public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
* [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found] ` <1405605697-11583-1-git-send-email-devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
@ 2014-07-17 14:01   ` Devesh Sharma
       [not found]     ` <3e39e90f-7095-4eb9-a844-516672a355ad-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2014-07-17 14:01   ` [for-next 2/2] xprtrdma: fix deallocation sequence of pd Devesh Sharma
  1 sibling, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-17 14:01 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA, Devesh Sharma

If verndor driver is attempted for removal while xprtrdma still has an
active mount, the removal of driver may never complete and can cause
unseen races or in worst case system crash.

To solve this, xprtrdma module should get reference of struct ib_device
structure for every mount. Reference is taken after local device address
resolution is completed successfuly.

reference to the struct ib_device pointer is put just before cm_id destruction.

Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/verbs.c     |   17 +++++++++++++++--
 net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 6ead5df..b00e55e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -457,6 +457,11 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	}
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+	if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+		dprintk("RPC:       %s: Failed to get device module\n",
+			__func__);
+		ia->ri_async_rc = -ENODEV;
+	}
 	rc = ia->ri_async_rc;
 	if (rc)
 		goto out;
@@ -466,16 +471,18 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
 	if (rc) {
 		dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
 			__func__, rc);
-		goto out;
+		goto out_put;
 	}
 	wait_for_completion_interruptible_timeout(&ia->ri_done,
 				msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
 	rc = ia->ri_async_rc;
 	if (rc)
-		goto out;
+		goto out_put;
 
 	return id;
 
+out_put:
+	module_put(id->device->owner);
 out:
 	rdma_destroy_id(id);
 	return ERR_PTR(rc);
@@ -613,6 +620,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	rwlock_init(&ia->ri_qplock);
 	return 0;
 out2:
+	module_put(ia->ri_id->device->owner);
 	rdma_destroy_id(ia->ri_id);
 	ia->ri_id = NULL;
 out1:
@@ -638,9 +646,11 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
 	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 		if (ia->ri_id->qp)
 			rdma_destroy_qp(ia->ri_id);
+		module_put(ia->ri_id->device->owner);
 		rdma_destroy_id(ia->ri_id);
 		ia->ri_id = NULL;
 	}
+
 	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
 		rc = ib_dealloc_pd(ia->ri_pd);
 		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
@@ -886,6 +896,7 @@ retry:
 		if (ia->ri_id->device != id->device) {
 			printk("RPC:       %s: can't reconnect on "
 				"different device!\n", __func__);
+			module_put(id->device->owner);
 			rdma_destroy_id(id);
 			rc = -ENETUNREACH;
 			goto out;
@@ -895,6 +906,7 @@ retry:
 		if (rc) {
 			dprintk("RPC:       %s: rdma_create_qp failed %i\n",
 				__func__, rc);
+			module_put(id->device->owner);
 			rdma_destroy_id(id);
 			rc = -ENETUNREACH;
 			goto out;
@@ -906,6 +918,7 @@ retry:
 		write_unlock(&ia->ri_qplock);
 
 		rdma_destroy_qp(old);
+		module_put(old->device->owner);
 		rdma_destroy_id(old);
 	} else {
 		dprintk("RPC:       %s: connecting...\n", __func__);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c419498..b35fa21 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -44,6 +44,7 @@
 #include <linux/spinlock.h> 		/* spinlock_t, etc */
 #include <linux/atomic.h>			/* atomic_t, etc */
 #include <linux/workqueue.h>		/* struct work_struct */
+#include <linux/module.h>		/* try_module_get()/module_put() */
 
 #include <rdma/rdma_cm.h>		/* RDMA connection api */
 #include <rdma/ib_verbs.h>		/* RDMA verbs api */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [for-next 2/2] xprtrdma: fix deallocation sequence of pd
       [not found] ` <1405605697-11583-1-git-send-email-devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
  2014-07-17 14:01   ` [for-next 1/2] xprtrdma: take reference of rdma provider module Devesh Sharma
@ 2014-07-17 14:01   ` Devesh Sharma
       [not found]     ` <3fdcf67f-2e90-4c61-92da-a8f7743cf54a-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  1 sibling, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-17 14:01 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA, Devesh Sharma

xprtrdma tries to destroy pd after destruction of cm_id. However,
pd should be deallocated before destruction of cm_id.

Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 net/sunrpc/xprtrdma/verbs.c |   22 +++++++++++++---------
 1 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index b00e55e..096e94b 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -544,7 +544,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	if (rc) {
 		dprintk("RPC:       %s: ib_query_device failed %d\n",
 			__func__, rc);
-		goto out2;
+		goto out3;
 	}
 
 	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
@@ -602,14 +602,14 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 				"phys register failed with %lX\n",
 				__func__, PTR_ERR(ia->ri_bind_mem));
 			rc = -ENOMEM;
-			goto out2;
+			goto out3;
 		}
 		break;
 	default:
 		printk(KERN_ERR "RPC: Unsupported memory "
 				"registration mode: %d\n", memreg);
 		rc = -ENOMEM;
-		goto out2;
+		goto out3;
 	}
 	dprintk("RPC:       %s: memory registration strategy is %d\n",
 		__func__, memreg);
@@ -619,6 +619,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 
 	rwlock_init(&ia->ri_qplock);
 	return 0;
+out3:
+	ib_dealloc_pd(ia->ri_pd);
+	ia->ri_pd = NULL;
 out2:
 	module_put(ia->ri_id->device->owner);
 	rdma_destroy_id(ia->ri_id);
@@ -643,19 +646,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
 		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
 			__func__, rc);
 	}
+
 	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
 		if (ia->ri_id->qp)
 			rdma_destroy_qp(ia->ri_id);
+
+		if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
+			rc = ib_dealloc_pd(ia->ri_pd);
+			dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
+				__func__, rc);
+		}
 		module_put(ia->ri_id->device->owner);
 		rdma_destroy_id(ia->ri_id);
 		ia->ri_id = NULL;
 	}
-
-	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
-		rc = ib_dealloc_pd(ia->ri_pd);
-		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
-			__func__, rc);
-	}
 }
 
 /*
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]     ` <3e39e90f-7095-4eb9-a844-516672a355ad-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
@ 2014-07-17 15:01       ` Steve Wise
       [not found]         ` <53C7E546.3080008-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-17 15:01 UTC (permalink / raw)
  To: Devesh Sharma, Roland Dreier, Hefty, Sean
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA



On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> If verndor driver is attempted for removal while xprtrdma still has an
> active mount, the removal of driver may never complete and can cause
> unseen races or in worst case system crash.
>
> To solve this, xprtrdma module should get reference of struct ib_device
> structure for every mount. Reference is taken after local device address
> resolution is completed successfuly.
>
> reference to the struct ib_device pointer is put just before cm_id destruction.
>
> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>

This seems like an issue with the rdma-cm or rdma core, not xprtrdma.  I 
see that user rdma applications cause a ref on the provider module here 
in ib_uverbs_open():

         if (!try_module_get(dev->ib_dev->owner)) {
                 ret = -ENODEV;
                 goto err;


Maybe kernel applications that allocate device resources should cause a 
ref on the provider's module.

Sean/Roland,  is there some history here as to how rdma provider module 
removal should be handled?

Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 2/2] xprtrdma: fix deallocation sequence of pd
       [not found]     ` <3fdcf67f-2e90-4c61-92da-a8f7743cf54a-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
@ 2014-07-17 15:05       ` Steve Wise
       [not found]         ` <53C7E64D.90501-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-17 15:05 UTC (permalink / raw)
  To: Devesh Sharma, linux-rdma-u79uwXL29TY76Z2rM5mHXA
  Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA

On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> xprtrdma tries to destroy pd after destruction of cm_id. However,
> pd should be deallocated before destruction of cm_id.

Why?

I think you really mean that the pd dealloc needs to be done before the 
module deref that you added in the first patch.  But let us see what 
Roland says about how device removal is supposed to be handled when 
kernel applications are using the device...



> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> ---
>   net/sunrpc/xprtrdma/verbs.c |   22 +++++++++++++---------
>   1 files changed, 13 insertions(+), 9 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index b00e55e..096e94b 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -544,7 +544,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>   	if (rc) {
>   		dprintk("RPC:       %s: ib_query_device failed %d\n",
>   			__func__, rc);
> -		goto out2;
> +		goto out3;
>   	}
>   
>   	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
> @@ -602,14 +602,14 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>   				"phys register failed with %lX\n",
>   				__func__, PTR_ERR(ia->ri_bind_mem));
>   			rc = -ENOMEM;
> -			goto out2;
> +			goto out3;
>   		}
>   		break;
>   	default:
>   		printk(KERN_ERR "RPC: Unsupported memory "
>   				"registration mode: %d\n", memreg);
>   		rc = -ENOMEM;
> -		goto out2;
> +		goto out3;
>   	}
>   	dprintk("RPC:       %s: memory registration strategy is %d\n",
>   		__func__, memreg);
> @@ -619,6 +619,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
>   
>   	rwlock_init(&ia->ri_qplock);
>   	return 0;
> +out3:
> +	ib_dealloc_pd(ia->ri_pd);
> +	ia->ri_pd = NULL;
>   out2:
>   	module_put(ia->ri_id->device->owner);
>   	rdma_destroy_id(ia->ri_id);
> @@ -643,19 +646,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
>   		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
>   			__func__, rc);
>   	}
> +
>   	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
>   		if (ia->ri_id->qp)
>   			rdma_destroy_qp(ia->ri_id);
> +
> +		if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
> +			rc = ib_dealloc_pd(ia->ri_pd);
> +			dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
> +				__func__, rc);
> +		}
>   		module_put(ia->ri_id->device->owner);
>   		rdma_destroy_id(ia->ri_id);
>   		ia->ri_id = NULL;
>   	}
> -
> -	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
> -		rc = ib_dealloc_pd(ia->ri_pd);
> -		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
> -			__func__, rc);
> -	}
>   }
>   
>   /*

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]         ` <53C7E546.3080008-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
@ 2014-07-17 15:05           ` Chuck Lever
       [not found]             ` <78A77C48-AC73-4C01-B139-A00B4F674C70-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  2014-07-17 15:20           ` Devesh Sharma
  2014-07-17 16:06           ` Hefty, Sean
  2 siblings, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-17 15:05 UTC (permalink / raw)
  To: Steve Wise
  Cc: Devesh Sharma, Roland Dreier, Hefty, Sean,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


On Jul 17, 2014, at 11:01 AM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:

> 
> 
> On 7/17/2014 9:01 AM, Devesh Sharma wrote:
>> If verndor driver is attempted for removal while xprtrdma still has an
>> active mount, the removal of driver may never complete and can cause
>> unseen races or in worst case system crash.
>> 
>> To solve this, xprtrdma module should get reference of struct ib_device
>> structure for every mount. Reference is taken after local device address
>> resolution is completed successfuly.
>> 
>> reference to the struct ib_device pointer is put just before cm_id destruction.
>> 
>> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> 
> This seems like an issue with the rdma-cm or rdma core, not xprtrdma.

Good point. I was wondering if svcrdma might have a similar issue.

Infrequently I see a hang during shutdown that appears to be related to
the ordering of removal of one of the RDMA modules, but I’ve never had
time to chase it.

See also: https://bugzilla.linux-nfs.org/show_bug.cgi?id=252


> I see that user rdma applications cause a ref on the provider module here in ib_uverbs_open():
> 
>        if (!try_module_get(dev->ib_dev->owner)) {
>                ret = -ENODEV;
>                goto err;
> 
> 
> Maybe kernel applications that allocate device resources should cause a ref on the provider's module.
> 
> Sean/Roland,  is there some history here as to how rdma provider module removal should be handled?
> 
> Steve.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]         ` <53C7E546.3080008-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
  2014-07-17 15:05           ` Chuck Lever
@ 2014-07-17 15:20           ` Devesh Sharma
  2014-07-17 16:06           ` Hefty, Sean
  2 siblings, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-17 15:20 UTC (permalink / raw)
  To: Steve Wise, Roland Dreier, Hefty, Sean
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org

________________________________________
From: Steve Wise [swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
Sent: Thursday, July 17, 2014 8:31 PM
To: Devesh Sharma; Roland Dreier; Hefty, Sean
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module

On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> If verndor driver is attempted for removal while xprtrdma still has an
> active mount, the removal of driver may never complete and can cause
> unseen races or in worst case system crash.
>
> To solve this, xprtrdma module should get reference of struct ib_device
> structure for every mount. Reference is taken after local device address
> resolution is completed successfuly.
>
> reference to the struct ib_device pointer is put just before cm_id destruction.
>
> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>

This seems like an issue with the rdma-cm or rdma core, not xprtrdma.  I
see that user rdma applications cause a ref on the provider module here
in ib_uverbs_open():

         if (!try_module_get(dev->ib_dev->owner)) {
                 ret = -ENODEV;
                 goto err;


Maybe kernel applications that allocate device resources should cause a
ref on the provider's module.

Yes, To me it looks like the right place to get reference of provider driver is
rdma_resolve_addr()-->acuire_ib_dev(). However this patch provides a workaround.

Lets wait for Roland/Sean's input.


Sean/Roland,  is there some history here as to how rdma provider module
removal should be handled?

Steve.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]             ` <78A77C48-AC73-4C01-B139-A00B4F674C70-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-17 15:31               ` Devesh Sharma
  0 siblings, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-17 15:31 UTC (permalink / raw)
  To: Chuck Lever, Steve Wise
  Cc: Roland Dreier, Hefty, Sean,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

 Replying again, due to mail formatting issue:

> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Thursday, July 17, 2014 8:36 PM
> To: Steve Wise
> Cc: Devesh Sharma; Roland Dreier; Hefty, Sean; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> On Jul 17, 2014, at 11:01 AM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
> wrote:
> 
> >
> >
> > On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> >> If verndor driver is attempted for removal while xprtrdma still has
> >> an active mount, the removal of driver may never complete and can
> >> cause unseen races or in worst case system crash.
> >>
> >> To solve this, xprtrdma module should get reference of struct
> >> ib_device structure for every mount. Reference is taken after local
> >> device address resolution is completed successfuly.
> >>
> >> reference to the struct ib_device pointer is put just before cm_id
> destruction.
> >>
> >> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> >
> > This seems like an issue with the rdma-cm or rdma core, not xprtrdma.
> 
> Good point. I was wondering if svcrdma might have a similar issue.
> 
> Infrequently I see a hang during shutdown that appears to be related to the
> ordering of removal of one of the RDMA modules, but I've never had time to
> chase it.
> 
> See also: https://bugzilla.linux-nfs.org/show_bug.cgi?id=252
> 
> 
> > I see that user rdma applications cause a ref on the provider module here
> in ib_uverbs_open():
> >
> >        if (!try_module_get(dev->ib_dev->owner)) {
> >                ret = -ENODEV;
> >                goto err;
> >
> >
> > Maybe kernel applications that allocate device resources should cause a ref
> on the provider's module.

Yes, To me it looks like the right place to get reference of provider driver is rdma_resolve_addr()-->acuire_ib_dev(). However this patch provides a workaround.

Let's wait for Roland/Sean's input.

> >
> > Sean/Roland,  is there some history here as to how rdma provider module
> removal should be handled?
> >
> > Steve.
> 
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 2/2] xprtrdma: fix deallocation sequence of pd
       [not found]         ` <53C7E64D.90501-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
@ 2014-07-17 15:35           ` Devesh Sharma
  0 siblings, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-17 15:35 UTC (permalink / raw)
  To: Steve Wise, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
  Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org

> -----Original Message-----
> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> Sent: Thursday, July 17, 2014 8:36 PM
> To: Devesh Sharma; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Cc: chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> Subject: Re: [for-next 2/2] xprtrdma: fix deallocation sequence of pd
> 
> On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> > xprtrdma tries to destroy pd after destruction of cm_id. However, pd
> > should be deallocated before destruction of cm_id.
> 
> Why?
> 
> I think you really mean that the pd dealloc needs to be done before the
> module deref that you added in the first patch.  But let us see what Roland
> says about how device removal is supposed to be handled when kernel
> applications are using the device...

Partially Yes, PD de-allocation should not be done after taking out the module reference. In the allocation
Sequence PD is allocated after cm_id allocation hence should be de-allocated before cm-id destroy.

> 
> 
> 
> > Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> > ---
> >   net/sunrpc/xprtrdma/verbs.c |   22 +++++++++++++---------
> >   1 files changed, 13 insertions(+), 9 deletions(-)
> >
> > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> > index b00e55e..096e94b 100644
> > --- a/net/sunrpc/xprtrdma/verbs.c
> > +++ b/net/sunrpc/xprtrdma/verbs.c
> > @@ -544,7 +544,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
> sockaddr *addr, int memreg)
> >   	if (rc) {
> >   		dprintk("RPC:       %s: ib_query_device failed %d\n",
> >   			__func__, rc);
> > -		goto out2;
> > +		goto out3;
> >   	}
> >
> >   	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { @@
> > -602,14 +602,14 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
> sockaddr *addr, int memreg)
> >   				"phys register failed with %lX\n",
> >   				__func__, PTR_ERR(ia->ri_bind_mem));
> >   			rc = -ENOMEM;
> > -			goto out2;
> > +			goto out3;
> >   		}
> >   		break;
> >   	default:
> >   		printk(KERN_ERR "RPC: Unsupported memory "
> >   				"registration mode: %d\n", memreg);
> >   		rc = -ENOMEM;
> > -		goto out2;
> > +		goto out3;
> >   	}
> >   	dprintk("RPC:       %s: memory registration strategy is %d\n",
> >   		__func__, memreg);
> > @@ -619,6 +619,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct
> > sockaddr *addr, int memreg)
> >
> >   	rwlock_init(&ia->ri_qplock);
> >   	return 0;
> > +out3:
> > +	ib_dealloc_pd(ia->ri_pd);
> > +	ia->ri_pd = NULL;
> >   out2:
> >   	module_put(ia->ri_id->device->owner);
> >   	rdma_destroy_id(ia->ri_id);
> > @@ -643,19 +646,20 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
> >   		dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
> >   			__func__, rc);
> >   	}
> > +
> >   	if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
> >   		if (ia->ri_id->qp)
> >   			rdma_destroy_qp(ia->ri_id);
> > +
> > +		if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
> > +			rc = ib_dealloc_pd(ia->ri_pd);
> > +			dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
> > +				__func__, rc);
> > +		}
> >   		module_put(ia->ri_id->device->owner);
> >   		rdma_destroy_id(ia->ri_id);
> >   		ia->ri_id = NULL;
> >   	}
> > -
> > -	if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
> > -		rc = ib_dealloc_pd(ia->ri_pd);
> > -		dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
> > -			__func__, rc);
> > -	}
> >   }
> >
> >   /*

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]         ` <53C7E546.3080008-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
  2014-07-17 15:05           ` Chuck Lever
  2014-07-17 15:20           ` Devesh Sharma
@ 2014-07-17 16:06           ` Hefty, Sean
       [not found]             ` <1828884A29C6694DAF28B7E6B8A823739933FCA3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2 siblings, 1 reply; 38+ messages in thread
From: Hefty, Sean @ 2014-07-17 16:06 UTC (permalink / raw)
  To: Steve Wise, Devesh Sharma, Roland Dreier
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org

> On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> > If verndor driver is attempted for removal while xprtrdma still has an
> > active mount, the removal of driver may never complete and can cause
> > unseen races or in worst case system crash.
> >
> > To solve this, xprtrdma module should get reference of struct ib_device
> > structure for every mount. Reference is taken after local device address
> > resolution is completed successfuly.
> >
> > reference to the struct ib_device pointer is put just before cm_id
> destruction.
> >
> > Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> 
> This seems like an issue with the rdma-cm or rdma core, not xprtrdma.  I
> see that user rdma applications cause a ref on the provider module here
> in ib_uverbs_open():
> 
>          if (!try_module_get(dev->ib_dev->owner)) {
>                  ret = -ENODEV;
>                  goto err;
> 
> 
> Maybe kernel applications that allocate device resources should cause a
> ref on the provider's module.
> 
> Sean/Roland,  is there some history here as to how rdma provider module
> removal should be handled?

The kernel modules should are not expected to access the rdma devices after their remove device callback has been invoked.  The rdma cm basically forwards the device removal on a per id basis.  Apps are expected to destroy the id after receiving that callback.  The rdma cm should block in the remove device call until all id's associated with the removed device have been destroyed.

- Sean
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]             ` <1828884A29C6694DAF28B7E6B8A823739933FCA3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2014-07-17 18:57               ` Shirley Ma
       [not found]                 ` <53C81CB7.2030000-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Shirley Ma @ 2014-07-17 18:57 UTC (permalink / raw)
  To: Hefty, Sean, Steve Wise, Devesh Sharma, Roland Dreier
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org



On 07/17/2014 09:06 AM, Hefty, Sean wrote:
>> On 7/17/2014 9:01 AM, Devesh Sharma wrote:
>>> If verndor driver is attempted for removal while xprtrdma still has an
>>> active mount, the removal of driver may never complete and can cause
>>> unseen races or in worst case system crash.
>>>
>>> To solve this, xprtrdma module should get reference of struct ib_device
>>> structure for every mount. Reference is taken after local device address
>>> resolution is completed successfuly.
>>>
>>> reference to the struct ib_device pointer is put just before cm_id
>> destruction.
>>>
>>> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
>>
>> This seems like an issue with the rdma-cm or rdma core, not xprtrdma.  I
>> see that user rdma applications cause a ref on the provider module here
>> in ib_uverbs_open():
>>
>>          if (!try_module_get(dev->ib_dev->owner)) {
>>                  ret = -ENODEV;
>>                  goto err;
>>
>>
>> Maybe kernel applications that allocate device resources should cause a
>> ref on the provider's module.
>>
>> Sean/Roland,  is there some history here as to how rdma provider module
>> removal should be handled?
> 
> The kernel modules should are not expected to access the rdma devices after their remove device callback has been invoked.  The rdma cm basically forwards the device removal on a per id basis.  Apps are expected to destroy the id after receiving that callback.  The rdma cm should block in the remove device call until all id's associated with the removed device have been destroyed.

So the rdma cm is expected to increase the driver reference count (try_module_get) for each new cm id, then deference count (module_put) when cm id is destroyed?

> - Sean
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                 ` <53C81CB7.2030000-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-17 19:07                   ` Steve Wise
  2014-07-17 19:50                     ` Hefty, Sean
  0 siblings, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-17 19:07 UTC (permalink / raw)
  To: 'Shirley Ma', 'Hefty, Sean',
	'Devesh Sharma', 'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA



> -----Original Message-----
> From: Shirley Ma [mailto:shirley.ma-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Thursday, July 17, 2014 1:58 PM
> To: Hefty, Sean; Steve Wise; Devesh Sharma; Roland Dreier
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> 
> 
> On 07/17/2014 09:06 AM, Hefty, Sean wrote:
> >> On 7/17/2014 9:01 AM, Devesh Sharma wrote:
> >>> If verndor driver is attempted for removal while xprtrdma still has an
> >>> active mount, the removal of driver may never complete and can cause
> >>> unseen races or in worst case system crash.
> >>>
> >>> To solve this, xprtrdma module should get reference of struct ib_device
> >>> structure for every mount. Reference is taken after local device address
> >>> resolution is completed successfuly.
> >>>
> >>> reference to the struct ib_device pointer is put just before cm_id
> >> destruction.
> >>>
> >>> Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> >>
> >> This seems like an issue with the rdma-cm or rdma core, not xprtrdma.  I
> >> see that user rdma applications cause a ref on the provider module here
> >> in ib_uverbs_open():
> >>
> >>          if (!try_module_get(dev->ib_dev->owner)) {
> >>                  ret = -ENODEV;
> >>                  goto err;
> >>
> >>
> >> Maybe kernel applications that allocate device resources should cause a
> >> ref on the provider's module.
> >>
> >> Sean/Roland,  is there some history here as to how rdma provider module
> >> removal should be handled?
> >
> > The kernel modules should are not expected to access the rdma devices after their
> remove device callback has been invoked.  The rdma cm basically forwards the device
> removal on a per id basis.  Apps are expected to destroy the id after receiving that
callback.
> The rdma cm should block in the remove device call until all id's associated with the
> removed device have been destroyed.
> 
> So the rdma cm is expected to increase the driver reference count (try_module_get) for
> each new cm id, then deference count (module_put) when cm id is destroyed?
> 

No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event  to each
application with rdmacm objects allocated, and each application is expected to destroy all
the objects it has allocated before returning from the event handler.

And I think the ib_verbs core calls each ib_client's remove handler when an rdma provider
unregisters with the core.  

Steve.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 19:07                   ` Steve Wise
@ 2014-07-17 19:50                     ` Hefty, Sean
       [not found]                       ` <1828884A29C6694DAF28B7E6B8A823739933FDEA-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2014-07-17 20:08                       ` Steve Wise
  0 siblings, 2 replies; 38+ messages in thread
From: Hefty, Sean @ 2014-07-17 19:50 UTC (permalink / raw)
  To: Steve Wise, 'Shirley Ma', 'Devesh Sharma',
	'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org

> > So the rdma cm is expected to increase the driver reference count
> (try_module_get) for
> > each new cm id, then deference count (module_put) when cm id is
> destroyed?
> >
> 
> No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
> to each
> application with rdmacm objects allocated, and each application is expected
> to destroy all
> the objects it has allocated before returning from the event handler.

This is almost correct.  The applications do not have to destroy all the objects that it has allocated before returning from their event handler.  E.g. an app can queue a work item that does the destruction.  The rdmacm will block in its ib_client remove handler until all relevant rdma_cm_id's have been destroyed.

> And I think the ib_verbs core calls each ib_client's remove handler when an
> rdma provider
> unregisters with the core.

yes
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                       ` <1828884A29C6694DAF28B7E6B8A823739933FDEA-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2014-07-17 19:55                         ` Steve Wise
  2014-07-17 20:23                           ` Shirley Ma
  0 siblings, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-17 19:55 UTC (permalink / raw)
  To: 'Hefty, Sean', 'Shirley Ma',
	'Devesh Sharma', 'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA



> -----Original Message-----
> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> Sent: Thursday, July 17, 2014 2:50 PM
> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> > > So the rdma cm is expected to increase the driver reference count
> > (try_module_get) for
> > > each new cm id, then deference count (module_put) when cm id is
> > destroyed?
> > >
> >
> > No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
> > to each
> > application with rdmacm objects allocated, and each application is expected
> > to destroy all
> > the objects it has allocated before returning from the event handler.
> 
> This is almost correct.  The applications do not have to destroy all the objects that it
has
> allocated before returning from their event handler.  E.g. an app can queue a work item
> that does the destruction.  The rdmacm will block in its ib_client remove handler until
all
> relevant rdma_cm_id's have been destroyed.
> 

Thanks for the clarification.  

> > And I think the ib_verbs core calls each ib_client's remove handler when an
> > rdma provider
> > unregisters with the core.
> 
> yes

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 19:50                     ` Hefty, Sean
       [not found]                       ` <1828884A29C6694DAF28B7E6B8A823739933FDEA-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2014-07-17 20:08                       ` Steve Wise
  2014-07-17 20:41                         ` Chuck Lever
  2014-07-18  6:19                         ` Devesh Sharma
  1 sibling, 2 replies; 38+ messages in thread
From: Steve Wise @ 2014-07-17 20:08 UTC (permalink / raw)
  To: 'Hefty, Sean', 'Shirley Ma',
	'Devesh Sharma', 'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA



> -----Original Message-----
> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> Sent: Thursday, July 17, 2014 2:56 PM
> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> 
> 
> > -----Original Message-----
> > From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> > Sent: Thursday, July 17, 2014 2:50 PM
> > To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> > Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> > Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
> >
> > > > So the rdma cm is expected to increase the driver reference count
> > > (try_module_get) for
> > > > each new cm id, then deference count (module_put) when cm id is
> > > destroyed?
> > > >
> > >
> > > No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
> > > to each
> > > application with rdmacm objects allocated, and each application is expected
> > > to destroy all
> > > the objects it has allocated before returning from the event handler.
> >
> > This is almost correct.  The applications do not have to destroy all the objects that
it has
> > allocated before returning from their event handler.  E.g. an app can queue a work
item
> > that does the destruction.  The rdmacm will block in its ib_client remove handler
until all
> > relevant rdma_cm_id's have been destroyed.
> >
> 
> Thanks for the clarification.
> 

And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in rpcrdma_conn_upcall().
It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls rpcrdma_conn_func()
for that endpoint, which schedules rep_connect_worker...  and I gave up following the code
path at this point... :)  

For this to all work correctly, it would need to destroy all the QPs, MRs, CQs, etc for
that device _before_ destroying the rdma cm ids.  Otherwise the provider module could be
unloaded too soon...

Steve.




--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 19:55                         ` Steve Wise
@ 2014-07-17 20:23                           ` Shirley Ma
  0 siblings, 0 replies; 38+ messages in thread
From: Shirley Ma @ 2014-07-17 20:23 UTC (permalink / raw)
  To: Steve Wise, 'Hefty, Sean', 'Devesh Sharma',
	'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA



On 07/17/2014 12:55 PM, Steve Wise wrote:
> 
> 
>> -----Original Message-----
>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
>> Sent: Thursday, July 17, 2014 2:50 PM
>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
>>
>>>> So the rdma cm is expected to increase the driver reference count
>>> (try_module_get) for
>>>> each new cm id, then deference count (module_put) when cm id is
>>> destroyed?
>>>>
>>>
>>> No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
>>> to each
>>> application with rdmacm objects allocated, and each application is expected
>>> to destroy all
>>> the objects it has allocated before returning from the event handler.
>>
>> This is almost correct.  The applications do not have to destroy all the objects that it
> has
>> allocated before returning from their event handler.  E.g. an app can queue a work item
>> that does the destruction.  The rdmacm will block in its ib_client remove handler until
> all
>> relevant rdma_cm_id's have been destroyed.
>>
> 
> Thanks for the clarification.  

Thanks, checked the cma code, the reference count maintains there.

>>> And I think the ib_verbs core calls each ib_client's remove handler when an
>>> rdma provider
>>> unregisters with the core.
>>
>> yes
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 20:08                       ` Steve Wise
@ 2014-07-17 20:41                         ` Chuck Lever
       [not found]                           ` <DF7CE85B-288D-4CC2-AD51-B326D5F1EE1A-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  2014-07-18  6:19                         ` Devesh Sharma
  1 sibling, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-17 20:41 UTC (permalink / raw)
  To: Steve Wise
  Cc: Hefty, Sean, Shirley Ma, Devesh Sharma, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


On Jul 17, 2014, at 4:08 PM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:

> 
> 
>> -----Original Message-----
>> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
>> Sent: Thursday, July 17, 2014 2:56 PM
>> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
>> 
>> 
>> 
>>> -----Original Message-----
>>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
>>> Sent: Thursday, July 17, 2014 2:50 PM
>>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
>>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
>>> 
>>>>> So the rdma cm is expected to increase the driver reference count
>>>> (try_module_get) for
>>>>> each new cm id, then deference count (module_put) when cm id is
>>>> destroyed?
>>>>> 
>>>> 
>>>> No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
>>>> to each
>>>> application with rdmacm objects allocated, and each application is expected
>>>> to destroy all
>>>> the objects it has allocated before returning from the event handler.
>>> 
>>> This is almost correct.  The applications do not have to destroy all the objects that
> it has
>>> allocated before returning from their event handler.  E.g. an app can queue a work
> item
>>> that does the destruction.  The rdmacm will block in its ib_client remove handler
> until all
>>> relevant rdma_cm_id's have been destroyed.
>>> 
>> 
>> Thanks for the clarification.
>> 
> 
> And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in rpcrdma_conn_upcall().
> It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls rpcrdma_conn_func()
> for that endpoint, which schedules rep_connect_worker...  and I gave up following the code
> path at this point... :)  
> 
> For this to all work correctly, it would need to destroy all the QPs, MRs, CQs, etc for
> that device _before_ destroying the rdma cm ids.  Otherwise the provider module could be
> unloaded too soon…

We can’t really deal with a CM_DEVICE_REMOVE event while there are active
NFS mounts.

System shutdown ordering should guarantee (one would hope) that NFS
mount points are unmounted before the RDMA/IB core infrastructure is
torn down. Ordering shouldn’t matter as long all NFS activity has
ceased before the CM tries to remove the device.

So if something is hanging up the CM, there’s something xprtrdma is not
cleaning up properly.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                           ` <DF7CE85B-288D-4CC2-AD51-B326D5F1EE1A-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-17 20:59                             ` Steve Wise
  2014-07-18  5:05                               ` Devesh Sharma
  2014-07-17 21:25                             ` Shirley Ma
  1 sibling, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-17 20:59 UTC (permalink / raw)
  To: 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Shirley Ma',
	'Devesh Sharma', 'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On
> Behalf Of Chuck Lever
> Sent: Thursday, July 17, 2014 3:42 PM
> To: Steve Wise
> Cc: Hefty, Sean; Shirley Ma; Devesh Sharma; Roland Dreier; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> 
> On Jul 17, 2014, at 4:08 PM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:
> 
> >
> >
> >> -----Original Message-----
> >> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> >> Sent: Thursday, July 17, 2014 2:56 PM
> >> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> >> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
> >> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> >>> Sent: Thursday, July 17, 2014 2:50 PM
> >>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> >>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> >>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
> >>>
> >>>>> So the rdma cm is expected to increase the driver reference count
> >>>> (try_module_get) for
> >>>>> each new cm id, then deference count (module_put) when cm id is
> >>>> destroyed?
> >>>>>
> >>>>
> >>>> No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
> >>>> to each
> >>>> application with rdmacm objects allocated, and each application is expected
> >>>> to destroy all
> >>>> the objects it has allocated before returning from the event handler.
> >>>
> >>> This is almost correct.  The applications do not have to destroy all the objects
that
> > it has
> >>> allocated before returning from their event handler.  E.g. an app can queue a work
> > item
> >>> that does the destruction.  The rdmacm will block in its ib_client remove handler
> > until all
> >>> relevant rdma_cm_id's have been destroyed.
> >>>
> >>
> >> Thanks for the clarification.
> >>
> >
> > And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in
> rpcrdma_conn_upcall().
> > It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls
> rpcrdma_conn_func()
> > for that endpoint, which schedules rep_connect_worker...  and I gave up following the
> code
> > path at this point... :)
> >
> > For this to all work correctly, it would need to destroy all the QPs, MRs, CQs, etc
for
> > that device _before_ destroying the rdma cm ids.  Otherwise the provider module could
> be
> > unloaded too soon.
> 
> We can't really deal with a CM_DEVICE_REMOVE event while there are active
> NFS mounts.
> 
> System shutdown ordering should guarantee (one would hope) that NFS
> mount points are unmounted before the RDMA/IB core infrastructure is
> torn down. Ordering shouldn't matter as long all NFS activity has
> ceased before the CM tries to remove the device.
> 
> So if something is hanging up the CM, there's something xprtrdma is not
> cleaning up properly.
> 


Devesh, how are you reproducing this?  Are you just rmmod'ing the ocrdma module while
there are active mounts?






--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                           ` <DF7CE85B-288D-4CC2-AD51-B326D5F1EE1A-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  2014-07-17 20:59                             ` Steve Wise
@ 2014-07-17 21:25                             ` Shirley Ma
  1 sibling, 0 replies; 38+ messages in thread
From: Shirley Ma @ 2014-07-17 21:25 UTC (permalink / raw)
  To: Chuck Lever, Steve Wise
  Cc: Hefty, Sean, Devesh Sharma, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


On 07/17/2014 01:41 PM, Chuck Lever wrote:
> On Jul 17, 2014, at 4:08 PM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:
> 
>> > 
>> > 
>>> >> -----Original Message-----
>>> >> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
>>> >> Sent: Thursday, July 17, 2014 2:56 PM
>>> >> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>>> >> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
>>> >> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
>>> >> 
>>> >> 
>>> >> 
>>>> >>> -----Original Message-----
>>>> >>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
>>>> >>> Sent: Thursday, July 17, 2014 2:50 PM
>>>> >>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>>>> >>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
>>>> >>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
>>>> >>> 
>>>>>> >>>>> So the rdma cm is expected to increase the driver reference count
>>>>> >>>> (try_module_get) for
>>>>>> >>>>> each new cm id, then deference count (module_put) when cm id is
>>>>> >>>> destroyed?
>>>>>> >>>>> 
>>>>> >>>> 
>>>>> >>>> No, I think he's saying the rdma-cm posts a RDMA_CM_DEVICE_REMOVAL event
>>>>> >>>> to each
>>>>> >>>> application with rdmacm objects allocated, and each application is expected
>>>>> >>>> to destroy all
>>>>> >>>> the objects it has allocated before returning from the event handler.
>>>> >>> 
>>>> >>> This is almost correct.  The applications do not have to destroy all the objects that
>> > it has
>>>> >>> allocated before returning from their event handler.  E.g. an app can queue a work
>> > item
>>>> >>> that does the destruction.  The rdmacm will block in its ib_client remove handler
>> > until all
>>>> >>> relevant rdma_cm_id's have been destroyed.
>>>> >>> 
>>> >> 
>>> >> Thanks for the clarification.
>>> >> 
>> > 
>> > And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in rpcrdma_conn_upcall().
>> > It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls rpcrdma_conn_func()
>> > for that endpoint, which schedules rep_connect_worker...  and I gave up following the code
>> > path at this point... :)  
>> > 
>> > For this to all work correctly, it would need to destroy all the QPs, MRs, CQs, etc for
>> > that device _before_ destroying the rdma cm ids.  Otherwise the provider module could be
>> > unloaded too soon…
> We can’t really deal with a CM_DEVICE_REMOVE event while there are active
> NFS mounts.
> 
> System shutdown ordering should guarantee (one would hope) that NFS
> mount points are unmounted before the RDMA/IB core infrastructure is
> torn down. Ordering shouldn’t matter as long all NFS activity has
> ceased before the CM tries to remove the device.
> 
> So if something is hanging up the CM, there’s something xprtrdma is not
> cleaning up properly.

I saw a problem once, restart the system without umounting the NFS. CM was hung on waiting for completion. It looks like a  bug in xprtrdma cleanup up. I couldn't reproduce it.

Call Trace:
 [<ffffffff815c9aa9>] schedule+0x29/0x70
 [<ffffffff815c8d35>] schedule_timeout+0x165/0x200
 [<ffffffff815ca9ff>] ? wait_for_completion+0xcf/0x110
 [<ffffffff810a708e>] ? __lock_release+0x9e/0x1f0
 [<ffffffff815ca9ff>] ? wait_for_completion+0xcf/0x110
 [<ffffffff815caa07>] wait_for_completion+0xd7/0x110
 [<ffffffff8108bce0>] ? try_to_wake_up+0x260/0x260
 [<ffffffffa064cb6e>] cma_process_remove+0xee/0x110 [rdma_cm]
 [<ffffffffa064cbdc>] cma_remove_one+0x4c/0x60 [rdma_cm]
 [<ffffffffa0279e0f>] ib_unregister_device+0x4f/0x100 [ib_core]
 [<ffffffffa02f76ee>] mlx4_ib_remove+0x2e/0x260 [mlx4_ib]
 [<ffffffffa01754c9>] mlx4_remove_device+0x69/0x80 [mlx4_core]
 [<ffffffffa01755b3>] mlx4_unregister_interface+0x43/0x80 [mlx4_core]
 [<ffffffffa030970c>] mlx4_ib_cleanup+0x10/0x23 [mlx4_ib]
 [<ffffffff810d9183>] SyS_delete_module+0x183/0x1e0
 [<ffffffff810f7c94>] ? __audit_syscall_entry+0x94/0x100
 [<ffffffff812c5789>] ? lockdep_sys_exit_thunk+0x35/0x67
 [<ffffffff815cec92>] system_call_fastpath+0x16/0x1b


Shirley
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 20:59                             ` Steve Wise
@ 2014-07-18  5:05                               ` Devesh Sharma
       [not found]                                 ` <EE7902D3F51F404C82415C4803930ACD3FE1482F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-18  5:05 UTC (permalink / raw)
  To: Steve Wise, 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Shirley Ma',
	'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Steve Wise
> Sent: Friday, July 18, 2014 2:29 AM
> To: 'Chuck Lever'
> Cc: 'Hefty, Sean'; 'Shirley Ma'; Devesh Sharma; 'Roland Dreier'; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> 
> > -----Original Message-----
> > From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Chuck Lever
> > Sent: Thursday, July 17, 2014 3:42 PM
> > To: Steve Wise
> > Cc: Hefty, Sean; Shirley Ma; Devesh Sharma; Roland Dreier;
> > linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> > module
> >
> >
> > On Jul 17, 2014, at 4:08 PM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
> wrote:
> >
> > >
> > >
> > >> -----Original Message-----
> > >> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> > >> Sent: Thursday, July 17, 2014 2:56 PM
> > >> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> > >> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
> > >> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
> > >> provider module
> > >>
> > >>
> > >>
> > >>> -----Original Message-----
> > >>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> > >>> Sent: Thursday, July 17, 2014 2:50 PM
> > >>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> > >>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> > >>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
> > >>> provider module
> > >>>
> > >>>>> So the rdma cm is expected to increase the driver reference
> > >>>>> count
> > >>>> (try_module_get) for
> > >>>>> each new cm id, then deference count (module_put) when cm id is
> > >>>> destroyed?
> > >>>>>
> > >>>>
> > >>>> No, I think he's saying the rdma-cm posts a
> > >>>> RDMA_CM_DEVICE_REMOVAL event to each application with
> rdmacm
> > >>>> objects allocated, and each application is expected to destroy
> > >>>> all the objects it has allocated before returning from the event
> > >>>> handler.
> > >>>
> > >>> This is almost correct.  The applications do not have to destroy
> > >>> all the objects
> that
> > > it has
> > >>> allocated before returning from their event handler.  E.g. an app
> > >>> can queue a work
> > > item
> > >>> that does the destruction.  The rdmacm will block in its ib_client
> > >>> remove handler
> > > until all
> > >>> relevant rdma_cm_id's have been destroyed.
> > >>>
> > >>
> > >> Thanks for the clarification.
> > >>
> > >
> > > And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in
> > rpcrdma_conn_upcall().
> > > It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls
> > rpcrdma_conn_func()
> > > for that endpoint, which schedules rep_connect_worker...  and I gave
> > > up following the
> > code
> > > path at this point... :)
> > >
> > > For this to all work correctly, it would need to destroy all the
> > > QPs, MRs, CQs, etc
> for
> > > that device _before_ destroying the rdma cm ids.  Otherwise the
> > > provider module could
> > be
> > > unloaded too soon.
> >
> > We can't really deal with a CM_DEVICE_REMOVE event while there are
> > active NFS mounts.
> >
> > System shutdown ordering should guarantee (one would hope) that NFS
> > mount points are unmounted before the RDMA/IB core infrastructure is
> > torn down. Ordering shouldn't matter as long all NFS activity has
> > ceased before the CM tries to remove the device.
> >
> > So if something is hanging up the CM, there's something xprtrdma is
> > not cleaning up properly.
> >
> 
> 
> Devesh, how are you reproducing this?  Are you just rmmod'ing the ocrdma
> module while there are active mounts?

Yes, I am issuing rmmod while there is an active mount. In my case rmmod ocrdma remains blocked forever.
Off-the-course of this discussion: Is there a reasoning behind not using ib_register_client()/ib_unregister_client() framework?
> 
> 
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-17 20:08                       ` Steve Wise
  2014-07-17 20:41                         ` Chuck Lever
@ 2014-07-18  6:19                         ` Devesh Sharma
       [not found]                           ` <EE7902D3F51F404C82415C4803930ACD3FE1686F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  1 sibling, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-18  6:19 UTC (permalink / raw)
  To: Steve Wise, 'Hefty, Sean', 'Shirley Ma',
	'Roland Dreier'
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Steve Wise
> Sent: Friday, July 18, 2014 1:39 AM
> To: 'Hefty, Sean'; 'Shirley Ma'; Devesh Sharma; 'Roland Dreier'
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> 
> > -----Original Message-----
> > From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> > Sent: Thursday, July 17, 2014 2:56 PM
> > To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> > Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
> > Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> > module
> >
> >
> >
> > > -----Original Message-----
> > > From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> > > Sent: Thursday, July 17, 2014 2:50 PM
> > > To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> > > Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> > > Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
> > > provider module
> > >
> > > > > So the rdma cm is expected to increase the driver reference
> > > > > count
> > > > (try_module_get) for
> > > > > each new cm id, then deference count (module_put) when cm id is
> > > > destroyed?
> > > > >
> > > >
> > > > No, I think he's saying the rdma-cm posts a
> RDMA_CM_DEVICE_REMOVAL
> > > > event to each application with rdmacm objects allocated, and each
> > > > application is expected to destroy all the objects it has
> > > > allocated before returning from the event handler.
> > >
> > > This is almost correct.  The applications do not have to destroy all
> > > the objects that
> it has
> > > allocated before returning from their event handler.  E.g. an app
> > > can queue a work
> item
> > > that does the destruction.  The rdmacm will block in its ib_client
> > > remove handler
> until all
> > > relevant rdma_cm_id's have been destroyed.
> > >
> >
> > Thanks for the clarification.
> >
> 
> And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in
> rpcrdma_conn_upcall().
> It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls
> rpcrdma_conn_func() for that endpoint, which schedules
> rep_connect_worker...  and I gave up following the code path at this point...
> :)
> 
> For this to all work correctly, it would need to destroy all the QPs, MRs, CQs,
> etc for that device _before_ destroying the rdma cm ids.  Otherwise the
> provider module could be unloaded too soon...

Okay, Should I try to handle device removal in this proposed fashion and post the v1.

> 
> Steve.
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                 ` <EE7902D3F51F404C82415C4803930ACD3FE1482F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-07-18 13:27                                   ` Steve Wise
  2014-07-18 15:47                                     ` Shirley Ma
  2014-07-21  5:23                                     ` Devesh Sharma
  0 siblings, 2 replies; 38+ messages in thread
From: Steve Wise @ 2014-07-18 13:27 UTC (permalink / raw)
  To: 'Devesh Sharma', 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Shirley Ma',
	'Roland Dreier', linux-rdma-u79uwXL29TY76Z2rM5mHXA

> > > We can't really deal with a CM_DEVICE_REMOVE event while there are
> > > active NFS mounts.
> > >
> > > System shutdown ordering should guarantee (one would hope) that NFS
> > > mount points are unmounted before the RDMA/IB core infrastructure is
> > > torn down. Ordering shouldn't matter as long all NFS activity has
> > > ceased before the CM tries to remove the device.
> > >
> > > So if something is hanging up the CM, there's something xprtrdma is
> > > not cleaning up properly.
> > >
> >
> >
> > Devesh, how are you reproducing this?  Are you just rmmod'ing the ocrdma
> > module while there are active mounts?
> 
> Yes, I am issuing rmmod while there is an active mount. In my case rmmod ocrdma remains
> blocked forever.
> Off-the-course of this discussion: Is there a reasoning behind not using
> ib_register_client()/ib_unregister_client() framework?

I think the idea is that you don't need to use it if you are transport-independent and use
the rdmacm...



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                           ` <EE7902D3F51F404C82415C4803930ACD3FE1686F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-07-18 15:27                             ` Chuck Lever
       [not found]                               ` <D9783B2E-8D18-442E-9BFE-0863F9DD6B96-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-18 15:27 UTC (permalink / raw)
  To: Devesh Sharma
  Cc: Steve Wise, Hefty, Sean, Shirley Ma, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org


On Jul 18, 2014, at 2:19 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:

>> -----Original Message-----
>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Steve Wise
>> Sent: Friday, July 18, 2014 1:39 AM
>> To: 'Hefty, Sean'; 'Shirley Ma'; Devesh Sharma; 'Roland Dreier'
>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
>> module
>> 
>> 
>> 
>>> -----Original Message-----
>>> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
>>> Sent: Thursday, July 17, 2014 2:56 PM
>>> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>>> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
>>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
>>> module
>>> 
>>> 
>>> 
>>>> -----Original Message-----
>>>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
>>>> Sent: Thursday, July 17, 2014 2:50 PM
>>>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
>>>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
>>>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
>>>> provider module
>>>> 
>>>>>> So the rdma cm is expected to increase the driver reference
>>>>>> count
>>>>> (try_module_get) for
>>>>>> each new cm id, then deference count (module_put) when cm id is
>>>>> destroyed?
>>>>>> 
>>>>> 
>>>>> No, I think he's saying the rdma-cm posts a
>> RDMA_CM_DEVICE_REMOVAL
>>>>> event to each application with rdmacm objects allocated, and each
>>>>> application is expected to destroy all the objects it has
>>>>> allocated before returning from the event handler.
>>>> 
>>>> This is almost correct.  The applications do not have to destroy all
>>>> the objects that
>> it has
>>>> allocated before returning from their event handler.  E.g. an app
>>>> can queue a work
>> item
>>>> that does the destruction.  The rdmacm will block in its ib_client
>>>> remove handler
>> until all
>>>> relevant rdma_cm_id's have been destroyed.
>>>> 
>>> 
>>> Thanks for the clarification.
>>> 
>> 
>> And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in
>> rpcrdma_conn_upcall().
>> It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls
>> rpcrdma_conn_func() for that endpoint, which schedules
>> rep_connect_worker...  and I gave up following the code path at this point...
>> :)
>> 
>> For this to all work correctly, it would need to destroy all the QPs, MRs, CQs,
>> etc for that device _before_ destroying the rdma cm ids.  Otherwise the
>> provider module could be unloaded too soon...
> 
> Okay, Should I try to handle device removal in this proposed fashion and post the v1.

Hi Devesh,

To make it work, xprtrdma is going to have to allow the device to be
removed and added back while there are active NFS mounts and pending RPCs.
AFAICT the code is not structured to do that today.

Probably the place to start is to see how much work is needed to leverage
the existing logic to watch for ENODEV and do the right things to suspend
RPC activity until another device is inserted. It would have to work like
a network partition that causes a transport reconnect.

However, replacing everything, including all MRs and the PD, will require
significant code churn and additional (undesirable) serialization around
the use of QPs and cm_ids. Thus I would like to understand how much of a
priority this is.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-18 13:27                                   ` Steve Wise
@ 2014-07-18 15:47                                     ` Shirley Ma
       [not found]                                       ` <53C94199.4050601-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  2014-07-21  5:23                                     ` Devesh Sharma
  1 sibling, 1 reply; 38+ messages in thread
From: Shirley Ma @ 2014-07-18 15:47 UTC (permalink / raw)
  To: Steve Wise, 'Devesh Sharma', 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


On 07/18/2014 06:27 AM, Steve Wise wrote:
>>>> We can't really deal with a CM_DEVICE_REMOVE event while there are
>>>> active NFS mounts.
>>>>
>>>> System shutdown ordering should guarantee (one would hope) that NFS
>>>> mount points are unmounted before the RDMA/IB core infrastructure is
>>>> torn down. Ordering shouldn't matter as long all NFS activity has
>>>> ceased before the CM tries to remove the device.
>>>>
>>>> So if something is hanging up the CM, there's something xprtrdma is
>>>> not cleaning up properly.
>>>>
>>>
>>>
>>> Devesh, how are you reproducing this?  Are you just rmmod'ing the ocrdma
>>> module while there are active mounts?
>>
>> Yes, I am issuing rmmod while there is an active mount. In my case rmmod ocrdma remains
>> blocked forever.
Where is it blocked?

>> Off-the-course of this discussion: Is there a reasoning behind not using
>> ib_register_client()/ib_unregister_client() framework?
> 
> I think the idea is that you don't need to use it if you are transport-independent and use
> the rdmacm...
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-18 13:27                                   ` Steve Wise
  2014-07-18 15:47                                     ` Shirley Ma
@ 2014-07-21  5:23                                     ` Devesh Sharma
  1 sibling, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-21  5:23 UTC (permalink / raw)
  To: Steve Wise, 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Shirley Ma',
	'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> Sent: Friday, July 18, 2014 6:58 PM
> To: Devesh Sharma; 'Chuck Lever'
> Cc: 'Hefty, Sean'; 'Shirley Ma'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> > > > We can't really deal with a CM_DEVICE_REMOVE event while there are
> > > > active NFS mounts.
> > > >
> > > > System shutdown ordering should guarantee (one would hope) that
> > > > NFS mount points are unmounted before the RDMA/IB core
> > > > infrastructure is torn down. Ordering shouldn't matter as long all
> > > > NFS activity has ceased before the CM tries to remove the device.
> > > >
> > > > So if something is hanging up the CM, there's something xprtrdma
> > > > is not cleaning up properly.
> > > >
> > >
> > >
> > > Devesh, how are you reproducing this?  Are you just rmmod'ing the
> > > ocrdma module while there are active mounts?
> >
> > Yes, I am issuing rmmod while there is an active mount. In my case
> > rmmod ocrdma remains blocked forever.
> > Off-the-course of this discussion: Is there a reasoning behind not
> > using
> > ib_register_client()/ib_unregister_client() framework?
> 
> I think the idea is that you don't need to use it if you are transport-
> independent and use the rdmacm...

Okay, Makes sense..

> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                               ` <D9783B2E-8D18-442E-9BFE-0863F9DD6B96-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-21  5:40                                 ` Devesh Sharma
  0 siblings, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-21  5:40 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Steve Wise, Hefty, Sean, Shirley Ma, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Friday, July 18, 2014 8:57 PM
> To: Devesh Sharma
> Cc: Steve Wise; Hefty, Sean; Shirley Ma; Roland Dreier; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> On Jul 18, 2014, at 2:19 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> wrote:
> 
> >> -----Original Message-----
> >> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> >> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Steve Wise
> >> Sent: Friday, July 18, 2014 1:39 AM
> >> To: 'Hefty, Sean'; 'Shirley Ma'; Devesh Sharma; 'Roland Dreier'
> >> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> >> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> >> module
> >>
> >>
> >>
> >>> -----Original Message-----
> >>> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> >>> Sent: Thursday, July 17, 2014 2:56 PM
> >>> To: 'Hefty, Sean'; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> >>> Cc: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org'
> >>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
> >>> provider module
> >>>
> >>>
> >>>
> >>>> -----Original Message-----
> >>>> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> >>>> Sent: Thursday, July 17, 2014 2:50 PM
> >>>> To: Steve Wise; 'Shirley Ma'; 'Devesh Sharma'; 'Roland Dreier'
> >>>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
> >>>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma
> >>>> provider module
> >>>>
> >>>>>> So the rdma cm is expected to increase the driver reference count
> >>>>> (try_module_get) for
> >>>>>> each new cm id, then deference count (module_put) when cm id is
> >>>>> destroyed?
> >>>>>>
> >>>>>
> >>>>> No, I think he's saying the rdma-cm posts a
> >> RDMA_CM_DEVICE_REMOVAL
> >>>>> event to each application with rdmacm objects allocated, and each
> >>>>> application is expected to destroy all the objects it has
> >>>>> allocated before returning from the event handler.
> >>>>
> >>>> This is almost correct.  The applications do not have to destroy
> >>>> all the objects that
> >> it has
> >>>> allocated before returning from their event handler.  E.g. an app
> >>>> can queue a work
> >> item
> >>>> that does the destruction.  The rdmacm will block in its ib_client
> >>>> remove handler
> >> until all
> >>>> relevant rdma_cm_id's have been destroyed.
> >>>>
> >>>
> >>> Thanks for the clarification.
> >>>
> >>
> >> And looking at xprtrdma, it does handle the DEVICE_REMOVAL event in
> >> rpcrdma_conn_upcall().
> >> It sets ep->rep_connected to -ENODEV, wakes everybody up, and calls
> >> rpcrdma_conn_func() for that endpoint, which schedules
> >> rep_connect_worker...  and I gave up following the code path at this
> point...
> >> :)
> >>
> >> For this to all work correctly, it would need to destroy all the QPs,
> >> MRs, CQs, etc for that device _before_ destroying the rdma cm ids.
> >> Otherwise the provider module could be unloaded too soon...
> >
> > Okay, Should I try to handle device removal in this proposed fashion and
> post the v1.
> 
> Hi Devesh,
> 
> To make it work, xprtrdma is going to have to allow the device to be removed
> and added back while there are active NFS mounts and pending RPCs.
> AFAICT the code is not structured to do that today.
> 
> Probably the place to start is to see how much work is needed to leverage
> the existing logic to watch for ENODEV and do the right things to suspend
> RPC activity until another device is inserted. It would have to work like a
> network partition that causes a transport reconnect.
> 
> However, replacing everything, including all MRs and the PD, will require
> significant code churn and additional (undesirable) serialization around the
> use of QPs and cm_ids. Thus I would like to understand how much of a
> priority this is.

Sure, I got your point, this is good for long term stability that we understand
and do the right thing. However if it's not feasible to change the entire infrastructure
in one go OR in the near future, do we have other ideas/options to handle this problem

In the real world situations it's quite possible that someone attempts to reload/replace/unload the 
vendor driver without knowing if there is an active mount or not.

> 
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                       ` <53C94199.4050601-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-21  6:11                                         ` Devesh Sharma
       [not found]                                           ` <EE7902D3F51F404C82415C4803930ACD3FE1C7B7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
       [not found]                                           ` <a6345162-863d -447c-b7c2-059ced190a13@CMEXHTCAS1.ad.emulex.com>
  0 siblings, 2 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-21  6:11 UTC (permalink / raw)
  To: Shirley Ma, Steve Wise, 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Shirley,

Once rmmod is issued, the connection corresponding to the active mount is destroyed and all the associated resources 
Are freed. As per the processing logic of DEVICE-REMOVAL event, nfs-rdma wakes-up all the  waiters, This results into
Re-establishment efforts, since the device is not present any more, rdma_resolve_address() fails with CM resolution
Error. This loop continues forever.

I am yet to find out which part of ocrdma is blocked. I am putting some debug messages to find it out. I will get back to 
The group with an update.

-Regards
 Devesh

> -----Original Message-----
> From: Shirley Ma [mailto:shirley.ma-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Friday, July 18, 2014 9:18 PM
> To: Steve Wise; Devesh Sharma; 'Chuck Lever'
> Cc: 'Hefty, Sean'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> On 07/18/2014 06:27 AM, Steve Wise wrote:
> >>>> We can't really deal with a CM_DEVICE_REMOVE event while there are
> >>>> active NFS mounts.
> >>>>
> >>>> System shutdown ordering should guarantee (one would hope) that
> NFS
> >>>> mount points are unmounted before the RDMA/IB core infrastructure
> >>>> is torn down. Ordering shouldn't matter as long all NFS activity
> >>>> has ceased before the CM tries to remove the device.
> >>>>
> >>>> So if something is hanging up the CM, there's something xprtrdma is
> >>>> not cleaning up properly.
> >>>>
> >>>
> >>>
> >>> Devesh, how are you reproducing this?  Are you just rmmod'ing the
> >>> ocrdma module while there are active mounts?
> >>
> >> Yes, I am issuing rmmod while there is an active mount. In my case
> >> rmmod ocrdma remains blocked forever.
> Where is it blocked?
> 
> >> Off-the-course of this discussion: Is there a reasoning behind not
> >> using
> >> ib_register_client()/ib_unregister_client() framework?
> >
> > I think the idea is that you don't need to use it if you are
> > transport-independent and use the rdmacm...
> >
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> > in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More
> majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> >
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                           ` <EE7902D3F51F404C82415C4803930ACD3FE1C7B7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-07-21 11:48                                             ` Devesh Sharma
  0 siblings, 0 replies; 38+ messages in thread
From: Devesh Sharma @ 2014-07-21 11:48 UTC (permalink / raw)
  To: Devesh Sharma, Shirley Ma, Steve Wise, 'Chuck Lever'
  Cc: 'Hefty, Sean', 'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

In rpcrdma_ep_connect():

write_lock(&ia->ri_qplock);
                old = ia->ri_id;
                ia->ri_id = id;
                write_unlock(&ia->ri_qplock);

                rdma_destroy_qp(old);
                rdma_destroy_id(old);  =============> Cm -id is destroyed here.


If following code fails in rpcrdma_ep_connect():
id = rpcrdma_create_id(xprt, ia,
                                (struct sockaddr *)&xprt->rx_data.addr);
                if (IS_ERR(id)) {
                        rc = -EHOSTUNREACH;
                        goto out;
                }

it leaves old cm-id still alive. This will always fail if Device is removed abruptly.

In rdma_resolve_addr()/rdma_destroy_id() cm_dev is referenced/de-referenced here (cma.c):

static int cma_acquire_dev(struct rdma_id_private *id_priv,
                           struct rdma_id_private *listen_id_priv) {
.
.
if (!ret)
                cma_attach_to_dev(id_priv, cma_dev);
}

static void cma_release_dev(struct rdma_id_private *id_priv)
{
        mutex_lock(&lock);
        list_del(&id_priv->list);
        cma_deref_dev(id_priv->cma_dev);
.
.
}

Since as per design of nfs-rdma at-least previously known good cm-id always remains live utill
another good cm-id is created, cma_dev->refcount never becomes 0 upon device removal .
Thus blocking the rmmod <vendor driver> forever.

-Regards
 Devesh

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Devesh Sharma
> Sent: Monday, July 21, 2014 11:42 AM
> To: Shirley Ma; Steve Wise; 'Chuck Lever'
> Cc: 'Hefty, Sean'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> Shirley,
> 
> Once rmmod is issued, the connection corresponding to the active mount is
> destroyed and all the associated resources Are freed. As per the processing
> logic of DEVICE-REMOVAL event, nfs-rdma wakes-up all the  waiters, This
> results into Re-establishment efforts, since the device is not present any
> more, rdma_resolve_address() fails with CM resolution Error. This loop
> continues forever.
> 
> I am yet to find out which part of ocrdma is blocked. I am putting some debug
> messages to find it out. I will get back to The group with an update.
> 
> -Regards
>  Devesh
> 
> > -----Original Message-----
> > From: Shirley Ma [mailto:shirley.ma-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> > Sent: Friday, July 18, 2014 9:18 PM
> > To: Steve Wise; Devesh Sharma; 'Chuck Lever'
> > Cc: 'Hefty, Sean'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> > module
> >
> >
> > On 07/18/2014 06:27 AM, Steve Wise wrote:
> > >>>> We can't really deal with a CM_DEVICE_REMOVE event while there
> > >>>> are active NFS mounts.
> > >>>>
> > >>>> System shutdown ordering should guarantee (one would hope) that
> > NFS
> > >>>> mount points are unmounted before the RDMA/IB core
> infrastructure
> > >>>> is torn down. Ordering shouldn't matter as long all NFS activity
> > >>>> has ceased before the CM tries to remove the device.
> > >>>>
> > >>>> So if something is hanging up the CM, there's something xprtrdma
> > >>>> is not cleaning up properly.
> > >>>>
> > >>>
> > >>>
> > >>> Devesh, how are you reproducing this?  Are you just rmmod'ing the
> > >>> ocrdma module while there are active mounts?
> > >>
> > >> Yes, I am issuing rmmod while there is an active mount. In my case
> > >> rmmod ocrdma remains blocked forever.
> > Where is it blocked?
> >
> > >> Off-the-course of this discussion: Is there a reasoning behind not
> > >> using
> > >> ib_register_client()/ib_unregister_client() framework?
> > >
> > > I think the idea is that you don't need to use it if you are
> > > transport-independent and use the rdmacm...
> > >
> > >
> > >
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> > > in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More
> > majordomo
> > > info at  http://vger.kernel.org/majordomo-info.html
> > >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                             ` <a6345162-863d-447c-b7c2-059ced190a13-3RiH6ntJJkP8BX6JNMqfyFjyZtpTMMwT@public.gmane.org>
@ 2014-07-21 14:53                                               ` Chuck Lever
       [not found]                                                 ` <27ACE237-161A-4CA5-AA5C-6349CC4118E3-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-21 14:53 UTC (permalink / raw)
  To: Devesh Sharma
  Cc: Shirley Ma, Steve Wise, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Hi Devesh-

Thanks for drilling into this further.

On Jul 21, 2014, at 7:48 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:

> In rpcrdma_ep_connect():
> 
> write_lock(&ia->ri_qplock);
>                old = ia->ri_id;
>                ia->ri_id = id;
>                write_unlock(&ia->ri_qplock);
> 
>                rdma_destroy_qp(old);
>                rdma_destroy_id(old);  =============> Cm -id is destroyed here.
> 
> 
> If following code fails in rpcrdma_ep_connect():
> id = rpcrdma_create_id(xprt, ia,
>                                (struct sockaddr *)&xprt->rx_data.addr);
>                if (IS_ERR(id)) {
>                        rc = -EHOSTUNREACH;
>                        goto out;
>                }
> 
> it leaves old cm-id still alive. This will always fail if Device is removed abruptly.

For CM_EVENT_DEVICE_REMOVAL, rpcrdma_conn_upcall() sets ep->rep_connected
to -ENODEV.

Then:

 929 int
 930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
 931 {
 932         struct rdma_cm_id *id, *old;
 933         int rc = 0;
 934         int retry_count = 0;
 935 
 936         if (ep->rep_connected != 0) {
 937                 struct rpcrdma_xprt *xprt;
 938 retry:
 939                 dprintk("RPC:       %s: reconnecting...\n", __func__);

ep->rep_connected is probably -ENODEV after a device removal. It would be
possible for the connect worker to destroy everything associated with this
connection in that case to ensure the underlying object reference counts
are cleared.

The immediate danger is that if there are pending RPCs, they could exit while
qp/cm_id are NULL, triggering a panic in rpcrdma_deregister_frmr_external().
Checking for NULL pointers inside the ri_qplock would prevent that.

However, NFS mounts via this adapter will hang indefinitely after all
transports are torn down and the adapter is gone. The only thing that can be
done is something drastic like “echo b > /proc/sysrq_trigger” on the client.

Thus, IMO hot-plugging or passive fail-over are the only scenarios where
this makes sense. If we have an immediate problem here, is it a problem with
system shutdown ordering that can be addressed in some other way?

Until that support is in place, obviously I would prefer that the removal of
the underlying driver be prevented while there are NFS mounts in place. I
think that’s what NFS users have come to expect.

In other words, don’t allow device removal until we have support for device
insertion :-)


> In rdma_resolve_addr()/rdma_destroy_id() cm_dev is referenced/de-referenced here (cma.c):
> 
> static int cma_acquire_dev(struct rdma_id_private *id_priv,
>                           struct rdma_id_private *listen_id_priv) {
> .
> .
> if (!ret)
>                cma_attach_to_dev(id_priv, cma_dev);
> }
> 
> static void cma_release_dev(struct rdma_id_private *id_priv)
> {
>        mutex_lock(&lock);
>        list_del(&id_priv->list);
>        cma_deref_dev(id_priv->cma_dev);
> .
> .
> }
> 
> Since as per design of nfs-rdma at-least previously known good cm-id always remains live utill
> another good cm-id is created, cma_dev->refcount never becomes 0 upon device removal .
> Thus blocking the rmmod <vendor driver> forever.
> 
> -Regards
> Devesh
> 
>> -----Original Message-----
>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Devesh Sharma
>> Sent: Monday, July 21, 2014 11:42 AM
>> To: Shirley Ma; Steve Wise; 'Chuck Lever'
>> Cc: 'Hefty, Sean'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
>> module
>> 
>> Shirley,
>> 
>> Once rmmod is issued, the connection corresponding to the active mount is
>> destroyed and all the associated resources Are freed. As per the processing
>> logic of DEVICE-REMOVAL event, nfs-rdma wakes-up all the  waiters, This
>> results into Re-establishment efforts, since the device is not present any
>> more, rdma_resolve_address() fails with CM resolution Error. This loop
>> continues forever.
>> 
>> I am yet to find out which part of ocrdma is blocked. I am putting some debug
>> messages to find it out. I will get back to The group with an update.
>> 
>> -Regards
>> Devesh
>> 
>>> -----Original Message-----
>>> From: Shirley Ma [mailto:shirley.ma-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
>>> Sent: Friday, July 18, 2014 9:18 PM
>>> To: Steve Wise; Devesh Sharma; 'Chuck Lever'
>>> Cc: 'Hefty, Sean'; 'Roland Dreier'; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
>>> module
>>> 
>>> 
>>> On 07/18/2014 06:27 AM, Steve Wise wrote:
>>>>>>> We can't really deal with a CM_DEVICE_REMOVE event while there
>>>>>>> are active NFS mounts.
>>>>>>> 
>>>>>>> System shutdown ordering should guarantee (one would hope) that
>>> NFS
>>>>>>> mount points are unmounted before the RDMA/IB core
>> infrastructure
>>>>>>> is torn down. Ordering shouldn't matter as long all NFS activity
>>>>>>> has ceased before the CM tries to remove the device.
>>>>>>> 
>>>>>>> So if something is hanging up the CM, there's something xprtrdma
>>>>>>> is not cleaning up properly.
>>>>>>> 
>>>>>> 
>>>>>> 
>>>>>> Devesh, how are you reproducing this?  Are you just rmmod'ing the
>>>>>> ocrdma module while there are active mounts?
>>>>> 
>>>>> Yes, I am issuing rmmod while there is an active mount. In my case
>>>>> rmmod ocrdma remains blocked forever.
>>> Where is it blocked?
>>> 
>>>>> Off-the-course of this discussion: Is there a reasoning behind not
>>>>> using
>>>>> ib_register_client()/ib_unregister_client() framework?
>>>> 
>>>> I think the idea is that you don't need to use it if you are
>>>> transport-independent and use the rdmacm...
>>>> 
>>>> 
>>>> 
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
>>>> in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More
>>> majordomo
>>>> info at  http://vger.kernel.org/majordomo-info.html
>>>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
>> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                 ` <27ACE237-161A-4CA5-AA5C-6349CC4118E3-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-21 15:03                                                   ` Steve Wise
  2014-07-21 15:20                                                     ` Chuck Lever
       [not found]                                                     ` <D88D1952-83A1-4FF9-B028-AAE7A859A 5B1@oracle.com>
  0 siblings, 2 replies; 38+ messages in thread
From: Steve Wise @ 2014-07-21 15:03 UTC (permalink / raw)
  To: 'Chuck Lever', 'Devesh Sharma'
  Cc: 'Shirley Ma', 'Hefty, Sean',
	'Roland Dreier', linux-rdma-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Monday, July 21, 2014 9:54 AM
> To: Devesh Sharma
> Cc: Shirley Ma; Steve Wise; Hefty, Sean; Roland Dreier; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> Hi Devesh-
> 
> Thanks for drilling into this further.
> 
> On Jul 21, 2014, at 7:48 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:
> 
> > In rpcrdma_ep_connect():
> >
> > write_lock(&ia->ri_qplock);
> >                old = ia->ri_id;
> >                ia->ri_id = id;
> >                write_unlock(&ia->ri_qplock);
> >
> >                rdma_destroy_qp(old);
> >                rdma_destroy_id(old);  =============> Cm -id is destroyed here.
> >
> >
> > If following code fails in rpcrdma_ep_connect():
> > id = rpcrdma_create_id(xprt, ia,
> >                                (struct sockaddr *)&xprt->rx_data.addr);
> >                if (IS_ERR(id)) {
> >                        rc = -EHOSTUNREACH;
> >                        goto out;
> >                }
> >
> > it leaves old cm-id still alive. This will always fail if Device is removed abruptly.
> 
> For CM_EVENT_DEVICE_REMOVAL, rpcrdma_conn_upcall() sets ep->rep_connected
> to -ENODEV.
> 
> Then:
> 
>  929 int
>  930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
>  931 {
>  932         struct rdma_cm_id *id, *old;
>  933         int rc = 0;
>  934         int retry_count = 0;
>  935
>  936         if (ep->rep_connected != 0) {
>  937                 struct rpcrdma_xprt *xprt;
>  938 retry:
>  939                 dprintk("RPC:       %s: reconnecting...\n", __func__);
> 
> ep->rep_connected is probably -ENODEV after a device removal. It would be
> possible for the connect worker to destroy everything associated with this
> connection in that case to ensure the underlying object reference counts
> are cleared.
> 
> The immediate danger is that if there are pending RPCs, they could exit while
> qp/cm_id are NULL, triggering a panic in rpcrdma_deregister_frmr_external().
> Checking for NULL pointers inside the ri_qplock would prevent that.
> 
> However, NFS mounts via this adapter will hang indefinitely after all
> transports are torn down and the adapter is gone. The only thing that can be
> done is something drastic like "echo b > /proc/sysrq_trigger" on the client.
> 
> Thus, IMO hot-plugging or passive fail-over are the only scenarios where
> this makes sense. If we have an immediate problem here, is it a problem with
> system shutdown ordering that can be addressed in some other way?
> 
> Until that support is in place, obviously I would prefer that the removal of
> the underlying driver be prevented while there are NFS mounts in place. I
> think that's what NFS users have come to expect.
> 
> In other words, don't allow device removal until we have support for device
> insertion :-)
> 
> 


If we fix the above problems on provider unload, shouldn't the mount recover if the
provider module is subsequently loaded?  Or another provider configured such that
rdma_resolve_addr/route() then picks an active device?



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-21 15:03                                                   ` Steve Wise
@ 2014-07-21 15:20                                                     ` Chuck Lever
       [not found]                                                     ` <D88D1952-83A1-4FF9-B028-AAE7A859A 5B1@oracle.com>
  1 sibling, 0 replies; 38+ messages in thread
From: Chuck Lever @ 2014-07-21 15:20 UTC (permalink / raw)
  To: Steve Wise
  Cc: Devesh Sharma, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


On Jul 21, 2014, at 11:03 AM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:

> 
> 
>> -----Original Message-----
>> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
>> Sent: Monday, July 21, 2014 9:54 AM
>> To: Devesh Sharma
>> Cc: Shirley Ma; Steve Wise; Hefty, Sean; Roland Dreier; linux-rdma@vger.kernel.org
>> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
>> 
>> Hi Devesh-
>> 
>> Thanks for drilling into this further.
>> 
>> On Jul 21, 2014, at 7:48 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:
>> 
>>> In rpcrdma_ep_connect():
>>> 
>>> write_lock(&ia->ri_qplock);
>>>               old = ia->ri_id;
>>>               ia->ri_id = id;
>>>               write_unlock(&ia->ri_qplock);
>>> 
>>>               rdma_destroy_qp(old);
>>>               rdma_destroy_id(old);  =============> Cm -id is destroyed here.
>>> 
>>> 
>>> If following code fails in rpcrdma_ep_connect():
>>> id = rpcrdma_create_id(xprt, ia,
>>>                               (struct sockaddr *)&xprt->rx_data.addr);
>>>               if (IS_ERR(id)) {
>>>                       rc = -EHOSTUNREACH;
>>>                       goto out;
>>>               }
>>> 
>>> it leaves old cm-id still alive. This will always fail if Device is removed abruptly.
>> 
>> For CM_EVENT_DEVICE_REMOVAL, rpcrdma_conn_upcall() sets ep->rep_connected
>> to -ENODEV.
>> 
>> Then:
>> 
>> 929 int
>> 930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
>> 931 {
>> 932         struct rdma_cm_id *id, *old;
>> 933         int rc = 0;
>> 934         int retry_count = 0;
>> 935
>> 936         if (ep->rep_connected != 0) {
>> 937                 struct rpcrdma_xprt *xprt;
>> 938 retry:
>> 939                 dprintk("RPC:       %s: reconnecting...\n", __func__);
>> 
>> ep->rep_connected is probably -ENODEV after a device removal. It would be
>> possible for the connect worker to destroy everything associated with this
>> connection in that case to ensure the underlying object reference counts
>> are cleared.
>> 
>> The immediate danger is that if there are pending RPCs, they could exit while
>> qp/cm_id are NULL, triggering a panic in rpcrdma_deregister_frmr_external().
>> Checking for NULL pointers inside the ri_qplock would prevent that.
>> 
>> However, NFS mounts via this adapter will hang indefinitely after all
>> transports are torn down and the adapter is gone. The only thing that can be
>> done is something drastic like "echo b > /proc/sysrq_trigger" on the client.
>> 
>> Thus, IMO hot-plugging or passive fail-over are the only scenarios where
>> this makes sense. If we have an immediate problem here, is it a problem with
>> system shutdown ordering that can be addressed in some other way?
>> 
>> Until that support is in place, obviously I would prefer that the removal of
>> the underlying driver be prevented while there are NFS mounts in place. I
>> think that's what NFS users have come to expect.
>> 
>> In other words, don't allow device removal until we have support for device
>> insertion :-)
>> 
>> 
> 
> 
> If we fix the above problems on provider unload, shouldn't the mount recover if the
> provider module is subsequently loaded?  Or another provider configured such that
> rdma_resolve_addr/route() then picks an active device?

On device removal, we have to destroy everything.

On insertion, we’ll have to create a fresh PD and MRs, which isn’t done now
during reconnect. So, insertion needs work too.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                       ` <D88D1952-83A1-4FF9-B028-AAE7A859A5B1-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-21 15:22                                                         ` Steve Wise
  2014-07-21 17:07                                                           ` Devesh Sharma
  0 siblings, 1 reply; 38+ messages in thread
From: Steve Wise @ 2014-07-21 15:22 UTC (permalink / raw)
  To: 'Chuck Lever'
  Cc: 'Devesh Sharma', 'Shirley Ma',
	'Hefty, Sean', 'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Monday, July 21, 2014 10:21 AM
> To: Steve Wise
> Cc: Devesh Sharma; Shirley Ma; Hefty, Sean; Roland Dreier; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
> 
> 
> On Jul 21, 2014, at 11:03 AM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org> wrote:
> 
> >
> >
> >> -----Original Message-----
> >> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> >> Sent: Monday, July 21, 2014 9:54 AM
> >> To: Devesh Sharma
> >> Cc: Shirley Ma; Steve Wise; Hefty, Sean; Roland Dreier; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> >> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
> >>
> >> Hi Devesh-
> >>
> >> Thanks for drilling into this further.
> >>
> >> On Jul 21, 2014, at 7:48 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:
> >>
> >>> In rpcrdma_ep_connect():
> >>>
> >>> write_lock(&ia->ri_qplock);
> >>>               old = ia->ri_id;
> >>>               ia->ri_id = id;
> >>>               write_unlock(&ia->ri_qplock);
> >>>
> >>>               rdma_destroy_qp(old);
> >>>               rdma_destroy_id(old);  =============> Cm -id is destroyed here.
> >>>
> >>>
> >>> If following code fails in rpcrdma_ep_connect():
> >>> id = rpcrdma_create_id(xprt, ia,
> >>>                               (struct sockaddr *)&xprt->rx_data.addr);
> >>>               if (IS_ERR(id)) {
> >>>                       rc = -EHOSTUNREACH;
> >>>                       goto out;
> >>>               }
> >>>
> >>> it leaves old cm-id still alive. This will always fail if Device is removed
abruptly.
> >>
> >> For CM_EVENT_DEVICE_REMOVAL, rpcrdma_conn_upcall() sets ep->rep_connected
> >> to -ENODEV.
> >>
> >> Then:
> >>
> >> 929 int
> >> 930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
> >> 931 {
> >> 932         struct rdma_cm_id *id, *old;
> >> 933         int rc = 0;
> >> 934         int retry_count = 0;
> >> 935
> >> 936         if (ep->rep_connected != 0) {
> >> 937                 struct rpcrdma_xprt *xprt;
> >> 938 retry:
> >> 939                 dprintk("RPC:       %s: reconnecting...\n", __func__);
> >>
> >> ep->rep_connected is probably -ENODEV after a device removal. It would be
> >> possible for the connect worker to destroy everything associated with this
> >> connection in that case to ensure the underlying object reference counts
> >> are cleared.
> >>
> >> The immediate danger is that if there are pending RPCs, they could exit while
> >> qp/cm_id are NULL, triggering a panic in rpcrdma_deregister_frmr_external().
> >> Checking for NULL pointers inside the ri_qplock would prevent that.
> >>
> >> However, NFS mounts via this adapter will hang indefinitely after all
> >> transports are torn down and the adapter is gone. The only thing that can be
> >> done is something drastic like "echo b > /proc/sysrq_trigger" on the client.
> >>
> >> Thus, IMO hot-plugging or passive fail-over are the only scenarios where
> >> this makes sense. If we have an immediate problem here, is it a problem with
> >> system shutdown ordering that can be addressed in some other way?
> >>
> >> Until that support is in place, obviously I would prefer that the removal of
> >> the underlying driver be prevented while there are NFS mounts in place. I
> >> think that's what NFS users have come to expect.
> >>
> >> In other words, don't allow device removal until we have support for device
> >> insertion :-)
> >>
> >>
> >
> >
> > If we fix the above problems on provider unload, shouldn't the mount recover if the
> > provider module is subsequently loaded?  Or another provider configured such that
> > rdma_resolve_addr/route() then picks an active device?
> 
> On device removal, we have to destroy everything.
> 
> On insertion, we'll have to create a fresh PD and MRs, which isn't done now
> during reconnect. So, insertion needs work too.
> 

Got it, thanks! 



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
  2014-07-21 15:22                                                         ` Steve Wise
@ 2014-07-21 17:07                                                           ` Devesh Sharma
       [not found]                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE1D9CF-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-21 17:07 UTC (permalink / raw)
  To: Steve Wise, 'Chuck Lever'
  Cc: 'Shirley Ma', 'Hefty, Sean',
	'Roland Dreier',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: Steve Wise [mailto:swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org]
> Sent: Monday, July 21, 2014 8:53 PM
> To: 'Chuck Lever'
> Cc: Devesh Sharma; 'Shirley Ma'; 'Hefty, Sean'; 'Roland Dreier'; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> 
> > -----Original Message-----
> > From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> > Sent: Monday, July 21, 2014 10:21 AM
> > To: Steve Wise
> > Cc: Devesh Sharma; Shirley Ma; Hefty, Sean; Roland Dreier;
> > linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> > module
> >
> >
> > On Jul 21, 2014, at 11:03 AM, Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
> wrote:
> >
> > >
> > >
> > >> -----Original Message-----
> > >> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> > >> Sent: Monday, July 21, 2014 9:54 AM
> > >> To: Devesh Sharma
> > >> Cc: Shirley Ma; Steve Wise; Hefty, Sean; Roland Dreier;
> > >> linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > >> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma
> > >> provider module
> > >>
> > >> Hi Devesh-
> > >>
> > >> Thanks for drilling into this further.
> > >>
> > >> On Jul 21, 2014, at 7:48 AM, Devesh Sharma
> <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:
> > >>
> > >>> In rpcrdma_ep_connect():
> > >>>
> > >>> write_lock(&ia->ri_qplock);
> > >>>               old = ia->ri_id;
> > >>>               ia->ri_id = id;
> > >>>               write_unlock(&ia->ri_qplock);
> > >>>
> > >>>               rdma_destroy_qp(old);
> > >>>               rdma_destroy_id(old);  =============> Cm -id is destroyed
> here.
> > >>>
> > >>>
> > >>> If following code fails in rpcrdma_ep_connect():
> > >>> id = rpcrdma_create_id(xprt, ia,
> > >>>                               (struct sockaddr *)&xprt->rx_data.addr);
> > >>>               if (IS_ERR(id)) {
> > >>>                       rc = -EHOSTUNREACH;
> > >>>                       goto out;
> > >>>               }
> > >>>
> > >>> it leaves old cm-id still alive. This will always fail if Device
> > >>> is removed
> abruptly.
> > >>
> > >> For CM_EVENT_DEVICE_REMOVAL, rpcrdma_conn_upcall() sets
> > >> ep->rep_connected to -ENODEV.
> > >>
> > >> Then:
> > >>
> > >> 929 int
> > >> 930 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia
> > >> *ia)
> > >> 931 {
> > >> 932         struct rdma_cm_id *id, *old;
> > >> 933         int rc = 0;
> > >> 934         int retry_count = 0;
> > >> 935
> > >> 936         if (ep->rep_connected != 0) {
> > >> 937                 struct rpcrdma_xprt *xprt;
> > >> 938 retry:
> > >> 939                 dprintk("RPC:       %s: reconnecting...\n", __func__);
> > >>
> > >> ep->rep_connected is probably -ENODEV after a device removal. It
> > >> ep->would be
> > >> possible for the connect worker to destroy everything associated
> > >> with this connection in that case to ensure the underlying object
> > >> reference counts are cleared.
> > >>
> > >> The immediate danger is that if there are pending RPCs, they could
> > >> exit while qp/cm_id are NULL, triggering a panic in
> rpcrdma_deregister_frmr_external().
> > >> Checking for NULL pointers inside the ri_qplock would prevent that.
> > >>
> > >> However, NFS mounts via this adapter will hang indefinitely after
> > >> all transports are torn down and the adapter is gone. The only
> > >> thing that can be done is something drastic like "echo b >
> /proc/sysrq_trigger" on the client.
> > >>
> > >> Thus, IMO hot-plugging or passive fail-over are the only scenarios
> > >> where this makes sense. If we have an immediate problem here, is it
> > >> a problem with system shutdown ordering that can be addressed in
> some other way?
> > >>
> > >> Until that support is in place, obviously I would prefer that the
> > >> removal of the underlying driver be prevented while there are NFS
> > >> mounts in place. I think that's what NFS users have come to expect.
> > >>
> > >> In other words, don't allow device removal until we have support
> > >> for device insertion :-)

This needs a fresh series of patches?

> > >>
> > >>
> > >
> > >
> > > If we fix the above problems on provider unload, shouldn't the mount
> > > recover if the provider module is subsequently loaded?  Or another
> > > provider configured such that
> > > rdma_resolve_addr/route() then picks an active device?
> >
> > On device removal, we have to destroy everything.
> >
> > On insertion, we'll have to create a fresh PD and MRs, which isn't
> > done now during reconnect. So, insertion needs work too.

Makes Sense to me too.
Thanks Chuck.

-Regards
 Devesh
> >
> 
> Got it, thanks!
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE1D9CF-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-07-21 17:30                                                               ` Chuck Lever
       [not found]                                                                 ` <0CDA5340-DDD6-42F8-8359-0069BBC9E24C-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-21 17:30 UTC (permalink / raw)
  To: Devesh Sharma
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org


On Jul 21, 2014, at 1:07 PM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:

>>>>> Until that support is in place, obviously I would prefer that the
>>>>> removal of the underlying driver be prevented while there are NFS
>>>>> mounts in place. I think that's what NFS users have come to expect.
>>>>> 
>>>>> In other words, don't allow device removal until we have support
>>>>> for device insertion :-)
> 
> This needs a fresh series of patches?

“do not allow removal” would likely take an approach similar to what you
originally posted, unless someone has an idea how to use a CM_EVENT to
make this work, or there is an objection from the kernel RDMA folks.

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                                 ` <0CDA5340-DDD6-42F8-8359-0069BBC9E24C-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-22  5:06                                                                   ` Devesh Sharma
       [not found]                                                                     ` <EE7902D3F51F404C82415C4803930ACD3FE1DB1D-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-22  5:06 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> Sent: Monday, July 21, 2014 11:01 PM
> To: Devesh Sharma
> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> On Jul 21, 2014, at 1:07 PM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> wrote:
> 
> >>>>> Until that support is in place, obviously I would prefer that the
> >>>>> removal of the underlying driver be prevented while there are NFS
> >>>>> mounts in place. I think that's what NFS users have come to expect.
> >>>>>
> >>>>> In other words, don't allow device removal until we have support
> >>>>> for device insertion :-)
> >
> > This needs a fresh series of patches?
> 
> "do not allow removal" would likely take an approach similar to what you
> originally posted, unless someone has an idea how to use a CM_EVENT to
> make this work, or there is an objection from the kernel RDMA folks.

Okay, I will re-work the patch if need be.

> 
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                                     ` <EE7902D3F51F404C82415C4803930ACD3FE1DB1D-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-07-30 18:39                                                                       ` Chuck Lever
       [not found]                                                                         ` <A40CDF7D-7ED2-4D67-957F-8F977D567774-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Chuck Lever @ 2014-07-30 18:39 UTC (permalink / raw)
  To: Devesh Sharma
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org


On Jul 22, 2014, at 1:06 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:

>> -----Original Message-----
>> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
>> Sent: Monday, July 21, 2014 11:01 PM
>> To: Devesh Sharma
>> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
>> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
>> module
>> 
>> 
>> On Jul 21, 2014, at 1:07 PM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
>> wrote:
>> 
>>>>>>> Until that support is in place, obviously I would prefer that the
>>>>>>> removal of the underlying driver be prevented while there are NFS
>>>>>>> mounts in place. I think that's what NFS users have come to expect.
>>>>>>> 
>>>>>>> In other words, don't allow device removal until we have support
>>>>>>> for device insertion :-)
>>> 
>>> This needs a fresh series of patches?
>> 
>> "do not allow removal" would likely take an approach similar to what you
>> originally posted, unless someone has an idea how to use a CM_EVENT to
>> make this work, or there is an objection from the kernel RDMA folks.
> 
> Okay, I will re-work the patch if need be.

Devesh, if there isn’t one already, could you file an upstream bug at

  http://bugzilla.linux-nfs.org

that documents the shutdown hang and perhaps summarizes this thread?
Thanks!

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                                         ` <A40CDF7D-7ED2-4D67-957F-8F977D567774-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
@ 2014-07-31  5:14                                                                           ` Devesh Sharma
       [not found]                                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE23695-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-07-31  5:14 UTC (permalink / raw)
  To: Chuck Lever
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Chuck Lever
> Sent: Thursday, July 31, 2014 12:09 AM
> To: Devesh Sharma
> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> 
> On Jul 22, 2014, at 1:06 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> wrote:
> 
> >> -----Original Message-----
> >> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> >> Sent: Monday, July 21, 2014 11:01 PM
> >> To: Devesh Sharma
> >> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> >> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> >> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> >> module
> >>
> >>
> >> On Jul 21, 2014, at 1:07 PM, Devesh Sharma
> <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> >> wrote:
> >>
> >>>>>>> Until that support is in place, obviously I would prefer that
> >>>>>>> the removal of the underlying driver be prevented while there
> >>>>>>> are NFS mounts in place. I think that's what NFS users have come to
> expect.
> >>>>>>>
> >>>>>>> In other words, don't allow device removal until we have support
> >>>>>>> for device insertion :-)
> >>>
> >>> This needs a fresh series of patches?
> >>
> >> "do not allow removal" would likely take an approach similar to what
> >> you originally posted, unless someone has an idea how to use a
> >> CM_EVENT to make this work, or there is an objection from the kernel
> RDMA folks.
> >
> > Okay, I will re-work the patch if need be.
> 
> Devesh, if there isn't one already, could you file an upstream bug at
> 
>   http://bugzilla.linux-nfs.org
> 
> that documents the shutdown hang and perhaps summarizes this thread?
> Thanks!

A bug has been filed
https://bugzilla.linux-nfs.org/show_bug.cgi?id=266

> 
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* RE: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE23695-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2014-08-18  9:52                                                                               ` Devesh Sharma
       [not found]                                                                                 ` <6a71f6a5-f335-42c6-b8b7-8b4bac5aae83-3RiH6ntJJkP8BX6JNMqfyFjyZtpTMMwT@public.gmane.org>
  0 siblings, 1 reply; 38+ messages in thread
From: Devesh Sharma @ 2014-08-18  9:52 UTC (permalink / raw)
  To: Devesh Sharma, Chuck Lever
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

Hi Chuk,

Can this patch be lined up for merger. Looks like there are no oppositions to this change

-Regards
 Devesh

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Devesh Sharma
> Sent: Thursday, July 31, 2014 10:45 AM
> To: Chuck Lever
> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
> module
> 
> > -----Original Message-----
> > From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> > owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Chuck Lever
> > Sent: Thursday, July 31, 2014 12:09 AM
> > To: Devesh Sharma
> > Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> > rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
> > module
> >
> >
> > On Jul 22, 2014, at 1:06 AM, Devesh Sharma
> <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> > wrote:
> >
> > >> -----Original Message-----
> > >> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
> > >> Sent: Monday, July 21, 2014 11:01 PM
> > >> To: Devesh Sharma
> > >> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
> > >> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > >> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma
> > >> provider module
> > >>
> > >>
> > >> On Jul 21, 2014, at 1:07 PM, Devesh Sharma
> > <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
> > >> wrote:
> > >>
> > >>>>>>> Until that support is in place, obviously I would prefer that
> > >>>>>>> the removal of the underlying driver be prevented while there
> > >>>>>>> are NFS mounts in place. I think that's what NFS users have
> > >>>>>>> come to
> > expect.
> > >>>>>>>
> > >>>>>>> In other words, don't allow device removal until we have
> > >>>>>>> support for device insertion :-)
> > >>>
> > >>> This needs a fresh series of patches?
> > >>
> > >> "do not allow removal" would likely take an approach similar to
> > >> what you originally posted, unless someone has an idea how to use a
> > >> CM_EVENT to make this work, or there is an objection from the
> > >> kernel
> > RDMA folks.
> > >
> > > Okay, I will re-work the patch if need be.
> >
> > Devesh, if there isn't one already, could you file an upstream bug at
> >
> >   http://bugzilla.linux-nfs.org
> >
> > that documents the shutdown hang and perhaps summarizes this thread?
> > Thanks!
> 
> A bug has been filed
> https://bugzilla.linux-nfs.org/show_bug.cgi?id=266
> 
> >
> > --
> > Chuck Lever
> > chuck[dot]lever[at]oracle[dot]com
> >
> >
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> > in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More
> majordomo
> > info at http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [for-next 1/2] xprtrdma: take reference of rdma provider module
       [not found]                                                                                 ` <6a71f6a5-f335-42c6-b8b7-8b4bac5aae83-3RiH6ntJJkP8BX6JNMqfyFjyZtpTMMwT@public.gmane.org>
@ 2014-08-18 13:13                                                                                   ` Chuck Lever
  0 siblings, 0 replies; 38+ messages in thread
From: Chuck Lever @ 2014-08-18 13:13 UTC (permalink / raw)
  To: Devesh Sharma
  Cc: Steve Wise, Shirley Ma, Hefty, Sean, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org



> On Aug 18, 2014, at 5:52 AM, Devesh Sharma <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org> wrote:
> 
> Hi Chuk,
> 
> Can this patch be lined up for merger. Looks like there are no oppositions to this change

Sorry, I was expecting a fresh rewritten patch. I haven't done a deep review of the original.

Anything you want to merge should be independently tested, of course. For example Steve should include it in his testing branch.


> 
> -Regards
> Devesh
> 
>> -----Original Message-----
>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Devesh Sharma
>> Sent: Thursday, July 31, 2014 10:45 AM
>> To: Chuck Lever
>> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
>> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> Subject: RE: [for-next 1/2] xprtrdma: take reference of rdma provider
>> module
>> 
>>> -----Original Message-----
>>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Chuck Lever
>>> Sent: Thursday, July 31, 2014 12:09 AM
>>> To: Devesh Sharma
>>> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
>>> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma provider
>>> module
>>> 
>>> 
>>> On Jul 22, 2014, at 1:06 AM, Devesh Sharma
>> <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
>>> wrote:
>>> 
>>>>> -----Original Message-----
>>>>> From: Chuck Lever [mailto:chuck.lever-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org]
>>>>> Sent: Monday, July 21, 2014 11:01 PM
>>>>> To: Devesh Sharma
>>>>> Cc: Steve Wise; Shirley Ma; Hefty, Sean; Roland Dreier; linux-
>>>>> rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>>>> Subject: Re: [for-next 1/2] xprtrdma: take reference of rdma
>>>>> provider module
>>>>> 
>>>>> 
>>>>> On Jul 21, 2014, at 1:07 PM, Devesh Sharma
>>> <Devesh.Sharma-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>
>>>>> wrote:
>>>>> 
>>>>>>>>>> Until that support is in place, obviously I would prefer that
>>>>>>>>>> the removal of the underlying driver be prevented while there
>>>>>>>>>> are NFS mounts in place. I think that's what NFS users have
>>>>>>>>>> come to
>>> expect.
>>>>>>>>>> 
>>>>>>>>>> In other words, don't allow device removal until we have
>>>>>>>>>> support for device insertion :-)
>>>>>> 
>>>>>> This needs a fresh series of patches?
>>>>> 
>>>>> "do not allow removal" would likely take an approach similar to
>>>>> what you originally posted, unless someone has an idea how to use a
>>>>> CM_EVENT to make this work, or there is an objection from the
>>>>> kernel
>>> RDMA folks.
>>>> 
>>>> Okay, I will re-work the patch if need be.
>>> 
>>> Devesh, if there isn't one already, could you file an upstream bug at
>>> 
>>>  http://bugzilla.linux-nfs.org
>>> 
>>> that documents the shutdown hang and perhaps summarizes this thread?
>>> Thanks!
>> 
>> A bug has been filed
>> https://bugzilla.linux-nfs.org/show_bug.cgi?id=266
>> 
>>> 
>>> --
>>> Chuck Lever
>>> chuck[dot]lever[at]oracle[dot]com
>>> 
>>> 
>>> 
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
>>> in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More
>> majordomo
>>> info at http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the
>> body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2014-08-18 13:13 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1405605697-11583-1-git-send-email-devesh.sharma@emulex.com>
     [not found] ` <1405605697-11583-1-git-send-email-devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
2014-07-17 14:01   ` [for-next 1/2] xprtrdma: take reference of rdma provider module Devesh Sharma
     [not found]     ` <3e39e90f-7095-4eb9-a844-516672a355ad-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
2014-07-17 15:01       ` Steve Wise
     [not found]         ` <53C7E546.3080008-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2014-07-17 15:05           ` Chuck Lever
     [not found]             ` <78A77C48-AC73-4C01-B139-A00B4F674C70-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-17 15:31               ` Devesh Sharma
2014-07-17 15:20           ` Devesh Sharma
2014-07-17 16:06           ` Hefty, Sean
     [not found]             ` <1828884A29C6694DAF28B7E6B8A823739933FCA3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2014-07-17 18:57               ` Shirley Ma
     [not found]                 ` <53C81CB7.2030000-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-17 19:07                   ` Steve Wise
2014-07-17 19:50                     ` Hefty, Sean
     [not found]                       ` <1828884A29C6694DAF28B7E6B8A823739933FDEA-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2014-07-17 19:55                         ` Steve Wise
2014-07-17 20:23                           ` Shirley Ma
2014-07-17 20:08                       ` Steve Wise
2014-07-17 20:41                         ` Chuck Lever
     [not found]                           ` <DF7CE85B-288D-4CC2-AD51-B326D5F1EE1A-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-17 20:59                             ` Steve Wise
2014-07-18  5:05                               ` Devesh Sharma
     [not found]                                 ` <EE7902D3F51F404C82415C4803930ACD3FE1482F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-07-18 13:27                                   ` Steve Wise
2014-07-18 15:47                                     ` Shirley Ma
     [not found]                                       ` <53C94199.4050601-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-21  6:11                                         ` Devesh Sharma
     [not found]                                           ` <EE7902D3F51F404C82415C4803930ACD3FE1C7B7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-07-21 11:48                                             ` Devesh Sharma
     [not found]                                           ` <a6345162-863d -447c-b7c2-059ced190a13@CMEXHTCAS1.ad.emulex.com>
     [not found]                                             ` <a6345162-863d-447c-b7c2-059ced190a13-3RiH6ntJJkP8BX6JNMqfyFjyZtpTMMwT@public.gmane.org>
2014-07-21 14:53                                               ` Chuck Lever
     [not found]                                                 ` <27ACE237-161A-4CA5-AA5C-6349CC4118E3-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-21 15:03                                                   ` Steve Wise
2014-07-21 15:20                                                     ` Chuck Lever
     [not found]                                                     ` <D88D1952-83A1-4FF9-B028-AAE7A859A 5B1@oracle.com>
     [not found]                                                       ` <D88D1952-83A1-4FF9-B028-AAE7A859A5B1-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-21 15:22                                                         ` Steve Wise
2014-07-21 17:07                                                           ` Devesh Sharma
     [not found]                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE1D9CF-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-07-21 17:30                                                               ` Chuck Lever
     [not found]                                                                 ` <0CDA5340-DDD6-42F8-8359-0069BBC9E24C-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-22  5:06                                                                   ` Devesh Sharma
     [not found]                                                                     ` <EE7902D3F51F404C82415C4803930ACD3FE1DB1D-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-07-30 18:39                                                                       ` Chuck Lever
     [not found]                                                                         ` <A40CDF7D-7ED2-4D67-957F-8F977D567774-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-31  5:14                                                                           ` Devesh Sharma
     [not found]                                                                             ` <EE7902D3F51F404C82415C4803930ACD3FE23695-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-08-18  9:52                                                                               ` Devesh Sharma
     [not found]                                                                                 ` <6a71f6a5-f335-42c6-b8b7-8b4bac5aae83-3RiH6ntJJkP8BX6JNMqfyFjyZtpTMMwT@public.gmane.org>
2014-08-18 13:13                                                                                   ` Chuck Lever
2014-07-21  5:23                                     ` Devesh Sharma
2014-07-17 21:25                             ` Shirley Ma
2014-07-18  6:19                         ` Devesh Sharma
     [not found]                           ` <EE7902D3F51F404C82415C4803930ACD3FE1686F-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2014-07-18 15:27                             ` Chuck Lever
     [not found]                               ` <D9783B2E-8D18-442E-9BFE-0863F9DD6B96-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
2014-07-21  5:40                                 ` Devesh Sharma
2014-07-17 14:01   ` [for-next 2/2] xprtrdma: fix deallocation sequence of pd Devesh Sharma
     [not found]     ` <3fdcf67f-2e90-4c61-92da-a8f7743cf54a-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
2014-07-17 15:05       ` Steve Wise
     [not found]         ` <53C7E64D.90501-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2014-07-17 15:35           ` Devesh Sharma

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox