[PATCH 04/10] IB/srp: Use SRP transport layer error recovery

linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
To: David Dillow <dillowda-1Heg1YXhbW8@public.gmane.org>,
	Roland Dreier <roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>,
	Vu Pham <vu-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>,
	Sebastian Riemer
	<sebastian.riemer-EIkl63zCoXaH+58JC4qpiA@public.gmane.org>,
	linux-rdma <linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>
Subject: [PATCH 04/10] IB/srp: Use SRP transport layer error recovery
Date: Thu, 10 Oct 2013 14:13:26 +0200	[thread overview]
Message-ID: <525699E6.3050605@acm.org> (raw)
In-Reply-To: <52569806.9080201-HInyCGIudOg@public.gmane.org>

Enable fast_io_fail_tmo and dev_loss_tmo functionality for the IB
SRP initiator. Add kernel module parameters that allow to specify
default values for these parameters.

Signed-off-by: Bart Van Assche <bvanassche-HInyCGIudOg@public.gmane.org>
Acked-by: David Dillow <dillowda-1Heg1YXhbW8@public.gmane.org>
Cc: Roland Dreier <roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Vu Pham <vu-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Cc: Sebastian Riemer <sebastian.riemer-EIkl63zCoXaH+58JC4qpiA@public.gmane.org>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 141 ++++++++++++++++++++++++++----------
 drivers/infiniband/ulp/srp/ib_srp.h |   1 -
 2 files changed, 101 insertions(+), 41 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 414fd02..5551f3b 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -86,6 +86,27 @@ module_param(topspin_workarounds, int, 0444);
 MODULE_PARM_DESC(topspin_workarounds,
 		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
 
+static struct kernel_param_ops srp_tmo_ops;
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+		 "Number of seconds between the observation of a transport"
+		 " layer error and failing all I/O. \"off\" means that this"
+		 " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 60;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+		 "Maximum number of seconds that the SRP transport should"
+		 " insulate transport layer errors. After this time has been"
+		 " exceeded the SCSI host is removed. Should be"
+		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		 " if fast_io_fail_tmo has not been set. \"off\" means that"
+		 " this functionality is disabled.");
+
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
 static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
@@ -102,6 +123,44 @@ static struct ib_client srp_client = {
 
 static struct ib_sa_client srp_sa_client;
 
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+	int tmo = *(int *)kp->arg;
+
+	if (tmo >= 0)
+		return sprintf(buffer, "%d", tmo);
+	else
+		return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+	int tmo, res;
+
+	if (strncmp(val, "off", 3) != 0) {
+		res = kstrtoint(val, 0, &tmo);
+		if (res)
+			goto out;
+	} else {
+		tmo = -1;
+	}
+	if (kp->arg == &srp_fast_io_fail_tmo)
+		res = srp_tmo_valid(tmo, srp_dev_loss_tmo);
+	else
+		res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo);
+	if (res)
+		goto out;
+	*(int *)kp->arg = tmo;
+
+out:
+	return res;
+}
+
+static struct kernel_param_ops srp_tmo_ops = {
+	.get = srp_tmo_get,
+	.set = srp_tmo_set,
+};
+
 static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
 {
 	return (struct srp_target_port *) host->hostdata;
@@ -689,23 +748,42 @@ static void srp_free_req(struct srp_target_port *target,
 	spin_unlock_irqrestore(&target->lock, flags);
 }
 
-static void srp_reset_req(struct srp_target_port *target, struct srp_request *req)
+static void srp_finish_req(struct srp_target_port *target,
+			   struct srp_request *req, int result)
 {
 	struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL);
 
 	if (scmnd) {
 		srp_free_req(target, req, scmnd, 0);
-		scmnd->result = DID_RESET << 16;
+		scmnd->result = result;
 		scmnd->scsi_done(scmnd);
 	}
 }
 
-static int srp_reconnect_target(struct srp_target_port *target)
+static void srp_terminate_io(struct srp_rport *rport)
 {
-	struct Scsi_Host *shost = target->scsi_host;
-	int i, ret;
+	struct srp_target_port *target = rport->lld_data;
+	int i;
+
+	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
+		struct srp_request *req = &target->req_ring[i];
+		srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16);
+	}
+}
 
-	scsi_target_block(&shost->shost_gendev);
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	int i, ret;
 
 	srp_disconnect_target(target);
 	/*
@@ -726,8 +804,7 @@ static int srp_reconnect_target(struct srp_target_port *target)
 
 	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
 		struct srp_request *req = &target->req_ring[i];
-		if (req->scmnd)
-			srp_reset_req(target, req);
+		srp_finish_req(target, req, DID_RESET << 16);
 	}
 
 	INIT_LIST_HEAD(&target->free_tx);
@@ -737,28 +814,9 @@ static int srp_reconnect_target(struct srp_target_port *target)
 	if (ret == 0)
 		ret = srp_connect_target(target);
 
-	scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING :
-			    SDEV_TRANSPORT_OFFLINE);
-	target->transport_offline = !!ret;
-
-	if (ret)
-		goto err;
-
-	shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n");
-
-	return ret;
-
-err:
-	shost_printk(KERN_ERR, target->scsi_host,
-		     PFX "reconnect failed (%d), removing target port.\n", ret);
-
-	/*
-	 * We couldn't reconnect, so kill our target port off.
-	 * However, we have to defer the real removal because we
-	 * are in the context of the SCSI error handler now, which
-	 * will deadlock if we call scsi_remove_host().
-	 */
-	srp_queue_remove_work(target);
+	if (ret == 0)
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "reconnect succeeded\n");
 
 	return ret;
 }
@@ -1356,10 +1414,11 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
 	struct srp_cmd *cmd;
 	struct ib_device *dev;
 	unsigned long flags;
-	int len;
+	int len, result;
 
-	if (unlikely(target->transport_offline)) {
-		scmnd->result = DID_NO_CONNECT << 16;
+	result = srp_chkready(target->rport);
+	if (unlikely(result)) {
+		scmnd->result = result;
 		scmnd->scsi_done(scmnd);
 		return 0;
 	}
@@ -1757,7 +1816,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
 	if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun,
 			      SRP_TSK_ABORT_TASK) == 0)
 		ret = SUCCESS;
-	else if (target->transport_offline)
+	else if (target->rport->state == SRP_RPORT_LOST)
 		ret = FAST_IO_FAIL;
 	else
 		ret = FAILED;
@@ -1784,7 +1843,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 	for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) {
 		struct srp_request *req = &target->req_ring[i];
 		if (req->scmnd && req->scmnd->device == scmnd->device)
-			srp_reset_req(target, req);
+			srp_finish_req(target, req, DID_RESET << 16);
 	}
 
 	return SUCCESS;
@@ -1793,14 +1852,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 static int srp_reset_host(struct scsi_cmnd *scmnd)
 {
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
-	int ret = FAILED;
 
 	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
 
-	if (!srp_reconnect_target(target))
-		ret = SUCCESS;
-
-	return ret;
+	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
 }
 
 static int srp_slave_configure(struct scsi_device *sdev)
@@ -2637,7 +2692,13 @@ static void srp_remove_one(struct ib_device *device)
 }
 
 static struct srp_function_template ib_srp_transport_functions = {
+	.has_rport_state	 = true,
+	.reset_timer_if_blocked	 = true,
+	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo,
+	.dev_loss_tmo		 = &srp_dev_loss_tmo,
+	.reconnect		 = srp_rport_reconnect,
 	.rport_delete		 = srp_rport_delete,
+	.terminate_rport_io	 = srp_terminate_io,
 };
 
 static int __init srp_init_module(void)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index 2a1768f..fd1817e 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -140,7 +140,6 @@ struct srp_target_port {
 	unsigned int		cmd_sg_cnt;
 	unsigned int		indirect_size;
 	bool			allow_ext_sg;
-	bool			transport_offline;
 
 	/* Everything above this point is used in the hot path of
 	 * command processing. Try to keep them packed into cachelines.
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

next prev parent reply	other threads:[~2013-10-10 12:13 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-10-10 12:05 [PATCH 0/10] IB SRP initiator patches for kernel 3.12 Bart Van Assche
2013-10-10 12:10 ` [PATCH 02/10] IB/srp: Keep rport as long as the IB transport layer Bart Van Assche
     [not found] ` <52569806.9080201-HInyCGIudOg@public.gmane.org>
2013-10-10 12:08   ` [PATCH 01/10] IB/srp: Make transport layer retry count configurable Bart Van Assche
2013-10-10 12:12   ` [PATCH 03/10] scsi_transport_srp: Add transport layer error handling Bart Van Assche
     [not found]     ` <525699A9.6090802-HInyCGIudOg@public.gmane.org>
2013-10-19 16:13       ` Bart Van Assche
     [not found]         ` <5262AF8D.7070700-HInyCGIudOg@public.gmane.org>
2013-10-25 22:42           ` David Dillow
2013-10-26  6:30             ` Bart Van Assche
2013-10-10 12:13   ` Bart Van Assche [this message]
2013-10-10 12:14   ` [PATCH 05/10] IB/srp: Start timers if a transport layer error occurs Bart Van Assche
2013-10-10 12:15   ` [PATCH 06/10] scsi_transport_srp: Add periodic reconnect support Bart Van Assche
2013-10-10 12:16   ` [PATCH 07/10] IB/srp: Add periodic reconnect functionality Bart Van Assche
2013-10-10 12:18   ` [PATCH 08/10] IB/srp: Export sgid to sysfs Bart Van Assche
2013-10-10 12:18   ` [PATCH 09/10] IB/srp: Introduce srp_alloc_req_data() Bart Van Assche
2013-10-10 12:19   ` [PATCH 10/10] IB/srp: Make queue size configurable Bart Van Assche
     [not found]     ` <52569B6A.30301-HInyCGIudOg@public.gmane.org>
2013-10-11 11:34       ` Jack Wang
2013-10-25 22:13       ` David Dillow
     [not found]         ` <1382739213.17280.3.camel-zHLflQxYYDO4Hhoo1DtQwJ9G+ZOsUmrO@public.gmane.org>
2013-10-26  6:13           ` Bart Van Assche
2013-10-25 22:14   ` [PATCH 0/10] IB SRP initiator patches for kernel 3.12 David Dillow

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:414fd02 dfblob:5551f3b dfblob:2a1768f dfblob:fd1817e )
 OR (
bs:"[PATCH 04/10] IB/srp: Use SRP transport layer error recovery" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=525699E6.3050605@acm.org \
    --to=bvanassche-hinycgiudog@public.gmane.org \
    --cc=dillowda-1Heg1YXhbW8@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org \
    --cc=sebastian.riemer-EIkl63zCoXaH+58JC4qpiA@public.gmane.org \
    --cc=vu-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).