From mboxrd@z Thu Jan 1 00:00:00 1970 From: Vu Pham Subject: Re: [ofa-general][PATCH 3/4] SRP fail-over faster Date: Thu, 22 Oct 2009 16:13:34 -0700 Message-ID: <4AE0E71E.20309@mellanox.com> References: <4AD3B453.3030109@mellanox.com> <4AD63681.6080901@mellanox.com> <4AD63DB1.3060906@mellanox.com> <1255570760.13845.4.camel@obelisk.thedillows.org> <4AD74C88.8030604@mellanox.com> <1255634715.29829.9.camel@lap75545.ornl.gov> <20091015213512.GW5191@obsidianresearch.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="------------050203040503010704060205" Return-path: In-Reply-To: <20091015213512.GW5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> Sender: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org To: Roland Dreier Cc: Jason Gunthorpe , David Dillow , Linux RDMA list , Bart Van Assche List-Id: linux-rdma@vger.kernel.org This is a multi-part message in MIME format. --------------050203040503010704060205 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Jason Gunthorpe wrote: > On Thu, Oct 15, 2009 at 03:25:15PM -0400, David Dillow wrote: > >> On Thu, 2009-10-15 at 09:23 -0700, Vu Pham wrote: >> >>> David Dillow wrote: >>> >>>> And if I want to disable this completely? >>>> >>>> >>> Unless these patches are bad and affect the stability of the driver, why >>> do you want to disable it? If you don't use multipath/device-mapper and >>> use /dev/sd**, everything will be same >>> >> I use multipath with ALUA, and I don't mind if the link flaps a bit. 60 >> seconds is near my SCSI timeout of 77 seconds, so it doesn't buy me >> much. I'd rather multipath be delivering traffic to the backup path than >> sitting on its thumbs for 60 seconds doing nothing. >> > > I've been left with a similar impression for several multipath things > I've seen in the past. True active/active multipath setups should have > a shorter timeout - there is no penalty for directing more traffic to > the 2nd path (the paths should be load balancing existing traffic in > the standard case anyhow). > > An active/passive configuration might be different... > > Certainly an enforced lower limit in the kernel is silly, and a > per-device setting does make some sense. > > Jason > Here is the updated patch which implement the device_loss_timeout for each target instead of module parameter. It also reflects changes from previous feedbacks. Please review --------------050203040503010704060205 Content-Type: text/plain; name="srp_3_device_loss_timeout.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="srp_3_device_loss_timeout.patch" Introducing device_loss_timeout per target granuality. Creating a timer to clean up connection after device_loss_timeout expired. During device_loss_timeout, the QP is in error state, srp will return DID_RESET for outstanding I/Os and return FAILED for abort_cmd, reset_lun, and return SUCCESS (without retrying reconnect) on reset_host Signed-off-by: Vu Pham ----------------------------------------------------------------------- drivers/infiniband/ulp/srp/ib_srp.c | 94 ++++++++++++++++++++++++++++++++++- drivers/infiniband/ulp/srp/ib_srp.h | 3 + 2 files changed, 95 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e44939a..12404d5 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -433,6 +433,10 @@ static void srp_remove_work(struct work_struct *work) return; } target->state = SRP_TARGET_REMOVED; + + if (timer_pending(&target->qp_err_timer)) + del_timer_sync(&target->qp_err_timer); + spin_unlock_irq(target->scsi_host->host_lock); spin_lock(&target->srp_host->target_lock); @@ -896,6 +900,50 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) DMA_FROM_DEVICE); } +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_target_port *target = + container_of(work, struct srp_target_port, work); + + srp_reconnect_target(target); + spin_lock_irq(target->scsi_host->host_lock); + target->work_in_progress = 0; + spin_unlock_irq(target->scsi_host->host_lock); +} + +static void srp_qp_in_err_timer(unsigned long data) +{ + struct srp_target_port *target = (struct srp_target_port *)data; + struct srp_request *req, *tmp; + + if (target->state != SRP_TARGET_LIVE) + return; + + spin_lock_irq(target->scsi_host->host_lock); + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req); + spin_unlock_irq(target->scsi_host->host_lock); + + spin_lock_irq(target->scsi_host->host_lock); + if (!target->work_in_progress) { + target->work_in_progress = 1; + INIT_WORK(&target->work, srp_reconnect_work); + schedule_work(&target->work); + } + spin_unlock_irq(target->scsi_host->host_lock); +} + +static void srp_qp_err_add_timer(struct srp_target_port *target, int time) +{ + if (!timer_pending(&target->qp_err_timer)) { + setup_timer(&target->qp_err_timer, + srp_qp_in_err_timer, + (unsigned long)target); + target->qp_err_timer.expires = round_jiffies(time*HZ + jiffies); + add_timer(&target->qp_err_timer); + } +} + static void srp_completion(struct ib_cq *cq, void *target_ptr) { struct srp_target_port *target = target_ptr; @@ -904,11 +952,19 @@ static void srp_completion(struct ib_cq *cq, void *target_ptr) ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); while (ib_poll_cq(cq, 1, &wc) > 0) { if (wc.status) { + unsigned long flags; + shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", wc.wr_id & SRP_OP_RECV ? "receive" : "send", wc.status); - target->qp_in_error = 1; + spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) { + target->qp_in_error = 1; + srp_qp_err_add_timer(target, 5); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); break; } @@ -1212,6 +1268,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) int attr_mask = 0; int comp = 0; int opcode = 0; + unsigned long flags; switch (event->event) { case IB_CM_REQ_ERROR: @@ -1299,6 +1356,13 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); + spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) { + target->qp_in_error = 1; + srp_qp_err_add_timer(target, 5); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); target->status = 0; break; @@ -1441,9 +1505,22 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req, *tmp; int ret = FAILED; - shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "SRP reset_host called state %d qp_err %d\n", + target->state, target->qp_in_error); + + spin_lock_irq(target->scsi_host->host_lock); + if (timer_pending(&target->qp_err_timer) || target->qp_in_error || + target->state != SRP_TARGET_LIVE) { + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req); + spin_unlock_irq(target->scsi_host->host_lock); + return SUCCESS; + } + spin_unlock_irq(target->scsi_host->host_lock); if (!srp_reconnect_target(target)) ret = SUCCESS; @@ -1657,6 +1734,7 @@ enum { SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_INITIATOR_EXT = 1 << 8, + SRP_OPT_DEVICE_LOSS_TMO = 1 << 9, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1674,6 +1752,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, + { SRP_OPT_DEVICE_LOSS_TMO, "device_loss_timeout=%d"}, { SRP_OPT_ERR, NULL } }; @@ -1801,6 +1880,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) kfree(p); break; + case SRP_OPT_DEVICE_LOSS_TMO: + if (match_int(args, &token)) { + printk(KERN_WARNING PFX "bad device loss timeout '%s'\n", p); + goto out; + } + target->device_loss_timeout = token; + break; + default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); @@ -1860,6 +1947,9 @@ static ssize_t srp_create_target(struct device *dev, if (ret) goto err; + if (target->device_loss_timeout <= 0) + target->device_loss_timeout = 30; + ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); shost_printk(KERN_DEBUG, target->scsi_host, PFX diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index e185b90..daa4bf7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -153,12 +153,15 @@ struct srp_target_port { struct srp_request req_ring[SRP_SQ_SIZE]; struct work_struct work; + int work_in_progress; struct list_head list; struct completion done; int status; enum srp_target_state state; int qp_in_error; + struct timer_list qp_err_timer; + int device_loss_timeout; }; struct srp_iu { --------------050203040503010704060205-- -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html