linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 00/11] hpsa: minor fixes and cleanups
@ 2013-11-07 16:45 Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection Stephen M. Cameron
                   ` (10 more replies)
  0 siblings, 11 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

The following series implements some minor fixes and cleanups for hpsa

---

Stephen M. Cameron (11):
      hpsa: use workqueue instead of kernel thread for lockup detection
      hpsa: do not attempt to flush the cache on locked up controllers
      hpsa: add 5 second delay after doorbell reset
      hpsa: do not discard scsi status on aborted commands
      hpsa: remove unneeded include of seq_file.h
      hpsa: fix memory leak in CCISS_BIG_PASSTHRU ioctl
      hpsa: add MSA 2040 to list of external target devices
      hpsa: cap CCISS_PASSTHRU at 20 concurrent commands.
      hpsa: prevent stalled i/o
      hpsa: rename scsi prefetch field
      hpsa: enable unit attention reporting


 drivers/scsi/hpsa.c     |  252 +++++++++++++++++++++++++++--------------------
 drivers/scsi/hpsa.h     |    9 +-
 drivers/scsi/hpsa_cmd.h |    4 +
 3 files changed, 157 insertions(+), 108 deletions(-)

-- 
-- steve

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-12-02 18:00   ` James Bottomley
  2013-11-07 16:45 ` [PATCH 02/11] hpsa: do not attempt to flush the cache on locked up controllers Stephen M. Cameron
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

Much simpler and avoids races starting/stopping the thread.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |  103 +++++++++++++--------------------------------------
 drivers/scsi/hpsa.h |    3 +
 2 files changed, 28 insertions(+), 78 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 3c97974..1264b51 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -172,10 +172,6 @@ static struct board_type products[] = {
 
 static int number_of_controllers;
 
-static struct list_head hpsa_ctlr_list = LIST_HEAD_INIT(hpsa_ctlr_list);
-static spinlock_t lockup_detector_lock;
-static struct task_struct *hpsa_lockup_detector;
-
 static irqreturn_t do_hpsa_intr_intx(int irq, void *dev_id);
 static irqreturn_t do_hpsa_intr_msi(int irq, void *dev_id);
 static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg);
@@ -4638,16 +4634,6 @@ static void hpsa_undo_allocations_after_kdump_soft_reset(struct ctlr_info *h)
 	kfree(h);
 }
 
-static void remove_ctlr_from_lockup_detector_list(struct ctlr_info *h)
-{
-	assert_spin_locked(&lockup_detector_lock);
-	if (!hpsa_lockup_detector)
-		return;
-	if (h->lockup_detected)
-		return; /* already stopped the lockup detector */
-	list_del(&h->lockup_list);
-}
-
 /* Called when controller lockup detected. */
 static void fail_all_cmds_on_list(struct ctlr_info *h, struct list_head *list)
 {
@@ -4666,8 +4652,6 @@ static void controller_lockup_detected(struct ctlr_info *h)
 {
 	unsigned long flags;
 
-	assert_spin_locked(&lockup_detector_lock);
-	remove_ctlr_from_lockup_detector_list(h);
 	h->access.set_intr_mask(h, HPSA_INTR_OFF);
 	spin_lock_irqsave(&h->lock, flags);
 	h->lockup_detected = readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
@@ -4687,7 +4671,6 @@ static void detect_controller_lockup(struct ctlr_info *h)
 	u32 heartbeat;
 	unsigned long flags;
 
-	assert_spin_locked(&lockup_detector_lock);
 	now = get_jiffies_64();
 	/* If we've received an interrupt recently, we're ok. */
 	if (time_after64(h->last_intr_timestamp +
@@ -4717,68 +4700,22 @@ static void detect_controller_lockup(struct ctlr_info *h)
 	h->last_heartbeat_timestamp = now;
 }
 
-static int detect_controller_lockup_thread(void *notused)
-{
-	struct ctlr_info *h;
-	unsigned long flags;
-
-	while (1) {
-		struct list_head *this, *tmp;
-
-		schedule_timeout_interruptible(HEARTBEAT_SAMPLE_INTERVAL);
-		if (kthread_should_stop())
-			break;
-		spin_lock_irqsave(&lockup_detector_lock, flags);
-		list_for_each_safe(this, tmp, &hpsa_ctlr_list) {
-			h = list_entry(this, struct ctlr_info, lockup_list);
-			detect_controller_lockup(h);
-		}
-		spin_unlock_irqrestore(&lockup_detector_lock, flags);
-	}
-	return 0;
-}
-
-static void add_ctlr_to_lockup_detector_list(struct ctlr_info *h)
+static void hpsa_monitor_ctlr_worker(struct work_struct *work)
 {
 	unsigned long flags;
-
-	h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL;
-	spin_lock_irqsave(&lockup_detector_lock, flags);
-	list_add_tail(&h->lockup_list, &hpsa_ctlr_list);
-	spin_unlock_irqrestore(&lockup_detector_lock, flags);
-}
-
-static void start_controller_lockup_detector(struct ctlr_info *h)
-{
-	/* Start the lockup detector thread if not already started */
-	if (!hpsa_lockup_detector) {
-		spin_lock_init(&lockup_detector_lock);
-		hpsa_lockup_detector =
-			kthread_run(detect_controller_lockup_thread,
-						NULL, HPSA);
-	}
-	if (!hpsa_lockup_detector) {
-		dev_warn(&h->pdev->dev,
-			"Could not start lockup detector thread\n");
+	struct ctlr_info *h = container_of(to_delayed_work(work),
+					struct ctlr_info, monitor_ctlr_work);
+	detect_controller_lockup(h);
+	if (h->lockup_detected)
+		return;
+	spin_lock_irqsave(&h->lock, flags);
+	if (h->remove_in_progress) {
+		spin_unlock_irqrestore(&h->lock, flags);
 		return;
 	}
-	add_ctlr_to_lockup_detector_list(h);
-}
-
-static void stop_controller_lockup_detector(struct ctlr_info *h)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&lockup_detector_lock, flags);
-	remove_ctlr_from_lockup_detector_list(h);
-	/* If the list of ctlr's to monitor is empty, stop the thread */
-	if (list_empty(&hpsa_ctlr_list)) {
-		spin_unlock_irqrestore(&lockup_detector_lock, flags);
-		kthread_stop(hpsa_lockup_detector);
-		spin_lock_irqsave(&lockup_detector_lock, flags);
-		hpsa_lockup_detector = NULL;
-	}
-	spin_unlock_irqrestore(&lockup_detector_lock, flags);
+	schedule_delayed_work(&h->monitor_ctlr_work,
+				h->heartbeat_sample_interval);
+	spin_unlock_irqrestore(&h->lock, flags);
 }
 
 static int hpsa_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -4925,7 +4862,12 @@ reinit_after_soft_reset:
 
 	hpsa_hba_inquiry(h);
 	hpsa_register_scsi(h);	/* hook ourselves into SCSI subsystem */
-	start_controller_lockup_detector(h);
+
+	/* Monitor the controller for firmware lockups */
+	h->heartbeat_sample_interval = HEARTBEAT_SAMPLE_INTERVAL;
+	INIT_DELAYED_WORK(&h->monitor_ctlr_work, hpsa_monitor_ctlr_worker);
+	schedule_delayed_work(&h->monitor_ctlr_work,
+				h->heartbeat_sample_interval);
 	return 0;
 
 clean4:
@@ -4991,13 +4933,20 @@ static void hpsa_free_device_info(struct ctlr_info *h)
 static void hpsa_remove_one(struct pci_dev *pdev)
 {
 	struct ctlr_info *h;
+	unsigned long flags;
 
 	if (pci_get_drvdata(pdev) == NULL) {
 		dev_err(&pdev->dev, "unable to remove device\n");
 		return;
 	}
 	h = pci_get_drvdata(pdev);
-	stop_controller_lockup_detector(h);
+
+	/* Get rid of any controller monitoring work items */
+	spin_lock_irqsave(&h->lock, flags);
+	h->remove_in_progress = 1;
+	cancel_delayed_work(&h->monitor_ctlr_work);
+	spin_unlock_irqrestore(&h->lock, flags);
+
 	hpsa_unregister_scsi(h);	/* unhook from SCSI subsystem */
 	hpsa_shutdown(pdev);
 	iounmap(h->vaddr);
diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h
index bc85e72..b48f1de 100644
--- a/drivers/scsi/hpsa.h
+++ b/drivers/scsi/hpsa.h
@@ -130,7 +130,8 @@ struct ctlr_info {
 	u32 heartbeat_sample_interval;
 	atomic_t firmware_flash_in_progress;
 	u32 lockup_detected;
-	struct list_head lockup_list;
+	struct delayed_work monitor_ctlr_work;
+	int remove_in_progress;
 	/* Address of h->q[x] is passed to intr handler to know which queue */
 	u8 q[MAX_REPLY_QUEUES];
 	u32 TMFSupportFlags; /* cache what task mgmt funcs are supported. */


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 02/11] hpsa: do not attempt to flush the cache on locked up controllers
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 03/11] hpsa: add 5 second delay after doorbell reset Stephen M. Cameron
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

There's no point in trying since it can't work, and if you do
try, it will just hang the system on shutdown.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 1264b51..20fc598 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -4884,6 +4884,15 @@ static void hpsa_flush_cache(struct ctlr_info *h)
 {
 	char *flush_buf;
 	struct CommandList *c;
+	unsigned long flags;
+
+	/* Don't bother trying to flush the cache if locked up */
+	spin_lock_irqsave(&h->lock, flags);
+	if (unlikely(h->lockup_detected)) {
+		spin_unlock_irqrestore(&h->lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
 
 	flush_buf = kzalloc(4, GFP_KERNEL);
 	if (!flush_buf)


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 02/11] hpsa: do not attempt to flush the cache on locked up controllers Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-11-08 13:51   ` Tomas Henzl
  2013-11-07 16:45 ` [PATCH 04/11] hpsa: do not discard scsi status on aborted commands Stephen M. Cameron
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

The hardware guys tell us that after initiating a software
reset via the doorbell register we need to wait 5 seconds before
attempting to talk to the board *at all*.  This means that we
cannot watch the board to verify it transitions from "ready" to
to "not ready" then back "ready", since this transition will
most likely happen during those 5 seconds (though we can still
verify the reset happens by watching the "driver version" field
get cleared.)

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
 1 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 20fc598..fff5fd3 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
 		 */
 		dev_info(&pdev->dev, "using doorbell to reset controller\n");
 		writel(use_doorbell, vaddr + SA5_DOORBELL);
+
+		/* PMC hardware guys tell us we need a 5 second delay after
+		 * doorbell reset and before any attempt to talk to the board
+		 * at all to ensure that this actually works and doesn't fall
+		 * over in some weird corner cases.
+		 */
+		msleep(5000);
 	} else { /* Try to do it the PCI power state way */
 
 		/* Quoting from the Open CISS Specification: "The Power
@@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
 	   need a little pause here */
 	msleep(HPSA_POST_RESET_PAUSE_MSECS);
 
-	/* Wait for board to become not ready, then ready. */
-	dev_info(&pdev->dev, "Waiting for board to reset.\n");
-	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
-	if (rc) {
-		dev_warn(&pdev->dev,
-			"failed waiting for board to reset."
-			" Will try soft reset.\n");
-		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
-		goto unmap_cfgtable;
+	if (!use_doorbell) {
+		/* Wait for board to become not ready, then ready.
+		 * (if we used the doorbell, then we already waited 5 secs
+		 * so the "not ready" state is already gone by so we
+		 * won't catch it.)
+		 */
+		dev_info(&pdev->dev, "Waiting for board to reset.\n");
+		rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
+		if (rc) {
+			dev_warn(&pdev->dev,
+				"failed waiting for board to reset."
+				" Will try soft reset.\n");
+			/* Not expected, but try soft reset later */
+			rc = -ENOTSUPP;
+			goto unmap_cfgtable;
+		}
 	}
 	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_READY);
 	if (rc) {


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 04/11] hpsa: do not discard scsi status on aborted commands
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (2 preceding siblings ...)
  2013-11-07 16:45 ` [PATCH 03/11] hpsa: add 5 second delay after doorbell reset Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 05/11] hpsa: remove unneeded include of seq_file.h Stephen M. Cameron
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

We inadvertantly discarded the scsi status for aborted commands.
For some commands (e.g. reads from tape drives) these can't be retried,
and if we discarded the scsi status, the scsi mid layer couldn't notice
anything was wrong and the error was not reported.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
Cc: stable@vger.kernel.org
---
 drivers/scsi/hpsa.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index fff5fd3..6faa4fb 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1285,7 +1285,7 @@ static void complete_scsi_command(struct CommandList *cp)
 					"has check condition: aborted command: "
 					"ASC: 0x%x, ASCQ: 0x%x\n",
 					cp, asc, ascq);
-				cmd->result = DID_SOFT_ERROR << 16;
+				cmd->result |= DID_SOFT_ERROR << 16;
 				break;
 			}
 			/* Must be some other type of check condition */


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 05/11] hpsa: remove unneeded include of seq_file.h
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (3 preceding siblings ...)
  2013-11-07 16:45 ` [PATCH 04/11] hpsa: do not discard scsi status on aborted commands Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-11-07 16:45 ` [PATCH 06/11] hpsa: fix memory leak in CCISS_BIG_PASSTHRU ioctl Stephen M. Cameron
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

Signed-off-by: Scott Teel <scott.teel@hp.com>
Acked-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |    1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 6faa4fb..b99a0a1 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -29,7 +29,6 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/timer.h>
-#include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 06/11] hpsa: fix memory leak in CCISS_BIG_PASSTHRU ioctl
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (4 preceding siblings ...)
  2013-11-07 16:45 ` [PATCH 05/11] hpsa: remove unneeded include of seq_file.h Stephen M. Cameron
@ 2013-11-07 16:45 ` Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 07/11] hpsa: add MSA 2040 to list of external target devices Stephen M. Cameron
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:45 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

We were leaking a command buffer if a DMA mapping error was
encountered in the CCISS_BIG_PASSTHRU ioctl.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index b99a0a1..752d234 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -3166,7 +3166,7 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, void __user *argp)
 				hpsa_pci_unmap(h->pdev, c, i,
 					PCI_DMA_BIDIRECTIONAL);
 				status = -ENOMEM;
-				goto cleanup1;
+				goto cleanup0;
 			}
 			c->SG[i].Addr.lower = temp64.val32.lower;
 			c->SG[i].Addr.upper = temp64.val32.upper;
@@ -3182,24 +3182,23 @@ static int hpsa_big_passthru_ioctl(struct ctlr_info *h, void __user *argp)
 	/* Copy the error information out */
 	memcpy(&ioc->error_info, c->err_info, sizeof(ioc->error_info));
 	if (copy_to_user(argp, ioc, sizeof(*ioc))) {
-		cmd_special_free(h, c);
 		status = -EFAULT;
-		goto cleanup1;
+		goto cleanup0;
 	}
 	if (ioc->Request.Type.Direction == XFER_READ && ioc->buf_size > 0) {
 		/* Copy the data out of the buffer we created */
 		BYTE __user *ptr = ioc->buf;
 		for (i = 0; i < sg_used; i++) {
 			if (copy_to_user(ptr, buff[i], buff_size[i])) {
-				cmd_special_free(h, c);
 				status = -EFAULT;
-				goto cleanup1;
+				goto cleanup0;
 			}
 			ptr += buff_size[i];
 		}
 	}
-	cmd_special_free(h, c);
 	status = 0;
+cleanup0:
+	cmd_special_free(h, c);
 cleanup1:
 	if (buff) {
 		for (i = 0; i < sg_used; i++)


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 07/11] hpsa: add MSA 2040 to list of external target devices
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (5 preceding siblings ...)
  2013-11-07 16:45 ` [PATCH 06/11] hpsa: fix memory leak in CCISS_BIG_PASSTHRU ioctl Stephen M. Cameron
@ 2013-11-07 16:46 ` Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 08/11] hpsa: cap CCISS_PASSTHRU at 20 concurrent commands Stephen M. Cameron
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:46 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

Signed-off-by: Scott Teel <scott.teel@hp.com>
Acked-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 752d234..fdc2228 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -1778,6 +1778,7 @@ static unsigned char *ext_target_model[] = {
 	"MSA2312",
 	"MSA2324",
 	"P2000 G3 SAS",
+	"MSA 2040 SAS",
 	NULL,
 };
 


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 08/11] hpsa: cap CCISS_PASSTHRU at 20 concurrent commands.
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (6 preceding siblings ...)
  2013-11-07 16:46 ` [PATCH 07/11] hpsa: add MSA 2040 to list of external target devices Stephen M. Cameron
@ 2013-11-07 16:46 ` Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 09/11] hpsa: prevent stalled i/o Stephen M. Cameron
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:46 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

Cap CCISS_BIG_PASSTHRU as well.  If an attempt is made
to exceed this, ioctl() will return -1 with errno == EAGAIN.

This is to prevent a userland program from exhausting all of
pci_alloc_consistent memory.  I've only seen this problem when
running a special test program designed to provoke it.  20
concurrent commands via the passthru ioctls (not counting SG_IO)
should be more than enough.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |   44 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/scsi/hpsa.h |    5 +++++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index fdc2228..ebb7144 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -3218,6 +3218,36 @@ static void check_ioctl_unit_attention(struct ctlr_info *h,
 			c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
 		(void) check_for_unit_attention(h, c);
 }
+
+static int increment_passthru_count(struct ctlr_info *h)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&h->passthru_count_lock, flags);
+	if (h->passthru_count >= HPSA_MAX_CONCURRENT_PASSTHRUS) {
+		spin_unlock_irqrestore(&h->passthru_count_lock, flags);
+		return -1;
+	}
+	h->passthru_count++;
+	spin_unlock_irqrestore(&h->passthru_count_lock, flags);
+	return 0;
+}
+
+static void decrement_passthru_count(struct ctlr_info *h)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&h->passthru_count_lock, flags);
+	if (h->passthru_count <= 0) {
+		spin_unlock_irqrestore(&h->passthru_count_lock, flags);
+		/* not expecting to get here. */
+		dev_warn(&h->pdev->dev, "Bug detected, passthru_count seems to be incorrect.\n");
+		return;
+	}
+	h->passthru_count--;
+	spin_unlock_irqrestore(&h->passthru_count_lock, flags);
+}
+
 /*
  * ioctl
  */
@@ -3225,6 +3255,7 @@ static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg)
 {
 	struct ctlr_info *h;
 	void __user *argp = (void __user *)arg;
+	int rc;
 
 	h = sdev_to_hba(dev);
 
@@ -3239,9 +3270,17 @@ static int hpsa_ioctl(struct scsi_device *dev, int cmd, void *arg)
 	case CCISS_GETDRIVVER:
 		return hpsa_getdrivver_ioctl(h, argp);
 	case CCISS_PASSTHRU:
-		return hpsa_passthru_ioctl(h, argp);
+		if (increment_passthru_count(h))
+			return -EAGAIN;
+		rc = hpsa_passthru_ioctl(h, argp);
+		decrement_passthru_count(h);
+		return rc;
 	case CCISS_BIG_PASSTHRU:
-		return hpsa_big_passthru_ioctl(h, argp);
+		if (increment_passthru_count(h))
+			return -EAGAIN;
+		rc = hpsa_big_passthru_ioctl(h, argp);
+		decrement_passthru_count(h);
+		return rc;
 	default:
 		return -ENOTTY;
 	}
@@ -4772,6 +4811,7 @@ reinit_after_soft_reset:
 	INIT_LIST_HEAD(&h->reqQ);
 	spin_lock_init(&h->lock);
 	spin_lock_init(&h->scan_lock);
+	spin_lock_init(&h->passthru_count_lock);
 	rc = hpsa_pci_init(h);
 	if (rc != 0)
 		goto clean1;
diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h
index b48f1de..fd9910a 100644
--- a/drivers/scsi/hpsa.h
+++ b/drivers/scsi/hpsa.h
@@ -114,6 +114,11 @@ struct ctlr_info {
 	struct TransTable_struct *transtable;
 	unsigned long transMethod;
 
+	/* cap concurrent passthrus at some reasonable maximum */
+#define HPSA_MAX_CONCURRENT_PASSTHRUS (20)
+	spinlock_t passthru_count_lock; /* protects passthru_count */
+	int passthru_count;
+
 	/*
 	 * Performant mode completion buffers
 	 */


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 09/11] hpsa: prevent stalled i/o
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (7 preceding siblings ...)
  2013-11-07 16:46 ` [PATCH 08/11] hpsa: cap CCISS_PASSTHRU at 20 concurrent commands Stephen M. Cameron
@ 2013-11-07 16:46 ` Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 10/11] hpsa: rename scsi prefetch field Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 11/11] hpsa: enable unit attention reporting Stephen M. Cameron
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:46 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

If a fifo full condition is encountered, i/o requests will stack
up in the h->reqQ queue.  The only thing which empties this queue
is start_io, which only gets called when new i/o requests come in.
If none are forthcoming, i/o in h->reqQ will be stalled.

To fix this, whenever fifo full condition is encountered, this
is recorded, and the interrupt handler examines this to see
if a fifo full condition was recently encountered when a
command completes and will call start_io to prevent i/o's in
h->reqQ from getting stuck.

I've only ever seen this problem occur when running specialized
test programs that pound on the the CCISS_PASSTHRU ioctl.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c |   32 ++++++++++++++++++++++++++++++--
 drivers/scsi/hpsa.h |    1 +
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index ebb7144..7256b29 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -3479,9 +3479,11 @@ static void start_io(struct ctlr_info *h)
 		c = list_entry(h->reqQ.next, struct CommandList, list);
 		/* can't do anything if fifo is full */
 		if ((h->access.fifo_full(h))) {
+			h->fifo_recently_full = 1;
 			dev_warn(&h->pdev->dev, "fifo full\n");
 			break;
 		}
+		h->fifo_recently_full = 0;
 
 		/* Get the first entry from the Request Q */
 		removeQ(c);
@@ -3535,15 +3537,41 @@ static inline int bad_tag(struct ctlr_info *h, u32 tag_index,
 static inline void finish_cmd(struct CommandList *c)
 {
 	unsigned long flags;
+	int io_may_be_stalled = 0;
+	struct ctlr_info *h = c->h;
 
-	spin_lock_irqsave(&c->h->lock, flags);
+	spin_lock_irqsave(&h->lock, flags);
 	removeQ(c);
-	spin_unlock_irqrestore(&c->h->lock, flags);
+
+	/*
+	 * Check for possibly stalled i/o.
+	 *
+	 * If a fifo_full condition is encountered, requests will back up
+	 * in h->reqQ.  This queue is only emptied out by start_io which is
+	 * only called when a new i/o request comes in.  If no i/o's are
+	 * forthcoming, the i/o's in h->reqQ can get stuck.  So we call
+	 * start_io from here if we detect such a danger.
+	 *
+	 * Normally, we shouldn't hit this case, but pounding on the
+	 * CCISS_PASSTHRU ioctl can provoke it.  Only call start_io if
+	 * commands_outstanding is low.  We want to avoid calling
+	 * start_io from in here as much as possible, and esp. don't
+	 * want to get in a cycle where we call start_io every time
+	 * through here.
+	 */
+	if (unlikely(h->fifo_recently_full) &&
+		h->commands_outstanding < 5)
+		io_may_be_stalled = 1;
+
+	spin_unlock_irqrestore(&h->lock, flags);
+
 	dial_up_lockup_detection_on_fw_flash_complete(c->h, c);
 	if (likely(c->cmd_type == CMD_SCSI))
 		complete_scsi_command(c);
 	else if (c->cmd_type == CMD_IOCTL_PEND)
 		complete(c->waiting);
+	if (unlikely(io_may_be_stalled))
+		start_io(h);
 }
 
 static inline u32 hpsa_tag_contains_index(u32 tag)
diff --git a/drivers/scsi/hpsa.h b/drivers/scsi/hpsa.h
index fd9910a..01c3283 100644
--- a/drivers/scsi/hpsa.h
+++ b/drivers/scsi/hpsa.h
@@ -137,6 +137,7 @@ struct ctlr_info {
 	u32 lockup_detected;
 	struct delayed_work monitor_ctlr_work;
 	int remove_in_progress;
+	u32 fifo_recently_full;
 	/* Address of h->q[x] is passed to intr handler to know which queue */
 	u8 q[MAX_REPLY_QUEUES];
 	u32 TMFSupportFlags; /* cache what task mgmt funcs are supported. */


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 10/11] hpsa: rename scsi prefetch field
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (8 preceding siblings ...)
  2013-11-07 16:46 ` [PATCH 09/11] hpsa: prevent stalled i/o Stephen M. Cameron
@ 2013-11-07 16:46 ` Stephen M. Cameron
  2013-11-07 16:46 ` [PATCH 11/11] hpsa: enable unit attention reporting Stephen M. Cameron
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:46 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

The field contains more bits than just the one
to indicate whether scsi prefetch should be turned on.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c     |   14 +++++++-------
 drivers/scsi/hpsa_cmd.h |    3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 7256b29..542d4cb 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -4384,15 +4384,15 @@ static inline bool hpsa_CISS_signature_present(struct ctlr_info *h)
 	return true;
 }
 
-/* Need to enable prefetch in the SCSI core for 6400 in x86 */
-static inline void hpsa_enable_scsi_prefetch(struct ctlr_info *h)
+static inline void hpsa_set_driver_support_bits(struct ctlr_info *h)
 {
 #ifdef CONFIG_X86
-	u32 prefetch;
+	/* Need to enable prefetch in the SCSI core for 6400 in x86 */
+	u32 driver_support;
 
-	prefetch = readl(&(h->cfgtable->SCSI_Prefetch));
-	prefetch |= 0x100;
-	writel(prefetch, &(h->cfgtable->SCSI_Prefetch));
+	driver_support = readl(&(h->cfgtable->driver_support));
+	driver_support |= ENABLE_SCSI_PREFETCH;
+	writel(driver_support, &(h->cfgtable->driver_support));
 #endif
 }
 
@@ -4503,7 +4503,7 @@ static int hpsa_pci_init(struct ctlr_info *h)
 		err = -ENODEV;
 		goto err_out_free_res;
 	}
-	hpsa_enable_scsi_prefetch(h);
+	hpsa_set_driver_support_bits(h);
 	hpsa_p600_dma_prefetch_quirk(h);
 	err = hpsa_enter_simple_mode(h);
 	if (err)
diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h
index a894f2e..5158709 100644
--- a/drivers/scsi/hpsa_cmd.h
+++ b/drivers/scsi/hpsa_cmd.h
@@ -356,7 +356,8 @@ struct CfgTable {
 	u32           TransMethodOffset;
 	u8            ServerName[16];
 	u32           HeartBeat;
-	u32           SCSI_Prefetch;
+	u32           driver_support;
+#define			ENABLE_SCSI_PREFETCH 0x100
 	u32	 	MaxScatterGatherElements;
 	u32		MaxLogicalUnits;
 	u32		MaxPhysicalDevices;


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 11/11] hpsa: enable unit attention reporting
  2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
                   ` (9 preceding siblings ...)
  2013-11-07 16:46 ` [PATCH 10/11] hpsa: rename scsi prefetch field Stephen M. Cameron
@ 2013-11-07 16:46 ` Stephen M. Cameron
  10 siblings, 0 replies; 23+ messages in thread
From: Stephen M. Cameron @ 2013-11-07 16:46 UTC (permalink / raw)
  To: james.bottomley; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

This used to be the default, but at some point the firmware guys
changed the default and I failed to notice.  Now to get unit
attention notifications, you must twiddle a bit indicating you
want them.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/scsi/hpsa.c     |    7 ++++---
 drivers/scsi/hpsa_cmd.h |    1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
index 542d4cb..be9ea76 100644
--- a/drivers/scsi/hpsa.c
+++ b/drivers/scsi/hpsa.c
@@ -4386,14 +4386,15 @@ static inline bool hpsa_CISS_signature_present(struct ctlr_info *h)
 
 static inline void hpsa_set_driver_support_bits(struct ctlr_info *h)
 {
-#ifdef CONFIG_X86
-	/* Need to enable prefetch in the SCSI core for 6400 in x86 */
 	u32 driver_support;
 
+#ifdef CONFIG_X86
+	/* Need to enable prefetch in the SCSI core for 6400 in x86 */
 	driver_support = readl(&(h->cfgtable->driver_support));
 	driver_support |= ENABLE_SCSI_PREFETCH;
-	writel(driver_support, &(h->cfgtable->driver_support));
 #endif
+	driver_support |= ENABLE_UNIT_ATTN;
+	writel(driver_support, &(h->cfgtable->driver_support));
 }
 
 /* Disable DMA prefetch for the P600.  Otherwise an ASIC bug may result
diff --git a/drivers/scsi/hpsa_cmd.h b/drivers/scsi/hpsa_cmd.h
index 5158709..bfc8c4e 100644
--- a/drivers/scsi/hpsa_cmd.h
+++ b/drivers/scsi/hpsa_cmd.h
@@ -358,6 +358,7 @@ struct CfgTable {
 	u32           HeartBeat;
 	u32           driver_support;
 #define			ENABLE_SCSI_PREFETCH 0x100
+#define			ENABLE_UNIT_ATTN 0x01
 	u32	 	MaxScatterGatherElements;
 	u32		MaxLogicalUnits;
 	u32		MaxPhysicalDevices;


^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-07 16:45 ` [PATCH 03/11] hpsa: add 5 second delay after doorbell reset Stephen M. Cameron
@ 2013-11-08 13:51   ` Tomas Henzl
  2013-11-08 14:44     ` scameron
  0 siblings, 1 reply; 23+ messages in thread
From: Tomas Henzl @ 2013-11-08 13:51 UTC (permalink / raw)
  To: Stephen M. Cameron, james.bottomley
  Cc: stephenmcameron, mikem, linux-scsi, scott.teel

On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
>
> The hardware guys tell us that after initiating a software
> reset via the doorbell register we need to wait 5 seconds before
> attempting to talk to the board *at all*.  This means that we
> cannot watch the board to verify it transitions from "ready" to
> to "not ready" then back "ready", since this transition will
> most likely happen during those 5 seconds (though we can still
> verify the reset happens by watching the "driver version" field
> get cleared.)
>
> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> ---
>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
>  1 files changed, 23 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> index 20fc598..fff5fd3 100644
> --- a/drivers/scsi/hpsa.c
> +++ b/drivers/scsi/hpsa.c
> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
>  		 */
>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> +
> +		/* PMC hardware guys tell us we need a 5 second delay after
> +		 * doorbell reset and before any attempt to talk to the board
> +		 * at all to ensure that this actually works and doesn't fall
> +		 * over in some weird corner cases.
> +		 */
> +		msleep(5000);
>  	} else { /* Try to do it the PCI power state way */
>  
>  		/* Quoting from the Open CISS Specification: "The Power
> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
>  	   need a little pause here */
>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);

I know it's complicated with a lot of different devices and fw versions,
but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
a bit fragile, what if a board comes up faster?
When the method "watching the "driver version"" works why don't you want to use it  
regardless of the reset method used?

>  
> -	/* Wait for board to become not ready, then ready. */
> -	dev_info(&pdev->dev, "Waiting for board to reset.\n");
> -	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> -	if (rc) {
> -		dev_warn(&pdev->dev,
> -			"failed waiting for board to reset."
> -			" Will try soft reset.\n");
> -		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
> -		goto unmap_cfgtable;
> +	if (!use_doorbell) {
> +		/* Wait for board to become not ready, then ready.
> +		 * (if we used the doorbell, then we already waited 5 secs
> +		 * so the "not ready" state is already gone by so we
> +		 * won't catch it.)
> +		 */
> +		dev_info(&pdev->dev, "Waiting for board to reset.\n");
> +		rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> +		if (rc) {
> +			dev_warn(&pdev->dev,
> +				"failed waiting for board to reset."
> +				" Will try soft reset.\n");
> +			/* Not expected, but try soft reset later */
> +			rc = -ENOTSUPP;
> +			goto unmap_cfgtable;
> +		}
>  	}
>  	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_READY);
>  	if (rc) {
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-08 13:51   ` Tomas Henzl
@ 2013-11-08 14:44     ` scameron
  2013-11-08 15:02       ` Tomas Henzl
  0 siblings, 1 reply; 23+ messages in thread
From: scameron @ 2013-11-08 14:44 UTC (permalink / raw)
  To: Tomas Henzl
  Cc: james.bottomley, stephenmcameron, mikem, linux-scsi, scott.teel,
	scameron

On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> > From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> >
> > The hardware guys tell us that after initiating a software
> > reset via the doorbell register we need to wait 5 seconds before
> > attempting to talk to the board *at all*.  This means that we
> > cannot watch the board to verify it transitions from "ready" to
> > to "not ready" then back "ready", since this transition will
> > most likely happen during those 5 seconds (though we can still
> > verify the reset happens by watching the "driver version" field
> > get cleared.)
> >
> > Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > ---
> >  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> >  1 files changed, 23 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> > index 20fc598..fff5fd3 100644
> > --- a/drivers/scsi/hpsa.c
> > +++ b/drivers/scsi/hpsa.c
> > @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
> >  		 */
> >  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
> >  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> > +
> > +		/* PMC hardware guys tell us we need a 5 second delay after
> > +		 * doorbell reset and before any attempt to talk to the board
> > +		 * at all to ensure that this actually works and doesn't fall
> > +		 * over in some weird corner cases.
> > +		 */
> > +		msleep(5000);
> >  	} else { /* Try to do it the PCI power state way */
> >  
> >  		/* Quoting from the Open CISS Specification: "The Power
> > @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> >  	   need a little pause here */
> >  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> 
> I know it's complicated with a lot of different devices and fw versions,
> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
> a bit fragile, what if a board comes up faster?
> When the method "watching the "driver version"" works why don't you want to use it  
> regardless of the reset method used?

The "watching the driver version" thing is only there to catch if
the firmware guys break things and turn the reset into a no-op
(which happened with the PCI power manaegment based reset and we
didn't catch it for a year or so because we didn't have that check)

We aren't supposed to look at the driver version field (or anything)
until we first verify the scratch pad register says the firmware is
ready.  In the case of those boards that use the "doorbell" reset,
we aren't supposed to look at *anything* for the first five seconds.

I have been bugging the firmware/hardware guys for a sane reset
procedure that actually works reliably for years with no luck.

For the SCSI over PCIe driver, being tired of this crap, I simply
unconditionally reset the device on driver load every single time,
and did this from the beginning.  This kind of forced the firmware
and hardware guys to make the reset on that thing work reliably
and quickly, and since I did that from the earliest days, they didn't
have a chance to screw it up without it being caught immediately.
For Smart Array, obviously it's too late for that approach.

-- steve

> 
> >  
> > -	/* Wait for board to become not ready, then ready. */
> > -	dev_info(&pdev->dev, "Waiting for board to reset.\n");
> > -	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> > -	if (rc) {
> > -		dev_warn(&pdev->dev,
> > -			"failed waiting for board to reset."
> > -			" Will try soft reset.\n");
> > -		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
> > -		goto unmap_cfgtable;
> > +	if (!use_doorbell) {
> > +		/* Wait for board to become not ready, then ready.
> > +		 * (if we used the doorbell, then we already waited 5 secs
> > +		 * so the "not ready" state is already gone by so we
> > +		 * won't catch it.)
> > +		 */
> > +		dev_info(&pdev->dev, "Waiting for board to reset.\n");
> > +		rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> > +		if (rc) {
> > +			dev_warn(&pdev->dev,
> > +				"failed waiting for board to reset."
> > +				" Will try soft reset.\n");
> > +			/* Not expected, but try soft reset later */
> > +			rc = -ENOTSUPP;
> > +			goto unmap_cfgtable;
> > +		}
> >  	}
> >  	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_READY);
> >  	if (rc) {
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-08 14:44     ` scameron
@ 2013-11-08 15:02       ` Tomas Henzl
  2013-11-08 15:31         ` scameron
  0 siblings, 1 reply; 23+ messages in thread
From: Tomas Henzl @ 2013-11-08 15:02 UTC (permalink / raw)
  To: scameron; +Cc: james.bottomley, stephenmcameron, mikem, linux-scsi, scott.teel

On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
>> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
>>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
>>>
>>> The hardware guys tell us that after initiating a software
>>> reset via the doorbell register we need to wait 5 seconds before
>>> attempting to talk to the board *at all*.  This means that we
>>> cannot watch the board to verify it transitions from "ready" to
>>> to "not ready" then back "ready", since this transition will
>>> most likely happen during those 5 seconds (though we can still
>>> verify the reset happens by watching the "driver version" field
>>> get cleared.)
>>>
>>> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
>>> ---
>>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
>>>  1 files changed, 23 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
>>> index 20fc598..fff5fd3 100644
>>> --- a/drivers/scsi/hpsa.c
>>> +++ b/drivers/scsi/hpsa.c
>>> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
>>>  		 */
>>>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
>>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
>>> +
>>> +		/* PMC hardware guys tell us we need a 5 second delay after
>>> +		 * doorbell reset and before any attempt to talk to the board
>>> +		 * at all to ensure that this actually works and doesn't fall
>>> +		 * over in some weird corner cases.
>>> +		 */
>>> +		msleep(5000);
>>>  	} else { /* Try to do it the PCI power state way */
>>>  
>>>  		/* Quoting from the Open CISS Specification: "The Power
>>> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
>>>  	   need a little pause here */
>>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
>> I know it's complicated with a lot of different devices and fw versions,
>> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
>> a bit fragile, what if a board comes up faster?
>> When the method "watching the "driver version"" works why don't you want to use it  
>> regardless of the reset method used?
> The "watching the driver version" thing is only there to catch if
> the firmware guys break things and turn the reset into a no-op
> (which happened with the PCI power manaegment based reset and we
> didn't catch it for a year or so because we didn't have that check)
>
> We aren't supposed to look at the driver version field (or anything)
> until we first verify the scratch pad register says the firmware is
> ready.  In the case of those boards that use the "doorbell" reset,
> we aren't supposed to look at *anything* for the first five seconds.
>
> I have been bugging the firmware/hardware guys for a sane reset
> procedure that actually works reliably for years with no luck.
>
> For the SCSI over PCIe driver, being tired of this crap, I simply
> unconditionally reset the device on driver load every single time,
> and did this from the beginning.  This kind of forced the firmware
> and hardware guys to make the reset on that thing work reliably
> and quickly, and since I did that from the earliest days, they didn't
> have a chance to screw it up without it being caught immediately.
> For Smart Array, obviously it's too late for that approach.

OK, my question was more or less if this:
msleep(HPSA_POST_RESET_PAUSE_MSECS);
just before waiting for the board to enter BOARD_NOT_READY state
isn't dangerous - when the board enters a ready state in the first 3sec
it will wait indefinitely for the not_ready state
thus whether the test for not ready state shouldn't be removed.
The mechanism now works somehow and maybe it's better
not to touch it, I just wanted to draw your attention to that
potential problem.


>
> -- steve
>
>>>  
>>> -	/* Wait for board to become not ready, then ready. */
>>> -	dev_info(&pdev->dev, "Waiting for board to reset.\n");
>>> -	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
>>> -	if (rc) {
>>> -		dev_warn(&pdev->dev,
>>> -			"failed waiting for board to reset."
>>> -			" Will try soft reset.\n");
>>> -		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
>>> -		goto unmap_cfgtable;
>>> +	if (!use_doorbell) {
>>> +		/* Wait for board to become not ready, then ready.
>>> +		 * (if we used the doorbell, then we already waited 5 secs
>>> +		 * so the "not ready" state is already gone by so we
>>> +		 * won't catch it.)
>>> +		 */
>>> +		dev_info(&pdev->dev, "Waiting for board to reset.\n");
>>> +		rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
>>> +		if (rc) {
>>> +			dev_warn(&pdev->dev,
>>> +				"failed waiting for board to reset."
>>> +				" Will try soft reset.\n");
>>> +			/* Not expected, but try soft reset later */
>>> +			rc = -ENOTSUPP;
>>> +			goto unmap_cfgtable;
>>> +		}
>>>  	}
>>>  	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_READY);
>>>  	if (rc) {
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-08 15:02       ` Tomas Henzl
@ 2013-11-08 15:31         ` scameron
  2013-12-01  0:42           ` James Bottomley
  0 siblings, 1 reply; 23+ messages in thread
From: scameron @ 2013-11-08 15:31 UTC (permalink / raw)
  To: Tomas Henzl
  Cc: james.bottomley, stephenmcameron, mikem, linux-scsi, scott.teel,
	scameron

On Fri, Nov 08, 2013 at 04:02:20PM +0100, Tomas Henzl wrote:
> On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> > On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> >> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> >>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> >>>
> >>> The hardware guys tell us that after initiating a software
> >>> reset via the doorbell register we need to wait 5 seconds before
> >>> attempting to talk to the board *at all*.  This means that we
> >>> cannot watch the board to verify it transitions from "ready" to
> >>> to "not ready" then back "ready", since this transition will
> >>> most likely happen during those 5 seconds (though we can still
> >>> verify the reset happens by watching the "driver version" field
> >>> get cleared.)
> >>>
> >>> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> >>> ---
> >>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> >>>  1 files changed, 23 insertions(+), 9 deletions(-)
> >>>
> >>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> >>> index 20fc598..fff5fd3 100644
> >>> --- a/drivers/scsi/hpsa.c
> >>> +++ b/drivers/scsi/hpsa.c
> >>> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
> >>>  		 */
> >>>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
> >>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> >>> +
> >>> +		/* PMC hardware guys tell us we need a 5 second delay after
> >>> +		 * doorbell reset and before any attempt to talk to the board
> >>> +		 * at all to ensure that this actually works and doesn't fall
> >>> +		 * over in some weird corner cases.
> >>> +		 */
> >>> +		msleep(5000);
> >>>  	} else { /* Try to do it the PCI power state way */
> >>>  
> >>>  		/* Quoting from the Open CISS Specification: "The Power
> >>> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> >>>  	   need a little pause here */
> >>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> >> I know it's complicated with a lot of different devices and fw versions,
> >> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
> >> a bit fragile, what if a board comes up faster?
> >> When the method "watching the "driver version"" works why don't you want to use it  
> >> regardless of the reset method used?
> > The "watching the driver version" thing is only there to catch if
> > the firmware guys break things and turn the reset into a no-op
> > (which happened with the PCI power manaegment based reset and we
> > didn't catch it for a year or so because we didn't have that check)
> >
> > We aren't supposed to look at the driver version field (or anything)
> > until we first verify the scratch pad register says the firmware is
> > ready.  In the case of those boards that use the "doorbell" reset,
> > we aren't supposed to look at *anything* for the first five seconds.
> >
> > I have been bugging the firmware/hardware guys for a sane reset
> > procedure that actually works reliably for years with no luck.
> >
> > For the SCSI over PCIe driver, being tired of this crap, I simply
> > unconditionally reset the device on driver load every single time,
> > and did this from the beginning.  This kind of forced the firmware
> > and hardware guys to make the reset on that thing work reliably
> > and quickly, and since I did that from the earliest days, they didn't
> > have a chance to screw it up without it being caught immediately.
> > For Smart Array, obviously it's too late for that approach.
> 
> OK, my question was more or less if this:
> msleep(HPSA_POST_RESET_PAUSE_MSECS);
> just before waiting for the board to enter BOARD_NOT_READY state
> isn't dangerous - when the board enters a ready state in the first 3sec
> it will wait indefinitely for the not_ready state
> thus whether the test for not ready state shouldn't be removed.
> The mechanism now works somehow and maybe it's better
> not to touch it, I just wanted to draw your attention to that
> potential problem.

Oh ok, I see.  Thanks, yes that does look questionable.  So you
are suggesting to skip the check for transition from NOT READY to 
READY in the scratch pad register in all cases, since we have all
these ridiculous delay requirements preventing us from watching the
board closely enough and so that may mean that we would miss such a
transition.

Let me talk it over with Mike Miller, but it seems reasonable.

-- steve

> 
> 
> >
> > -- steve
> >
> >>>  
> >>> -	/* Wait for board to become not ready, then ready. */
> >>> -	dev_info(&pdev->dev, "Waiting for board to reset.\n");
> >>> -	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> >>> -	if (rc) {
> >>> -		dev_warn(&pdev->dev,
> >>> -			"failed waiting for board to reset."
> >>> -			" Will try soft reset.\n");
> >>> -		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
> >>> -		goto unmap_cfgtable;
> >>> +	if (!use_doorbell) {
> >>> +		/* Wait for board to become not ready, then ready.
> >>> +		 * (if we used the doorbell, then we already waited 5 secs
> >>> +		 * so the "not ready" state is already gone by so we
> >>> +		 * won't catch it.)
> >>> +		 */
> >>> +		dev_info(&pdev->dev, "Waiting for board to reset.\n");
> >>> +		rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
> >>> +		if (rc) {
> >>> +			dev_warn(&pdev->dev,
> >>> +				"failed waiting for board to reset."
> >>> +				" Will try soft reset.\n");
> >>> +			/* Not expected, but try soft reset later */
> >>> +			rc = -ENOTSUPP;
> >>> +			goto unmap_cfgtable;
> >>> +		}
> >>>  	}
> >>>  	rc = hpsa_wait_for_board_state(pdev, vaddr, BOARD_READY);
> >>>  	if (rc) {
> >>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-11-08 15:31         ` scameron
@ 2013-12-01  0:42           ` James Bottomley
  2013-12-02 17:15             ` Mike Miller
  0 siblings, 1 reply; 23+ messages in thread
From: James Bottomley @ 2013-12-01  0:42 UTC (permalink / raw)
  To: scameron; +Cc: Tomas Henzl, stephenmcameron, mikem, linux-scsi, scott.teel

On Fri, 2013-11-08 at 09:31 -0600, scameron@beardog.cce.hp.com wrote:
> On Fri, Nov 08, 2013 at 04:02:20PM +0100, Tomas Henzl wrote:
> > On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> > > On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> > >> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> > >>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > >>>
> > >>> The hardware guys tell us that after initiating a software
> > >>> reset via the doorbell register we need to wait 5 seconds before
> > >>> attempting to talk to the board *at all*.  This means that we
> > >>> cannot watch the board to verify it transitions from "ready" to
> > >>> to "not ready" then back "ready", since this transition will
> > >>> most likely happen during those 5 seconds (though we can still
> > >>> verify the reset happens by watching the "driver version" field
> > >>> get cleared.)
> > >>>
> > >>> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > >>> ---
> > >>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> > >>>  1 files changed, 23 insertions(+), 9 deletions(-)
> > >>>
> > >>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> > >>> index 20fc598..fff5fd3 100644
> > >>> --- a/drivers/scsi/hpsa.c
> > >>> +++ b/drivers/scsi/hpsa.c
> > >>> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
> > >>>  		 */
> > >>>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
> > >>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> > >>> +
> > >>> +		/* PMC hardware guys tell us we need a 5 second delay after
> > >>> +		 * doorbell reset and before any attempt to talk to the board
> > >>> +		 * at all to ensure that this actually works and doesn't fall
> > >>> +		 * over in some weird corner cases.
> > >>> +		 */
> > >>> +		msleep(5000);
> > >>>  	} else { /* Try to do it the PCI power state way */
> > >>>  
> > >>>  		/* Quoting from the Open CISS Specification: "The Power
> > >>> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> > >>>  	   need a little pause here */
> > >>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > >> I know it's complicated with a lot of different devices and fw versions,
> > >> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
> > >> a bit fragile, what if a board comes up faster?
> > >> When the method "watching the "driver version"" works why don't you want to use it  
> > >> regardless of the reset method used?
> > > The "watching the driver version" thing is only there to catch if
> > > the firmware guys break things and turn the reset into a no-op
> > > (which happened with the PCI power manaegment based reset and we
> > > didn't catch it for a year or so because we didn't have that check)
> > >
> > > We aren't supposed to look at the driver version field (or anything)
> > > until we first verify the scratch pad register says the firmware is
> > > ready.  In the case of those boards that use the "doorbell" reset,
> > > we aren't supposed to look at *anything* for the first five seconds.
> > >
> > > I have been bugging the firmware/hardware guys for a sane reset
> > > procedure that actually works reliably for years with no luck.
> > >
> > > For the SCSI over PCIe driver, being tired of this crap, I simply
> > > unconditionally reset the device on driver load every single time,
> > > and did this from the beginning.  This kind of forced the firmware
> > > and hardware guys to make the reset on that thing work reliably
> > > and quickly, and since I did that from the earliest days, they didn't
> > > have a chance to screw it up without it being caught immediately.
> > > For Smart Array, obviously it's too late for that approach.
> > 
> > OK, my question was more or less if this:
> > msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > just before waiting for the board to enter BOARD_NOT_READY state
> > isn't dangerous - when the board enters a ready state in the first 3sec
> > it will wait indefinitely for the not_ready state
> > thus whether the test for not ready state shouldn't be removed.
> > The mechanism now works somehow and maybe it's better
> > not to touch it, I just wanted to draw your attention to that
> > potential problem.
> 
> Oh ok, I see.  Thanks, yes that does look questionable.  So you
> are suggesting to skip the check for transition from NOT READY to 
> READY in the scratch pad register in all cases, since we have all
> these ridiculous delay requirements preventing us from watching the
> board closely enough and so that may mean that we would miss such a
> transition.
> 
> Let me talk it over with Mike Miller, but it seems reasonable.

Is there a resolution on this?  It's holding up the patch series.

James



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-12-01  0:42           ` James Bottomley
@ 2013-12-02 17:15             ` Mike Miller
  2013-12-02 17:23               ` James Bottomley
  0 siblings, 1 reply; 23+ messages in thread
From: Mike Miller @ 2013-12-02 17:15 UTC (permalink / raw)
  To: James Bottomley
  Cc: scameron, Tomas Henzl, stephenmcameron, linux-scsi, scott.teel

On Sat, Nov 30, 2013 at 04:42:02PM -0800, James Bottomley wrote:
> On Fri, 2013-11-08 at 09:31 -0600, scameron@beardog.cce.hp.com wrote:
> > On Fri, Nov 08, 2013 at 04:02:20PM +0100, Tomas Henzl wrote:
> > > On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> > > > On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> > > >> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> > > >>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > >>>
> > > >>> The hardware guys tell us that after initiating a software
> > > >>> reset via the doorbell register we need to wait 5 seconds before
> > > >>> attempting to talk to the board *at all*.  This means that we
> > > >>> cannot watch the board to verify it transitions from "ready" to
> > > >>> to "not ready" then back "ready", since this transition will
> > > >>> most likely happen during those 5 seconds (though we can still
> > > >>> verify the reset happens by watching the "driver version" field
> > > >>> get cleared.)
> > > >>>
> > > >>> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > >>> ---
> > > >>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> > > >>>  1 files changed, 23 insertions(+), 9 deletions(-)
> > > >>>
> > > >>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> > > >>> index 20fc598..fff5fd3 100644
> > > >>> --- a/drivers/scsi/hpsa.c
> > > >>> +++ b/drivers/scsi/hpsa.c
> > > >>> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
> > > >>>  		 */
> > > >>>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
> > > >>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> > > >>> +
> > > >>> +		/* PMC hardware guys tell us we need a 5 second delay after
> > > >>> +		 * doorbell reset and before any attempt to talk to the board
> > > >>> +		 * at all to ensure that this actually works and doesn't fall
> > > >>> +		 * over in some weird corner cases.
> > > >>> +		 */
> > > >>> +		msleep(5000);
> > > >>>  	} else { /* Try to do it the PCI power state way */
> > > >>>  
> > > >>>  		/* Quoting from the Open CISS Specification: "The Power
> > > >>> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> > > >>>  	   need a little pause here */
> > > >>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > >> I know it's complicated with a lot of different devices and fw versions,
> > > >> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
> > > >> a bit fragile, what if a board comes up faster?
> > > >> When the method "watching the "driver version"" works why don't you want to use it  
> > > >> regardless of the reset method used?
> > > > The "watching the driver version" thing is only there to catch if
> > > > the firmware guys break things and turn the reset into a no-op
> > > > (which happened with the PCI power manaegment based reset and we
> > > > didn't catch it for a year or so because we didn't have that check)
> > > >
> > > > We aren't supposed to look at the driver version field (or anything)
> > > > until we first verify the scratch pad register says the firmware is
> > > > ready.  In the case of those boards that use the "doorbell" reset,
> > > > we aren't supposed to look at *anything* for the first five seconds.
> > > >
> > > > I have been bugging the firmware/hardware guys for a sane reset
> > > > procedure that actually works reliably for years with no luck.
> > > >
> > > > For the SCSI over PCIe driver, being tired of this crap, I simply
> > > > unconditionally reset the device on driver load every single time,
> > > > and did this from the beginning.  This kind of forced the firmware
> > > > and hardware guys to make the reset on that thing work reliably
> > > > and quickly, and since I did that from the earliest days, they didn't
> > > > have a chance to screw it up without it being caught immediately.
> > > > For Smart Array, obviously it's too late for that approach.
> > > 
> > > OK, my question was more or less if this:
> > > msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > just before waiting for the board to enter BOARD_NOT_READY state
> > > isn't dangerous - when the board enters a ready state in the first 3sec
> > > it will wait indefinitely for the not_ready state
> > > thus whether the test for not ready state shouldn't be removed.
> > > The mechanism now works somehow and maybe it's better
> > > not to touch it, I just wanted to draw your attention to that
> > > potential problem.
> > 
> > Oh ok, I see.  Thanks, yes that does look questionable.  So you
> > are suggesting to skip the check for transition from NOT READY to 
> > READY in the scratch pad register in all cases, since we have all
> > these ridiculous delay requirements preventing us from watching the
> > board closely enough and so that may mean that we would miss such a
> > transition.
> > 
> > Let me talk it over with Mike Miller, but it seems reasonable.
> 
> Is there a resolution on this?  It's holding up the patch series.
> 
> James
> 
James,
Let's omit the check for board ready/not ready. With these arbitrary delays
already in place we're likely to miss the transition.

Do you need me to resubmit the patch set?

-- mikem

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-12-02 17:15             ` Mike Miller
@ 2013-12-02 17:23               ` James Bottomley
  2013-12-02 17:24                 ` Miller, Mike (OS Dev)
  0 siblings, 1 reply; 23+ messages in thread
From: James Bottomley @ 2013-12-02 17:23 UTC (permalink / raw)
  To: Mike Miller
  Cc: scameron, Tomas Henzl, stephenmcameron, linux-scsi, scott.teel

On Mon, 2013-12-02 at 11:15 -0600, Mike Miller wrote:
> On Sat, Nov 30, 2013 at 04:42:02PM -0800, James Bottomley wrote:
> > On Fri, 2013-11-08 at 09:31 -0600, scameron@beardog.cce.hp.com wrote:
> > > On Fri, Nov 08, 2013 at 04:02:20PM +0100, Tomas Henzl wrote:
> > > > On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> > > > > On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> > > > >> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> > > > >>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > > >>>
> > > > >>> The hardware guys tell us that after initiating a software
> > > > >>> reset via the doorbell register we need to wait 5 seconds before
> > > > >>> attempting to talk to the board *at all*.  This means that we
> > > > >>> cannot watch the board to verify it transitions from "ready" to
> > > > >>> to "not ready" then back "ready", since this transition will
> > > > >>> most likely happen during those 5 seconds (though we can still
> > > > >>> verify the reset happens by watching the "driver version" field
> > > > >>> get cleared.)
> > > > >>>
> > > > >>> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > > >>> ---
> > > > >>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> > > > >>>  1 files changed, 23 insertions(+), 9 deletions(-)
> > > > >>>
> > > > >>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c
> > > > >>> index 20fc598..fff5fd3 100644
> > > > >>> --- a/drivers/scsi/hpsa.c
> > > > >>> +++ b/drivers/scsi/hpsa.c
> > > > >>> @@ -3781,6 +3781,13 @@ static int hpsa_controller_hard_reset(struct pci_dev *pdev,
> > > > >>>  		 */
> > > > >>>  		dev_info(&pdev->dev, "using doorbell to reset controller\n");
> > > > >>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> > > > >>> +
> > > > >>> +		/* PMC hardware guys tell us we need a 5 second delay after
> > > > >>> +		 * doorbell reset and before any attempt to talk to the board
> > > > >>> +		 * at all to ensure that this actually works and doesn't fall
> > > > >>> +		 * over in some weird corner cases.
> > > > >>> +		 */
> > > > >>> +		msleep(5000);
> > > > >>>  	} else { /* Try to do it the PCI power state way */
> > > > >>>  
> > > > >>>  		/* Quoting from the Open CISS Specification: "The Power
> > > > >>> @@ -3977,15 +3984,22 @@ static int hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> > > > >>>  	   need a little pause here */
> > > > >>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > > >> I know it's complicated with a lot of different devices and fw versions,
> > > > >> but here^ we wait for 3sec - isn't the method - wait for 3s then wait for board not ready
> > > > >> a bit fragile, what if a board comes up faster?
> > > > >> When the method "watching the "driver version"" works why don't you want to use it  
> > > > >> regardless of the reset method used?
> > > > > The "watching the driver version" thing is only there to catch if
> > > > > the firmware guys break things and turn the reset into a no-op
> > > > > (which happened with the PCI power manaegment based reset and we
> > > > > didn't catch it for a year or so because we didn't have that check)
> > > > >
> > > > > We aren't supposed to look at the driver version field (or anything)
> > > > > until we first verify the scratch pad register says the firmware is
> > > > > ready.  In the case of those boards that use the "doorbell" reset,
> > > > > we aren't supposed to look at *anything* for the first five seconds.
> > > > >
> > > > > I have been bugging the firmware/hardware guys for a sane reset
> > > > > procedure that actually works reliably for years with no luck.
> > > > >
> > > > > For the SCSI over PCIe driver, being tired of this crap, I simply
> > > > > unconditionally reset the device on driver load every single time,
> > > > > and did this from the beginning.  This kind of forced the firmware
> > > > > and hardware guys to make the reset on that thing work reliably
> > > > > and quickly, and since I did that from the earliest days, they didn't
> > > > > have a chance to screw it up without it being caught immediately.
> > > > > For Smart Array, obviously it's too late for that approach.
> > > > 
> > > > OK, my question was more or less if this:
> > > > msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > > just before waiting for the board to enter BOARD_NOT_READY state
> > > > isn't dangerous - when the board enters a ready state in the first 3sec
> > > > it will wait indefinitely for the not_ready state
> > > > thus whether the test for not ready state shouldn't be removed.
> > > > The mechanism now works somehow and maybe it's better
> > > > not to touch it, I just wanted to draw your attention to that
> > > > potential problem.
> > > 
> > > Oh ok, I see.  Thanks, yes that does look questionable.  So you
> > > are suggesting to skip the check for transition from NOT READY to 
> > > READY in the scratch pad register in all cases, since we have all
> > > these ridiculous delay requirements preventing us from watching the
> > > board closely enough and so that may mean that we would miss such a
> > > transition.
> > > 
> > > Let me talk it over with Mike Miller, but it seems reasonable.
> > 
> > Is there a resolution on this?  It's holding up the patch series.
> > 
> > James
> > 
> James,
> Let's omit the check for board ready/not ready. With these arbitrary delays
> already in place we're likely to miss the transition.
> 
> Do you need me to resubmit the patch set?

No; you want 1-2;4-11 of the current one ... I can do that.

James




^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
  2013-12-02 17:23               ` James Bottomley
@ 2013-12-02 17:24                 ` Miller, Mike (OS Dev)
  0 siblings, 0 replies; 23+ messages in thread
From: Miller, Mike (OS Dev) @ 2013-12-02 17:24 UTC (permalink / raw)
  To: James Bottomley
  Cc: scameron@beardog.cce.hp.com, Tomas Henzl,
	stephenmcameron@gmail.com, linux-scsi@vger.kernel.org,
	Teel, Scott Stacy



> -----Original Message-----
> From: James Bottomley [mailto:James.Bottomley@HansenPartnership.com]
> Sent: Monday, December 02, 2013 11:23 AM
> To: Miller, Mike (OS Dev)
> Cc: scameron@beardog.cce.hp.com; Tomas Henzl;
> stephenmcameron@gmail.com; linux-scsi@vger.kernel.org; Teel, Scott Stacy
> Subject: Re: [PATCH 03/11] hpsa: add 5 second delay after doorbell reset
> 
> On Mon, 2013-12-02 at 11:15 -0600, Mike Miller wrote:
> > On Sat, Nov 30, 2013 at 04:42:02PM -0800, James Bottomley wrote:
> > > On Fri, 2013-11-08 at 09:31 -0600, scameron@beardog.cce.hp.com wrote:
> > > > On Fri, Nov 08, 2013 at 04:02:20PM +0100, Tomas Henzl wrote:
> > > > > On 11/08/2013 03:44 PM, scameron@beardog.cce.hp.com wrote:
> > > > > > On Fri, Nov 08, 2013 at 02:51:37PM +0100, Tomas Henzl wrote:
> > > > > >> On 11/07/2013 05:45 PM, Stephen M. Cameron wrote:
> > > > > >>> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > > > >>>
> > > > > >>> The hardware guys tell us that after initiating a software
> > > > > >>> reset via the doorbell register we need to wait 5 seconds
> > > > > >>> before attempting to talk to the board *at all*.  This means
> > > > > >>> that we cannot watch the board to verify it transitions from
> > > > > >>> "ready" to to "not ready" then back "ready", since this
> > > > > >>> transition will most likely happen during those 5 seconds
> > > > > >>> (though we can still verify the reset happens by watching
> > > > > >>> the "driver version" field get cleared.)
> > > > > >>>
> > > > > >>> Signed-off-by: Stephen M. Cameron
> > > > > >>> <scameron@beardog.cce.hp.com>
> > > > > >>> ---
> > > > > >>>  drivers/scsi/hpsa.c |   32 +++++++++++++++++++++++---------
> > > > > >>>  1 files changed, 23 insertions(+), 9 deletions(-)
> > > > > >>>
> > > > > >>> diff --git a/drivers/scsi/hpsa.c b/drivers/scsi/hpsa.c index
> > > > > >>> 20fc598..fff5fd3 100644
> > > > > >>> --- a/drivers/scsi/hpsa.c
> > > > > >>> +++ b/drivers/scsi/hpsa.c
> > > > > >>> @@ -3781,6 +3781,13 @@ static int
> hpsa_controller_hard_reset(struct pci_dev *pdev,
> > > > > >>>  		 */
> > > > > >>>  		dev_info(&pdev->dev, "using doorbell to reset
> controller\n");
> > > > > >>>  		writel(use_doorbell, vaddr + SA5_DOORBELL);
> > > > > >>> +
> > > > > >>> +		/* PMC hardware guys tell us we need a 5 second
> delay after
> > > > > >>> +		 * doorbell reset and before any attempt to talk to
> the board
> > > > > >>> +		 * at all to ensure that this actually works and doesn't
> fall
> > > > > >>> +		 * over in some weird corner cases.
> > > > > >>> +		 */
> > > > > >>> +		msleep(5000);
> > > > > >>>  	} else { /* Try to do it the PCI power state way */
> > > > > >>>
> > > > > >>>  		/* Quoting from the Open CISS Specification: "The
> Power
> > > > > >>> @@ -3977,15 +3984,22 @@ static int
> hpsa_kdump_hard_reset_controller(struct pci_dev *pdev)
> > > > > >>>  	   need a little pause here */
> > > > > >>>  	msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > > > >> I know it's complicated with a lot of different devices and
> > > > > >> fw versions, but here^ we wait for 3sec - isn't the method -
> > > > > >> wait for 3s then wait for board not ready a bit fragile, what if a
> board comes up faster?
> > > > > >> When the method "watching the "driver version"" works why
> > > > > >> don't you want to use it regardless of the reset method used?
> > > > > > The "watching the driver version" thing is only there to catch
> > > > > > if the firmware guys break things and turn the reset into a
> > > > > > no-op (which happened with the PCI power manaegment based
> > > > > > reset and we didn't catch it for a year or so because we
> > > > > > didn't have that check)
> > > > > >
> > > > > > We aren't supposed to look at the driver version field (or
> > > > > > anything) until we first verify the scratch pad register says
> > > > > > the firmware is ready.  In the case of those boards that use
> > > > > > the "doorbell" reset, we aren't supposed to look at *anything* for
> the first five seconds.
> > > > > >
> > > > > > I have been bugging the firmware/hardware guys for a sane
> > > > > > reset procedure that actually works reliably for years with no luck.
> > > > > >
> > > > > > For the SCSI over PCIe driver, being tired of this crap, I
> > > > > > simply unconditionally reset the device on driver load every
> > > > > > single time, and did this from the beginning.  This kind of
> > > > > > forced the firmware and hardware guys to make the reset on
> > > > > > that thing work reliably and quickly, and since I did that
> > > > > > from the earliest days, they didn't have a chance to screw it up
> without it being caught immediately.
> > > > > > For Smart Array, obviously it's too late for that approach.
> > > > >
> > > > > OK, my question was more or less if this:
> > > > > msleep(HPSA_POST_RESET_PAUSE_MSECS);
> > > > > just before waiting for the board to enter BOARD_NOT_READY state
> > > > > isn't dangerous - when the board enters a ready state in the
> > > > > first 3sec it will wait indefinitely for the not_ready state
> > > > > thus whether the test for not ready state shouldn't be removed.
> > > > > The mechanism now works somehow and maybe it's better not to
> > > > > touch it, I just wanted to draw your attention to that potential
> > > > > problem.
> > > >
> > > > Oh ok, I see.  Thanks, yes that does look questionable.  So you
> > > > are suggesting to skip the check for transition from NOT READY to
> > > > READY in the scratch pad register in all cases, since we have all
> > > > these ridiculous delay requirements preventing us from watching
> > > > the board closely enough and so that may mean that we would miss
> > > > such a transition.
> > > >
> > > > Let me talk it over with Mike Miller, but it seems reasonable.
> > >
> > > Is there a resolution on this?  It's holding up the patch series.
> > >
> > > James
> > >
> > James,
> > Let's omit the check for board ready/not ready. With these arbitrary
> > delays already in place we're likely to miss the transition.
> >
> > Do you need me to resubmit the patch set?
> 
> No; you want 1-2;4-11 of the current one ... I can do that.
> 
> James
> 

Yes. Thank you.

-- mikem
> 


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection
  2013-11-07 16:45 ` [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection Stephen M. Cameron
@ 2013-12-02 18:00   ` James Bottomley
  2013-12-04 16:31     ` scameron
  0 siblings, 1 reply; 23+ messages in thread
From: James Bottomley @ 2013-12-02 18:00 UTC (permalink / raw)
  To: Stephen M. Cameron; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

On Thu, 2013-11-07 at 10:45 -0600, Stephen M. Cameron wrote:
> From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> 
> Much simpler and avoids races starting/stopping the thread.
> 
> Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>

Actually, perhaps you do need to resubmit the series: this is rejecting
against scsi-misc because of your prior hpsa patches:

patching file drivers/scsi/hpsa.c
Hunk #1 succeeded at 170 (offset -2 lines).
Hunk #2 succeeded at 4881 (offset 247 lines).
Hunk #3 succeeded at 4899 (offset 247 lines).
Hunk #4 succeeded at 4918 (offset 247 lines).
Hunk #5 succeeded at 4947 (offset 247 lines).
Hunk #6 FAILED at 4862.
Hunk #7 FAILED at 4928.
2 out of 7 hunks FAILED -- saving rejects to file
drivers/scsi/hpsa.c.rej
patching file drivers/scsi/hpsa.h
Hunk #1 FAILED at 130.
1 out of 1 hunk FAILED -- saving rejects to file drivers/scsi/hpsa.h.rej

James



^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection
  2013-12-02 18:00   ` James Bottomley
@ 2013-12-04 16:31     ` scameron
  2013-12-04 17:42       ` James Bottomley
  0 siblings, 1 reply; 23+ messages in thread
From: scameron @ 2013-12-04 16:31 UTC (permalink / raw)
  To: James Bottomley
  Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel, scameron

On Mon, Dec 02, 2013 at 10:00:02AM -0800, James Bottomley wrote:
> On Thu, 2013-11-07 at 10:45 -0600, Stephen M. Cameron wrote:
> > From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > 
> > Much simpler and avoids races starting/stopping the thread.
> > 
> > Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> 
> Actually, perhaps you do need to resubmit the series: this is rejecting
> against scsi-misc because of your prior hpsa patches:

Ah, I did not realize you had taken these into your tree:

2f2ec0b [SCSI] hpsa: bring logical drives online when format completes
c99298f [SCSI] hpsa: hide logical drives with format in progress from linux

Since you had complained about the races, and suggested using a
workqueue rather than a kernel thread, I thought that meant you
had dropped them, so I had gone back to the drawing board.  I hope
to have a new version of the series for you this afternoon.

-- steve

> 
> patching file drivers/scsi/hpsa.c
> Hunk #1 succeeded at 170 (offset -2 lines).
> Hunk #2 succeeded at 4881 (offset 247 lines).
> Hunk #3 succeeded at 4899 (offset 247 lines).
> Hunk #4 succeeded at 4918 (offset 247 lines).
> Hunk #5 succeeded at 4947 (offset 247 lines).
> Hunk #6 FAILED at 4862.
> Hunk #7 FAILED at 4928.
> 2 out of 7 hunks FAILED -- saving rejects to file
> drivers/scsi/hpsa.c.rej
> patching file drivers/scsi/hpsa.h
> Hunk #1 FAILED at 130.
> 1 out of 1 hunk FAILED -- saving rejects to file drivers/scsi/hpsa.h.rej
> 
> James
> 

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection
  2013-12-04 16:31     ` scameron
@ 2013-12-04 17:42       ` James Bottomley
  0 siblings, 0 replies; 23+ messages in thread
From: James Bottomley @ 2013-12-04 17:42 UTC (permalink / raw)
  To: scameron; +Cc: stephenmcameron, mikem, thenzl, linux-scsi, scott.teel

On Wed, 2013-12-04 at 10:31 -0600, scameron@beardog.cce.hp.com wrote:
> On Mon, Dec 02, 2013 at 10:00:02AM -0800, James Bottomley wrote:
> > On Thu, 2013-11-07 at 10:45 -0600, Stephen M. Cameron wrote:
> > > From: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > > 
> > > Much simpler and avoids races starting/stopping the thread.
> > > 
> > > Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
> > 
> > Actually, perhaps you do need to resubmit the series: this is rejecting
> > against scsi-misc because of your prior hpsa patches:
> 
> Ah, I did not realize you had taken these into your tree:
> 
> 2f2ec0b [SCSI] hpsa: bring logical drives online when format completes
> c99298f [SCSI] hpsa: hide logical drives with format in progress from linux

That's OK, I can drop them again ... I've been having a lot of trouble
with unsetting the imap labels I use to mark patches.

James



^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2013-12-04 17:42 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-11-07 16:45 [PATCH 00/11] hpsa: minor fixes and cleanups Stephen M. Cameron
2013-11-07 16:45 ` [PATCH 01/11] hpsa: use workqueue instead of kernel thread for lockup detection Stephen M. Cameron
2013-12-02 18:00   ` James Bottomley
2013-12-04 16:31     ` scameron
2013-12-04 17:42       ` James Bottomley
2013-11-07 16:45 ` [PATCH 02/11] hpsa: do not attempt to flush the cache on locked up controllers Stephen M. Cameron
2013-11-07 16:45 ` [PATCH 03/11] hpsa: add 5 second delay after doorbell reset Stephen M. Cameron
2013-11-08 13:51   ` Tomas Henzl
2013-11-08 14:44     ` scameron
2013-11-08 15:02       ` Tomas Henzl
2013-11-08 15:31         ` scameron
2013-12-01  0:42           ` James Bottomley
2013-12-02 17:15             ` Mike Miller
2013-12-02 17:23               ` James Bottomley
2013-12-02 17:24                 ` Miller, Mike (OS Dev)
2013-11-07 16:45 ` [PATCH 04/11] hpsa: do not discard scsi status on aborted commands Stephen M. Cameron
2013-11-07 16:45 ` [PATCH 05/11] hpsa: remove unneeded include of seq_file.h Stephen M. Cameron
2013-11-07 16:45 ` [PATCH 06/11] hpsa: fix memory leak in CCISS_BIG_PASSTHRU ioctl Stephen M. Cameron
2013-11-07 16:46 ` [PATCH 07/11] hpsa: add MSA 2040 to list of external target devices Stephen M. Cameron
2013-11-07 16:46 ` [PATCH 08/11] hpsa: cap CCISS_PASSTHRU at 20 concurrent commands Stephen M. Cameron
2013-11-07 16:46 ` [PATCH 09/11] hpsa: prevent stalled i/o Stephen M. Cameron
2013-11-07 16:46 ` [PATCH 10/11] hpsa: rename scsi prefetch field Stephen M. Cameron
2013-11-07 16:46 ` [PATCH 11/11] hpsa: enable unit attention reporting Stephen M. Cameron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).