[PATCH 12/16] cciss: do soft reset if hard reset is broken

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Stephen M. Cameron" <scameron@beardog.cce.hp.com>
To: axboe@kernel.dk
Cc: mikem@beardog.cce.hp.com, akpm@linux-foundation.org,
	thenzl@redhat.com, linux-kernel@vger.kernel.org,
	smcameron@yahoo.com
Subject: [PATCH 12/16] cciss: do soft reset if hard reset is broken
Date: Tue, 03 May 2011 14:53:52 -0500	[thread overview]
Message-ID: <20110503195352.5154.5183.stgit@beardog.cce.hp.com> (raw)
In-Reply-To: <20110503194919.5154.78352.stgit@beardog.cce.hp.com>

From: Stephen M. Cameron <scameron@beardog.cce.hp.com>

on driver load, if reset_devices is set, and the hard reset
attempts fail, try to bring up the controller to the point that
a command can be sent, and send it a soft reset command, then
after the reset undo whatever driver initialization was done to get
it to the point to take a command, and re-do it after the reset.

This is to get kdump to work on all the "non-resettable" controllers
(except 64xx controllers which can't be reset due to the potentially
shared cache module.)

Signed-off-by: Stephen M. Cameron <scameron@beardog.cce.hp.com>
---
 drivers/block/cciss.c |  236 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 221 insertions(+), 15 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index df58c59..23b0ba4 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2477,6 +2477,31 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 	return 0;
 }
 
+static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
+	u8 reset_type)
+{
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
+		CTLR_LUNID, TYPE_MSG);
+	c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
+	if (return_status != IO_OK) {
+		cmd_special_free(h, c);
+		return return_status;
+	}
+	c->waiting = NULL;
+	enqueue_cmd_and_start_io(h, c);
+	/* Don't wait for completion, the reset won't complete.  Don't free
+	 * the command either.  This is the last command we will send before
+	 * re-initializing everything, so it doesn't matter and won't leak.
+	 */
+	return 0;
+}
+
 static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
 		size_t size, __u8 page_code, unsigned char *scsi3addr,
 		int cmd_type)
@@ -3463,6 +3488,63 @@ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
 	return next_command(h);
 }
 
+/* Some controllers, like p400, will give us one interrupt
+ * after a soft reset, even if we turned interrupts off.
+ * Only need to check for this in the cciss_xxx_discard_completions
+ * functions.
+ */
+static int ignore_bogus_interrupt(ctlr_info_t *h)
+{
+	if (likely(!reset_devices))
+		return 0;
+
+	if (likely(h->interrupts_enabled))
+		return 0;
+
+	dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
+		"(known firmware bug.)  Ignoring.\n");
+
+	return 1;
+}
+
+static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
+
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
+
+	if (interrupt_not_for_us(h))
+		return IRQ_NONE;
+	spin_lock_irqsave(&h->lock, flags);
+	while (interrupt_pending(h)) {
+		raw_tag = get_next_completion(h);
+		while (raw_tag != FIFO_EMPTY)
+			raw_tag = next_command(h);
+	}
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
+{
+	ctlr_info_t *h = dev_id;
+	unsigned long flags;
+	u32 raw_tag;
+
+	if (ignore_bogus_interrupt(h))
+		return IRQ_NONE;
+
+	spin_lock_irqsave(&h->lock, flags);
+	raw_tag = get_next_completion(h);
+	while (raw_tag != FIFO_EMPTY)
+		raw_tag = next_command(h);
+	spin_unlock_irqrestore(&h->lock, flags);
+	return IRQ_HANDLED;
+}
+
 static irqreturn_t do_cciss_intx(int irq, void *dev_id)
 {
 	ctlr_info_t *h = dev_id;
@@ -4380,7 +4462,6 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u
 	return 0;
 }
 
-#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0)
 #define cciss_noop(p) cciss_message(p, 3, 0)
 
 static int cciss_controller_hard_reset(struct pci_dev *pdev,
@@ -4591,13 +4672,17 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	/* Wait for board to become not ready, then ready. */
 	dev_info(&pdev->dev, "Waiting for board to reset.\n");
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
-	if (rc) /* Don't bail, might be E500, etc. which can't be reset */
-		dev_warn(&pdev->dev,
-			"failed waiting for board to reset\n");
+	if (rc) {
+		dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
+				"  Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+		goto unmap_cfgtable;
+	}
 	rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
 	if (rc) {
 		dev_warn(&pdev->dev,
-			"failed waiting for board to become ready\n");
+			"failed waiting for board to become ready "
+			"after hard reset\n");
 		goto unmap_cfgtable;
 	}
 
@@ -4605,16 +4690,13 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
 	if (rc < 0)
 		goto unmap_cfgtable;
 	if (rc) {
-		dev_warn(&pdev->dev, "Unable to successfully reset controller,"
-			" Ignoring controller.\n");
-		rc = -ENODEV;
-		goto unmap_cfgtable;
+		dev_warn(&pdev->dev, "Unable to successfully hard reset "
+			"controller. Will try soft reset.\n");
+		rc = -ENOTSUPP; /* Not expected, but try soft reset later */
 	} else {
-		dev_info(&pdev->dev, "board ready.\n");
+		dev_info(&pdev->dev, "Board ready after hard reset.\n");
 	}
 
-	dev_info(&pdev->dev, "board ready.\n");
-
 unmap_cfgtable:
 	iounmap(cfgtable);
 
@@ -4639,7 +4721,7 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev)
 	 * due to concerns about shared bbwc between 6402/6404 pair.
 	 */
 	if (rc == -ENOTSUPP)
-		return 0; /* just try to do the kdump anyhow. */
+		return rc; /* just try to do the kdump anyhow. */
 	if (rc)
 		return -ENODEV;
 
@@ -4745,6 +4827,60 @@ static int cciss_request_irq(ctlr_info_t *h,
 	return -1;
 }
 
+static int __devinit cciss_kdump_soft_reset(ctlr_info_t *h)
+{
+	if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
+		dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
+		return -EIO;
+	}
+
+	dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
+		dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
+		return -1;
+	}
+
+	dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
+	if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
+		dev_warn(&h->pdev->dev, "Board failed to become ready "
+			"after soft reset.\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
+{
+	int ctlr = h->ctlr;
+
+	free_irq(h->intr[PERF_MODE_INT], h);
+#ifdef CONFIG_PCI_MSI
+	if (h->msix_vector)
+		pci_disable_msix(h->pdev);
+	else if (h->msi_vector)
+		pci_disable_msi(h->pdev);
+#endif /* CONFIG_PCI_MSI */
+	cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+	cciss_free_scatterlists(h);
+	cciss_free_cmd_pool(h);
+	kfree(h->blockFetchTable);
+	if (h->reply_pool)
+		pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+				h->reply_pool, h->reply_pool_dhandle);
+	if (h->transtable)
+		iounmap(h->transtable);
+	if (h->cfgtable)
+		iounmap(h->cfgtable);
+	if (h->vaddr)
+		iounmap(h->vaddr);
+	unregister_blkdev(h->major, h->devname);
+	cciss_destroy_hba_sysfs_entry(h);
+	pci_release_regions(h->pdev);
+	kfree(h);
+	hba[ctlr] = NULL;
+}
+
 /*
  *  This is it.  Find all the controllers and register them.  I really hate
  *  stealing all these major device numbers.
@@ -4756,13 +4892,27 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	int i;
 	int j = 0;
 	int rc;
+	int try_soft_reset = 0;
 	int dac, return_code;
 	InquiryData_struct *inq_buff;
 	ctlr_info_t *h;
+	unsigned long flags;
 
 	rc = cciss_init_reset_devices(pdev);
-	if (rc)
-		return rc;
+	if (rc) {
+		if (rc != -ENOTSUPP)
+			return rc;
+		/* If the reset fails in a particular way (it has no way to do
+		 * a proper hard reset, so returns -ENOTSUPP) we can try to do
+		 * a soft reset once we get the controller configured up to the
+		 * point that it can accept a command.
+		 */
+		try_soft_reset = 1;
+		rc = 0;
+	}
+
+reinit_after_soft_reset:
+
 	i = alloc_cciss_hba(pdev);
 	if (i < 0)
 		return -1;
@@ -4852,6 +5002,62 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		h->gendisk[j] = NULL;
 	}
 
+	/* At this point, the controller is ready to take commands.
+	 * Now, if reset_devices and the hard reset didn't work, try
+	 * the soft reset and see if that works.
+	 */
+	if (try_soft_reset) {
+
+		/* This is kind of gross.  We may or may not get a completion
+		 * from the soft reset command, and if we do, then the value
+		 * from the fifo may or may not be valid.  So, we wait 10 secs
+		 * after the reset throwing away any completions we get during
+		 * that time.  Unregister the interrupt handler and register
+		 * fake ones to scoop up any residual completions.
+		 */
+		spin_lock_irqsave(&h->lock, flags);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+		spin_unlock_irqrestore(&h->lock, flags);
+		free_irq(h->intr[PERF_MODE_INT], h);
+		rc = cciss_request_irq(h, cciss_msix_discard_completions,
+					cciss_intx_discard_completions);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Failed to request_irq after "
+				"soft reset.\n");
+			goto clean4;
+		}
+
+		rc = cciss_kdump_soft_reset(h);
+		if (rc) {
+			dev_warn(&h->pdev->dev, "Soft reset failed.\n");
+			goto clean4;
+		}
+
+		dev_info(&h->pdev->dev, "Board READY.\n");
+		dev_info(&h->pdev->dev,
+			"Waiting for stale completions to drain.\n");
+		h->access.set_intr_mask(h, CCISS_INTR_ON);
+		msleep(10000);
+		h->access.set_intr_mask(h, CCISS_INTR_OFF);
+
+		rc = controller_reset_failed(h->cfgtable);
+		if (rc)
+			dev_info(&h->pdev->dev,
+				"Soft reset appears to have failed.\n");
+
+		/* since the controller's reset, we have to go back and re-init
+		 * everything.  Easiest to just forget what we've done and do it
+		 * all over again.
+		 */
+		cciss_undo_allocations_after_kdump_soft_reset(h);
+		try_soft_reset = 0;
+		if (rc)
+			/* don't go to clean4, we already unallocated */
+			return -ENODEV;
+
+		goto reinit_after_soft_reset;
+	}
+
 	cciss_scsi_setup(h);
 
 	/* Turn the interrupts on so we can service requests */

next prev parent reply	other threads:[~2011-05-03 20:02 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-05-03 19:52 [PATCH 00/16] cciss: May 3, 2011 updates Stephen M. Cameron
2011-05-03 19:52 ` [PATCH 01/16] cciss: add readl after writel in interrupt mask setting code Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 02/16] cciss: do a better job of detecting controller reset failure Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 03/16] cciss: factor out command pool allocation functions Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 04/16] cciss: factor out scatterlist " Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 05/16] cciss: factor out irq request code Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 06/16] cciss: fix reply pool and block fetch table memory leaks Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 07/16] cciss: get rid of message related magic numbers Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 08/16] cciss: increase time to wait for board reset to start Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 09/16] cciss: clarify messages around reset behavior Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 10/16] cciss: increase timeouts for post-reset no-ops Stephen M. Cameron
2011-05-03 19:53 ` [PATCH 11/16] cciss: use new doorbell-bit-5 reset method Stephen M. Cameron
2011-05-03 19:53 ` Stephen M. Cameron [this message]
2011-05-03 19:53 ` [PATCH 13/16] cciss: remove superfluous sleeps around reset code Stephen M. Cameron
2011-05-03 19:54 ` [PATCH 14/16] cciss: do not attempt PCI power management reset method if we know it won't work Stephen M. Cameron
2011-05-03 19:54 ` [PATCH 15/16] cciss: do not use bit 2 doorbell reset Stephen M. Cameron
2011-05-03 19:54 ` [PATCH 16/16] cciss: add cciss_tape_cmds module paramter Stephen M. Cameron
2011-05-06 14:29 ` [PATCH 00/16] cciss: May 3, 2011 updates Jens Axboe

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:df58c59 dfblob:23b0ba4 )
 OR (
bs:"[PATCH 12/16] cciss: do soft reset if hard reset is broken" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110503195352.5154.5183.stgit@beardog.cce.hp.com \
    --to=scameron@beardog.cce.hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mikem@beardog.cce.hp.com \
    --cc=smcameron@yahoo.com \
    --cc=thenzl@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.