public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Ranjan Kumar <ranjan.kumar@broadcom.com>,
	Sumit Saxena <sumit.saxena@broadcom.com>,
	"Martin K . Petersen" <martin.petersen@oracle.com>,
	Sasha Levin <sashal@kernel.org>,
	sathya.prakash@broadcom.com, kashyap.desai@broadcom.com,
	sreekanth.reddy@broadcom.com,
	James.Bottomley@HansenPartnership.com,
	mpi3mr-linuxdrv.pdl@broadcom.com, linux-scsi@vger.kernel.org
Subject: [PATCH AUTOSEL 6.14 09/54] scsi: mpi3mr: Synchronous access b/w reset and tm thread for reply queue
Date: Thu,  3 Apr 2025 15:01:24 -0400	[thread overview]
Message-ID: <20250403190209.2675485-9-sashal@kernel.org> (raw)
In-Reply-To: <20250403190209.2675485-1-sashal@kernel.org>

From: Ranjan Kumar <ranjan.kumar@broadcom.com>

[ Upstream commit f195fc060c738d303a21fae146dbf85e1595fb4c ]

When the task management thread processes reply queues while the reset
thread resets them, the task management thread accesses an invalid queue ID
(0xFFFF), set by the reset thread, which points to unallocated memory,
causing a crash.

Add flag 'io_admin_reset_sync' to synchronize access between the reset,
I/O, and admin threads. Before a reset, the reset handler sets this flag to
block I/O and admin processing threads. If any thread bypasses the initial
check, the reset thread waits up to 10 seconds for processing to finish. If
the wait exceeds 10 seconds, the controller is marked as unrecoverable.

Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Ranjan Kumar <ranjan.kumar@broadcom.com>
Link: https://lore.kernel.org/r/20250129100850.25430-4-ranjan.kumar@broadcom.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/scsi/mpi3mr/mpi3mr.h    |  2 +
 drivers/scsi/mpi3mr/mpi3mr_fw.c | 67 +++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 9ed20ed581be6..6e3f337ace9f8 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -1096,6 +1096,7 @@ struct scmd_priv {
  * @ts_update_interval: Timestamp update interval
  * @reset_in_progress: Reset in progress flag
  * @unrecoverable: Controller unrecoverable flag
+ * @io_admin_reset_sync: Manage state of I/O ops during an admin reset process
  * @prev_reset_result: Result of previous reset
  * @reset_mutex: Controller reset mutex
  * @reset_waitq: Controller reset  wait queue
@@ -1284,6 +1285,7 @@ struct mpi3mr_ioc {
 	u16 ts_update_interval;
 	u8 reset_in_progress;
 	u8 unrecoverable;
+	u8 io_admin_reset_sync;
 	int prev_reset_result;
 	struct mutex reset_mutex;
 	wait_queue_head_t reset_waitq;
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index 656108dd2ee30..ec5b1ab287177 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -17,7 +17,7 @@ static void mpi3mr_process_factsdata(struct mpi3mr_ioc *mrioc,
 	struct mpi3_ioc_facts_data *facts_data);
 static void mpi3mr_pel_wait_complete(struct mpi3mr_ioc *mrioc,
 	struct mpi3mr_drv_cmd *drv_cmd);
-
+static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc);
 static int poll_queues;
 module_param(poll_queues, int, 0444);
 MODULE_PARM_DESC(poll_queues, "Number of queues for io_uring poll mode. (Range 1 - 126)");
@@ -459,7 +459,7 @@ int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc)
 	}
 
 	do {
-		if (mrioc->unrecoverable)
+		if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
 			break;
 
 		mrioc->admin_req_ci = le16_to_cpu(reply_desc->request_queue_ci);
@@ -554,7 +554,7 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc,
 	}
 
 	do {
-		if (mrioc->unrecoverable)
+		if (mrioc->unrecoverable || mrioc->io_admin_reset_sync)
 			break;
 
 		req_q_idx = le16_to_cpu(reply_desc->request_queue_id) - 1;
@@ -4394,6 +4394,7 @@ int mpi3mr_reinit_ioc(struct mpi3mr_ioc *mrioc, u8 is_resume)
 		goto out_failed_noretry;
 	}
 
+	mrioc->io_admin_reset_sync = 0;
 	if (is_resume || mrioc->block_on_pci_err) {
 		dprint_reset(mrioc, "setting up single ISR\n");
 		retval = mpi3mr_setup_isr(mrioc, 1);
@@ -5252,6 +5253,55 @@ void mpi3mr_pel_get_seqnum_complete(struct mpi3mr_ioc *mrioc,
 	drv_cmd->retry_count = 0;
 }
 
+/**
+ * mpi3mr_check_op_admin_proc -
+ * @mrioc: Adapter instance reference
+ *
+ * Check if any of the operation reply queues
+ * or the admin reply queue are currently in use.
+ * If any queue is in use, this function waits for
+ * a maximum of 10 seconds for them to become available.
+ *
+ * Return: 0 on success, non-zero on failure.
+ */
+static int mpi3mr_check_op_admin_proc(struct mpi3mr_ioc *mrioc)
+{
+
+	u16 timeout = 10 * 10;
+	u16 elapsed_time = 0;
+	bool op_admin_in_use = false;
+
+	do {
+		op_admin_in_use = false;
+
+		/* Check admin_reply queue first to exit early */
+		if (atomic_read(&mrioc->admin_reply_q_in_use) == 1)
+			op_admin_in_use = true;
+		else {
+			/* Check op_reply queues */
+			int i;
+
+			for (i = 0; i < mrioc->num_queues; i++) {
+				if (atomic_read(&mrioc->op_reply_qinfo[i].in_use) == 1) {
+					op_admin_in_use = true;
+					break;
+				}
+			}
+		}
+
+		if (!op_admin_in_use)
+			break;
+
+		msleep(100);
+
+	} while (++elapsed_time < timeout);
+
+	if (op_admin_in_use)
+		return 1;
+
+	return 0;
+}
+
 /**
  * mpi3mr_soft_reset_handler - Reset the controller
  * @mrioc: Adapter instance reference
@@ -5332,6 +5382,7 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
 	mpi3mr_wait_for_host_io(mrioc, MPI3MR_RESET_HOST_IOWAIT_TIMEOUT);
 
 	mpi3mr_ioc_disable_intr(mrioc);
+	mrioc->io_admin_reset_sync = 1;
 
 	if (snapdump) {
 		mpi3mr_set_diagsave(mrioc);
@@ -5359,6 +5410,16 @@ int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
 		ioc_err(mrioc, "Failed to issue soft reset to the ioc\n");
 		goto out;
 	}
+
+	retval = mpi3mr_check_op_admin_proc(mrioc);
+	if (retval) {
+		ioc_err(mrioc, "Soft reset failed due to an Admin or I/O queue polling\n"
+				"thread still processing replies even after a 10 second\n"
+				"timeout. Marking the controller as unrecoverable!\n");
+
+		goto out;
+	}
+
 	if (mrioc->num_io_throttle_group !=
 	    mrioc->facts.max_io_throttle_group) {
 		ioc_err(mrioc,
-- 
2.39.5


  parent reply	other threads:[~2025-04-03 19:02 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-03 19:01 [PATCH AUTOSEL 6.14 01/54] wifi: ath9k: use unsigned long for activity check timestamp Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 02/54] wifi: ath11k: Fix DMA buffer allocation to resolve SWIOTLB issues Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 03/54] wifi: ath11k: fix memory leak in ath11k_xxx_remove() Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 04/54] wifi: ath12k: fix memory leak in ath12k_pci_remove() Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 05/54] wifi: ath12k: Fix invalid entry fetch in ath12k_dp_mon_srng_process Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 06/54] wifi: ath12k: Avoid memory leak while enabling statistics Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 07/54] ata: libata-core: Add 'external' to the libata.force kernel parameter Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 08/54] scsi: mpi3mr: Avoid reply queue full condition Sasha Levin
2025-04-03 19:01 ` Sasha Levin [this message]
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 10/54] net: page_pool: don't cast mp param to devmem Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 11/54] f2fs: don't retry IO for corrupted data scenario Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 12/54] wifi: mac80211: add strict mode disabling workarounds Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 13/54] wifi: mac80211: ensure sdata->work is canceled before initialized Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 14/54] scsi: target: spc: Fix RSOC parameter data header size Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 15/54] net: usb: asix_devices: add FiberGecko DeviceID Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 16/54] page_pool: avoid infinite loop to schedule delayed worker Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 17/54] can: flexcan: Add quirk to handle separate interrupt lines for mailboxes Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 18/54] can: flexcan: add NXP S32G2/S32G3 SoC support Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 19/54] jfs: Fix uninit-value access of imap allocated in the diMount() function Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 20/54] mptcp: move the whole rx path under msk socket lock protection Sasha Levin
2025-04-10 11:05   ` Matthieu Baerts
2025-04-14  0:12     ` Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 21/54] fs/jfs: cast inactags to s64 to prevent potential overflow Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 22/54] fs/jfs: Prevent integer overflow in AG size calculation Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 23/54] jfs: Prevent copying of nlink with value 0 from disk inode Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 24/54] jfs: add sanity check for agwidth in dbMount Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 25/54] wifi: rtw88: Add support for Mercusys MA30N and D-Link DWA-T185 rev. A1 Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 26/54] ata: libata-eh: Do not use ATAPI DMA for a device limited to PIO mode Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 27/54] net: sfp: add quirk for 2.5G OEM BX SFP Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 28/54] wifi: ath12k: Fix invalid data access in ath12k_dp_rx_h_undecap_nwifi Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 29/54] null_blk: replace null_process_cmd() call in null_zone_write() Sasha Levin
2025-04-04  3:31   ` Shinichiro Kawasaki
2025-04-14  0:12     ` Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 30/54] f2fs: fix to avoid out-of-bounds access in f2fs_truncate_inode_blocks() Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 31/54] net: sfp: add quirk for FS SFP-10GM-T copper SFP+ module Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 32/54] ahci: add PCI ID for Marvell 88SE9215 SATA Controller Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 33/54] ext4: protect ext4_release_dquot against freezing Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 34/54] Revert "f2fs: rebuild nat_bits during umount" Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 35/54] wifi: mac80211: fix userspace_selectors corruption Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 36/54] ext4: ignore xattrs past end Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 37/54] cdc_ether|r8152: ThinkPad Hybrid USB-C/A Dock quirk Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 38/54] scsi: st: Fix array overflow in st_setup() Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 39/54] ahci: Marvell 88SE9215 controllers prefer DMA for ATAPI Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 40/54] btrfs: reject out-of-band dirty folios during writeback Sasha Levin
2025-04-03 19:37   ` David Sterba
2025-04-14  0:11     ` Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 41/54] btrfs: harden block_group::bg_list against list_del() races Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 42/54] wifi: mt76: mt76x2u: add TP-Link TL-WDN6200 ID to device table Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 43/54] net: vlan: don't propagate flags on open Sasha Levin
2025-04-03 19:01 ` [PATCH AUTOSEL 6.14 44/54] tracing: fix return value in __ftrace_event_enable_disable for TRACE_REG_UNREGISTER Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 45/54] Bluetooth: btusb: Add new VID/PID for WCN785x Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 46/54] Bluetooth: btintel_pcie: Add device id of Whale Peak Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 47/54] Bluetooth: btusb: Add 13 USB device IDs for Qualcomm WCN785x Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 48/54] Bluetooth: hci_uart: fix race during initialization Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 49/54] Bluetooth: btusb: Add 2 HWIDs for MT7922 Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 50/54] Bluetooth: hci_qca: use the power sequencer for wcn6750 Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 51/54] Bluetooth: qca: simplify WCN399x NVM loading Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 52/54] Bluetooth: qca: add WCN3950 support Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 53/54] Bluetooth: Add quirk for broken READ_VOICE_SETTING Sasha Levin
2025-04-03 19:02 ` [PATCH AUTOSEL 6.14 54/54] Bluetooth: Add quirk for broken READ_PAGE_SCAN_TYPE Sasha Levin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250403190209.2675485-9-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=James.Bottomley@HansenPartnership.com \
    --cc=kashyap.desai@broadcom.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=mpi3mr-linuxdrv.pdl@broadcom.com \
    --cc=ranjan.kumar@broadcom.com \
    --cc=sathya.prakash@broadcom.com \
    --cc=sreekanth.reddy@broadcom.com \
    --cc=stable@vger.kernel.org \
    --cc=sumit.saxena@broadcom.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox