linux-pci.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
@ 2025-03-14 10:16 Chandrashekar Devegowda
  2025-03-14 19:40 ` Bjorn Helgaas
  0 siblings, 1 reply; 11+ messages in thread
From: Chandrashekar Devegowda @ 2025-03-14 10:16 UTC (permalink / raw)
  To: linux-bluetooth
  Cc: linux-pci, bhelgaas, ravishankar.srivatsa, chethan.tumkur.narayan,
	Chandrashekar Devegowda, Kiran K

Support function level reset (flr) on hardware exception to recover
controller. Driver also implements the back-off time of 5 seconds and
the maximum number of retries are limited to 5 before giving up.

The function btintel_acpi_reset_method() is made public and reused to
invoke flr for pcie products.

Signed-off-by: Chandrashekar Devegowda <chandrashekar.devegowda@intel.com>
Signed-off-by: Kiran K <kiran.k@intel.com>
---
 drivers/bluetooth/btintel.c      |  11 +-
 drivers/bluetooth/btintel.h      |  12 ++
 drivers/bluetooth/btintel_pcie.c | 246 ++++++++++++++++++++++++++++++-
 3 files changed, 260 insertions(+), 9 deletions(-)

diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index ec5e2c7a56ae..aae6a85d9f64 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -30,11 +30,6 @@
 
 #define BTINTEL_EFI_DSBR	L"UefiCnvCommonDSBR"
 
-enum {
-	DSM_SET_WDISABLE2_DELAY = 1,
-	DSM_SET_RESET_METHOD = 3,
-};
-
 #define CMD_WRITE_BOOT_PARAMS	0xfc0e
 struct cmd_write_boot_params {
 	__le32 boot_addr;
@@ -49,9 +44,10 @@ static struct {
 	u32        fw_build_num;
 } coredump_info;
 
-static const guid_t btintel_guid_dsm =
+const guid_t btintel_guid_dsm =
 	GUID_INIT(0xaa10f4e0, 0x81ac, 0x4233,
 		  0xab, 0xf6, 0x3b, 0x2a, 0xc5, 0x0e, 0x28, 0xd9);
+EXPORT_SYMBOL_GPL(btintel_guid_dsm);
 
 int btintel_check_bdaddr(struct hci_dev *hdev)
 {
@@ -2547,7 +2543,7 @@ static void btintel_set_ppag(struct hci_dev *hdev, struct intel_version_tlv *ver
 	kfree_skb(skb);
 }
 
-static int btintel_acpi_reset_method(struct hci_dev *hdev)
+int btintel_acpi_reset_method(struct hci_dev *hdev)
 {
 	int ret = 0;
 	acpi_status status;
@@ -2586,6 +2582,7 @@ static int btintel_acpi_reset_method(struct hci_dev *hdev)
 	kfree(buffer.pointer);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(btintel_acpi_reset_method);
 
 static void btintel_set_dsm_reset_method(struct hci_dev *hdev,
 					 struct intel_version_tlv *ver_tlv)
diff --git a/drivers/bluetooth/btintel.h b/drivers/bluetooth/btintel.h
index 19530ea14905..bc4edfe08216 100644
--- a/drivers/bluetooth/btintel.h
+++ b/drivers/bluetooth/btintel.h
@@ -64,6 +64,12 @@ struct intel_tlv {
 
 #define BTINTEL_FWID_MAXLEN 64
 
+enum {
+	DSM_SET_WDISABLE2_DELAY = 1,
+	DSM_SET_RESET_METHOD = 3,
+	DSM_SET_RESET_METHOD_PCIE = 5 /* Used for PCIe products */
+};
+
 struct intel_version_tlv {
 	u32	cnvi_top;
 	u32	cnvr_top;
@@ -255,6 +261,7 @@ int btintel_shutdown_combined(struct hci_dev *hdev);
 void btintel_hw_error(struct hci_dev *hdev, u8 code);
 void btintel_print_fseq_info(struct hci_dev *hdev);
 int btintel_diagnostics(struct hci_dev *hdev, struct sk_buff *skb);
+int btintel_acpi_reset_method(struct hci_dev *hdev);
 #else
 
 static inline int btintel_check_bdaddr(struct hci_dev *hdev)
@@ -393,4 +400,9 @@ static inline int btintel_diagnostics(struct hci_dev *hdev, struct sk_buff *skb)
 {
 	return -EOPNOTSUPP;
 }
+
+static int btintel_acpi_reset_method(struct hci_dev *hdev)
+{
+	return -EOPNOTSUPP;
+}
 #endif
diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c
index 9114be1fc3ce..9346c8d68636 100644
--- a/drivers/bluetooth/btintel_pcie.c
+++ b/drivers/bluetooth/btintel_pcie.c
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/acpi.h>
 
 #include <linux/unaligned.h>
 
@@ -22,6 +23,8 @@
 #include "btintel.h"
 #include "btintel_pcie.h"
 
+extern const guid_t btintel_guid_dsm;
+
 #define VERSION "0.1"
 
 #define BTINTEL_PCI_DEVICE(dev, subdev)	\
@@ -41,6 +44,13 @@ static const struct pci_device_id btintel_pcie_table[] = {
 };
 MODULE_DEVICE_TABLE(pci, btintel_pcie_table);
 
+struct btintel_pcie_dev_restart_data {
+	struct list_head list;
+	u8 restart_count;
+	time64_t last_error;
+	char name[];
+};
+
 /* Intel PCIe uses 4 bytes of HCI type instead of 1 byte BT SIG HCI type */
 #define BTINTEL_PCIE_HCI_TYPE_LEN	4
 #define BTINTEL_PCIE_HCI_CMD_PKT	0x00000001
@@ -62,6 +72,9 @@ MODULE_DEVICE_TABLE(pci, btintel_pcie_table);
 #define BTINTEL_PCIE_TRIGGER_REASON_USER_TRIGGER	0x17A2
 #define BTINTEL_PCIE_TRIGGER_REASON_FW_ASSERT		0x1E61
 
+#define BTINTEL_PCIE_RESET_OK_TIME_SECS		5
+#define BTINTEL_PCIE_FLR_RESET_MAX_RETRY	5
+
 /* Alive interrupt context */
 enum {
 	BTINTEL_PCIE_ROM,
@@ -99,6 +112,14 @@ struct btintel_pcie_dbgc_ctxt {
 	struct btintel_pcie_dbgc_ctxt_buf bufs[BTINTEL_PCIE_DBGC_BUFFER_COUNT];
 };
 
+struct btintel_pcie_removal {
+	struct pci_dev *pdev;
+	struct work_struct work;
+};
+
+static LIST_HEAD(btintel_pcie_restart_data_list);
+static DEFINE_SPINLOCK(btintel_pcie_restart_data_lock);
+
 /* This function initializes the memory for DBGC buffers and formats the
  * DBGC fragment which consists header info and DBGC buffer's LSB, MSB and
  * size as the payload
@@ -1804,6 +1825,9 @@ static int btintel_pcie_send_frame(struct hci_dev *hdev,
 	u32 type;
 	u32 old_ctxt;
 
+	if (test_bit(BTINTEL_PCIE_CORE_HALTED, &data->flags))
+		return -ENODEV;
+
 	/* Due to the fw limitation, the type header of the packet should be
 	 * 4 bytes unlike 1 byte for UART. In UART, the firmware can read
 	 * the first byte to get the packet type and redirect the rest of data
@@ -2041,6 +2065,210 @@ static int btintel_pcie_setup(struct hci_dev *hdev)
 	return err;
 }
 
+static struct btintel_pcie_dev_restart_data *btintel_pcie_get_restart_data(struct pci_dev *pdev,
+									   struct device *dev)
+{
+	struct btintel_pcie_dev_restart_data *tmp, *data = NULL;
+	const char *name = pci_name(pdev);
+	struct hci_dev *hdev = to_hci_dev(dev);
+
+	spin_lock(&btintel_pcie_restart_data_lock);
+	list_for_each_entry(tmp, &btintel_pcie_restart_data_list, list) {
+		if (strcmp(tmp->name, name))
+			continue;
+		data = tmp;
+		break;
+	}
+	spin_unlock(&btintel_pcie_restart_data_lock);
+
+	if (data) {
+		bt_dev_dbg(hdev, "Found restart data for BDF:%s", data->name);
+		return data;
+	}
+
+	/* First time allocate */
+	data = kzalloc(struct_size(data, name, strlen(name) + 1), GFP_ATOMIC);
+	if (!data)
+		return NULL;
+
+	strscpy(data->name, name, strlen(data->name) + 1);
+	spin_lock(&btintel_pcie_restart_data_lock);
+	list_add_tail(&data->list, &btintel_pcie_restart_data_list);
+	spin_unlock(&btintel_pcie_restart_data_lock);
+
+	return data;
+}
+
+static void btintel_pcie_free_restart_list(void)
+{
+	struct btintel_pcie_dev_restart_data *tmp;
+
+	while ((tmp = list_first_entry_or_null(&btintel_pcie_restart_data_list,
+					       typeof(*tmp), list))) {
+		list_del(&tmp->list);
+		kfree(tmp);
+	}
+}
+
+static void btintel_pcie_inc_restart_count(struct pci_dev *pdev, struct device *dev)
+{
+	struct btintel_pcie_dev_restart_data *data;
+	struct hci_dev *hdev = to_hci_dev(dev);
+	time64_t retry_window;
+
+	data = btintel_pcie_get_restart_data(pdev, dev);
+	if (!data)
+		return;
+
+	retry_window = ktime_get_boottime_seconds() - data->last_error;
+	if (data->restart_count == 0) {
+		/* First iteration initialise the time and counter */
+		data->last_error = ktime_get_boottime_seconds();
+		data->restart_count++;
+		bt_dev_dbg(hdev, "First iteration initialise. last_error:%lld seconds restart_count:%d",
+			   data->last_error, data->restart_count);
+	} else if (retry_window < BTINTEL_PCIE_RESET_OK_TIME_SECS &&
+		   data->restart_count <= BTINTEL_PCIE_FLR_RESET_MAX_RETRY) {
+		/* FLR triggered still within the Max retry time so
+		 * increment the counter
+		 */
+		data->restart_count++;
+		bt_dev_err(hdev, "FLR triggered still within the Max retry time so increment the restart_count:%d",
+			   data->restart_count);
+	} else if (retry_window > BTINTEL_PCIE_RESET_OK_TIME_SECS) {
+		/* FLR triggered out of the retry window so reset */
+		bt_dev_err(hdev, "FLR triggered OUT of the retry window so RESET counters last_error:%lld seconds restart_count:%d",
+			   data->last_error, data->restart_count);
+		data->last_error = 0;
+		data->restart_count = 0;
+	}
+}
+
+static void btintel_pcie_removal_work(struct work_struct *wk)
+{
+	struct btintel_pcie_removal *removal =
+		container_of(wk, struct btintel_pcie_removal, work);
+	struct pci_dev *pdev = removal->pdev;
+	struct pci_bus *bus;
+	struct btintel_pcie_data *data;
+
+	data = pci_get_drvdata(pdev);
+
+	pci_lock_rescan_remove();
+
+	bus = pdev->bus;
+	if (!bus)
+		goto out;
+
+	btintel_acpi_reset_method(data->hdev);
+	pci_stop_and_remove_bus_device(pdev);
+	pci_dev_put(pdev);
+
+	if (bus->parent)
+		bus = bus->parent;
+	pci_rescan_bus(bus);
+
+out:
+	pci_unlock_rescan_remove();
+	kfree(removal);
+}
+
+static void btintel_pcie_reset(struct hci_dev *hdev)
+{
+	struct btintel_pcie_removal *removal;
+	struct btintel_pcie_data *data;
+	union acpi_object *obj, argv4;
+	acpi_handle handle;
+	struct pldr_mode {
+		u16	cmd_type;
+		u16	cmd_payload;
+	} __packed mode;
+
+	data = hci_get_drvdata(hdev);
+	if (!btintel_test_flag(hdev, INTEL_FIRMWARE_LOADED)) {
+		bt_dev_dbg(hdev, "Firmware not loaded, so reset not supported");
+		return;
+	}
+
+	handle = ACPI_HANDLE(GET_HCIDEV_DEV(hdev));
+	if (!handle) {
+		bt_dev_dbg(hdev, "No support for bluetooth device in ACPI firmware");
+		return;
+	}
+
+	if (!acpi_has_method(handle, "_PRR")) {
+		bt_dev_err(hdev, "No support for _PRR ACPI method");
+		return;
+	}
+
+	if (!acpi_check_dsm(handle, &btintel_guid_dsm, 0,
+			    BIT(DSM_SET_RESET_METHOD_PCIE))) {
+		bt_dev_err(hdev, "No dsm support to set reset method for _PRR");
+		return;
+	}
+
+	/* set 1 for_PRR Mode */
+	mode.cmd_type = 1;
+	/* 0 – BT Core Reset (Default)
+	 * 1 – Product Reset (PLDR Abort flow)
+	 */
+	mode.cmd_payload = 0;
+
+	argv4.buffer.type = ACPI_TYPE_BUFFER;
+	argv4.buffer.length = sizeof(mode);
+	argv4.buffer.pointer = (void *)&mode;
+
+	obj = acpi_evaluate_dsm(handle, &btintel_guid_dsm, 0,
+				DSM_SET_RESET_METHOD_PCIE, &argv4);
+	if (!obj) {
+		bt_dev_err(hdev, "Failed to call dsm to set reset method");
+		return;
+	}
+	ACPI_FREE(obj);
+
+	removal = kzalloc(sizeof(*removal), GFP_ATOMIC);
+	if (!removal)
+		return;
+
+	removal->pdev = data->pdev;
+	INIT_WORK(&removal->work, btintel_pcie_removal_work);
+	pci_dev_get(removal->pdev);
+	schedule_work(&removal->work);
+}
+
+static void btintel_pcie_hw_error(struct hci_dev *hdev, u8 code)
+{
+	struct  btintel_pcie_dev_restart_data *data;
+	struct btintel_pcie_data *dev_data = hci_get_drvdata(hdev);
+	struct pci_dev *pdev = dev_data->pdev;
+	time64_t retry_window;
+
+	if (code == 0x13) {
+		bt_dev_err(hdev, "Encountered top exception");
+		return;
+	}
+
+	data = btintel_pcie_get_restart_data(pdev, &hdev->dev);
+	if (!data)
+		return;
+
+	retry_window = ktime_get_boottime_seconds() - data->last_error;
+
+	/* If within 5 seconds max 5 attempts have already been made
+	 * then stop any more retry and indicate to user for cold boot
+	 */
+	if (retry_window < BTINTEL_PCIE_RESET_OK_TIME_SECS &&
+	    data->restart_count >= BTINTEL_PCIE_FLR_RESET_MAX_RETRY) {
+		bt_dev_err(hdev, "Recovery options exhausted, Please cold boot!");
+		bt_dev_err(hdev, "Boot time:%lld seconds first_flr at:%lld seconds restart_count:%d",
+			   ktime_get_boottime_seconds(), data->last_error,
+			   data->restart_count);
+		return;
+	}
+	btintel_pcie_inc_restart_count(pdev, &hdev->dev);
+	btintel_pcie_reset(hdev);
+}
+
 static int btintel_pcie_setup_hdev(struct btintel_pcie_data *data)
 {
 	int err;
@@ -2062,9 +2290,10 @@ static int btintel_pcie_setup_hdev(struct btintel_pcie_data *data)
 	hdev->send = btintel_pcie_send_frame;
 	hdev->setup = btintel_pcie_setup;
 	hdev->shutdown = btintel_shutdown_combined;
-	hdev->hw_error = btintel_hw_error;
+	hdev->hw_error = btintel_pcie_hw_error;
 	hdev->set_diag = btintel_set_diag;
 	hdev->set_bdaddr = btintel_set_bdaddr;
+	hdev->reset = btintel_pcie_reset;
 
 	err = hci_register_dev(hdev);
 	if (err < 0) {
@@ -2208,7 +2437,20 @@ static struct pci_driver btintel_pcie_driver = {
 	.driver.coredump = btintel_pcie_coredump
 #endif
 };
-module_pci_driver(btintel_pcie_driver);
+
+static int __init btintel_pcie_init(void)
+{
+	return pci_register_driver(&btintel_pcie_driver);
+}
+
+static void __exit btintel_pcie_exit(void)
+{
+	pci_unregister_driver(&btintel_pcie_driver);
+	btintel_pcie_free_restart_list();
+}
+
+module_init(btintel_pcie_init);
+module_exit(btintel_pcie_exit);
 
 MODULE_AUTHOR("Tedd Ho-Jeong An <tedd.an@intel.com>");
 MODULE_DESCRIPTION("Intel Bluetooth PCIe transport driver ver " VERSION);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-03-14 10:16 [PATCH v1] Bluetooth: btintel_pcie: Support function level reset Chandrashekar Devegowda
@ 2025-03-14 19:40 ` Bjorn Helgaas
  2025-03-18 14:55   ` Luiz Augusto von Dentz
  0 siblings, 1 reply; 11+ messages in thread
From: Bjorn Helgaas @ 2025-03-14 19:40 UTC (permalink / raw)
  To: Chandrashekar Devegowda
  Cc: linux-bluetooth, linux-pci, bhelgaas, ravishankar.srivatsa,
	chethan.tumkur.narayan, Kiran K

On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda wrote:
> Support function level reset (flr) on hardware exception to recover
> controller. Driver also implements the back-off time of 5 seconds and
> the maximum number of retries are limited to 5 before giving up.

Sort of weird that the commit log mentions FLR, but it's not mentioned
in the patch itself except for BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE performs
an FLR.

Since this is an ACPI _DSM, presumably this mechanism only works for
devices built into the platform, not for any potential plug-in devices
that would not be described via ACPI.  I guess this driver probably
already only works for built-in devices because it also uses
DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.

There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
assume the _DSM exists because the device needs some additional
device-specific work around the FLR.

> +static void btintel_pcie_removal_work(struct work_struct *wk)
> +{
> +	struct btintel_pcie_removal *removal =
> +		container_of(wk, struct btintel_pcie_removal, work);
> +	struct pci_dev *pdev = removal->pdev;
> +	struct pci_bus *bus;
> +	struct btintel_pcie_data *data;
> +
> +	data = pci_get_drvdata(pdev);
> +
> +	pci_lock_rescan_remove();
> +
> +	bus = pdev->bus;
> +	if (!bus)
> +		goto out;
> +
> +	btintel_acpi_reset_method(data->hdev);
> +	pci_stop_and_remove_bus_device(pdev);
> +	pci_dev_put(pdev);
> +
> +	if (bus->parent)
> +		bus = bus->parent;
> +	pci_rescan_bus(bus);

This remove and rescan by a driver that's bound to the device subverts
the driver model.  pci_stop_and_remove_bus_device() detaches the
driver from the device.  After the driver is detached, we should not
be running any driver code.

There are a couple other drivers that remove their own device (ath9k,
iwlwifi, asus_wmi, eeepc-laptop), but I think those are broken and
it's a mistake to add this pattern to more drivers.

What's the reason for doing the remove and rescan?  The PCI core
doesn't reset the device when you do this, so it's not a "bigger
hammer reset".

> +out:
> +	pci_unlock_rescan_remove();
> +	kfree(removal);
> +}

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-03-14 19:40 ` Bjorn Helgaas
@ 2025-03-18 14:55   ` Luiz Augusto von Dentz
  2025-03-18 15:47     ` Bjorn Helgaas
  0 siblings, 1 reply; 11+ messages in thread
From: Luiz Augusto von Dentz @ 2025-03-18 14:55 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Chandrashekar Devegowda, linux-bluetooth, linux-pci, bhelgaas,
	ravishankar.srivatsa, chethan.tumkur.narayan, Kiran K

Hi Bjorn,

On Fri, Mar 14, 2025 at 3:40 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda wrote:
> > Support function level reset (flr) on hardware exception to recover
> > controller. Driver also implements the back-off time of 5 seconds and
> > the maximum number of retries are limited to 5 before giving up.
>
> Sort of weird that the commit log mentions FLR, but it's not mentioned
> in the patch itself except for BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
> Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE performs
> an FLR.
>
> Since this is an ACPI _DSM, presumably this mechanism only works for
> devices built into the platform, not for any potential plug-in devices
> that would not be described via ACPI.  I guess this driver probably
> already only works for built-in devices because it also uses
> DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.
>
> There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
> assume the _DSM exists because the device needs some additional
> device-specific work around the FLR.
>
> > +static void btintel_pcie_removal_work(struct work_struct *wk)
> > +{
> > +     struct btintel_pcie_removal *removal =
> > +             container_of(wk, struct btintel_pcie_removal, work);
> > +     struct pci_dev *pdev = removal->pdev;
> > +     struct pci_bus *bus;
> > +     struct btintel_pcie_data *data;
> > +
> > +     data = pci_get_drvdata(pdev);
> > +
> > +     pci_lock_rescan_remove();
> > +
> > +     bus = pdev->bus;
> > +     if (!bus)
> > +             goto out;
> > +
> > +     btintel_acpi_reset_method(data->hdev);
> > +     pci_stop_and_remove_bus_device(pdev);
> > +     pci_dev_put(pdev);
> > +
> > +     if (bus->parent)
> > +             bus = bus->parent;
> > +     pci_rescan_bus(bus);
>
> This remove and rescan by a driver that's bound to the device subverts
> the driver model.  pci_stop_and_remove_bus_device() detaches the
> driver from the device.  After the driver is detached, we should not
> be running any driver code.

Yeah, this self removal was sort of bugging me as well, although I'm
not familiar enough with the pci subsystem, having the driver remove
and continue running code like pci_rescan_bus seems wrong as we may
end up with multiple instances of the same driver.

> There are a couple other drivers that remove their own device (ath9k,
> iwlwifi, asus_wmi, eeepc-laptop), but I think those are broken and
> it's a mistake to add this pattern to more drivers.
>
> What's the reason for doing the remove and rescan?  The PCI core
> doesn't reset the device when you do this, so it's not a "bigger
> hammer reset".

I guess it was more of the expectation of Chandru to have a sort of
hard reset, driver remove+probe, instead of a soft reset where the
driver will just need to reinit itself after performing
pcie_reset_flr.

> > +out:
> > +     pci_unlock_rescan_remove();
> > +     kfree(removal);
> > +}
>


-- 
Luiz Augusto von Dentz

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-03-18 14:55   ` Luiz Augusto von Dentz
@ 2025-03-18 15:47     ` Bjorn Helgaas
  2025-05-25 10:30       ` K, Kiran
  2025-10-23  9:42       ` Devegowda, Chandrashekar
  0 siblings, 2 replies; 11+ messages in thread
From: Bjorn Helgaas @ 2025-03-18 15:47 UTC (permalink / raw)
  To: Luiz Augusto von Dentz
  Cc: Chandrashekar Devegowda, linux-bluetooth, linux-pci, bhelgaas,
	ravishankar.srivatsa, chethan.tumkur.narayan, Kiran K

On Tue, Mar 18, 2025 at 10:55:06AM -0400, Luiz Augusto von Dentz wrote:
> On Fri, Mar 14, 2025 at 3:40 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda wrote:
> > > Support function level reset (flr) on hardware exception to recover
> > > controller. Driver also implements the back-off time of 5 seconds and
> > > the maximum number of retries are limited to 5 before giving up.
> >
> > Sort of weird that the commit log mentions FLR, but it's not mentioned
> > in the patch itself except for BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
> > Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE performs
> > an FLR.
> >
> > Since this is an ACPI _DSM, presumably this mechanism only works for
> > devices built into the platform, not for any potential plug-in devices
> > that would not be described via ACPI.  I guess this driver probably
> > already only works for built-in devices because it also uses
> > DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.
> >
> > There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
> > assume the _DSM exists because the device needs some additional
> > device-specific work around the FLR.
> >
> > > +static void btintel_pcie_removal_work(struct work_struct *wk)
> > > +{
> > > +     struct btintel_pcie_removal *removal =
> > > +             container_of(wk, struct btintel_pcie_removal, work);
> > > +     struct pci_dev *pdev = removal->pdev;
> > > +     struct pci_bus *bus;
> > > +     struct btintel_pcie_data *data;
> > > +
> > > +     data = pci_get_drvdata(pdev);
> > > +
> > > +     pci_lock_rescan_remove();
> > > +
> > > +     bus = pdev->bus;
> > > +     if (!bus)
> > > +             goto out;
> > > +
> > > +     btintel_acpi_reset_method(data->hdev);
> > > +     pci_stop_and_remove_bus_device(pdev);
> > > +     pci_dev_put(pdev);
> > > +
> > > +     if (bus->parent)
> > > +             bus = bus->parent;
> > > +     pci_rescan_bus(bus);
> >
> > This remove and rescan by a driver that's bound to the device subverts
> > the driver model.  pci_stop_and_remove_bus_device() detaches the
> > driver from the device.  After the driver is detached, we should not
> > be running any driver code.
> 
> Yeah, this self removal was sort of bugging me as well, although I'm
> not familiar enough with the pci subsystem, having the driver remove
> and continue running code like pci_rescan_bus seems wrong as we may
> end up with multiple instances of the same driver.
> 
> > There are a couple other drivers that remove their own device (ath9k,
> > iwlwifi, asus_wmi, eeepc-laptop), but I think those are broken and
> > it's a mistake to add this pattern to more drivers.
> >
> > What's the reason for doing the remove and rescan?  The PCI core
> > doesn't reset the device when you do this, so it's not a "bigger
> > hammer reset".
> 
> I guess it was more of the expectation of Chandru to have a sort of
> hard reset, driver remove+probe, instead of a soft reset where the
> driver will just need to reinit itself after performing
> pcie_reset_flr.

If the object is just to reinitialize the driver, I think this hack of
removing and rescanning is a bad way to do it.  If you reset the
device, you now know the state of the device and you can make the
driver state match it.  If necessary you can always reuse part or all
of the .remove() and .probe() methods yourself, without this dance of
calling pci_stop_and_remove_bus_device() and pci_rescan_bus().

Bjorn

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-03-18 15:47     ` Bjorn Helgaas
@ 2025-05-25 10:30       ` K, Kiran
  2025-10-23  9:42       ` Devegowda, Chandrashekar
  1 sibling, 0 replies; 11+ messages in thread
From: K, Kiran @ 2025-05-25 10:30 UTC (permalink / raw)
  To: Bjorn Helgaas, Luiz Augusto von Dentz
  Cc: Devegowda, Chandrashekar, linux-bluetooth@vger.kernel.org,
	linux-pci@vger.kernel.org, bhelgaas@google.com,
	Srivatsa, Ravishankar, Tumkur Narayan, Chethan

Hi Bjorn, Luiz,

Thanks for the comments. I have published v2 version of the patch addressing the comments.

>Subject: Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
>
>On Tue, Mar 18, 2025 at 10:55:06AM -0400, Luiz Augusto von Dentz wrote:
>> On Fri, Mar 14, 2025 at 3:40 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>> > On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda
>wrote:
>> > > Support function level reset (flr) on hardware exception to
>> > > recover controller. Driver also implements the back-off time of 5
>> > > seconds and the maximum number of retries are limited to 5 before
>giving up.
>> >
>> > Sort of weird that the commit log mentions FLR, but it's not
>> > mentioned in the patch itself except for
>BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
>> > Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE
>performs
>> > an FLR.
>> >
>> > Since this is an ACPI _DSM, presumably this mechanism only works for
>> > devices built into the platform, not for any potential plug-in
>> > devices that would not be described via ACPI.  I guess this driver
>> > probably already only works for built-in devices because it also
>> > uses DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.
>> >
>> > There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
>> > assume the _DSM exists because the device needs some additional
>> > device-specific work around the FLR.
>> >
>> > > +static void btintel_pcie_removal_work(struct work_struct *wk) {
>> > > +     struct btintel_pcie_removal *removal =
>> > > +             container_of(wk, struct btintel_pcie_removal, work);
>> > > +     struct pci_dev *pdev = removal->pdev;
>> > > +     struct pci_bus *bus;
>> > > +     struct btintel_pcie_data *data;
>> > > +
>> > > +     data = pci_get_drvdata(pdev);
>> > > +
>> > > +     pci_lock_rescan_remove();
>> > > +
>> > > +     bus = pdev->bus;
>> > > +     if (!bus)
>> > > +             goto out;
>> > > +
>> > > +     btintel_acpi_reset_method(data->hdev);
>> > > +     pci_stop_and_remove_bus_device(pdev);
>> > > +     pci_dev_put(pdev);
>> > > +
>> > > +     if (bus->parent)
>> > > +             bus = bus->parent;
>> > > +     pci_rescan_bus(bus);
>> >
>> > This remove and rescan by a driver that's bound to the device
>> > subverts the driver model.  pci_stop_and_remove_bus_device()
>> > detaches the driver from the device.  After the driver is detached,
>> > we should not be running any driver code.
>>
>> Yeah, this self removal was sort of bugging me as well, although I'm
>> not familiar enough with the pci subsystem, having the driver remove
>> and continue running code like pci_rescan_bus seems wrong as we may
>> end up with multiple instances of the same driver.
>>
>> > There are a couple other drivers that remove their own device
>> > (ath9k, iwlwifi, asus_wmi, eeepc-laptop), but I think those are
>> > broken and it's a mistake to add this pattern to more drivers.
>> >
>> > What's the reason for doing the remove and rescan?  The PCI core
>> > doesn't reset the device when you do this, so it's not a "bigger
>> > hammer reset".
>>
>> I guess it was more of the expectation of Chandru to have a sort of
>> hard reset, driver remove+probe, instead of a soft reset where the
>> driver will just need to reinit itself after performing
>> pcie_reset_flr.
>
>If the object is just to reinitialize the driver, I think this hack of removing and
>rescanning is a bad way to do it.  If you reset the device, you now know the
>state of the device and you can make the driver state match it.  If necessary
>you can always reuse part or all of the .remove() and .probe() methods
>yourself, without this dance of calling pci_stop_and_remove_bus_device() and
>pci_rescan_bus().
>
>Bjorn

Thanks,
Kiran


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-03-18 15:47     ` Bjorn Helgaas
  2025-05-25 10:30       ` K, Kiran
@ 2025-10-23  9:42       ` Devegowda, Chandrashekar
  2025-10-23 20:36         ` Bjorn Helgaas
  1 sibling, 1 reply; 11+ messages in thread
From: Devegowda, Chandrashekar @ 2025-10-23  9:42 UTC (permalink / raw)
  To: Bjorn Helgaas, Luiz Augusto von Dentz
  Cc: linux-bluetooth@vger.kernel.org, linux-pci@vger.kernel.org,
	bhelgaas@google.com, Srivatsa, Ravishankar,
	Tumkur Narayan, Chethan, K, Kiran, Ben Ami, Golan, Berg, Johannes

+ Golan , Johannes from Intel WiFi PCIe driver team.

Hi Bjorn,


> Subject: Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
> 
> On Tue, Mar 18, 2025 at 10:55:06AM -0400, Luiz Augusto von Dentz wrote:
> > On Fri, Mar 14, 2025 at 3:40 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda
> wrote:
> > > > Support function level reset (flr) on hardware exception to
> > > > recover controller. Driver also implements the back-off time of 5
> > > > seconds and the maximum number of retries are limited to 5 before
> giving up.
> > >
> > > Sort of weird that the commit log mentions FLR, but it's not
> > > mentioned in the patch itself except for
> BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
> > > Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE
> performs
> > > an FLR.
> > >
> > > Since this is an ACPI _DSM, presumably this mechanism only works for
> > > devices built into the platform, not for any potential plug-in
> > > devices that would not be described via ACPI.  I guess this driver
> > > probably already only works for built-in devices because it also
> > > uses DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.
> > >
> > > There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
> > > assume the _DSM exists because the device needs some additional
> > > device-specific work around the FLR.
> > >
> > > > +static void btintel_pcie_removal_work(struct work_struct *wk) {
> > > > +     struct btintel_pcie_removal *removal =
> > > > +             container_of(wk, struct btintel_pcie_removal, work);
> > > > +     struct pci_dev *pdev = removal->pdev;
> > > > +     struct pci_bus *bus;
> > > > +     struct btintel_pcie_data *data;
> > > > +
> > > > +     data = pci_get_drvdata(pdev);
> > > > +
> > > > +     pci_lock_rescan_remove();
> > > > +
> > > > +     bus = pdev->bus;
> > > > +     if (!bus)
> > > > +             goto out;
> > > > +
> > > > +     btintel_acpi_reset_method(data->hdev);
> > > > +     pci_stop_and_remove_bus_device(pdev);
> > > > +     pci_dev_put(pdev);
> > > > +
> > > > +     if (bus->parent)
> > > > +             bus = bus->parent;
> > > > +     pci_rescan_bus(bus);
> > >
> > > This remove and rescan by a driver that's bound to the device
> > > subverts the driver model.  pci_stop_and_remove_bus_device()
> > > detaches the driver from the device.  After the driver is detached,
> > > we should not be running any driver code.
> >
> > Yeah, this self removal was sort of bugging me as well, although I'm
> > not familiar enough with the pci subsystem, having the driver remove
> > and continue running code like pci_rescan_bus seems wrong as we may
> > end up with multiple instances of the same driver.
> >
> > > There are a couple other drivers that remove their own device
> > > (ath9k, iwlwifi, asus_wmi, eeepc-laptop), but I think those are
> > > broken and it's a mistake to add this pattern to more drivers.
> > >
> > > What's the reason for doing the remove and rescan?  The PCI core
> > > doesn't reset the device when you do this, so it's not a "bigger
> > > hammer reset".
> >
> > I guess it was more of the expectation of Chandru to have a sort of
> > hard reset, driver remove+probe, instead of a soft reset where the
> > driver will just need to reinit itself after performing
> > pcie_reset_flr.
> 
> If the object is just to reinitialize the driver, I think this hack of removing and
> rescanning is a bad way to do it.  If you reset the device, you now know the
> state of the device and you can make the driver state match it.  If necessary
> you can always reuse part or all of the .remove() and .probe() methods
> yourself, without this dance of calling pci_stop_and_remove_bus_device() and
> pci_rescan_bus().

I’m sharing insights from our recent work on the PLDR for the BT driver , The
above method supports FLR effectively, but for PLDR it is required to unload
Wifi driver before doing PLDR via ACPI method.

Currently, calling pci_rescan_bus() successfully rebinds both the WiFi and BT
drivers. This approach follows the method used for the WiFi driver, as seen
here:

https://elixir.bootlin.com/linux/v6.18-rc1/source/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c#L2182

Could you please suggest any alternative methods to avoid using pci_rescan_bus()
for this purpose?

> 
> Bjorn

Regards,
Chandru


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-10-23  9:42       ` Devegowda, Chandrashekar
@ 2025-10-23 20:36         ` Bjorn Helgaas
  2025-10-27 10:08           ` Johannes Berg
  0 siblings, 1 reply; 11+ messages in thread
From: Bjorn Helgaas @ 2025-10-23 20:36 UTC (permalink / raw)
  To: Devegowda, Chandrashekar
  Cc: Luiz Augusto von Dentz, linux-bluetooth@vger.kernel.org,
	linux-pci@vger.kernel.org, bhelgaas@google.com,
	Srivatsa, Ravishankar, Tumkur Narayan, Chethan, K, Kiran,
	Ben Ami, Golan, Berg, Johannes

On Thu, Oct 23, 2025 at 09:42:16AM +0000, Devegowda, Chandrashekar wrote:
> > On Tue, Mar 18, 2025 at 10:55:06AM -0400, Luiz Augusto von Dentz wrote:
> > > On Fri, Mar 14, 2025 at 3:40 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > > On Fri, Mar 14, 2025 at 12:16:13PM +0200, Chandrashekar Devegowda
> > wrote:
> > > > > Support function level reset (flr) on hardware exception to
> > > > > recover controller. Driver also implements the back-off time
> > > > > of 5 seconds and the maximum number of retries are limited
> > > > > to 5 before giving up.
> > > >
> > > > Sort of weird that the commit log mentions FLR, but it's not
> > > > mentioned in the patch itself except for
> > > > BTINTEL_PCIE_FLR_RESET_MAX_RETRY.  Apparently the assumption
> > > > is that DSM_SET_RESET_METHOD_PCIE performs an FLR.
> > > >
> > > > Since this is an ACPI _DSM, presumably this mechanism only
> > > > works for devices built into the platform, not for any
> > > > potential plug-in devices that would not be described via
> > > > ACPI.  I guess this driver probably already only works for
> > > > built-in devices because it also uses DSM_SET_WDISABLE2_DELAY
> > > > and DSM_SET_RESET_METHOD.
> > > >
> > > > There is a generic PCI core way to do FLR (pcie_reset_flr()),
> > > > so I assume the _DSM exists because the device needs some
> > > > additional device-specific work around the FLR.
> > > >
> > > > > +static void btintel_pcie_removal_work(struct work_struct *wk) {
> > > > > +     struct btintel_pcie_removal *removal =
> > > > > +             container_of(wk, struct btintel_pcie_removal, work);
> > > > > +     struct pci_dev *pdev = removal->pdev;
> > > > > +     struct pci_bus *bus;
> > > > > +     struct btintel_pcie_data *data;
> > > > > +
> > > > > +     data = pci_get_drvdata(pdev);
> > > > > +
> > > > > +     pci_lock_rescan_remove();
> > > > > +
> > > > > +     bus = pdev->bus;
> > > > > +     if (!bus)
> > > > > +             goto out;
> > > > > +
> > > > > +     btintel_acpi_reset_method(data->hdev);
> > > > > +     pci_stop_and_remove_bus_device(pdev);
> > > > > +     pci_dev_put(pdev);
> > > > > +
> > > > > +     if (bus->parent)
> > > > > +             bus = bus->parent;
> > > > > +     pci_rescan_bus(bus);
> > > >
> > > > This remove and rescan by a driver that's bound to the device
> > > > subverts the driver model.  pci_stop_and_remove_bus_device()
> > > > detaches the driver from the device.  After the driver is
> > > > detached, we should not be running any driver code.
> > >
> > > Yeah, this self removal was sort of bugging me as well, although
> > > I'm not familiar enough with the pci subsystem, having the
> > > driver remove and continue running code like pci_rescan_bus
> > > seems wrong as we may end up with multiple instances of the same
> > > driver.
> > >
> > > > There are a couple other drivers that remove their own device
> > > > (ath9k, iwlwifi, asus_wmi, eeepc-laptop), but I think those
> > > > are broken and it's a mistake to add this pattern to more
> > > > drivers.
> > > >
> > > > What's the reason for doing the remove and rescan?  The PCI
> > > > core doesn't reset the device when you do this, so it's not a
> > > > "bigger hammer reset".
> > >
> > > I guess it was more of the expectation of Chandru to have a sort
> > > of hard reset, driver remove+probe, instead of a soft reset
> > > where the driver will just need to reinit itself after
> > > performing pcie_reset_flr.
> > 
> > If the object is just to reinitialize the driver, I think this
> > hack of removing and rescanning is a bad way to do it.  If you
> > reset the device, you now know the state of the device and you can
> > make the driver state match it.  If necessary you can always reuse
> > part or all of the .remove() and .probe() methods yourself,
> > without this dance of calling pci_stop_and_remove_bus_device() and
> > pci_rescan_bus().
> 
> I’m sharing insights from our recent work on the PLDR for the BT
> driver. The above method supports FLR effectively, but for PLDR it
> is required to unload Wifi driver before doing PLDR via ACPI method.

IIUC "PLDR" is an ACPI method that does a reset, and you want to reset
a BT device.

> Currently, calling pci_rescan_bus() successfully rebinds both the
> WiFi and BT drivers. This approach follows the method used for the
> WiFi driver, as seen here:
> 
> https://elixir.bootlin.com/linux/v6.18-rc1/source/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c#L2182

It looks like this is a multi-function device, iwlwifi is bound to
function 0, and btintel is bound to function 1.

Then it looks like iwl_trans_pcie_removal_wk() starts with the wifi
device, finds the corresponding BT device, removes the BT device, runs
PLDR on the wifi device, removes the wifi device, and rescans to find
both devices again:

  iwl_trans_pcie_removal_wk
    wifi = removal->pdev			# WiFi device
    bt = pci_get_slot(...)			# BT device
    pci_stop_and_remove_bus_device(bt)
    iwl_trans_pcie_set_product_reset(wifi)	# do PLDR on WiFi
    pci_stop_and_remove_bus_device(wifi)
    pci_rescan_bus

It seems problematic to me for the WiFi driver to remove the BT
driver.  What if BT was active at the time?  Why is it ok to yank the
rug out from under it?

Why does the BT driver have to be unloaded before resetting the WiFi
device?  Why does the WiFi driver have to be unloaded before resetting
the BT device?

Theoretically, the functions of a PCI multi-function device are
independent and really don't have any influence on each other.

If you had a single driver that claimed both devices, that driver
could coordinate this since it would know about both and could
synchronize their activity when needed.

Maybe you could have a wrapper driver that claims both and delegates
each function to either iwlwifi or btintel as needed?  Since the
wrapper, iwlwifi, and btintel would be linked into a single module,
you could arrange callbacks between them to synchronize and handle
these resets.

If you reset the device, you know the state of the device afterward,
and the driver should be able to initialize its own data structures
accordingly.  This should not require any PCI device removal or
rescan.

Bjorn

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-10-23 20:36         ` Bjorn Helgaas
@ 2025-10-27 10:08           ` Johannes Berg
  2025-10-28 21:06             ` Bjorn Helgaas
  0 siblings, 1 reply; 11+ messages in thread
From: Johannes Berg @ 2025-10-27 10:08 UTC (permalink / raw)
  To: Bjorn Helgaas, Devegowda, Chandrashekar
  Cc: Luiz Augusto von Dentz, linux-bluetooth@vger.kernel.org,
	linux-pci@vger.kernel.org, bhelgaas@google.com,
	Srivatsa, Ravishankar, Tumkur Narayan, Chethan, K, Kiran,
	Ben Ami, Golan

Hi,

So I've been asked to chime in here, mostly on behalf of iwlwifi, and
I'll actually respond to two of your messages a bit.

(from your previous email first:)

> Sort of weird that the commit log mentions FLR, but it's not mentioned
> in the patch itself except for BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
> Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE performs
> an FLR.

It's not just weird, it's simply wrong. This is not about FLR at all.

> Since this is an ACPI _DSM, presumably this mechanism only works for
> devices built into the platform, not for any potential plug-in devices
> that would not be described via ACPI.  I guess this driver probably
> already only works for built-in devices because it also uses
> DSM_SET_WDISABLE2_DELAY and DSM_SET_RESET_METHOD.

It ... depends. Sometimes it can work even for discrete devices if the
platform ACPI is prepared for it. But if you just take it into a
platform not prepared to handle it, then yes, obviously it cannot work.
The driver overall should and can work for discrete devices plugged into
an arbitrary platform (I have one plugged into an ARM board), but it
generally will not have all the platform integration features. (Though
some people are actually asking about DT support. I guess taken to the
logical conclusion for the reset discussion at hand that would mean
having a voltage regulator or so linked that the driver can use to power
off and on the device.)

> There is a generic PCI core way to do FLR (pcie_reset_flr()), so I
> assume the _DSM exists because the device needs some additional
> device-specific work around the FLR.

Nah, FLR was misleading. This is not related to FLR, see below for a
more complete discussion.

(the email I'm responding to:)

> IIUC "PLDR" is an ACPI method that does a reset, and you want to reset
> a BT device.

Not really.

PLDR is a Windows-ism, we really shouldn't even use this name.

(PLDR means "product level device reset", which is really trying to
reset more than each function. There's some magic required in Windows to
achieve this at their driver abstraction level.)

From a hardware perspective, it has a number of shared components [1],
as well as BT and WiFi functions on the device. Doing function reset on
either will only reset the individual function parts, not the shared
components.

[1] I could go into more details but it's not really for this discussion


To actually do that reset we need ACPI help, first calling the product
reset DSM (see iwl_trans_pcie_set_product_reset()) to enable the right
kind of reset among the various options, and then calling _RST on the
corresponding _PRR object. I _think_ this might even be specified
somewhere, so maybe pci_dev_acpi_reset() should try to find a _PRR
object, and we could remove this code? But I'm no ACPI expert. AFAICT
there's no _RST method on the object itself, only on the corresponding
_PRR object.

> > Currently, calling pci_rescan_bus() successfully rebinds both the
> > WiFi and BT drivers. This approach follows the method used for the
> > WiFi driver, as seen here:
> > 
> > https://elixir.bootlin.com/linux/v6.18-rc1/source/drivers/net/wireless/intel/iwlwifi/pcie/gen1_2/trans.c#L2182
> 
> It looks like this is a multi-function device, iwlwifi is bound to
> function 0, and btintel is bound to function 1.

Yes.

> Then it looks like iwl_trans_pcie_removal_wk() starts with the wifi
> device, finds the corresponding BT device, removes the BT device, runs
> PLDR on the wifi device, removes the wifi device, and rescans to find
> both devices again:

More or less, yes, see also above for a full discussion.

>   iwl_trans_pcie_removal_wk
>     wifi = removal->pdev			# WiFi device
>     bt = pci_get_slot(...)			# BT device
>     pci_stop_and_remove_bus_device(bt)
>     iwl_trans_pcie_set_product_reset(wifi)	# do PLDR on WiFi
>     pci_stop_and_remove_bus_device(wifi)
>     pci_rescan_bus
> 
> It seems problematic to me for the WiFi driver to remove the BT
> driver.  What if BT was active at the time?  Why is it ok to yank the
> rug out from under it?

So first of all, it's not really different from unbinding it via sysfs,
so this is necessarily always supported. There's even PCIe hotplug, as
you surely know, and this device could be behind a Thunderbolt port
(I've actually done this in the, albeit fairly distant, past, when
admittedly BT was still USB.)

From a hardware perspective, the reason to do this is if one side (BT or
WiFi) detects that there's an error in a shared hardware components.
This may be either side, but it's not always predictable which side will
first detect the error, say WiFi is mostly idle then it won't
necessarily detect it.

Crucially though, the hardware is basically dead at this point anyway -
both BT and WiFi will be affected. One of them might just not know it
yet, and might not even notice until eventually getting a command
timeout to a random command, which might not be triggered until
"something happens" such as userspace triggering a scan.

> Why does the BT driver have to be unloaded before resetting the WiFi
> device?  Why does the WiFi driver have to be unloaded before resetting
> the BT device?

See above, I think.

> Theoretically, the functions of a PCI multi-function device are
> independent and really don't have any influence on each other.

"Theoretically", see above.

> If you had a single driver that claimed both devices, that driver
> could coordinate this since it would know about both and could
> synchronize their activity when needed.

Yes, but it'd be very complex to maintain that across two unrelated
subsystems, and wouldn't actually _help_ much.

> Maybe you could have a wrapper driver that claims both and delegates
> each function to either iwlwifi or btintel as needed?  Since the
> wrapper, iwlwifi, and btintel would be linked into a single module,
> you could arrange callbacks between them to synchronize and handle
> these resets.

While it would be possible to build such a driver, it'd be rather
complex to maintain, and wouldn't actually help anything.

> If you reset the device, you know the state of the device afterward,
> and the driver should be able to initialize its own data structures
> accordingly.  This should not require any PCI device removal or
> rescan.

Obviously, we know the state of the device, but ... it _does_ require
PCI removal *and* rescan, because the device completely falls off the
bus and needs to be rediscovered. The drivers also fundamentally have to
be unbound from it, since all state of the device (including BAR setup)
is lost. I'm fairly certain that if you were to query even the device
IDs after the reset, you'd see 0xFFFFFFFF, but in truth I don't fully
understand how this works at the PCIe bus level.

In any case, I didn't see any other way to do this when I implemented
it, and don't see it now either.


(going back to your earlier email:)

> This remove and rescan by a driver that's bound to the device subverts
> the driver model.  pci_stop_and_remove_bus_device() detaches the
> driver from the device.  After the driver is detached, we should not
> be running any driver code.

I mean, I _guess_ you could argue that? I don't really think of it that
way - the code that is running to do all this isn't actually part of the
driver context of this module. Now, if you're going to argue that a
"driver module" cannot have code (other than init/exit I guess) running
outside of the driver context I suppose you could do that, but that's
probably broken by half the modules in the tree anyway.

I also don't think that it's actually possible for the driver to unbind
itself within the context of the driver model, so in effect you're
arguing that a driver is also not allowed to call device_reprobe() for
it's own device, which I'd think some calls do today. The code here is
just a slightly more complex PCI-specific case of this because of the
device hardware behaviour described above.

Do we have to be careful with this vs. say module unload? Obviously,
just like anything else that isn't pure driver code, which also has to
be careful, just against remove() callback rather than module exit. But
this code isn't really tied to the driver model, it's running outside of
that context.


I hope this clarifies things. I apologise on behalf of the BT team for
misleading you on pretty much all of the terminology.

johannes

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-10-27 10:08           ` Johannes Berg
@ 2025-10-28 21:06             ` Bjorn Helgaas
  2025-10-29 10:38               ` Johannes Berg
  0 siblings, 1 reply; 11+ messages in thread
From: Bjorn Helgaas @ 2025-10-28 21:06 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Devegowda, Chandrashekar, Luiz Augusto von Dentz,
	linux-bluetooth@vger.kernel.org, linux-pci@vger.kernel.org,
	bhelgaas@google.com, Srivatsa, Ravishankar,
	Tumkur Narayan, Chethan, K, Kiran, Ben Ami, Golan

On Mon, Oct 27, 2025 at 11:08:22AM +0100, Johannes Berg wrote:
> Hi,
> 
> So I've been asked to chime in here, mostly on behalf of iwlwifi, and
> I'll actually respond to two of your messages a bit.
> 
> (from your previous email first:)
> 
> > Sort of weird that the commit log mentions FLR, but it's not mentioned
> > in the patch itself except for BTINTEL_PCIE_FLR_RESET_MAX_RETRY.
> > Apparently the assumption is that DSM_SET_RESET_METHOD_PCIE performs
> > an FLR.
> 
> It's not just weird, it's simply wrong. This is not about FLR at all.

Thanks for clearing that up.

> > If you reset the device, you know the state of the device afterward,
> > and the driver should be able to initialize its own data structures
> > accordingly.  This should not require any PCI device removal or
> > rescan.
> 
> Obviously, we know the state of the device, but ... it _does_ require
> PCI removal *and* rescan, because the device completely falls off the
> bus and needs to be rediscovered. The drivers also fundamentally have to
> be unbound from it, since all state of the device (including BAR setup)
> is lost. I'm fairly certain that if you were to query even the device
> IDs after the reset, you'd see 0xFFFFFFFF, but in truth I don't fully
> understand how this works at the PCIe bus level.

It might be different for other buses, but the PCI core really doesn't
do anything to the device during removal or rescan.  It does turn off
power management interrupts from the device and the like, but I'm
pretty sure it doesn't reset the device or do anything to make it
start responding to PCI transactions again.

Obviously remove and rescan reinitializes the *driver* because the PCI
core calls the driver .remove() method, reads the Vendor and Device
IDs, reads and if necessary programs the BARs, and calls the driver
.probe() method.

I think it's really the PLDR that's making the difference here, not
the remove and rescan.  I guess you could experimentally read some
config registers after the PLDR and before the remove/rescan.

Since you know the other device is dead already, I don't have a
problem with resetting the shared parts of the device, so you do need
some way to poke the other driver to reinit.  But I think using the
PCI core remove/rescan to do that makes it more complicated than
necessary and distracts from what's really happening.

Bjorn

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-10-28 21:06             ` Bjorn Helgaas
@ 2025-10-29 10:38               ` Johannes Berg
  2025-10-29 16:26                 ` Bjorn Helgaas
  0 siblings, 1 reply; 11+ messages in thread
From: Johannes Berg @ 2025-10-29 10:38 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Devegowda, Chandrashekar, Luiz Augusto von Dentz,
	linux-bluetooth@vger.kernel.org, linux-pci@vger.kernel.org,
	bhelgaas@google.com, Srivatsa, Ravishankar,
	Tumkur Narayan, Chethan, K, Kiran, Ben Ami, Golan

On Tue, 2025-10-28 at 16:06 -0500, Bjorn Helgaas wrote:
> > Obviously, we know the state of the device, but ... it _does_ require
> > PCI removal *and* rescan, because the device completely falls off the
> > bus and needs to be rediscovered. The drivers also fundamentally have to
> > be unbound from it, since all state of the device (including BAR setup)
> > is lost. I'm fairly certain that if you were to query even the device
> > IDs after the reset, you'd see 0xFFFFFFFF, but in truth I don't fully
> > understand how this works at the PCIe bus level.
> 
> It might be different for other buses, but the PCI core really doesn't
> do anything to the device during removal or rescan.  It does turn off
> power management interrupts from the device and the like, but I'm
> pretty sure it doesn't reset the device or do anything to make it
> start responding to PCI transactions again.

OK, fair. I have hit various weird issues with cached config space in
the past (such as [1], which we never resolved), so I guess I'm possibly
being ultra careful here in what I'm assuming or not.

[1] https://lore.kernel.org/linux-pci/20230605203519.bc4232207449.Idbaa55b93f780838af44ebccb84c36f60716df04@changeid/


> Obviously remove and rescan reinitializes the *driver* because the PCI
> core calls the driver .remove() method, reads the Vendor and Device
> IDs, reads and if necessary programs the BARs, and calls the driver
> .probe() method.

Right. So I guess in effect you're saying that device_reprobe() ought to
be sufficient.

For WiFi, this code goes back to another issue we had (somewhat
intentionally) where under certain circumstances during initialisation
the firmware can do a product reset without the driver being aware, and
then the driver just has to detect it by PCIe transactions failing with
0xFFFF'FFFF being read all the time. It's going to be hard to test this
case now, but we can still test the product reset.

For BT detecting the error and initiation product reset, it does seem
that doing (only) device_reprobe() for both functions is actually
sufficient. I believe the FLR code in BT is broken though, so we're
going to (re-)check all of this.

> I think it's really the PLDR that's making the difference here, not
> the remove and rescan.  I guess you could experimentally read some
> config registers after the PLDR and before the remove/rescan.

Yeah, just need to find real hardware with all the BIOS integration to
run it ;-) (Most of our testing uses VMs.)

> Since you know the other device is dead already, I don't have a
> problem with resetting the shared parts of the device, so you do need
> some way to poke the other driver to reinit.  But I think using the
> PCI core remove/rescan to do that makes it more complicated than
> necessary and distracts from what's really happening.

Fair. I think the easiest way to achieve this is still device_reprobe()
on the other device - eventually even if we tell the other driver then
it still will simply call device_reprobe() on itself, so there's no big
difference.

Thanks,
johannes

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v1] Bluetooth: btintel_pcie: Support function level reset
  2025-10-29 10:38               ` Johannes Berg
@ 2025-10-29 16:26                 ` Bjorn Helgaas
  0 siblings, 0 replies; 11+ messages in thread
From: Bjorn Helgaas @ 2025-10-29 16:26 UTC (permalink / raw)
  To: Johannes Berg
  Cc: Devegowda, Chandrashekar, Luiz Augusto von Dentz,
	linux-bluetooth@vger.kernel.org, linux-pci@vger.kernel.org,
	bhelgaas@google.com, Srivatsa, Ravishankar,
	Tumkur Narayan, Chethan, K, Kiran, Ben Ami, Golan

On Wed, Oct 29, 2025 at 11:38:31AM +0100, Johannes Berg wrote:
> On Tue, 2025-10-28 at 16:06 -0500, Bjorn Helgaas wrote:
> > > Obviously, we know the state of the device, but ... it _does_ require
> > > PCI removal *and* rescan, because the device completely falls off the
> > > bus and needs to be rediscovered. The drivers also fundamentally have to
> > > be unbound from it, since all state of the device (including BAR setup)
> > > is lost. I'm fairly certain that if you were to query even the device
> > > IDs after the reset, you'd see 0xFFFFFFFF, but in truth I don't fully
> > > understand how this works at the PCIe bus level.
> > 
> > It might be different for other buses, but the PCI core really doesn't
> > do anything to the device during removal or rescan.  It does turn off
> > power management interrupts from the device and the like, but I'm
> > pretty sure it doesn't reset the device or do anything to make it
> > start responding to PCI transactions again.
> 
> OK, fair. I have hit various weird issues with cached config space in
> the past (such as [1], which we never resolved), so I guess I'm possibly
> being ultra careful here in what I'm assuming or not.
> 
> [1] https://lore.kernel.org/linux-pci/20230605203519.bc4232207449.Idbaa55b93f780838af44ebccb84c36f60716df04@changeid/
> 
> > Obviously remove and rescan reinitializes the *driver* because the PCI
> > core calls the driver .remove() method, reads the Vendor and Device
> > IDs, reads and if necessary programs the BARs, and calls the driver
> > .probe() method.
> 
> Right. So I guess in effect you're saying that device_reprobe() ought to
> be sufficient.
> 
> For WiFi, this code goes back to another issue we had (somewhat
> intentionally) where under certain circumstances during initialisation
> the firmware can do a product reset without the driver being aware, and
> then the driver just has to detect it by PCIe transactions failing with
> 0xFFFF'FFFF being read all the time. It's going to be hard to test this
> case now, but we can still test the product reset.

I think there are also some cases (maybe mostly embedded arm and
arm64?) where PCIe failures cause synchronous aborts or SError 
interrupts instead of just returning 0xFFFF'FFFF data.  Maybe nothing
you can do about that other than adding delay after initialization or
something.

> For BT detecting the error and initiation product reset, it does seem
> that doing (only) device_reprobe() for both functions is actually
> sufficient. I believe the FLR code in BT is broken though, so we're
> going to (re-)check all of this.
> 
> > I think it's really the PLDR that's making the difference here, not
> > the remove and rescan.  I guess you could experimentally read some
> > config registers after the PLDR and before the remove/rescan.
> 
> Yeah, just need to find real hardware with all the BIOS integration to
> run it ;-) (Most of our testing uses VMs.)
> 
> > Since you know the other device is dead already, I don't have a
> > problem with resetting the shared parts of the device, so you do need
> > some way to poke the other driver to reinit.  But I think using the
> > PCI core remove/rescan to do that makes it more complicated than
> > necessary and distracts from what's really happening.
> 
> Fair. I think the easiest way to achieve this is still device_reprobe()
> on the other device - eventually even if we tell the other driver then
> it still will simply call device_reprobe() on itself, so there's no big
> difference.

I hadn't heard of device_reprobe() until you pointed it out, but this
usage doesn't really seem to fit the intended purpose since the
probing critera haven't changed:

 * This function detaches the attached driver (if any) for the given
 * device and restarts the driver probing process.  It is intended
 * to use if probing criteria changed during a devices lifetime and
 * driver attachment should change accordingly.

But I definitely like it better than remove/rescan.  I suppose either
way results in udev remove/add events that are really spurious.  And
maybe PM artifacts like what you tripped over at [1] (although there
may still be a harmful PCI core race there).

Bjorn

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-10-29 16:26 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-14 10:16 [PATCH v1] Bluetooth: btintel_pcie: Support function level reset Chandrashekar Devegowda
2025-03-14 19:40 ` Bjorn Helgaas
2025-03-18 14:55   ` Luiz Augusto von Dentz
2025-03-18 15:47     ` Bjorn Helgaas
2025-05-25 10:30       ` K, Kiran
2025-10-23  9:42       ` Devegowda, Chandrashekar
2025-10-23 20:36         ` Bjorn Helgaas
2025-10-27 10:08           ` Johannes Berg
2025-10-28 21:06             ` Bjorn Helgaas
2025-10-29 10:38               ` Johannes Berg
2025-10-29 16:26                 ` Bjorn Helgaas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).