Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [PATCH v3 2/7] net: wwan: t9xx: Add control plane transaction layer
From: Jack Wu via B4 Relay @ 2026-06-24 10:04 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jack Wu, Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng,
	Matthias Brugger, AngeloGioacchino Del Regno, Simon Horman,
	Jonathan Corbet, Shuah Khan
  Cc: linux-kernel, netdev, linux-arm-kernel, linux-mediatek, linux-doc
In-Reply-To: <20260624-t9xx_driver_v1-v3-0-73ff03f60c48@compal.com>

From: Jack Wu <jackbb_wu@compal.com>

The control plane implements TX services that reside in the
transaction layer. The services receive the packets from the
port layer and call the corresponding DMA components to
transmit data to the device. Meanwhile, TX services receive
and manage the port control commands from the port layer.

The control plane implements RX services that reside in the
transaction layer. The services receive the downlink packets
from the modem and transfer the packets to the corresponding
port layer interfaces.

Signed-off-by: Jack Wu <jackbb_wu@compal.com>
---
 drivers/net/wwan/Kconfig                    |  5 +++
 drivers/net/wwan/t9xx/Makefile              |  5 +--
 drivers/net/wwan/t9xx/mtk_ctrl_plane.c      | 48 +++++++++++++++++++++++++++++
 drivers/net/wwan/t9xx/mtk_ctrl_plane.h      | 22 +++++++++++++
 drivers/net/wwan/t9xx/mtk_dev.c             | 44 ++++++++++++++++++++++++++
 drivers/net/wwan/t9xx/mtk_dev.h             |  5 +++
 drivers/net/wwan/t9xx/pcie/Makefile         | 10 ++++++
 drivers/net/wwan/t9xx/pcie/mtk_pci.c        | 10 +++---
 drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.h | 21 +++++++++++++
 9 files changed, 163 insertions(+), 7 deletions(-)

diff --git a/drivers/net/wwan/Kconfig b/drivers/net/wwan/Kconfig
index 4cee537c739f..7019b44494f8 100644
--- a/drivers/net/wwan/Kconfig
+++ b/drivers/net/wwan/Kconfig
@@ -124,6 +124,7 @@ config MTK_T7XX
 config MTK_T9XX
 	tristate "MediaTek PCIe 5G WWAN modem T9xx device"
 	depends on PCI
+	select MTK_T9XX_PCI
 	select NET_DEVLINK
 	help
 	  Enables MediaTek PCIe based 5G WWAN modem (T9xx series) device.
@@ -133,6 +134,10 @@ config MTK_T9XX
 
 	  If unsure, say N.
 
+config MTK_T9XX_PCI
+	tristate
+	depends on PCI
+
 endif # WWAN
 
 endmenu
diff --git a/drivers/net/wwan/t9xx/Makefile b/drivers/net/wwan/t9xx/Makefile
index 6f2dd3f91454..ae9d6f2344ab 100644
--- a/drivers/net/wwan/t9xx/Makefile
+++ b/drivers/net/wwan/t9xx/Makefile
@@ -4,7 +4,8 @@ ccflags-y += -I$(src)/pcie
 ccflags-y += -I$(src)
 
 obj-$(CONFIG_MTK_T9XX) += mtk_t9xx.o
+obj-$(CONFIG_MTK_T9XX_PCI) += pcie/
 
 mtk_t9xx-y := \
-	pcie/mtk_pci.o \
-	pcie/mtk_pci_drv_m9xx.o
+	mtk_dev.o \
+	mtk_ctrl_plane.o
diff --git a/drivers/net/wwan/t9xx/mtk_ctrl_plane.c b/drivers/net/wwan/t9xx/mtk_ctrl_plane.c
new file mode 100644
index 000000000000..07938f3e6fe2
--- /dev/null
+++ b/drivers/net/wwan/t9xx/mtk_ctrl_plane.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, MediaTek Inc.
+ * Copyright (c) 2022-2023, Intel Corporation.
+ */
+
+#include <linux/device.h>
+
+#include "mtk_ctrl_plane.h"
+
+/**
+ * mtk_ctrl_init() - Initialize the control plane block.
+ * @mdev: Pointer to the MTK modem device.
+ *
+ * Allocates and initializes the control plane block
+ * associated with @mdev.
+ *
+ * Return: 0 on success, -ENOMEM on allocation failure.
+ */
+int mtk_ctrl_init(struct mtk_md_dev *mdev)
+{
+	struct mtk_ctrl_blk *ctrl_blk;
+
+	ctrl_blk = devm_kzalloc(mdev->dev, sizeof(*ctrl_blk), GFP_KERNEL);
+	if (!ctrl_blk)
+		return -ENOMEM;
+
+	ctrl_blk->mdev = mdev;
+	mdev->ctrl_blk = ctrl_blk;
+
+	return 0;
+}
+EXPORT_SYMBOL(mtk_ctrl_init);
+
+/**
+ * mtk_ctrl_exit() - Clean up the control plane block.
+ * @mdev: Pointer to the MTK modem device.
+ *
+ * Frees the control plane block associated with @mdev.
+ */
+void mtk_ctrl_exit(struct mtk_md_dev *mdev)
+{
+	struct mtk_ctrl_blk *ctrl_blk = mdev->ctrl_blk;
+
+	devm_kfree(mdev->dev, ctrl_blk);
+	mdev->ctrl_blk = NULL;
+}
+EXPORT_SYMBOL(mtk_ctrl_exit);
diff --git a/drivers/net/wwan/t9xx/mtk_ctrl_plane.h b/drivers/net/wwan/t9xx/mtk_ctrl_plane.h
new file mode 100644
index 000000000000..c141876ef95d
--- /dev/null
+++ b/drivers/net/wwan/t9xx/mtk_ctrl_plane.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#ifndef __MTK_CTRL_PLANE_H__
+#define __MTK_CTRL_PLANE_H__
+
+#include <linux/kref.h>
+#include <linux/skbuff.h>
+
+#include "mtk_dev.h"
+
+struct mtk_ctrl_blk {
+	struct mtk_md_dev *mdev;
+	struct mtk_ctrl_trans *trans;
+};
+
+int mtk_ctrl_init(struct mtk_md_dev *mdev);
+void mtk_ctrl_exit(struct mtk_md_dev *mdev);
+
+#endif /* __MTK_CTRL_PLANE_H__ */
diff --git a/drivers/net/wwan/t9xx/mtk_dev.c b/drivers/net/wwan/t9xx/mtk_dev.c
new file mode 100644
index 000000000000..f254ca7ed877
--- /dev/null
+++ b/drivers/net/wwan/t9xx/mtk_dev.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#include <linux/module.h>
+
+#include "mtk_dev.h"
+
+struct mtk_md_dev *mtk_dev_alloc(struct device *pdev, const struct mtk_dev_ops *dev_ops)
+{
+	struct mtk_md_dev *mdev;
+
+	mdev = devm_kzalloc(pdev, sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return NULL;
+
+	mdev->dev_ops = dev_ops;
+	mdev->dev = pdev;
+	return mdev;
+}
+EXPORT_SYMBOL(mtk_dev_alloc);
+
+void mtk_dev_free(struct mtk_md_dev *mdev)
+{
+	struct device *dev = mdev->dev;
+
+	devm_kfree(dev, mdev);
+}
+EXPORT_SYMBOL(mtk_dev_free);
+
+static int __init mtk_common_drv_init(void)
+{
+	return 0;
+}
+module_init(mtk_common_drv_init);
+
+static void __exit mtk_common_drv_exit(void)
+{
+}
+module_exit(mtk_common_drv_exit);
+
+MODULE_DESCRIPTION("MediaTek T9xx PCIe WWAN driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/wwan/t9xx/mtk_dev.h b/drivers/net/wwan/t9xx/mtk_dev.h
index 8278a0e2875e..bb3ea68890ea 100644
--- a/drivers/net/wwan/t9xx/mtk_dev.h
+++ b/drivers/net/wwan/t9xx/mtk_dev.h
@@ -36,6 +36,7 @@ enum mtk_dev_evt_d2h {
 };
 
 struct mtk_md_dev;
+struct mtk_ctrl_blk;
 
 struct mtk_dev_ops {
 	u32 (*get_dev_state)(struct mtk_md_dev *mdev);
@@ -57,6 +58,7 @@ struct mtk_md_dev {
 	void *hw_priv;
 	u32 hw_ver;
 	char dev_str[MTK_DEV_STR_LEN];
+	struct mtk_ctrl_blk *ctrl_blk;
 };
 
 static inline u32 mtk_dev_get_dev_state(struct mtk_md_dev *mdev)
@@ -105,4 +107,7 @@ static inline int mtk_dev_send_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
 	return mdev->dev_ops->send_dev_evt(mdev, dev_evt);
 }
 
+struct mtk_md_dev *mtk_dev_alloc(struct device *pdev, const struct mtk_dev_ops *dev_ops);
+void mtk_dev_free(struct mtk_md_dev *mdev);
+
 #endif /* __MTK_DEV_H__ */
diff --git a/drivers/net/wwan/t9xx/pcie/Makefile b/drivers/net/wwan/t9xx/pcie/Makefile
new file mode 100644
index 000000000000..7410d1796d27
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+ccflags-y += -I$(src)
+ccflags-y += -I$(src)/..
+
+obj-$(CONFIG_MTK_T9XX_PCI) += mtk_t9xx_pcie.o
+
+mtk_t9xx_pcie-y := \
+	mtk_pci_drv_m9xx.o \
+	mtk_pci.o
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci.c b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
index c6a7196fcdd6..90b33dd6effd 100644
--- a/drivers/net/wwan/t9xx/pcie/mtk_pci.c
+++ b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 
 #include "mtk_dev.h"
+#include "mtk_trans_ctrl.h"
 #include "mtk_pci.h"
 #include "mtk_pci_reg.h"
 
@@ -467,6 +468,7 @@ static u32 mtk_pci_ext_h2d_evt_hw_bits(u32 chs)
 
 	SET_HW_BITS(hw_bits, chs, MHCCIF_RC2EP_EVT_DEVICE_RESET,
 		    DEV_EVT_H2D_DEVICE_RESET);
+
 	return LE32_TO_U32(cpu_to_le32(hw_bits));
 }
 
@@ -908,13 +910,11 @@ static int mtk_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	struct mtk_md_dev *mdev;
 	int ret;
 
-	mdev = devm_kzalloc(dev, sizeof(*mdev), GFP_KERNEL);
+	mdev = mtk_dev_alloc(dev, &pci_hw_ops);
 	if (!mdev) {
 		ret = -ENOMEM;
 		goto log_err;
 	}
-	mdev->dev_ops = &pci_hw_ops;
-	mdev->dev = dev;
 
 	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 	if (!priv) {
@@ -991,7 +991,7 @@ static int mtk_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 free_priv_data:
 	devm_kfree(dev, priv);
 free_cntx_data:
-	devm_kfree(dev, mdev);
+	mtk_dev_free(mdev);
 log_err:
 	dev_err(dev, "Failed to probe device, ret=%d\n", ret);
 
@@ -1017,7 +1017,7 @@ static void mtk_pci_remove(struct pci_dev *pdev)
 	pci_load_and_free_saved_state(pdev, &priv->saved_state);
 
 	devm_kfree(dev, priv);
-	devm_kfree(dev, mdev);
+	mtk_dev_free(mdev);
 }
 
 static pci_ers_result_t mtk_pci_error_detected(struct pci_dev *pdev,
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.h b/drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.h
new file mode 100644
index 000000000000..d6de4c43b529
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#ifndef __MTK_TRANS_CTRL_H__
+#define __MTK_TRANS_CTRL_H__
+
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include "mtk_dev.h"
+
+struct mtk_ctrl_trans {
+	struct mtk_ctrl_blk *ctrl_blk;
+	struct mtk_md_dev *mdev;
+};
+
+#endif

-- 
2.34.1



^ permalink raw reply related

* [PATCH v3 1/7] net: wwan: t9xx: Add PCIe core
From: Jack Wu via B4 Relay @ 2026-06-24 10:04 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jack Wu, Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng,
	Matthias Brugger, AngeloGioacchino Del Regno, Simon Horman,
	Jonathan Corbet, Shuah Khan
  Cc: linux-kernel, netdev, linux-arm-kernel, linux-mediatek, linux-doc
In-Reply-To: <20260624-t9xx_driver_v1-v3-0-73ff03f60c48@compal.com>

From: Jack Wu <jackbb_wu@compal.com>

Registers the T900 device driver with the kernel. Set up all
the fundamental configurations for the device: PCIe layer,
Modem Host Cross Core Interface (MHCCIF), Reset Generation
Unit (RGU), modem common control operations and build
infrastructure.

* PCIe layer code implements driver probe and removal, MSI-X
  interrupt initialization and de-initialization, and the way
  of resetting the device.
* MHCCIF provides interrupt channels to communicate events
  such as handshake, PM and port enumeration.
* RGU provides interrupt channels to generate notifications
  from the device so that the T900 driver could get the
  device reset.
* Modem common control operations provide the basic read/write
  functions of the device's hardware registers,
  mask/unmask/get/clear functions of the device's interrupt
  registers and inquiry functions of the device's status.

Signed-off-by: Jack Wu <jackbb_wu@compal.com>
---
 drivers/net/wwan/Kconfig                      |   12 +
 drivers/net/wwan/Makefile                     |    1 +
 drivers/net/wwan/t9xx/Makefile                |   10 +
 drivers/net/wwan/t9xx/mtk_dev.h               |  108 +++
 drivers/net/wwan/t9xx/pcie/mtk_pci.c          | 1049 +++++++++++++++++++++++++
 drivers/net/wwan/t9xx/pcie/mtk_pci.h          |  234 ++++++
 drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c |   69 ++
 drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h      |   70 ++
 8 files changed, 1553 insertions(+)

diff --git a/drivers/net/wwan/Kconfig b/drivers/net/wwan/Kconfig
index 88df55d78d90..4cee537c739f 100644
--- a/drivers/net/wwan/Kconfig
+++ b/drivers/net/wwan/Kconfig
@@ -121,6 +121,18 @@ config MTK_T7XX
 
 	  If unsure, say N.
 
+config MTK_T9XX
+	tristate "MediaTek PCIe 5G WWAN modem T9xx device"
+	depends on PCI
+	select NET_DEVLINK
+	help
+	  Enables MediaTek PCIe based 5G WWAN modem (T9xx series) device.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called mtk_t9xx.
+
+	  If unsure, say N.
+
 endif # WWAN
 
 endmenu
diff --git a/drivers/net/wwan/Makefile b/drivers/net/wwan/Makefile
index 3960c0ae2445..7361eef4c472 100644
--- a/drivers/net/wwan/Makefile
+++ b/drivers/net/wwan/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_QCOM_BAM_DMUX) += qcom_bam_dmux.o
 obj-$(CONFIG_RPMSG_WWAN_CTRL) += rpmsg_wwan_ctrl.o
 obj-$(CONFIG_IOSM) += iosm/
 obj-$(CONFIG_MTK_T7XX) += t7xx/
+obj-$(CONFIG_MTK_T9XX) += t9xx/
diff --git a/drivers/net/wwan/t9xx/Makefile b/drivers/net/wwan/t9xx/Makefile
new file mode 100644
index 000000000000..6f2dd3f91454
--- /dev/null
+++ b/drivers/net/wwan/t9xx/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+ccflags-y += -I$(src)/pcie
+ccflags-y += -I$(src)
+
+obj-$(CONFIG_MTK_T9XX) += mtk_t9xx.o
+
+mtk_t9xx-y := \
+	pcie/mtk_pci.o \
+	pcie/mtk_pci_drv_m9xx.o
diff --git a/drivers/net/wwan/t9xx/mtk_dev.h b/drivers/net/wwan/t9xx/mtk_dev.h
new file mode 100644
index 000000000000..8278a0e2875e
--- /dev/null
+++ b/drivers/net/wwan/t9xx/mtk_dev.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#ifndef __MTK_DEV_H__
+#define __MTK_DEV_H__
+
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#define MTK_DEV_STR_LEN 16
+
+enum mtk_user_id {
+	MTK_USER_MIN,
+	MTK_USER_CTRL,
+	MTK_USER_DATA,
+	MTK_USER_MAX
+};
+
+enum mtk_dev_evt_h2d {
+	DEV_EVT_H2D_DEVICE_RESET	= BIT(2),
+	DEV_EVT_H2D_MAX			= BIT(5)
+};
+
+enum mtk_dev_evt_d2h {
+	DEV_EVT_D2H_BOOT_FLOW_SYNC	= BIT(4),
+	DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP = BIT(5),
+	DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD	= BIT(6),
+	DEV_EVT_D2H_MAX			= BIT(11)
+};
+
+struct mtk_md_dev;
+
+struct mtk_dev_ops {
+	u32 (*get_dev_state)(struct mtk_md_dev *mdev);
+	void (*ack_dev_state)(struct mtk_md_dev *mdev, u32 state);
+	u32 (*get_dev_cfg)(struct mtk_md_dev *mdev);
+	int (*register_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt,
+				int (*evt_cb)(u32 status, void *data), void *data);
+	void (*unregister_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
+	void (*mask_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
+	void (*unmask_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
+	void (*clear_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
+	int (*send_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
+};
+
+/* mtk_md_dev defines the structure of MTK modem device */
+struct mtk_md_dev {
+	struct device *dev;
+	const struct mtk_dev_ops *dev_ops;
+	void *hw_priv;
+	u32 hw_ver;
+	char dev_str[MTK_DEV_STR_LEN];
+};
+
+static inline u32 mtk_dev_get_dev_state(struct mtk_md_dev *mdev)
+{
+	return mdev->dev_ops->get_dev_state(mdev);
+}
+
+static inline void mtk_dev_ack_dev_state(struct mtk_md_dev *mdev, u32 state)
+{
+	return mdev->dev_ops->ack_dev_state(mdev, state);
+}
+
+static inline u32 mtk_dev_get_dev_cfg(struct mtk_md_dev *mdev)
+{
+	return mdev->dev_ops->get_dev_cfg(mdev);
+}
+
+static inline int mtk_dev_register_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt,
+					   int (*evt_cb)(u32 status, void *data), void *data)
+{
+	return mdev->dev_ops->register_dev_evt(mdev, dev_evt, evt_cb, data);
+}
+
+static inline void mtk_dev_unregister_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
+{
+	mdev->dev_ops->unregister_dev_evt(mdev, dev_evt);
+}
+
+static inline void mtk_dev_mask_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
+{
+	mdev->dev_ops->mask_dev_evt(mdev, dev_evt);
+}
+
+static inline void mtk_dev_unmask_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
+{
+	mdev->dev_ops->unmask_dev_evt(mdev, dev_evt);
+}
+
+static inline void mtk_dev_clear_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
+{
+	mdev->dev_ops->clear_dev_evt(mdev, dev_evt);
+}
+
+static inline int mtk_dev_send_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
+{
+	return mdev->dev_ops->send_dev_evt(mdev, dev_evt);
+}
+
+#endif /* __MTK_DEV_H__ */
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci.c b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
new file mode 100644
index 000000000000..c6a7196fcdd6
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#include <linux/acpi.h>
+#include <linux/aer.h>
+#include <linux/bitfield.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include "mtk_dev.h"
+#include "mtk_pci.h"
+#include "mtk_pci_reg.h"
+
+#define MTK_PCI_BAR_NUM		6
+#define MTK_PCI_TRANSPARENT_ATR_SIZE	(0x3F)
+#define MTK_PCI_MINIMUM_ATR_SIZE	(0x1000)
+#define ATR_SIZE_LO32_MASK		GENMASK_ULL(31, 0)
+#define ATR_SIZE_HI32_MASK		GENMASK_ULL(63, 32)
+#define ATR_SIZE_BIAS_FROM_LO32		2
+#define ATR_ADDR_ALIGN_MASK		0xFFFFF000
+#define ATR_EN				BIT(0)
+#define ATR_PARAM_OFFSET		16
+/* Delay between ACPI PXP._OFF and _ON for modem power cycle stabilization */
+#define MTK_PLDR_POWER_OFF_DELAY_MS	500
+#define LE32_TO_U32(x) ((__force u32)(__le32)(x))
+#define SET_HW_BITS(dest, chs, mhccif, dev)		\
+	({						\
+		if ((chs) & (dev))					\
+			(dest) |= FIELD_PREP(mhccif, 1);		\
+	})
+
+struct mtk_mhccif_cb {
+	struct list_head entry;
+	int (*evt_cb)(u32 status, void *data);
+	void *data;
+	u32 chs;
+};
+
+/**
+ * mtk_pci_setup_atr() - Configure a PCIe address translation rule
+ * @mdev: MTK MD device
+ * @cfg: ATR configuration parameters
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_setup_atr(struct mtk_md_dev *mdev, struct mtk_atr_cfg *cfg)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 addr, val, size_h, size_l;
+	int atr_size, pos, offset;
+
+	if (cfg->transparent) {
+		/* No address conversion is performed */
+		atr_size = MTK_PCI_TRANSPARENT_ATR_SIZE;
+	} else {
+		if (cfg->size < MTK_PCI_MINIMUM_ATR_SIZE)
+			cfg->size = MTK_PCI_MINIMUM_ATR_SIZE;
+
+		if (cfg->src_addr & (cfg->size - 1)) {
+			dev_err((mdev)->dev, "Invalid atr src addr is not aligned to size\n");
+			return -EFAULT;
+		}
+
+		if (cfg->trsl_addr & (cfg->size - 1)) {
+			dev_err((mdev)->dev,
+				"Invalid atr trsl addr is not aligned to size, %llx, %llx\n",
+				cfg->trsl_addr, cfg->size - 1);
+			return -EFAULT;
+		}
+
+		size_l = FIELD_GET(ATR_SIZE_LO32_MASK, cfg->size);
+		size_h = FIELD_GET(ATR_SIZE_HI32_MASK, cfg->size);
+		pos = ffs(size_l);
+		if (pos) {
+			atr_size = pos - ATR_SIZE_BIAS_FROM_LO32;
+		} else {
+			pos = ffs(size_h);
+			atr_size = pos + 32 - ATR_SIZE_BIAS_FROM_LO32;
+		}
+	}
+
+	/* Calculate table offset */
+	offset = ATR_PORT_OFFSET * cfg->port + ATR_TABLE_OFFSET * cfg->table;
+	addr = REG_ATR_PCIE_WIN0_T0_SRC_ADDR_MSB + offset;
+	val = (u32)(cfg->src_addr >> 32);
+	mtk_pci_mac_write32(priv, addr, val);
+
+	addr = REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset;
+	val = (u32)(cfg->src_addr & ATR_ADDR_ALIGN_MASK) | (atr_size << 1) | ATR_EN;
+	mtk_pci_mac_write32(priv, addr, val);
+
+	addr = REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_MSB + offset;
+	val = (u32)(cfg->trsl_addr >> 32);
+	mtk_pci_mac_write32(priv, addr, val);
+
+	addr = REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_LSB + offset;
+	val = (u32)(cfg->trsl_addr & ATR_ADDR_ALIGN_MASK);
+	mtk_pci_mac_write32(priv, addr, val);
+
+	/* TRSL_PARAM */
+	addr = REG_ATR_PCIE_WIN0_T0_TRSL_PARAM + offset;
+	val = (cfg->trsl_param << ATR_PARAM_OFFSET) | cfg->trsl_id;
+	mtk_pci_mac_write32(priv, addr, val);
+
+	return 0;
+}
+
+/**
+ * mtk_pci_atr_disable() - Disable all PCIe address translation rules
+ * @priv: MTK PCI private data
+ */
+void mtk_pci_atr_disable(struct mtk_pci_priv *priv)
+{
+	int port, tbl, offset;
+	u32 val;
+
+	/* Disable all ATR table for all ports */
+	for (port = ATR_SRC_PCI_WIN0; port <= ATR_SRC_AXIS_3; port++)
+		for (tbl = 0; tbl < ATR_TABLE_NUM_PER_ATR; tbl++) {
+			/* Calculate table offset */
+			offset = ATR_PORT_OFFSET * port + ATR_TABLE_OFFSET * tbl;
+			val = mtk_pci_mac_read32(priv, REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset);
+			val = val & (~BIT(0));
+			/* Disable table by SRC_ADDR_L */
+			mtk_pci_mac_write32(priv, REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset, val);
+		}
+}
+
+static void mtk_pci_set_msix_merged(struct mtk_pci_priv *priv, int irq_cnt)
+{
+	mtk_pci_mac_write32(priv, REG_PCIE_CFG_MSIX, ffs(irq_cnt) * 2 - 1);
+}
+
+/**
+ * mtk_pci_get_dev_state() - Read the device state from the modem
+ * @mdev: MTK MD device
+ *
+ * Return: Device state value.
+ */
+u32 mtk_pci_get_dev_state(struct mtk_md_dev *mdev)
+{
+	return mtk_pci_mac_read32(mdev->hw_priv, REG_PCIE_DEBUG_DUMMY_7);
+}
+
+/**
+ * mtk_pci_ack_dev_state() - Acknowledge the device state to the modem
+ * @mdev: MTK MD device
+ * @state: State value to acknowledge
+ */
+void mtk_pci_ack_dev_state(struct mtk_md_dev *mdev, u32 state)
+{
+	mtk_pci_mac_write32(mdev->hw_priv, REG_PCIE_DEBUG_DUMMY_7, state);
+}
+
+/**
+ * mtk_pci_get_irq_id() - Map an IRQ source to its hardware IRQ ID
+ * @mdev: MTK MD device
+ * @irq_src: IRQ source enum
+ *
+ * Return: IRQ ID on success, -EINVAL on failure.
+ */
+int mtk_pci_get_irq_id(struct mtk_md_dev *mdev, enum mtk_irq_src irq_src)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	const int *irq_tbl = priv->cfg->irq_tbl;
+	int irq_id = -EINVAL;
+
+	if (irq_src > MTK_IRQ_SRC_MIN && irq_src < MTK_IRQ_SRC_MAX) {
+		irq_id = irq_tbl[irq_src];
+		if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX)
+			irq_id = -EINVAL;
+	}
+
+	return irq_id;
+}
+
+/**
+ * mtk_pci_get_virq_id() - Get the Linux virtual IRQ for a hardware IRQ ID
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ *
+ * Return: Virtual IRQ number on success, negative error code on failure.
+ */
+int mtk_pci_get_virq_id(struct mtk_md_dev *mdev, int irq_id)
+{
+	struct pci_dev *pdev = to_pci_dev(mdev->dev);
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if (!priv->irq_cnt || irq_id < 0)
+		return -EINVAL;
+
+	return pci_irq_vector(pdev, irq_id % priv->irq_cnt);
+}
+
+/**
+ * mtk_pci_register_irq() - Register a callback for a hardware IRQ
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ * @irq_cb: Callback function
+ * @data: Private data passed to callback
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_register_irq(struct mtk_md_dev *mdev, int irq_id,
+			 int (*irq_cb)(int irq_id, void *data), void *data)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if ((irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX) || !irq_cb)
+		return -EINVAL;
+
+	if (priv->irq_cb_list[irq_id]) {
+		dev_err((mdev)->dev,
+			"Unable to register irq, irq_id=%d, it's already been register by %ps.\n",
+			irq_id, priv->irq_cb_list[irq_id]);
+		return -EFAULT;
+	}
+	priv->irq_cb_list[irq_id] = irq_cb;
+	priv->irq_cb_data[irq_id] = data;
+
+	return 0;
+}
+
+/**
+ * mtk_pci_unregister_irq() - Unregister a hardware IRQ callback
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_unregister_irq(struct mtk_md_dev *mdev, int irq_id)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX)
+		return -EINVAL;
+
+	if (!priv->irq_cb_list[irq_id]) {
+		dev_err((mdev)->dev, "irq_id=%d has not been registered\n", irq_id);
+		return -EFAULT;
+	}
+	priv->irq_cb_list[irq_id] = NULL;
+	priv->irq_cb_data[irq_id] = NULL;
+
+	return 0;
+}
+
+/**
+ * mtk_pci_mask_irq() - Mask (disable) a hardware IRQ
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_mask_irq(struct mtk_md_dev *mdev, int irq_id)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
+	    priv->irq_type != PCI_IRQ_MSIX) {
+		dev_err(mdev->dev, "Failed to mask irq: input irq_id=%d\n", irq_id);
+		return -EINVAL;
+	}
+
+	mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_CLR_GRP0_0, BIT(irq_id));
+
+	return 0;
+}
+
+/**
+ * mtk_pci_unmask_irq() - Unmask (enable) a hardware IRQ
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_unmask_irq(struct mtk_md_dev *mdev, int irq_id)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
+	    priv->irq_type != PCI_IRQ_MSIX) {
+		dev_err(mdev->dev, "Failed to unmask irq: input irq_id=%d\n", irq_id);
+		return -EINVAL;
+	}
+
+	mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_SET_GRP0_0, BIT(irq_id));
+
+	return 0;
+}
+
+/**
+ * mtk_pci_clear_irq() - Clear (acknowledge) a hardware IRQ
+ * @mdev: MTK MD device
+ * @irq_id: Hardware IRQ ID
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_clear_irq(struct mtk_md_dev *mdev, int irq_id)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
+	    priv->irq_type != PCI_IRQ_MSIX) {
+		dev_err(mdev->dev, "Failed to clear irq: input irq_id=%d\n", irq_id);
+		return -EINVAL;
+	}
+
+	mtk_pci_mac_write32(priv, REG_MSIX_ISTATUS_HOST_GRP0_0, BIT(irq_id));
+
+	return 0;
+}
+
+static u32 mtk_pci_ext_d2h_evt_hw_bits(u32 chs)
+{
+	u32 hw_bits = 0;
+
+	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_BOOT_FLOW_SYNC,
+		    DEV_EVT_D2H_BOOT_FLOW_SYNC);
+	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_SAP,
+		    DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP);
+	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_MD,
+		    DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD);
+
+	return LE32_TO_U32(cpu_to_le32(hw_bits));
+}
+
+static u32 mtk_pci_ext_d2h_evt_chs(u32 hw_bits)
+{
+	u32 chs = 0;
+
+	if (!hw_bits)
+		return chs;
+
+	chs = FIELD_PREP(DEV_EVT_D2H_BOOT_FLOW_SYNC,
+			 FIELD_GET(MHCCIF_EP2RC_EVT_BOOT_FLOW_SYNC, hw_bits)) |
+	      FIELD_PREP(DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP,
+			 FIELD_GET(MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_SAP, hw_bits)) |
+	      FIELD_PREP(DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD,
+			 FIELD_GET(MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_MD, hw_bits));
+
+	return chs;
+}
+
+/**
+ * mtk_pci_register_ext_evt() - Register a callback for MHCCIF device events
+ * @mdev: MTK MD device
+ * @chs: Bitmask of event channels to register
+ * @evt_cb: Callback function
+ * @data: Private data passed to callback
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_register_ext_evt(struct mtk_md_dev *mdev, u32 chs,
+			     int (*evt_cb)(u32 status, void *data), void *data)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	struct mtk_mhccif_cb *cb;
+	int ret = 0;
+
+	if (!chs || !evt_cb)
+		return -EINVAL;
+
+	spin_lock_bh(&priv->mhccif_lock);
+	list_for_each_entry(cb, &priv->mhccif_cb_list, entry) {
+		if (cb->chs & chs) {
+			ret = -EFAULT;
+			dev_err((mdev)->dev,
+				"Unable to register evt, intersection: chs=0x%08x&0x%08x cb=%ps\n",
+				chs, cb->chs, cb->evt_cb);
+			goto err_spin_unlock;
+		}
+	}
+	cb = devm_kzalloc(mdev->dev, sizeof(*cb), GFP_ATOMIC);
+	if (!cb) {
+		ret = -ENOMEM;
+		goto err_spin_unlock;
+	}
+	cb->evt_cb = evt_cb;
+	cb->data = data;
+	cb->chs = chs;
+	list_add_tail(&cb->entry, &priv->mhccif_cb_list);
+err_spin_unlock:
+	spin_unlock_bh(&priv->mhccif_lock);
+
+	return ret;
+}
+
+/**
+ * mtk_pci_unregister_ext_evt() - Unregister an MHCCIF device event callback
+ * @mdev: MTK MD device
+ * @chs: Bitmask of event channels to unregister
+ */
+void mtk_pci_unregister_ext_evt(struct mtk_md_dev *mdev, u32 chs)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	struct mtk_mhccif_cb *cb, *next;
+
+	if (!chs)
+		return;
+
+	spin_lock_bh(&priv->mhccif_lock);
+	list_for_each_entry_safe(cb, next, &priv->mhccif_cb_list, entry) {
+		if (cb->chs == chs) {
+			list_del(&cb->entry);
+			devm_kfree(mdev->dev, cb);
+			goto out;
+		}
+	}
+	dev_warn((mdev)->dev,
+		 "Unable to unregister evt, no chs=0x%08x has been registered.\n", chs);
+out:
+	spin_unlock_bh(&priv->mhccif_lock);
+}
+
+/**
+ * mtk_pci_mask_ext_evt() - Mask (disable) MHCCIF device events
+ * @mdev: MTK MD device
+ * @chs: Bitmask of event channels to mask
+ */
+void mtk_pci_mask_ext_evt(struct mtk_md_dev *mdev, u32 chs)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
+
+	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
+			MHCCIF_EP2RC_SW_INT_EAP_MASK_SET, hw_bits);
+}
+
+/**
+ * mtk_pci_unmask_ext_evt() - Unmask (enable) MHCCIF device events
+ * @mdev: MTK MD device
+ * @chs: Bitmask of event channels to unmask
+ */
+void mtk_pci_unmask_ext_evt(struct mtk_md_dev *mdev, u32 chs)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
+
+	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
+			MHCCIF_EP2RC_SW_INT_EAP_MASK_CLR, hw_bits);
+}
+
+/**
+ * mtk_pci_clear_ext_evt() - Clear (acknowledge) MHCCIF device events
+ * @mdev: MTK MD device
+ * @chs: Bitmask of event channels to clear
+ */
+void mtk_pci_clear_ext_evt(struct mtk_md_dev *mdev, u32 chs)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
+
+	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
+			MHCCIF_EP2RC_SW_INT_ACK, hw_bits);
+}
+
+static u32 mtk_pci_ext_h2d_evt_hw_bits(u32 chs)
+{
+	u32 hw_bits = 0;
+
+	SET_HW_BITS(hw_bits, chs, MHCCIF_RC2EP_EVT_DEVICE_RESET,
+		    DEV_EVT_H2D_DEVICE_RESET);
+	return LE32_TO_U32(cpu_to_le32(hw_bits));
+}
+
+/**
+ * mtk_pci_send_ext_evt() - Send an MHCCIF event to the modem
+ * @mdev: MTK MD device
+ * @ch: Event channel to trigger (must be a single bit)
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_send_ext_evt(struct mtk_md_dev *mdev, u32 ch)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 rc_base, hw_bits;
+
+	rc_base = priv->cfg->mhccif_rc_base_addr;
+
+	/* Only allow one ch to be triggered at a time */
+	if (!is_power_of_2(ch)) {
+		dev_err((mdev)->dev, "Unsupported ext evt ch=0x%08x\n", ch);
+		return -EINVAL;
+	}
+
+	hw_bits = mtk_pci_ext_h2d_evt_hw_bits(ch);
+	mtk_pci_write32(mdev, rc_base + MHCCIF_RC2EP_SW_BSY, hw_bits);
+	mtk_pci_write32(mdev, rc_base + MHCCIF_RC2EP_SW_TCHNUM, ffs(hw_bits) - 1);
+	return 0;
+}
+
+static u32 mtk_pci_get_ext_evt_hw_status(struct mtk_md_dev *mdev)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	return mtk_pci_read32(mdev, priv->cfg->mhccif_rc_base_addr +
+			      MHCCIF_EP2RC_SW_INT_STS);
+}
+
+/**
+ * mtk_pci_fldr() - Perform a Function Level Device Reset via ACPI _RST
+ * @mdev: MTK MD device
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_fldr(struct mtk_md_dev *mdev)
+{
+#ifdef CONFIG_ACPI
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	acpi_status acpi_ret;
+	acpi_handle handle;
+
+	if (acpi_disabled) {
+		dev_err((mdev)->dev, "Unsupported, acpi function isn't enable\n");
+		return -ENODEV;
+	}
+
+	handle = ACPI_HANDLE(mdev->dev);
+
+	if (!handle) {
+		dev_err((mdev)->dev, "Unsupported, acpi handle isn't found\n");
+		return -ENODEV;
+	}
+
+	if (!acpi_has_method(handle, "_RST")) {
+		dev_err((mdev)->dev, "Unsupported, _RST method isn't found\n");
+		return -ENODEV;
+	}
+
+	acpi_ret = acpi_evaluate_object(handle, "_RST", NULL, &buffer);
+	if (ACPI_FAILURE(acpi_ret)) {
+		dev_err((mdev)->dev, "Failed to execute _RST method: %s\n",
+			acpi_format_exception(acpi_ret));
+		return -EFAULT;
+	}
+
+	acpi_os_free(buffer.pointer);
+
+	return 0;
+#else /* !CONFIG_ACPI */
+	dev_err((mdev)->dev, "Unsupported, CONFIG ACPI hasn't been set to 'y'\n");
+
+	return -ENODEV;
+#endif /* !CONFIG_ACPI */
+}
+
+/**
+ * mtk_pci_pldr() - Perform a PCIe Link Down Reset via ACPI PXP._OFF/_ON
+ * @mdev: MTK MD device
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_pldr(struct mtk_md_dev *mdev)
+{
+#ifdef CONFIG_ACPI
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct pci_dev *bridge;
+	acpi_status acpi_ret;
+	acpi_handle handle;
+
+	if (acpi_disabled) {
+		dev_err((mdev)->dev, "Unsupported, acpi function isn't enable\n");
+		return -ENODEV;
+	}
+
+	bridge = pci_upstream_bridge(to_pci_dev(mdev->dev));
+	if (!bridge) {
+		dev_err((mdev)->dev, "Unable to find bridge\n");
+		return -ENODEV;
+	}
+
+	handle = ACPI_HANDLE(&bridge->dev);
+	if (!handle) {
+		dev_err((mdev)->dev, "Unsupported, acpi handle isn't found\n");
+		return -ENODEV;
+	}
+	if (!acpi_has_method(handle, "PXP._OFF") ||
+	    !acpi_has_method(handle, "PXP._ON")) {
+		dev_err((mdev)->dev, "Unsupported, pldr method isn't supported\n");
+		return -ENODEV;
+	}
+	acpi_ret = acpi_evaluate_object(handle, "PXP._OFF", NULL, &buffer);
+	if (ACPI_FAILURE(acpi_ret)) {
+		dev_err((mdev)->dev, "Failed to execute _OFF method: %s\n",
+			acpi_format_exception(acpi_ret));
+		return -EFAULT;
+	}
+	acpi_os_free(buffer.pointer);
+
+	msleep(MTK_PLDR_POWER_OFF_DELAY_MS);
+
+	buffer.length = ACPI_ALLOCATE_BUFFER;
+	buffer.pointer = NULL;
+	acpi_ret = acpi_evaluate_object(handle, "PXP._ON", NULL, &buffer);
+	if (ACPI_FAILURE(acpi_ret)) {
+		dev_err((mdev)->dev, "Failed to execute _ON method: %s\n",
+			acpi_format_exception(acpi_ret));
+		return -EFAULT;
+	}
+	acpi_os_free(buffer.pointer);
+
+	return 0;
+#else
+	dev_err((mdev)->dev, "Unsupported, CONFIG ACPI hasn't been set to 'y'\n");
+
+	return -ENODEV;
+#endif
+}
+
+/**
+ * mtk_pci_get_dev_cfg() - Read the device configuration from the modem
+ * @mdev: MTK MD device
+ *
+ * Return: Device configuration value.
+ */
+u32 mtk_pci_get_dev_cfg(struct mtk_md_dev *mdev)
+{
+	u32 val;
+
+	val = mtk_pci_mac_read32(mdev->hw_priv, REG_PCIE_DEBUG_DUMMY_4);
+	return (val >> MTK_CFG_INFO_BIT_SHIFT);
+}
+
+static int mtk_pci_dev_reset(struct mtk_md_dev *mdev, enum mtk_reset_type type)
+{
+	switch (type) {
+	case RESET_MHCCIF:
+		return mtk_pci_send_ext_evt(mdev, DEV_EVT_H2D_DEVICE_RESET);
+	case RESET_FLDR:
+		return mtk_pci_fldr(mdev);
+	case RESET_PLDR:
+		return mtk_pci_pldr(mdev);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * mtk_pci_reset() - Reset the modem device
+ * @mdev: MTK MD device
+ * @type: Reset type (MHCCIF, FLDR, or PLDR)
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mtk_pci_reset(struct mtk_md_dev *mdev, enum mtk_reset_type type)
+{
+	return mtk_pci_dev_reset(mdev, type);
+}
+
+/**
+ * mtk_pci_link_check() - Check if the PCIe link to the modem is active
+ * @mdev: MTK MD device
+ *
+ * Return: true if the device is present, false otherwise.
+ */
+bool mtk_pci_link_check(struct mtk_md_dev *mdev)
+{
+	return pci_device_is_present(to_pci_dev(mdev->dev));
+}
+
+static void mtk_mhccif_isr_work(struct work_struct *work)
+{
+	struct mtk_pci_priv *priv =
+		container_of(work, struct mtk_pci_priv, mhccif_work);
+	struct mtk_md_dev *mdev = priv->irq_desc->mdev;
+	struct mtk_mhccif_cb *cb;
+	u32 stat, mask, chs;
+
+	stat = mtk_pci_get_ext_evt_hw_status(mdev);
+	mask = mtk_pci_read32(mdev, priv->cfg->mhccif_rc_base_addr
+		+ MHCCIF_EP2RC_SW_INT_EAP_MASK);
+	if (unlikely(stat == U32_MAX && !(mtk_pci_link_check(mdev)))) {
+		/* When link failed, we don't need to unmask/clear. */
+		dev_err((mdev)->dev, "Failed to check link in MHCCIF handler.\n");
+		return;
+	}
+
+	stat &= ~mask;
+	chs = mtk_pci_ext_d2h_evt_chs(stat);
+	spin_lock_bh(&priv->mhccif_lock);
+	list_for_each_entry(cb, &priv->mhccif_cb_list, entry) {
+		if (cb->chs & chs)
+			cb->evt_cb(cb->chs & chs, cb->data);
+	}
+	spin_unlock_bh(&priv->mhccif_lock);
+
+	mtk_pci_clear_irq(mdev, priv->mhccif_irq_id);
+	mtk_pci_unmask_irq(mdev, priv->mhccif_irq_id);
+}
+
+static const struct  pci_device_id t9xx_pci_table[] = {
+	MTK_PCI_DEV_CFG(0x0900, mtk_dev_cfg_0900),
+	CEI_PCI_DEV_CFG(0x01CA, mtk_dev_cfg_0900),
+	{/* end: all zeroes */}
+};
+
+MODULE_DEVICE_TABLE(pci, t9xx_pci_table);
+
+static int mtk_pci_bar_init(struct mtk_md_dev *mdev)
+{
+	struct pci_dev *pdev = to_pci_dev(mdev->dev);
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	u32 bar[MTK_PCI_BAR_NUM];
+	int i, ret;
+
+	for (i = 0; i < MTK_PCI_BAR_NUM; i++)
+		pci_read_config_dword(to_pci_dev(mdev->dev),
+				      PCI_BASE_ADDRESS_0 + (i << 2), bar + i);
+
+	ret = pcim_iomap_regions(pdev, MTK_REQUESTED_BARS, mdev->dev_str);
+	if (ret) {
+		dev_err((mdev)->dev, "Failed to init MMIO. ret=%d\n", ret);
+		return ret;
+	}
+
+	/* get ioremapped memory */
+	priv->mac_reg_base = pcim_iomap_table(pdev)[MTK_BAR_0_1_IDX];
+	priv->bar23_addr = pcim_iomap_table(pdev)[MTK_BAR_2_3_IDX];
+	if (!priv->mac_reg_base || !priv->bar23_addr) {
+		dev_err((mdev)->dev, "Failed to init BAR.\n");
+		return -EINVAL;
+	}
+	/* We use MD view base address "0" to observe registers */
+	priv->ext_reg_base = priv->bar23_addr - ATR_PCIE_REG_TRSL_ADDR;
+
+	return 0;
+}
+
+static int mtk_mhccif_irq_cb(int irq_id, void *data)
+{
+	struct mtk_md_dev *mdev = data;
+	struct mtk_pci_priv *priv;
+
+	priv = mdev->hw_priv;
+	queue_work(system_highpri_wq, &priv->mhccif_work);
+
+	return 0;
+}
+
+static int mtk_mhccif_init(struct mtk_md_dev *mdev)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	int ret;
+
+	INIT_LIST_HEAD(&priv->mhccif_cb_list);
+	spin_lock_init(&priv->mhccif_lock);
+	INIT_WORK(&priv->mhccif_work, mtk_mhccif_isr_work);
+
+	ret = mtk_pci_get_irq_id(mdev, MTK_IRQ_SRC_MHCCIF);
+	if (ret < 0) {
+		dev_err((mdev)->dev, "Failed to get mhccif_irq_id. ret=%d\n", ret);
+		return ret;
+	}
+	priv->mhccif_irq_id = ret;
+
+	ret = mtk_pci_register_irq(mdev, priv->mhccif_irq_id, mtk_mhccif_irq_cb, mdev);
+	if (ret) {
+		dev_err((mdev)->dev, "Failed to register mhccif_irq callback\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void mtk_mhccif_exit(struct mtk_md_dev *mdev)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+
+	mtk_pci_unregister_irq(mdev, priv->mhccif_irq_id);
+	cancel_work_sync(&priv->mhccif_work);
+}
+
+static irqreturn_t mtk_pci_irq_handler(struct mtk_md_dev *mdev, u32 irq_state)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	int irq_id;
+
+	/* Check whether each set bit has a callback, if has, call it */
+	do {
+		irq_id = fls(irq_state) - 1;
+		irq_state &= ~BIT(irq_id);
+		if (likely(priv->irq_cb_list[irq_id]))
+			priv->irq_cb_list[irq_id](irq_id, priv->irq_cb_data[irq_id]);
+		else
+			dev_err((mdev)->dev, "Unhandled irq_id=%d, no callback for it.\n", irq_id);
+	} while (irq_state);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mtk_pci_irq_msix(int irq, void *data)
+{
+	struct mtk_pci_irq_desc *irq_desc = data;
+	struct mtk_md_dev *mdev = irq_desc->mdev;
+	struct mtk_pci_priv *priv;
+	u32 irq_state, irq_enable;
+
+	priv = mdev->hw_priv;
+	irq_state = mtk_pci_mac_read32(priv, REG_MSIX_ISTATUS_HOST_GRP0_0);
+	irq_enable = mtk_pci_mac_read32(priv, REG_IMASK_HOST_MSIX_GRP0_0);
+	irq_state &= irq_enable;
+
+	if (unlikely(!irq_state) ||
+	    unlikely(!((irq_state & GENMASK(priv->irq_cnt - 1, 0)) &
+		      irq_desc->msix_bits)))
+		return IRQ_NONE;
+
+	/* Mask the bit and user needs to unmask by itself */
+	mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_CLR_GRP0_0,
+			    irq_state & ~BIT(30));
+
+	return mtk_pci_irq_handler(mdev, irq_state);
+}
+
+static int mtk_pci_request_irq_msix(struct mtk_md_dev *mdev,
+				    int irq_cnt_allocated)
+{
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	struct mtk_pci_irq_desc *irq_desc;
+	struct pci_dev *pdev;
+	int irq_cnt;
+	int ret, i;
+
+	/* calculate the nearest 2's power number */
+	irq_cnt = BIT(fls(irq_cnt_allocated) - 1);
+	pdev = to_pci_dev(mdev->dev);
+	irq_desc = priv->irq_desc;
+	for (i = 0; i < irq_cnt; i++) {
+		irq_desc[i].mdev = mdev;
+		irq_desc[i].msix_bits = BIT(i);
+		snprintf(irq_desc[i].name, MTK_IRQ_NAME_LEN, "msix%d-%s", i, mdev->dev_str);
+		ret = pci_request_irq(pdev, i, mtk_pci_irq_msix, NULL,
+				      &irq_desc[i], irq_desc[i].name);
+		if (ret) {
+			dev_err((mdev)->dev, "Failed to request %s: ret=%d\n",
+				irq_desc[i].name, ret);
+			for (i--; i >= 0; i--)
+				pci_free_irq(pdev, i, &irq_desc[i]);
+			return ret;
+		}
+	}
+	priv->irq_cnt = irq_cnt;
+	priv->irq_type = PCI_IRQ_MSIX;
+
+	if (irq_cnt != MTK_IRQ_CNT_MAX)
+		mtk_pci_set_msix_merged(priv, irq_cnt);
+
+	return 0;
+}
+
+static int mtk_pci_request_irq(struct mtk_md_dev *mdev)
+{
+	struct pci_dev *pdev = to_pci_dev(mdev->dev);
+	int irq_cnt, ret;
+
+	irq_cnt = pci_alloc_irq_vectors(pdev, MTK_IRQ_CNT_MIN,
+					MTK_IRQ_CNT_MAX, PCI_IRQ_MSIX);
+
+	if (irq_cnt < MTK_IRQ_CNT_MIN) {
+		dev_err(mdev->dev,
+			"Unable to alloc pci irq vectors. ret=%d maxirqcnt=%d irqtype=0x%x\n",
+			irq_cnt, MTK_IRQ_CNT_MAX, PCI_IRQ_MSIX);
+		return -EFAULT;
+	}
+
+	ret = mtk_pci_request_irq_msix(mdev, irq_cnt);
+	if (ret)
+		pci_free_irq_vectors(pdev);
+
+	return ret;
+}
+
+static void mtk_pci_free_irq(struct mtk_md_dev *mdev)
+{
+	struct pci_dev *pdev = to_pci_dev(mdev->dev);
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	int i;
+
+	for (i = 0; i < priv->irq_cnt; i++)
+		pci_free_irq(pdev, i, &priv->irq_desc[i]);
+
+	pci_free_irq_vectors(pdev);
+}
+
+static const struct mtk_dev_ops pci_hw_ops = {
+	.get_dev_state = mtk_pci_get_dev_state,
+	.ack_dev_state = mtk_pci_ack_dev_state,
+	.get_dev_cfg = mtk_pci_get_dev_cfg,
+	.register_dev_evt = mtk_pci_register_ext_evt,
+	.unregister_dev_evt = mtk_pci_unregister_ext_evt,
+	.mask_dev_evt = mtk_pci_mask_ext_evt,
+	.unmask_dev_evt = mtk_pci_unmask_ext_evt,
+	.clear_dev_evt = mtk_pci_clear_ext_evt,
+	.send_dev_evt = mtk_pci_send_ext_evt,
+};
+
+static int mtk_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct mtk_pci_priv *priv;
+	struct mtk_md_dev *mdev;
+	int ret;
+
+	mdev = devm_kzalloc(dev, sizeof(*mdev), GFP_KERNEL);
+	if (!mdev) {
+		ret = -ENOMEM;
+		goto log_err;
+	}
+	mdev->dev_ops = &pci_hw_ops;
+	mdev->dev = dev;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		ret = -ENOMEM;
+		goto free_cntx_data;
+	}
+
+	pci_set_drvdata(pdev, mdev);
+	priv->cfg = (void *)id->driver_data;
+	priv->mdev = mdev;
+	mdev->hw_ver  = pdev->device;
+	mdev->hw_priv = priv;
+	mdev->dev     = dev;
+	snprintf(mdev->dev_str, MTK_DEV_STR_LEN, "%02x%02x%d",
+		 pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn));
+	if (pdev->state_saved)
+		pci_restore_state(pdev);
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		dev_err((mdev)->dev, "Failed to enable pci device.\n");
+		goto free_priv_data;
+	}
+
+	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err((mdev)->dev, "Failed to set DMA Mask and Coherent. (ret=%d)\n", ret);
+		goto free_priv_data;
+	}
+
+	ret = mtk_pci_bar_init(mdev);
+	if (ret)
+		goto free_priv_data;
+
+	ret = priv->cfg->atr_init(mdev);
+	if (ret)
+		goto free_priv_data;
+
+	ret = mtk_mhccif_init(mdev);
+	if (ret)
+		goto free_priv_data;
+
+	/* mask all irqs */
+	if (priv->cfg->flag & MTK_CFG_IRQ_DFLT_MASK)
+		mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_CLR_GRP0_0, U32_MAX);
+
+	ret = mtk_pci_request_irq(mdev);
+	if (ret)
+		goto free_mhccif;
+
+	pci_set_master(pdev);
+	mtk_pci_unmask_irq(mdev, priv->mhccif_irq_id);
+
+	if (mtk_pci_link_check(mdev)) {
+		pci_save_state(pdev);
+	} else {
+		ret = -ENOLINK;
+		goto clear_master;
+	}
+
+	priv->saved_state = pci_store_saved_state(pdev);
+	if (!priv->saved_state) {
+		ret = -EFAULT;
+		goto clear_master;
+	}
+
+	return 0;
+
+clear_master:
+	pci_clear_master(pdev);
+	mtk_pci_free_irq(mdev);
+free_mhccif:
+	mtk_mhccif_exit(mdev);
+free_priv_data:
+	devm_kfree(dev, priv);
+free_cntx_data:
+	devm_kfree(dev, mdev);
+log_err:
+	dev_err(dev, "Failed to probe device, ret=%d\n", ret);
+
+	return ret;
+}
+
+static void mtk_pci_remove(struct pci_dev *pdev)
+{
+	struct mtk_md_dev *mdev = pci_get_drvdata(pdev);
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	struct device *dev = &pdev->dev;
+
+	mtk_pci_mask_irq(mdev, priv->mhccif_irq_id);
+
+	if (mtk_pci_pldr(mdev)) {
+		dev_warn(dev, "Failed to execute PLDR, try external event\n");
+		mtk_pci_reset(mdev, RESET_MHCCIF);
+	}
+
+	pci_clear_master(pdev);
+	mtk_pci_free_irq(mdev);
+	mtk_mhccif_exit(mdev);
+	pci_load_and_free_saved_state(pdev, &priv->saved_state);
+
+	devm_kfree(dev, priv);
+	devm_kfree(dev, mdev);
+}
+
+static pci_ers_result_t mtk_pci_error_detected(struct pci_dev *pdev,
+					       pci_channel_state_t state)
+{
+	struct mtk_md_dev *mdev = pci_get_drvdata(pdev);
+
+	dev_err((mdev)->dev, "AER detected: pci_channel_state_t=%d\n", state);
+
+	/* Request a slot reset. */
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static const struct pci_error_handlers mtk_pci_err_handler = {
+	.error_detected = mtk_pci_error_detected,
+};
+
+static struct pci_driver mtk_pci_drv = {
+	.name = "mtk_pci_drv",
+	.id_table = t9xx_pci_table,
+	.probe = mtk_pci_probe,
+	.remove = mtk_pci_remove,
+	.err_handler = &mtk_pci_err_handler
+};
+
+module_pci_driver(mtk_pci_drv);
+
+MODULE_DESCRIPTION("MediaTek T9xx PCIe WWAN driver pcie layer");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci.h b/drivers/net/wwan/t9xx/pcie/mtk_pci.h
new file mode 100644
index 000000000000..9819a1b07c1b
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/mtk_pci.h
@@ -0,0 +1,234 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#ifndef __MTK_PCI_H__
+#define __MTK_PCI_H__
+
+#include <linux/pci.h>
+
+#include "../mtk_dev.h"
+
+enum mtk_irq_src {
+	MTK_IRQ_SRC_MIN,
+	MTK_IRQ_SRC_MHCCIF,
+	MTK_IRQ_SRC_DPMAIF,
+	MTK_IRQ_SRC_DPMAIF2,
+	MTK_IRQ_SRC_CLDMA0,
+	MTK_IRQ_SRC_CLDMA1,
+	MTK_IRQ_SRC_CLDMA2,
+	MTK_IRQ_SRC_CLDMA3,
+	MTK_IRQ_SRC_PM_LOCK,
+	MTK_IRQ_SRC_DPMAIF3,
+	MTK_IRQ_SRC_DPMAIF6,
+	MTK_IRQ_SRC_MAX
+};
+
+enum mtk_reset_type {
+	RESET_FLDR,
+	RESET_PLDR,
+	RESET_MHCCIF,
+};
+
+enum mtk_atr_type {
+	ATR_PCI2AXI = 0,
+	ATR_AXI2PCI,
+};
+
+enum mtk_atr_src_port {
+	ATR_SRC_PCI_WIN0 = 0,
+	ATR_SRC_PCI_WIN1,
+	ATR_SRC_AXIS_0,
+	ATR_SRC_AXIS_1,
+	ATR_SRC_AXIS_2,
+	ATR_SRC_AXIS_3,
+};
+
+enum mtk_atr_dst_port {
+	ATR_DST_PCI_TRX = 0,
+	ATR_DST_AXIM_0 = 4,
+	ATR_DST_AXIM_1,
+	ATR_DST_AXIM_2,
+	ATR_DST_AXIM_3,
+};
+
+enum mtk_pci_evt_h2d {
+	DEV_EVT_H2D_EXTEND_BASE            = DEV_EVT_H2D_MAX,
+	EXT_EVT_H2D_RESERVED_FOR_CLDMA0    = DEV_EVT_H2D_EXTEND_BASE << 1,
+	EXT_EVT_H2D_RESERVED_FOR_CLDMA1    = DEV_EVT_H2D_EXTEND_BASE << 2,
+	EXT_EVT_H2D_RESERVED_FOR_CLDMA3    = DEV_EVT_H2D_EXTEND_BASE << 3,
+	EXT_EVT_H2D_RESERVED_FOR_CLDMA2    = DEV_EVT_H2D_EXTEND_BASE << 4,
+	EXT_EVT_H2D_RESERVED_FOR_DPMAIF    = DEV_EVT_H2D_EXTEND_BASE << 5,
+	EXT_EVT_H2D_PCIE_PM_SUSPEND_REQ    = DEV_EVT_H2D_EXTEND_BASE << 6,
+	EXT_EVT_H2D_PCIE_PM_RESUME_REQ     = DEV_EVT_H2D_EXTEND_BASE << 7,
+	EXT_EVT_H2D_PCIE_PM_SUSPEND_REQ_AP = DEV_EVT_H2D_EXTEND_BASE << 8,
+	EXT_EVT_H2D_PCIE_PM_RESUME_REQ_AP  = DEV_EVT_H2D_EXTEND_BASE << 9,
+	EXT_EVT_H2D_RESERVED_FOR_TEST      = DEV_EVT_H2D_EXTEND_BASE << 11,
+};
+
+enum mtk_pci_evt_d2h {
+	DEV_EVT_D2H_EXTEND_BASE            = DEV_EVT_D2H_MAX,
+	EXT_EVT_D2H_RESERVED_FOR_CLDMA0    = DEV_EVT_D2H_EXTEND_BASE << 1,
+	EXT_EVT_D2H_RESERVED_FOR_CLDMA1    = DEV_EVT_D2H_EXTEND_BASE << 2,
+	EXT_EVT_D2H_RESERVED_FOR_CLDMA3    = DEV_EVT_D2H_EXTEND_BASE << 3,
+	EXT_EVT_D2H_RESERVED_FOR_CLDMA2    = DEV_EVT_D2H_EXTEND_BASE << 4,
+	EXT_EVT_D2H_RESERVED_FOR_DPMAIF    = DEV_EVT_D2H_EXTEND_BASE << 5,
+	EXT_EVT_D2H_PCIE_PM_SUSPEND_ACK    = DEV_EVT_D2H_EXTEND_BASE << 6,
+	EXT_EVT_D2H_PCIE_PM_RESUME_ACK     = DEV_EVT_D2H_EXTEND_BASE << 7,
+	EXT_EVT_D2H_PCIE_PM_SUSPEND_ACK_AP = DEV_EVT_D2H_EXTEND_BASE << 8,
+	EXT_EVT_D2H_PCIE_PM_RESUME_ACK_AP  = DEV_EVT_D2H_EXTEND_BASE << 9,
+	EXT_EVT_D2H_SOFT_OFF_NOTIFY        = DEV_EVT_D2H_EXTEND_BASE << 10,
+	EXT_EVT_D2H_FRC_DONE_NOTIFY        = DEV_EVT_D2H_EXTEND_BASE << 11,
+	EXT_EVT_D2H_RESERVED_FOR_TEST1	   = DEV_EVT_D2H_EXTEND_BASE << 12,
+	EXT_EVT_D2H_RESERVED_FOR_TEST2	   = DEV_EVT_D2H_EXTEND_BASE << 13,
+};
+
+#define MTK_PCI_CLASS                 0x0D4000
+#define MTK_PCI_VENDOR_ID             0x14C3
+#define CEI_PCI_VENDOR_ID             0x03F0
+
+#define MTK_CFG_INFO_BIT_SHIFT        4
+
+#define MTK_PCI_DEV_CFG(id, cfg) \
+{ \
+	PCI_DEVICE(MTK_PCI_VENDOR_ID, id), \
+	MTK_PCI_CLASS, PCI_ANY_ID, \
+	.driver_data = (kernel_ulong_t)&(cfg), \
+}
+
+#define CEI_PCI_DEV_CFG(id, cfg) \
+{ \
+	PCI_DEVICE(CEI_PCI_VENDOR_ID, id), \
+	MTK_PCI_CLASS, PCI_ANY_ID, \
+	.driver_data = (kernel_ulong_t)&(cfg), \
+}
+
+#define MTK_CFG_IRQ_DFLT_MASK		BIT(0)
+#define MTK_CFG_DISABLE_AP_DRM		BIT(2)
+#define MTK_CFG_PM_SW_IRQ		BIT(6)
+
+#define MTK_BAR_0_1_IDX                 0
+#define MTK_BAR_2_3_IDX                 2
+
+#define MTK_REQUESTED_BARS \
+	((1 << MTK_BAR_0_1_IDX) | \
+	 (1 << MTK_BAR_2_3_IDX))
+
+#define MTK_IRQ_CNT_MIN				1
+#define MTK_IRQ_CNT_MAX				32
+#define MTK_IRQ_NAME_LEN			32
+
+#define ATR_PORT_OFFSET				0x100
+#define ATR_TABLE_OFFSET			0x20
+#define ATR_TABLE_NUM_PER_ATR			8
+#define ATR_PCIE_REG_TRSL_ADDR			0x10000000
+#define ATR_PCIE_REG_SIZE			0x00400000
+#define ATR_PCIE_REG_PORT			ATR_SRC_PCI_WIN0
+#define ATR_PCIE_REG_TABLE_NUM			1
+#define ATR_PCIE_REG_TRSL_PORT			ATR_DST_AXIM_0
+#define ATR_PCIE_DEV_DMA_SRC_ADDR		0x00000000
+#define ATR_PCIE_DEV_DMA_TRANSPARENT		1
+#define ATR_PCIE_DEV_DMA_SIZE			0
+#define ATR_PCIE_DEV_DMA_TABLE_NUM		0
+#define ATR_PCIE_DEV_DMA_TRSL_ADDR		0x00000000
+
+struct mtk_pci_irq_desc {
+	struct mtk_md_dev *mdev;
+	u32 msix_bits;
+	char name[MTK_IRQ_NAME_LEN];
+};
+
+struct mtk_pci_dev_cfg {
+	u32 flag;
+	u32 mhccif_rc_base_addr;
+	u32 istatus_host_ctrl_addr;
+	int irq_tbl[MTK_IRQ_SRC_MAX];
+	int (*atr_init)(struct mtk_md_dev *mdev);
+};
+
+extern const struct mtk_pci_dev_cfg mtk_dev_cfg_0900;
+
+struct mtk_pci_priv {
+	struct mtk_md_dev *mdev;
+	const struct mtk_pci_dev_cfg *cfg;
+	void __iomem *bar23_addr;
+	void __iomem *mac_reg_base;
+	void __iomem *ext_reg_base;
+	int irq_cnt;
+	int irq_type;
+	void *irq_cb_data[MTK_IRQ_CNT_MAX];
+
+	int (*irq_cb_list[MTK_IRQ_CNT_MAX])(int irq_id, void *data);
+	struct mtk_pci_irq_desc irq_desc[MTK_IRQ_CNT_MAX];
+	struct list_head mhccif_cb_list;
+	/* mhccif_lock: lock to protect mhccif_cb_list */
+	spinlock_t mhccif_lock;
+	struct work_struct mhccif_work;
+	int mhccif_irq_id;
+	struct pci_saved_state *saved_state;
+};
+
+struct mtk_atr_cfg {
+	u64 src_addr;
+	u64 trsl_addr;
+	u64 size;
+	u32 type;      /* Port type */
+	u32 port;      /* Port number */
+	u32 table;     /* Table number (8 tables for each port) */
+	u32 trsl_id;
+	u32 trsl_param;
+	u32 transparent;
+};
+
+/* BAR 0/1 MMIO access */
+static inline u32 mtk_pci_mac_read32(struct mtk_pci_priv *priv, u64 addr)
+{
+	return ioread32(priv->mac_reg_base + addr);
+}
+
+static inline void mtk_pci_mac_write32(struct mtk_pci_priv *priv, u64 addr, u32 val)
+{
+	iowrite32(val, priv->mac_reg_base + addr);
+}
+
+/* BAR 2/3 MMIO access */
+static inline u32 mtk_pci_read32(struct mtk_md_dev *mdev, u64 addr)
+{
+	return ioread32(((struct mtk_pci_priv *)mdev->hw_priv)->ext_reg_base + addr);
+}
+
+static inline void mtk_pci_write32(struct mtk_md_dev *mdev, u64 addr, u32 val)
+{
+	iowrite32(val, ((struct mtk_pci_priv *)mdev->hw_priv)->ext_reg_base + addr);
+}
+
+/* Device operations */
+u32 mtk_pci_get_dev_state(struct mtk_md_dev *mdev);
+void mtk_pci_ack_dev_state(struct mtk_md_dev *mdev, u32 state);
+u32 mtk_pci_get_dev_cfg(struct mtk_md_dev *mdev);
+/* IRQ Related operations */
+int mtk_pci_get_irq_id(struct mtk_md_dev *mdev, enum mtk_irq_src irq_src);
+int mtk_pci_get_virq_id(struct mtk_md_dev *mdev, int irq_id);
+int mtk_pci_register_irq(struct mtk_md_dev *mdev, int irq_id,
+			 int (*irq_cb)(int irq_id, void *data), void *data);
+int mtk_pci_unregister_irq(struct mtk_md_dev *mdev, int irq_id);
+int mtk_pci_mask_irq(struct mtk_md_dev *mdev, int irq_id);
+int mtk_pci_unmask_irq(struct mtk_md_dev *mdev, int irq_id);
+int mtk_pci_clear_irq(struct mtk_md_dev *mdev, int irq_id);
+/* External event related */
+int mtk_pci_register_ext_evt(struct mtk_md_dev *mdev, u32 chs,
+			     int (*evt_cb)(u32 status, void *data), void *data);
+void mtk_pci_unregister_ext_evt(struct mtk_md_dev *mdev, u32 chs);
+void mtk_pci_mask_ext_evt(struct mtk_md_dev *mdev, u32 chs);
+void mtk_pci_unmask_ext_evt(struct mtk_md_dev *mdev, u32 chs);
+void mtk_pci_clear_ext_evt(struct mtk_md_dev *mdev, u32 chs);
+int mtk_pci_send_ext_evt(struct mtk_md_dev *mdev, u32 ch);
+int mtk_pci_fldr(struct mtk_md_dev *mdev);
+int mtk_pci_pldr(struct mtk_md_dev *mdev);
+int mtk_pci_reset(struct mtk_md_dev *mdev, enum mtk_reset_type type);
+bool mtk_pci_link_check(struct mtk_md_dev *mdev);
+int mtk_pci_setup_atr(struct mtk_md_dev *mdev, struct mtk_atr_cfg *cfg);
+void mtk_pci_atr_disable(struct mtk_pci_priv *priv);
+
+#endif /* __MTK_PCI_H__ */
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c b/drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c
new file mode 100644
index 000000000000..88b44142afb7
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+#include <linux/types.h>
+#include "mtk_pci.h"
+#include "mtk_pci_reg.h"
+
+static int mtk_pci_atr_init_m9xx(struct mtk_md_dev *mdev)
+{
+	struct pci_dev *pdev = to_pci_dev(mdev->dev);
+	struct mtk_pci_priv *priv = mdev->hw_priv;
+	struct mtk_atr_cfg cfg;
+	int port, ret;
+
+	mtk_pci_atr_disable(priv);
+
+	/* Config ATR for RC to access device's register */
+	cfg.src_addr = pci_resource_start(pdev, MTK_BAR_2_3_IDX);
+	cfg.size = ATR_PCIE_REG_SIZE;
+	cfg.trsl_addr = ATR_PCIE_REG_TRSL_ADDR;
+	cfg.type = ATR_PCI2AXI;
+	cfg.port = ATR_PCIE_REG_PORT;
+	cfg.table = ATR_PCIE_REG_TABLE_NUM;
+	cfg.trsl_id = ATR_PCIE_REG_TRSL_PORT;
+	cfg.trsl_param = 0x0;
+	cfg.transparent = 0x0;
+	ret = mtk_pci_setup_atr(mdev, &cfg);
+	if (ret)
+		return ret;
+
+	/* Config ATR for EP to access RC's memory */
+	for (port = ATR_SRC_AXIS_0; port <= ATR_SRC_AXIS_3; port++) {
+		cfg.src_addr = ATR_PCIE_DEV_DMA_SRC_ADDR;
+		cfg.size = ATR_PCIE_DEV_DMA_SIZE;
+		cfg.trsl_addr = ATR_PCIE_DEV_DMA_TRSL_ADDR;
+		cfg.type = ATR_AXI2PCI;
+		cfg.port = port;
+		cfg.table = ATR_PCIE_DEV_DMA_TABLE_NUM;
+		cfg.trsl_id = ATR_DST_PCI_TRX;
+		cfg.trsl_param = 0x0;
+		/* Enable transparent translation */
+		cfg.transparent = ATR_PCIE_DEV_DMA_TRANSPARENT;
+		ret = mtk_pci_setup_atr(mdev, &cfg);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+const struct mtk_pci_dev_cfg mtk_dev_cfg_0900 = {
+	.flag = MTK_CFG_PM_SW_IRQ,
+	.mhccif_rc_base_addr = 0x1000A000,
+	.istatus_host_ctrl_addr = REG_ISTATUS_HOST_CTRL_NEW,
+	.irq_tbl = {
+		[MTK_IRQ_SRC_DPMAIF]  = 24,
+		[MTK_IRQ_SRC_CLDMA0]  = 27,
+		[MTK_IRQ_SRC_CLDMA1]  = 26,
+		[MTK_IRQ_SRC_CLDMA2]  = 25,
+		[MTK_IRQ_SRC_MHCCIF]  = 28,
+		[MTK_IRQ_SRC_DPMAIF2] = 29,
+		[MTK_IRQ_SRC_CLDMA3]  = 31,
+		[MTK_IRQ_SRC_PM_LOCK] = 0,
+		[MTK_IRQ_SRC_DPMAIF3] = 7,
+		[MTK_IRQ_SRC_DPMAIF6]  = 10,
+	},
+	.atr_init = mtk_pci_atr_init_m9xx,
+};
diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h b/drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h
new file mode 100644
index 000000000000..3f0667e8a846
--- /dev/null
+++ b/drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2022, MediaTek Inc.
+ */
+
+#ifndef __MTK_PCI_REG_H__
+#define __MTK_PCI_REG_H__
+
+#define REG_ISTATUS_HOST_CTRL_NEW		0x031C
+#define REG_PCIE_MISC_CTRL			0x0348
+#define REG_PCIE_CFG_MSIX			0x03EC
+#define REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB	0x0600
+#define REG_ATR_PCIE_WIN0_T0_SRC_ADDR_MSB	0x0604
+#define REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_LSB	0x0608
+#define REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_MSB	0x060C
+#define REG_ATR_PCIE_WIN0_T0_TRSL_PARAM		0x0610
+#define REG_PCIE_DEBUG_DUMMY_3			0x0D0C
+#define REG_PCIE_DEBUG_DUMMY_4			0x0D10
+#define REG_PCIE_DEBUG_DUMMY_7			0x0D1C
+#define REG_MSIX_ISTATUS_HOST_GRP0_0		0x0F00
+#define REG_IMASK_HOST_MSIX_SET_GRP0_0		0x3000
+#define REG_IMASK_HOST_MSIX_CLR_GRP0_0		0x3080
+#define REG_IMASK_HOST_MSIX_GRP0_0		0x3100
+
+/* mhccif registers */
+#define MHCCIF_RC2EP_SW_BSY			0x4
+#define MHCCIF_RC2EP_SW_TCHNUM			0xC
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_CLDMA0	BIT(4)
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_CLDMA1	BIT(5)
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_CLDMA3	BIT(6)
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_CLDMA2	BIT(7)
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_DPMAIF	BIT(8)
+#define MHCCIF_RC2EP_EVT_PCIE_PM_SUSPEND_REQ	BIT(9)
+#define MHCCIF_RC2EP_EVT_PCIE_PM_RESUME_REQ	BIT(10)
+#define MHCCIF_RC2EP_EVT_PCIE_PM_SUSPEND_REQ_AP	BIT(11)
+#define MHCCIF_RC2EP_EVT_PCIE_PM_RESUME_REQ_AP	BIT(12)
+#define MHCCIF_RC2EP_EVT_DEVICE_RESET		BIT(13)
+#define MHCCIF_RC2EP_EVT_RESERVED_FOR_TEST	BIT(31)
+
+#define MHCCIF_EP2RC_SW_INT_STS			0x10
+#define MHCCIF_EP2RC_SW_INT_ACK			0x14
+#define MHCCIF_EP2RC_SW_INT_EAP_MASK		0x20
+#define MHCCIF_EP2RC_SW_INT_EAP_MASK_SET	0x30
+#define MHCCIF_EP2RC_SW_INT_EAP_MASK_CLR	0x40
+#define MHCCIF_EP2RC_SPARE_REG_1		0x0104
+#define MHCCIF_EP2RC_SPARE_REG_5		0x0114
+#define MHCCIF_EP2RC_SPARE_REG_13		0x0134
+#define MHCCIF_EP2RC_SPARE_REG_14		0x0138
+#define MHCCIF_EP2RC_EVT_BOOT_FLOW_SYNC		BIT(5)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_CLDMA0	BIT(6)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_CLDMA1	BIT(7)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_CLDMA3	BIT(8)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_CLDMA2	BIT(9)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_DPMAIF	BIT(10)
+#define MHCCIF_EP2RC_EVT_PCIE_PM_SUSPEND_ACK	BIT(11)
+#define MHCCIF_EP2RC_EVT_PCIE_PM_RESUME_ACK	BIT(12)
+#define MHCCIF_EP2RC_EVT_PCIE_PM_SUSPEND_ACK_AP	BIT(13)
+#define MHCCIF_EP2RC_EVT_PCIE_PM_RESUME_ACK_AP	BIT(14)
+#define MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_SAP	BIT(15)
+#define MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_MD	BIT(16)
+#define MHCCIF_EP2RC_EVT_SOFT_OFF_NOTIFY	BIT(17)
+#define MHCCIF_EP2RC_EVT_MD_REBOOT		BIT(19)
+#define MHCCIF_EP2RC_EVT_MD_POWEROFF		BIT(20)
+#define MHCCIF_EP2RC_EVT_GNSS_ENABLE		BIT(21)
+#define MHCCIF_EP2RC_EVT_GNSS_DISABLE		BIT(22)
+#define MHCCIF_EP2RC_EVT_FRC_DONE_NOTIFY	BIT(24)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_TEST1	BIT(30)
+#define MHCCIF_EP2RC_EVT_RESERVED_FOR_TEST2	BIT(31)
+
+#endif /* __MTK_PCI_REG_H__ */

-- 
2.34.1



^ permalink raw reply related

* [PATCH v3 0/7] net: wwan: t9xx: Add MediaTek T9XX WWAN driver
From: Jack Wu via B4 Relay @ 2026-06-24 10:04 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Jack Wu, Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng,
	Matthias Brugger, AngeloGioacchino Del Regno, Simon Horman,
	Jonathan Corbet, Shuah Khan
  Cc: linux-kernel, netdev, linux-arm-kernel, linux-mediatek, linux-doc

T9XX is the PCIe host device driver for MediaTek's
t900 modem. The driver uses the WWAN framework
infrastructure to create the following control ports
and network interfaces for data transactions.
* /dev/wwan0at0 - Interface that supports AT commands.
* /dev/wwan0mbim0 - Interface conforming to the MBIM
  protocol.
* wwan0-X - Primary network interface for IP traffic.

The main blocks in the T9XX driver are:
* HW layer - Abstracts the hardware bus operations for
   the device, and provides generic interfaces for the
   transaction layer to get the device's information and
   control the device's behavior. It includes:

   * PCIe - Implements probe, removal and interrupt
     handling.
   * MHCCIF (Modem Host Cross-Core Interface) - Provides
     interrupt channels for bidirectional event
     notification such as handshake and port enumeration.

* Transaction layer - Implements data transactions for
   the control plane and the data plane. It includes:

   * DPMAIF (Data Plane Modem AP Interface) - Controls
     the hardware that provides uplink and downlink
     queues for the data path. The data exchange takes
     place using circular buffers to share data buffer
     addresses and metadata to describe the packets.
   * CLDMA (Cross Layer DMA) - Manages the hardware
     used by the port layer to send control messages to
     the device using MediaTek's CCCI (Cross-Core
     Communication Interface) protocol.
   * TX Services - Dispatch packets from the port layer
     to the device.
   * RX Services - Dispatch packets to the port layer
     when receiving packets from the device.

* Port layer - Provides control plane and data plane
   interfaces to userspace. It includes:

   * Control Plane - Provides device node interfaces
     for controlling data transactions.
   * Data Plane - Provides network link interfaces
     wwanX (0, 1, 2...) for IP data transactions.

* Core logic - Contains the core logic to keep the
   device working. It includes:

   * FSM (Finite State Machine) - Monitors the state
     of the device, and notifies each module when the
     state changes.

The compilation of the T9XX driver is enabled by the
CONFIG_MTK_T9XX and CONFIG_MTK_T9XX_PCI config option
which depends on CONFIG_WWAN.

This v2 submission covers the control plane only
(patches 1-6). The data plane will follow in a
separate series once the control plane is accepted.

---
Changes in v3:
- Address sashiko AI code review comments and fix sparse warnings
- Patch 1 (Add PCIe core):
  - Move extern declaration of mtk_dev_cfg_0900 from mtk_pci.c to mtk_pci.h to fix sparse warning
  - Remove mtk_pci_bar_exit(): pcim_iounmap_region() was called with bitmask instead of BAR index, and pcim_iomap_regions() is devres-managed so manual unmap is redundant [sashiko]
  - Remove pci_disable_device() from probe error path and remove path: pcim_enable_device() registers devres cleanup, manual disable causes enable_cnt underflow [sashiko]
- Patch 3 (Add control DMA interface):
  - Add #include "mtk_cldma.h" in mtk_cldma_drv_m9xx.c to fix sparse undeclared symbol warnings for mtk_cldma_regs_m9xx and cldma_drv_ops_m9xx
  - Move extern declaration of mtk_ctrl_info_m9xx from mtk_trans_ctrl.c to mtk_trans_ctrl.h to fix sparse undeclared symbol warning
  - Replace kcalloc + radix_tree_gang_lookup with radix_tree_for_each_slot() in mtk_ctrl_remove_radix_tree() to eliminate allocation in teardown path [sashiko]
  - Add missing mtk_pci_dev_exit() call in mtk_pci_remove() to properly clean up FSM and trans_ctrl resources before device removal [sashiko]
- Patch 4 (Add control port):
  - Replace kcalloc + radix_tree_gang_lookup with radix_tree_for_each_slot() and single-entry gang_lookup in mtk_port_tbl_destroy() to eliminate allocation failure in teardown path [sashiko]
  - Add port = NULL after mtk_port_put_locked() in mtk_port_internal_open() error path to prevent returning un-refcounted pointer [sashiko]
  - Restore skb_unlink + trb_complete for non-EAGAIN errors in mtk_ctrl_trb_handler() TX path to prevent infinite retry loop [sashiko]
- Patch 5 (Add FSM thread):
  - Check mtk_pci_register_irq() return value in mtk_cldma_dev_init() and add err_destroy_wq error label to fix unreachable dead code [sashiko]
  - Propagate specific error codes (ENOMEM/EIO/EINVAL) from mtk_cldma_dev_init() error paths instead of generic -EIO [sashiko]
  - Free head SKB after detaching frag_list in mtk_cldma_rxq_free() scatter-gather mode to fix memory leak [sashiko]
- Patch 6 (Add AT & MBIM WWAN ports):
  - Remove unused write_lock mutex from struct mtk_port and its mutex_init call [sashiko]
- Link to v2: https://patch.msgid.link/20260610-t9xx_driver_v1-v2-0-c65addf23b3f@compal.com

Changes in v2:
- Split series into control plane (this v2) and data plane (follow-up)
- Patch 1 (Add PCIe core):
  - Rename BAR_NUM to MTK_PCI_BAR_NUM for driver prefix consistency
  - Replace magic numbers in mtk_pci_setup_atr() with named defines
  - Remove redundant ATR register comments, use blank line separators
  - Add kernel-doc comments to all non-static functions
  - Convert 4 MMIO wrapper functions to static inline in header [sashiko]
  - Remove unnecessary unlikely() from IRQ validation paths
  - Add irq_cnt == 0 and irq_id < 0 guards in mtk_pci_get_virq_id() [sashiko]
  - Initialize hw_bits at declaration for consistency
  - Merge same-type variable declarations into single lines
  - Add #else/#endif comments for CONFIG_ACPI blocks
  - Add newlines in mtk_pci_pldr() for readability
  - Move return into default case in mtk_pci_dev_reset()
  - Simplify mtk_mhccif_init() error path to use direct returns
  - Change -EFAULT to -ENOLINK for PCIe link check failure
  - Rename goto label "out" to "log_err" in mtk_pci_probe()
  - Wrap long lines to stay within 80 columns
  - Fix IRQ vector leak: add pci_free_irq_vectors() on error path [sashiko]
  - Fix mtk_pci_remove() ordering: free IRQ before cancel_work_sync [sashiko]
  - Fix mtk_pci_pldr() ACPI buffer leak: free first result before second call [sashiko]
  - Replace msleep(500) with MTK_PLDR_POWER_OFF_DELAY_MS define
  - Remove unused EXT_EVT_H2D_DRM_DISABLE_AP and related register define [sashiko]
  - Increase MTK_IRQ_NAME_LEN from 20 to 32 to fix W=1 format-truncation warning [sashiko]
- Patch 2 (Add control plane transaction layer):
  - Add kernel-doc comments to mtk_ctrl_init() and mtk_ctrl_exit()
  - Change mtk_ctrl_exit() return type from int to void
  - Set mdev->ctrl_blk to NULL after freeing in mtk_ctrl_exit() [sashiko]
  - Change ctrl_blk from void* to typed struct mtk_ctrl_blk* [sashiko]
  - Remove redundant "depends on MTK_T9XX" from MTK_T9XX_PCI Kconfig [sashiko]
  - Use mtk_dev_free() instead of devm_kfree() in mtk_pci_probe() error path [sashiko]
- Patch 3 (Add control DMA interface):
  - Add @ops kernel-doc parameter for mtk_ctrl_init()
  - Rename 'err' to 'ret' consistently throughout the patch
  - Reorder variable declarations to follow reverse Christmas tree style
  - Change mtk_cldma_txq_free() return type from int to void
  - Change mtk_cldma_rxq_free() return type from int to void
  - Change mtk_cldma_exit() return type from int to void
  - Remove unnecessary zero-initialization of ret in mtk_cldma_start_xfer()
  - Remove unnecessary zero-initialization of ret in mtk_cldma_tx()
  - Use direct return instead of goto out in mtk_cldma_submit_tx() error paths
  - Move software state before HWO flag in mtk_cldma_submit_tx()
  - Squash variable declarations in mtk_cldma_check_intr_status()
  - Remove unlikely() from validation paths in mtk_cldma_check_ch_cfg()
  - Clamp data_recv_len with min_t to prevent skb_over_panic in mtk_cldma_rx_skb_adjust() [sashiko]
  - Use READ_ONCE() for HWO flag polling in mtk_cldma_check_rx_req() [sashiko]
  - Fix mtk_cldma_rx_done_work() to always unmask interrupt on error path [sashiko]
  - Add DMA address guard in mtk_cldma_txq_free() teardown loop [sashiko]
  - Add IS_ERR() check for kthread_run() in mtk_ctrl_trb_srv_init() [sashiko]
  - Fix queue_info memory leak on validation failure in mtk_pcie_hif_init() [sashiko]
  - Handle non-EAGAIN errors in mtk_ctrl_trb_handler() TX path [sashiko]
  - Fix 'err' typo to 'ret' in mtk_cldma_txbuf_set() error message
  - Remove unused variable mdev in mtk_cldma_rx_check_again() [sashiko]
  - Remove unused variables trans and ctrl_blk in mtk_cldma_txq_free() and mtk_cldma_rxq_free() [sashiko]
- Patch 4 (Add control port):
  - Add @cfg kernel-doc parameter for mtk_ctrl_init()
  - Update mtk_ctrl_init() return description to cover additional error codes
  - Fix double list_del in mtk_port_stale_list_grp_cleanup() [sashiko]
  - Fix direct mtk_port_trb_free() call to use kref_put() in mtk_port_ch_enable() error path [sashiko]
  - Fix direct mtk_port_trb_free() call to use kref_put() in mtk_port_ch_disable() error path [sashiko]
  - Add mtk_port_tbl_destroy() in mtk_port_mngr_init() error path to prevent port memory leak [sashiko]
  - Change port_ops exit/reset/enable/disable callbacks from int to void
  - Move -EIO dispatch comment to where the code was introduced
- Patch 5 (Add FSM thread):
  - Add bounds check for rtft_entry in mtk_fsm_parse_hs2_msg() [sashiko]
  - Add skb length validation before accessing ctrl_msg_header in mtk_fsm_sap_ctrl_msg_handler() [sashiko]
  - Fix skb leak on CTRL_MSG_HS2 mismatch return in mtk_fsm_sap_ctrl_msg_handler() [sashiko]
  - Add skb length validation before accessing ctrl_msg_header in mtk_fsm_md_ctrl_msg_handler() [sashiko]
  - Replace devm_kzalloc/devm_kfree with kzalloc/kfree for FSM events [sashiko]
  - Fix mtk_fsm_evt_submit() to return -ETIMEDOUT on blocking event timeout [sashiko]
  - Change FSM kthread from TASK_INTERRUPTIBLE to TASK_UNINTERRUPTIBLE [sashiko]
  - Remove unused variable hw_id in mtk_cldma_dev_exit() [sashiko]
- Patch 6 (Add AT & MBIM WWAN ports):
  - Use imperative mode in commit message
  - Remove unnecessary zero-initialization of ret in mtk_port_copy_data_from()
  - Change copy_from_user() error code from -EFAULT to -EINVAL in mtk_port_copy_data_from()
  - Return -EINVAL for zero-length write in mtk_port_common_write()
  - Change mtk_port_wwan_exit/enable/disable() return type from int to void
  - Fix packet_size to account for CCCI header reservation in mtk_port_common_write() [sashiko]
  - Fix WWAN tx callbacks to consume skb and return 0 per wwan_port_ops contract [sashiko]
  - Fix wwan_create_port() error path: clear ERR_PTR to NULL and call mtk_port_ch_disable() [sashiko]
- Patch 7 (Add maintainers entry): new patch
- Link to v1: https://patch.msgid.link/20260529-t9xx_driver_v1-v1-0-bdbfe2c01e57@compal.com

---
Jack Wu (7):
      net: wwan: t9xx: Add PCIe core
      net: wwan: t9xx: Add control plane transaction layer
      net: wwan: t9xx: Add control DMA interface
      net: wwan: t9xx: Add control port
      net: wwan: t9xx: Add FSM thread
      net: wwan: t9xx: Add AT & MBIM WWAN ports
      net: wwan: t9xx: Add maintainers entry

 MAINTAINERS                                     |    9 +
 drivers/net/wwan/Kconfig                        |   17 +
 drivers/net/wwan/Makefile                       |    1 +
 drivers/net/wwan/t9xx/Makefile                  |   14 +
 drivers/net/wwan/t9xx/mtk_ctrl_plane.c          |  111 ++
 drivers/net/wwan/t9xx/mtk_ctrl_plane.h          |   88 ++
 drivers/net/wwan/t9xx/mtk_dev.c                 |   55 +
 drivers/net/wwan/t9xx/mtk_dev.h                 |  114 ++
 drivers/net/wwan/t9xx/mtk_fsm.c                 |  948 +++++++++++++++
 drivers/net/wwan/t9xx/mtk_fsm.h                 |  140 +++
 drivers/net/wwan/t9xx/mtk_port.c                |  968 +++++++++++++++
 drivers/net/wwan/t9xx/mtk_port.h                |  174 +++
 drivers/net/wwan/t9xx/mtk_port_io.c             |  573 +++++++++
 drivers/net/wwan/t9xx/mtk_port_io.h             |   41 +
 drivers/net/wwan/t9xx/mtk_utility.h             |   33 +
 drivers/net/wwan/t9xx/pcie/Makefile             |   15 +
 drivers/net/wwan/t9xx/pcie/mtk_cldma.c          | 1420 +++++++++++++++++++++++
 drivers/net/wwan/t9xx/pcie/mtk_cldma.h          |  173 +++
 drivers/net/wwan/t9xx/pcie/mtk_cldma_drv.c      |  371 ++++++
 drivers/net/wwan/t9xx/pcie/mtk_cldma_drv.h      |  174 +++
 drivers/net/wwan/t9xx/pcie/mtk_cldma_drv_m9xx.c |  178 +++
 drivers/net/wwan/t9xx/pcie/mtk_cldma_drv_m9xx.h |  101 ++
 drivers/net/wwan/t9xx/pcie/mtk_ctrl_cfg_m9xx.c  |   55 +
 drivers/net/wwan/t9xx/pcie/mtk_pci.c            | 1102 ++++++++++++++++++
 drivers/net/wwan/t9xx/pcie/mtk_pci.h            |  234 ++++
 drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c   |   69 ++
 drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h        |   71 ++
 drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.c     |  603 ++++++++++
 drivers/net/wwan/t9xx/pcie/mtk_trans_ctrl.h     |  107 ++
 29 files changed, 7959 insertions(+)
---
base-commit: eb3f4b7426cfd2b79d65b7d37155480b32259a11
change-id: 20260529-t9xx_driver_v1-1744f8af7739

Best regards,
--  
Jack Wu <jackbb_wu@compal.com>



^ permalink raw reply

* Re: [PATCH v3 8/8] docs: misc: amd-sbi: Document SBTSI userspace interface
From: Gupta, Akshay @ 2026-06-24 10:03 UTC (permalink / raw)
  To: Randy Dunlap, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-hwmon@vger.kernel.org
  Cc: corbet@lwn.net, skhan@linuxfoundation.org, linux@roeck-us.net,
	arnd@arndb.de, gregkh@linuxfoundation.org,
	Chatradhi, Naveen Krishna, Umarji, Anand, L k, Prathima
In-Reply-To: <f1e4c8f7-b3d9-46a3-b42f-710dcbcf15c0@infradead.org>


On 6/22/2026 10:27 PM, Randy Dunlap wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On 6/22/26 6:58 AM, Akshay Gupta wrote:
>> From: Prathima <Prathima.Lk@amd.com>
>>
>> - Document AMD sideband IOCTL description defined
>>    for SBTSI and its usage.
>>    User space C-APIs are made available by esmi_oob_library [1],
>>    which is provided by the E-SMS project [2].
>>
>>    Link: https://github.com/amd/esmi_oob_library [1]
>>    Link: https://www.amd.com/en/developer/e-sms.html [2]
>>
>> Include a user-space open example for /dev/sbtsi-* and list auxiliary
>> bus sysfs paths.
>>
>> Reviewed-by: Akshay Gupta <Akshay.Gupta@amd.com>
>> Signed-off-by: Prathima <Prathima.Lk@amd.com>
>> ---
>> Changes since v2:
>> - Update misc node names info as per socket
>>
>> Changes since v1:
>> - Elaborate the document
>>   Documentation/misc-devices/amd-sbi.rst | 68 ++++++++++++++++++++++++++
>>   1 file changed, 68 insertions(+)
>>
>> diff --git a/Documentation/misc-devices/amd-sbi.rst b/Documentation/misc-devices/amd-sbi.rst
>> index f91ddadefe48..fbbbc504119f 100644
>> --- a/Documentation/misc-devices/amd-sbi.rst
>> +++ b/Documentation/misc-devices/amd-sbi.rst
>> @@ -48,6 +48,60 @@ Access restrictions:
>>    * APML Mailbox messages and Register xfer access are read-write,
>>    * CPUID and MCA_MSR access is read-only.
>>
>> +SBTSI device
>> +============
>> +
>> +sbtsi driver under the drivers/misc/amd-sbi creates miscdevice
>     The sbtsi driver in the drivers/misc/amd-sbi/ directory creates a miscdevice
>
>> +/dev/sbtsi-* to let user space programs run APML TSI register xfer
>                                                                   transfer
> ?

Hi,

yes, will update to "transfer" in next version. Thank you.

>
>> +commands.
>> +
>> +The driver supports both I2C and I3C transports for SB-TSI targets.
>> +The transport is selected by the bus where the device is enumerated.
>> +
>> +Misc device:
>> + * In 1P socket 0: /dev/sbtsi-4c
>> + * In 2P socket 0: /dev/sbtsi-4c, socket 1: /dev/sbtsi-48
>> +
>> +.. code-block:: bash
>> +
>> +   $ ls -al /dev/sbtsi-4c
>> +   crw-------    1 root     root       10, 116 Apr  2 05:22 /dev/sbtsi-4c
>> +
>> +
>> +Access restrictions:
>> + * Only root user is allowed to open the file.
>> + * APML TSI Register xfer access is read-write.
>                          transfer
> ?
>
>> +
>> +SBTSI hwmon interface
>> +=====================
> [snip]
>
> --
> ~Randy
>

^ permalink raw reply

* Re: [PATCH v3 1/8] hwmon/misc: amd-sbi: Move core sbtsi support from hwmon to misc
From: Gupta, Akshay @ 2026-06-24  9:59 UTC (permalink / raw)
  To: Julian Braha, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-hwmon@vger.kernel.org
  Cc: corbet@lwn.net, skhan@linuxfoundation.org, linux@roeck-us.net,
	arnd@arndb.de, gregkh@linuxfoundation.org,
	Chatradhi, Naveen Krishna, Umarji, Anand, L k, Prathima
In-Reply-To: <458e0f2f-9997-42b5-8f43-68799ff9d4e7@gmail.com>


On 6/23/2026 11:15 PM, Julian Braha wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Hi Akshay,
>
> On 6/22/26 14:58, Akshay Gupta wrote:
>
>> diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
>> index e4c4f2b09732..8f204cf49b6e 100644
>> --- a/drivers/hwmon/Kconfig
>> +++ b/drivers/hwmon/Kconfig
>> @@ -1963,7 +1963,7 @@ config SENSORS_SL28CPLD
>>
>>   config SENSORS_SBTSI
>>        tristate "Emulated SB-TSI temperature sensor"
>> -     depends on I2C
>> +     select AMD_SBTSI
>>        help
>>          If you say yes here you get support for emulated temperature
>>          sensors on AMD SoCs with SB-TSI interface connected to a BMC device.
>> diff --git a/drivers/misc/amd-sbi/Kconfig b/drivers/misc/amd-sbi/Kconfig
>> index 30e7fad7356c..512251690e0e 100644
>> --- a/drivers/misc/amd-sbi/Kconfig
>> +++ b/drivers/misc/amd-sbi/Kconfig
>> @@ -20,3 +20,16 @@ config AMD_SBRMI_HWMON
>>          This provides support for RMI device hardware monitoring. If enabled,
>>          a hardware monitoring device will be created for each socket in
>>          the system.
>> +
>> +config AMD_SBTSI
>> +     tristate "AMD side band TSI support"
>> +     depends on I2C
>> +     depends on ARM || ARM64 || COMPILE_TEST
>> +     select AUXILIARY_BUS
>> +     help
>> +       Enables support for the AMD SB-TSI (Side Band Temperature Sensor
>> +       Interface) driver, which provides access to emulated CPU temperature
>> +       sensors on AMD SoCs via an I2C connected BMC device.
>> +
>> +       This driver can also be built as a module. If so, the module will
>> +       be called sbtsi.
> Your kconfig changes introduce an unmet dependency bug. When I enable
> SENSORS_SBTSI without enabling COMPILE_TEST on x86, I get this:
>
> WARNING: unmet direct dependencies detected for AMD_SBTSI
>    Depends on [n]: I2C [=y] && (ARM || ARM64 || COMPILE_TEST [=n])
>    Selected by [y]:
>    - SENSORS_SBTSI [=y] && HWMON [=y]
>
> - Julian Braha

Hi Julian,

Thank you, will address this in next version.

>

^ permalink raw reply

* Re: [PATCH v4 0/2] PM: dpm_watchdog: Improve DPM watchdog configurability
From: Tzung-Bi Shih @ 2026-06-24  9:31 UTC (permalink / raw)
  To: Jonathan Corbet, Rafael J. Wysocki, Greg Kroah-Hartman,
	Danilo Krummrich
  Cc: Shuah Khan, Pavel Machek, Len Brown, linux-doc, linux-kernel,
	linux-pm, driver-core, tfiga, senozhatsky, Randy Dunlap
In-Reply-To: <20260611021219.2093476-1-tzungbi@kernel.org>

On Thu, Jun 11, 2026 at 02:12:16AM +0000, Tzung-Bi Shih wrote:
> This series improves the configurability of the DPM watchdog.
> 
> Currently, the DPM watchdog timeouts are fixed at compile time, and the
> watchdog is always enabled if compiled in.  Also, the module parameters
> defined in drivers/base/power/main.c use the generic and non-descriptive
> "main" prefix.
> 
> This series addresses these limitations.
> 
> Patch 1 renames the module parameter prefix for drivers/base/power/main.c
> from "main" to "pm_sleep".
> 
> Patch 2 introduces the "dpm_watchdog_enabled" module parameter to allow
> enabling/disabling the watchdog at boot time and runtime.  It also adds
> CONFIG_DPM_WATCHDOG_ENABLED to set default value of the module parameter
> at compile time.

Just realized I messed up the Signed-off-by lines for the patches, so they
haven't been added correctly to the commit messages.  Please disregard this
series.  I'll fix and resend the series after v7.2-rc1 is out.

^ permalink raw reply

* RE: [External Mail] [PATCH v2 1/7] net: wwan: t9xx: Add PCIe core
From: Wu. JackBB (GSM) @ 2026-06-24  9:15 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, Wu. JackBB (GSM)
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-mediatek@lists.infradead.org, linux-doc@vger.kernel.org
In-Reply-To: <20260610-t9xx_driver_v1-v2-1-c65addf23b3f@compal.com>

Hi Jakub,

Addressing sashiko AI code review comments for this patch, as
requested by you in the patch 3/7 review:
https://patchwork.kernel.org/project/netdevbpf/patch/20260610-t9xx_driver_v1-v2-3-c65addf23b3f@compal.com/#27006088

Q1: Does this code perform an incorrect double byte-swap on big-endian
architectures? The hardware bits are manually swapped using cpu_to_le32()
and cast back to u32. This value is later passed to mtk_pci_write32(),
which utilizes iowrite32(). Since iowrite32() internally handles
host-to-little-endian conversion, swapping the value beforehand will cause
a double swap on big-endian platforms.

  This driver targets MediaTek T9xx PCIe WWAN modems, which exist
  exclusively on x86 platforms (little-endian). On little-endian,
  cpu_to_le32() is a no-op and LE32_TO_U32() is a simple cast, so
  no byte-swap occurs. The hardware register layout assumes LE host
  ordering. While the pattern is technically incorrect for big-endian,
  this hardware is not available on BE platforms and the Kconfig
  dependency (depends on PCI) combined with the device's PCIe-only
  nature effectively restricts this to x86/ARM64-LE.

Q2: Does this call to ffs() yield the wrong channel index on big-endian
systems? hw_bits has already been endian-swapped in
mtk_pci_ext_h2d_evt_hw_bits(). Using ffs() on an endian-swapped value
produces a completely incorrect bit index.

  Same as Q1 — on little-endian platforms, no swap occurs, so ffs()
  operates on the correct value. Additionally, all callers of
  mtk_pci_send_ext_evt() pass known-valid channel values through the
  is_power_of_2(ch) check, and the SET_HW_BITS mapping covers all
  valid channels.

Q3: Does clearing the top-level interrupt status at the end of the handler
without a subsequent read loop lead to permanently lost hardware events?
If a new hardware event triggers during the worker's execution, clearing
the write-1-to-clear (W1C) interrupt status after the hardware event
statuses were read at the start will discard the newly asserted events.

  No events are lost. mtk_pci_clear_irq() clears the MSI-X interrupt
  status register (BIT(irq_id) in REG_MSIX_ISTATUS_HOST_GRP0_0), NOT
  the MHCCIF event registers. The MHCCIF events are level-triggered
  from the modem's EP2RC registers — the modem sets event bits and
  they remain set until the modem clears them or the host acknowledges.

  The flow is:
    1. Hardware: modem sets EP2RC event bits -> MSI-X interrupt fires
    2. ISR: masks MSI-X bit, schedules work
    3. Worker: reads EP2RC event status, dispatches callbacks
    4. Worker: clears MSI-X status (BIT(irq_id)), unmasks MSI-X

  If a new MHCCIF event arrives during step 3, the EP2RC register
  gets the new bit set. The MSI-X clear in step 4 only clears the
  MSI-X pending bit, not the MHCCIF source. After unmask, the MHCCIF
  event source re-asserts the MSI-X interrupt because the EP2RC bits
  are still set, triggering a new ISR -> new work -> new read of the
  updated EP2RC register.

Q4: Does this code incorrectly attempt to unmap a bitmask instead of a
single BAR index? MTK_REQUESTED_BARS is defined as a bitmask, but
pcim_iounmap_region() expects a specific BAR index. Furthermore, is
this manual unmapping redundant since the memory was mapped with
pcim_iomap_regions() and will automatically be cleaned up by devres?

  Valid on both points. Fixed in v3 by removing mtk_pci_bar_exit()
  entirely. pcim_iomap_regions() is devres-managed so manual unmap
  is redundant.

Q5: Does this logic mistakenly drop hardware interrupts when fewer than 32
MSI-X vectors are allocated? If fewer than 32 vectors are allocated
(for example, irq_cnt = 1), GENMASK(0, 0) evaluates to BIT(0). A
hardware interrupt source like MHCCIF (bit 28) will result in
BIT(28) & BIT(0) == 0. Could this lead to a complete loss of critical
device events?

  No issue. This is the MSI-X merged mode design. When fewer vectors
  are allocated than interrupt sources, mtk_pci_set_msix_merged()
  configures the hardware to merge multiple interrupt sources into
  the available vectors. With irq_cnt=1, the hardware maps ALL
  interrupt sources to vector 0. The GENMASK filters the irq_state
  to only the allocated vector bits, then checks if the current
  handler's vector bit is set. This correctly handles both full
  allocation (32 vectors, 1:1 mapping) and merged mode (fewer
  vectors, N:1 mapping).

Q6: Will this manual pci_disable_device() call trigger an atomic underflow?
Since the device was initialized using pcim_enable_device(), devres will
automatically disable the device during cleanup. Manually calling
pci_disable_device() drops the enable_cnt to zero. When devres executes
its cleanup, it will call pci_disable_device() again, leading to an
underflow and a kernel warning.

  Valid. Fixed in v3 by removing pci_disable_device() from both the
  probe error path and mtk_pci_remove(). pcim_enable_device() handles
  cleanup via devres.

Q7: Does performing a PCIe Link Down Reset (PLDR) before freeing IRQs cause
PCIe Unsupported Request (UR) faults? mtk_pci_pldr() power-cycles the
device and clears its BARs. Directly after this, mtk_pci_free_irq() is
called, which attempts to access the device's MSI-X MMIO table.

  No UR faults. mtk_pci_pldr() calls ACPI PXP._OFF then PXP._ON.
  The _ON call restores power to the device, bringing the PCIe link
  back up with BARs accessible. pci_free_irq_vectors() writes to PCI
  config space (MSI-X capability enable bit) which is always
  accessible while the device is on the bus. The MSI-X table MMIO
  access for masking individual entries goes through the
  re-established BAR mapping.

Q8: Will this manual pci_disable_device() call in mtk_pci_remove() trigger
an atomic underflow for the same reason as in mtk_pci_probe()?
Is a call to mtk_pci_dev_exit() missing from the remove path?

  Two issues, both valid and fixed in v3:
  1. pci_disable_device() underflow: same as Q6, removed.
  2. Missing mtk_pci_dev_exit(): added in patch 3/7 v3 (where
     mtk_pci_dev_init is introduced).

Thanks.

================================================================================================================================================================
This message may contain information which is private, privileged or confidential of Compal Electronics, Inc. If you are not the intended recipient of this message, please notify the sender and destroy/delete the message. Any review, retransmission, dissemination or other use of, or taking of any action in reliance upon this information, by persons or entities other than the intended recipient is prohibited.
================================================================================================================================================================

^ permalink raw reply

* RE: [External Mail] [PATCH v2 6/7] net: wwan: t9xx: Add AT & MBIM WWAN ports
From: Wu. JackBB (GSM) @ 2026-06-24  9:24 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, Wu. JackBB (GSM)
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-mediatek@lists.infradead.org, linux-doc@vger.kernel.org
In-Reply-To: <20260610-t9xx_driver_v1-v2-6-c65addf23b3f@compal.com>

Hi Jakub,

Addressing sashiko AI code review comments for this patch, as
requested by you in the patch 3/7 review:
https://patchwork.kernel.org/project/netdevbpf/patch/20260610-t9xx_driver_v1-v2-3-c65addf23b3f@compal.com/#27006088

Q1: Is this mutex ever acquired? It is initialized during port setup, but
it does not appear to be used to serialize operations in
mtk_port_common_write() or when sending data.

  Valid. Fixed in v3 by removing the unused write_lock mutex from
  struct mtk_port and its mutex_init call.

Q2: Will concurrent writes safely execute here without holding write_lock?
Could this lack of serialization lead to sequence number corruption?

  The WWAN core framework holds port->ops_lock (a mutex) around the
  tx/tx_blocking callback invocations, serializing write operations
  on the same WWAN port. For internal ports, writes are exclusively
  performed by the FSM kthread, which is single-threaded. Concurrent
  writes to the same port do not occur, and port->tx_seq is always
  modified by a single thread.

Q3: What happens if a blocking write is interrupted by a port teardown?
If PORT_S_WR is cleared, trb->status remains at MTK_DFLT_TRB_STATUS (1).
ret = (!trb->status) ? len : trb->status evaluates to 1, causing an
incorrect byte count to be returned.

  This only occurs during port teardown (PORT_S_WR cleared by
  mtk_port_common_close or disable). At this point the port is being
  shut down and the return value is largely irrelevant — the caller
  cannot meaningfully use the port afterward. The submitted data may
  or may not have been transmitted by DMA, depending on timing. This
  is a teardown-only scenario with no practical impact on data
  integrity.

Q4: Is it safe to mutate the flags directly here? This is a non-atomic
read-modify-write on the shared port structure.

  The WWAN core holds port->ops_lock (a mutex) around tx/tx_blocking
  callbacks, serializing flag modifications. mtk_port_wwan_write()
  and mtk_port_wwan_write_blocking() are never called concurrently
  for the same port.

Q5: Does this silently drop data on partial writes? If
mtk_port_common_write() returns a positive value (partial success),
consume_skb is called unconditionally and 0 (success) is returned.

  This is a design limitation of the WWAN port API: wwan_port_op_tx
  returns 0 for success or negative for error — there is no mechanism
  to report partial writes back to the WWAN core. The already-submitted
  fragments cannot be recalled from the DMA engine. For the AT/MBIM
  control ports in this driver, messages are small (typically under
  1KB, within a single MTU). The multi-fragment path is rarely
  exercised for control plane traffic.

Q6: As with mtk_port_wwan_write(), mutating the shared blocking flag
without atomics could race, and ignoring positive return values could
lead to silent data loss.

  Same as Q4 (flags serialized by WWAN core ops_lock) and Q5
  (partial write is a WWAN API limitation, rare for control messages).

Q7: Is there a race condition here if wwan_create_port() fails? The return
value is directly assigned to w_port without checking IS_ERR() first.
Could concurrent RX pass the error pointer to wwan_port_rx()?

  No race. mtk_port_wwan_enable() is called from the FSM thread
  during the handshake sequence, before the port starts receiving
  data. The CLDMA RX queue for this port has not been opened at this
  point — RX data only arrives after the modem completes its
  handshake. The RX path cannot observe the error pointer because no
  data arrives until after the port is fully enabled.

Q8: Is the WWAN port exposed to userspace before its state is fully
initialized? wwan_create_port() registers the character device and
triggers a uevent. If userspace opens immediately, PORT_S_ENABLE is
not set yet so open returns -ENODEV.

  The window between wwan_create_port() returning and
  set_bit(PORT_S_ENABLE) is a few instructions (nanoseconds). If
  userspace opens in that window, the open returns -ENODEV and the
  application retries. In practice, user space WWAN managers (e.g.,
  ModemManager) wait for udev events to settle before opening ports.
  Reordering to set PORT_S_ENABLE before wwan_create_port is not
  correct either — the port should not be marked enabled before the
  WWAN port object exists.

Thanks.

================================================================================================================================================================
This message may contain information which is private, privileged or confidential of Compal Electronics, Inc. If you are not the intended recipient of this message, please notify the sender and destroy/delete the message. Any review, retransmission, dissemination or other use of, or taking of any action in reliance upon this information, by persons or entities other than the intended recipient is prohibited.
================================================================================================================================================================

^ permalink raw reply

* RE: [External Mail] [PATCH v2 5/7] net: wwan: t9xx: Add FSM thread
From: Wu. JackBB (GSM) @ 2026-06-24  9:23 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, Wu. JackBB (GSM)
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-mediatek@lists.infradead.org, linux-doc@vger.kernel.org
In-Reply-To: <20260610-t9xx_driver_v1-v2-5-c65addf23b3f@compal.com>

Hi Jakub,

Addressing sashiko AI code review comments for this patch, as
requested by you in the patch 3/7 review:
https://patchwork.kernel.org/project/netdevbpf/patch/20260610-t9xx_driver_v1-v2-3-c65addf23b3f@compal.com/#27006088

Q1: Will this compile on standard compilers? The container_of macro relies
on offsetof, which requires an integer constant expression. Evaluating
hs_info->id dynamically at runtime will cause a compilation error.

  This compiles correctly. The Linux kernel's container_of uses
  __builtin_offsetof (GCC extension), which supports variable array
  indices. GCC and Clang both generate runtime offset calculation
  code for container_of(ptr, type, member[variable_index]). This
  pattern is used in multiple places in the kernel.

Q2: Could this result in a NULL pointer dereference? If the device sends
multiple CTRL_MSG_HS2 messages, the second message overwrites the skb
pointer. When the first event is processed, it frees the skb and sets
hs_info->rt_data to NULL, causing a deref on the second event.

  This cannot happen in practice. The handshake follows a strict
  HS1 -> HS2 -> HS3 sequence. The MHCCIF channel interrupt is masked
  in mtk_fsm_hs1_handler() before the FSM event is submitted,
  preventing duplicate HS2 notifications. Only one HS2 message is
  expected per handshake cycle.

Q3: Can this read past the end of the packet? We pass rtft_entry->data to
the action callback before checking if rtft_entry->data_len fits within
the remaining packet length.

  The bounds check at the loop start ensures the rtft_entry struct
  header is within bounds. The action functions only read fixed-size
  fields (e.g., a single __le32 for packet padding mode). The modem
  firmware is a trusted source — the feature query protocol uses
  head_pattern/tail_pattern integrity checks, and the message format
  is guaranteed by the modem firmware.

Q4: Could this result in unaligned memory accesses on the next loop
iteration? Since data_len is not verified to be a multiple of 4, the
next rtft_entry could be misaligned.

  On x86 (the target platform for this PCIe WWAN modem), unaligned
  memory access is handled natively without traps. The feature query
  entries from the modem firmware are always naturally aligned in
  practice (data_len is always a multiple of 4 for the defined
  feature types).

Q5: Does this code ensure the packet is long enough before parsing the
feature query? It casts and reads ft_query fields without verifying if
hs_info->rt_data_len is large enough to hold a feature_query structure.

  The SKB is received from the CLDMA RX path with validated length.
  After skb_pull, the remaining data is the feature query payload.
  The head/tail pattern check immediately following this cast
  validates the message integrity. A truncated message would fail the
  pattern check and take the error path.

Q6: Is there a risk of accessing uninitialized memory here? If the skb is
fragmented, checking skb->len is not sufficient. We should use
pskb_may_pull() before casting and accessing skb->data.

  The RX SKBs are allocated internally by the CLDMA driver using
  __dev_alloc_skb with linear buffers. The CCCI header and control
  message data always reside in the contiguous linear data region.
  These SKBs are not received from the network stack and are always
  linear, so pskb_may_pull is not needed.

Q7: Does this introduce a leak if the device sends multiple messages
rapidly? Overwriting hs_info->rt_data discards the previous skb pointer
without freeing it.

  Same as Q2. The handshake is a strict HS1 -> HS2 -> HS3 sequence
  with the MHCCIF channel masked after HS1. Only one HS2 message
  arrives per handshake cycle. No overwrite can occur.

Q8: Does this permanently hang the device initialization? The interrupt is
masked and cleared at the beginning. If the device stage is not
DEV_STAGE_IDLE, we return without unmasking.

  The early_bootup_handler is registered for the BOOT_FLOW_SYNC
  MHCCIF event. The MTK modem firmware sends this event once, only
  when the modem reaches IDLE stage. The non-IDLE code path handles
  unexpected states defensively — masking prevents spurious interrupts
  from an error state. The normal boot flow always reaches IDLE
  before sending BOOT_FLOW_SYNC.

Q9: Can concurrent modifications corrupt the list traversal? External
modules can call mtk_fsm_notifier_register() to modify these lists
concurrently with FSM thread iteration.

  The notifier registration and unregistration are only called during
  driver probe and remove paths, which are serialized by the driver
  model. The FSM kthread does not start processing events until probe
  completes, so there is no concurrent access to the notifier lists
  between registration and traversal.

Q10: These lists are modified without any locking mechanism, meaning
concurrent registrations or iterations could crash the kernel.

  Same as Q9. mtk_fsm_notifier_register/unregister are only called
  from probe/remove paths, serialized by the driver model.

Q11: Is it safe to guess the caller's context like this? The checks do not
catch contexts where preemption is disabled by a standard spin_lock().

  The heuristic is sufficient for all current callers.
  mtk_fsm_evt_submit is called from FSM kthread, MHCCIF workqueue
  callback, probe/remove paths, and CLDMA workqueue callbacks — all
  process context, none holding spinlocks. The in_hardirq/in_softirq/
  irqs_disabled checks cover the actual calling contexts.

Q12: Could this lead to a NULL pointer dereference? wake_up_process() is
performed outside the lock. If mtk_fsm_exit() stops the thread and sets
fsm->fsm_handler to NULL concurrently.

  This race cannot occur. mtk_fsm_exit() is called during the device
  remove sequence, which first submits a FSM_EVT_DEV_RM event. The
  FSM kthread processes this event via mtk_fsm_dev_rm_act(), which
  sets EVT_TF_GATECLOSED under evtq_lock. Only after the kthread
  finishes does mtk_fsm_exit() proceed to call kthread_stop() and
  set fsm_handler to NULL. The EVT_TF_GATECLOSED check inside the
  spinlock provides ordering guarantee.

Q13: Can this loop endlessly if a signal is continuously delivered? Since
this resets the timer to the full MTK_DFLT_TRB_TIMEOUT every time, a
stream of signals might trap the task in an infinite loop.

  This is intentional. Channel enable/disable are control plane
  operations that must complete or timeout. Each iteration sleeps in
  wait_event_interruptible_timeout — not a busy loop. The timeout
  guarantees bounded per-iteration execution. The total wait time may
  exceed a single timeout with continuous signals, but the operation
  eventually completes or times out.

Q14: Does this code silently ignore interrupt registration failures? The
return value of mtk_pci_register_irq() is not checked, and the cleanup
operations below the return statement are unreachable dead code.

  Valid. Fixed in v3 by checking the return value and adding an
  err_destroy_wq error label. Also propagated specific error codes
  from all error paths.

Q15: Does this leak the head SKB when operating in scatter-gather mode? If
rxq->nr_bds > 0, the frag_list is detached but the head SKB itself is
never freed because dev_kfree_skb_any(req->skb) is inside the else block.

  Valid. Fixed in v3 by adding dev_kfree_skb_any(req->skb) after
  detaching frag_list in SG mode.

Thanks.

================================================================================================================================================================
This message may contain information which is private, privileged or confidential of Compal Electronics, Inc. If you are not the intended recipient of this message, please notify the sender and destroy/delete the message. Any review, retransmission, dissemination or other use of, or taking of any action in reliance upon this information, by persons or entities other than the intended recipient is prohibited.
================================================================================================================================================================

^ permalink raw reply

* RE: [External Mail] [PATCH v2 4/7] net: wwan: t9xx: Add control port
From: Wu. JackBB (GSM) @ 2026-06-24  9:19 UTC (permalink / raw)
  To: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, Wu. JackBB (GSM)
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org,
	linux-mediatek@lists.infradead.org, linux-doc@vger.kernel.org
In-Reply-To: <20260610-t9xx_driver_v1-v2-4-c65addf23b3f@compal.com>

Hi Jakub,

Addressing sashiko AI code review comments for this patch, as
requested by you in the patch 3/7 review:
https://patchwork.kernel.org/project/netdevbpf/patch/20260610-t9xx_driver_v1-v2-3-c65addf23b3f@compal.com/#27006088

Q1: Does this unconditionally free the internal port memory? If an internal
user or an active TRB holds a reference to the port during device teardown,
it seems this bypasses kref_put and directly calls the release function.
This could lead to a use-after-free when they attempt to access the port
or drop their reference.

  Internal ports are exclusively used by the FSM layer. During
  teardown, mtk_fsm_ctrl_ch_stop calls mtk_port_internal_close,
  which calls kref_put to release the user's reference before
  mtk_port_free_or_backup is reached. By the time
  mtk_port_free_or_backup runs, the kref count is 1 (the initial
  allocation reference only). Calling mtk_port_release directly at
  this point is equivalent to kref_put decrementing from 1 to 0.
  Internal ports do not use the stale list backup mechanism, so
  bypassing kref_put avoids the unnecessary stale list check in the
  kref_put path.

Q2: Is it safe to traverse the port_tbl radix tree without holding
rcu_read_lock or the port_mngr_grp_mtx mutex? Concurrent port deletions
could free radix tree nodes or mtk_port structures during traversal,
potentially leading to a use-after-free when the un-refcounted pointers
are dereferenced.

  mtk_port_search_by_name is only called from mtk_port_internal_open,
  which is exclusively invoked by the FSM kthread during
  mtk_fsm_ctrl_ch_start. Port creation and destruction are also
  driven by FSM state transitions on the same kthread. Since the FSM
  serializes these operations, no concurrent port removal can occur
  while search_by_name is iterating the radix tree.

Q3: If this array allocation fails during teardown, does the function skip
deleting items from the radix tree and freeing the dynamically allocated
mtk_port objects? This appears to bypass the teardown phase entirely and
leak memory.

  Valid. Fixed in v3 by replacing kcalloc + radix_tree_gang_lookup
  with radix_tree_for_each_slot() and single-entry gang_lookup,
  eliminating the allocation in the teardown path.

Q4: Should we increment the port's reference count when assigning it to
trb->priv? If a writer thread is interrupted by a signal while waiting for
hardware completion, it returns and drops the port. If the port is then
closed and freed, the asynchronous callback mtk_port_tx_complete will
dereference the freed port, resulting in a use-after-free.

  The TRB lifecycle is protected by kref on the TRB itself (kref_get
  before submit, kref_put in both the caller and the completion
  callback). When a blocking operation times out, the caller releases
  its kref reference, but the TRB remains alive in the transaction
  layer until the completion callback fires. The port is not freed at
  this point — port destruction goes through kref_put(&port->kref,
  mtk_port_release), and the port's reference count is not decremented
  until the close path completes. Additionally, TRB_CMD_DISABLE
  triggers mtk_ctrl_ch_flush in the transaction layer, which forces
  all pending TRBs to complete before the port teardown proceeds.

Q5: Can concurrent packet transmissions on the same port race here? There
does not appear to be any synchronization protecting the read and increment
of port->tx_seq.

  tx_seq is a per-port 8-bit sequence number. Each port has a single
  writer in practice: external ports are written from user space via
  cdev write() (serialized by the file descriptor), and internal
  ports are written exclusively by the FSM kthread. Concurrent
  transmissions on the same port do not occur in normal operation.

Q6: Does this safely handle fragmented SKBs? Without using pskb_may_pull
or similar, the header might span across multiple DMA fragments. If
skb->data does not contain the entire 16-byte header, this cast could
result in reading out-of-bounds memory.

  The RX SKBs are allocated internally by the CLDMA driver using
  __dev_alloc_skb with linear buffers. The CCCI header always resides
  in the contiguous linear data region. mtk_port_strip_header already
  validates skb->len >= sizeof(*ccci_h) before accessing the header.
  Since these SKBs are not received from the network stack and are
  always linear, pskb_may_pull is not needed here.

Q7: Is there a bounds check to ensure that the packet buffer is large
enough to hold the number of ports specified by msg->port_cnt? If the
modem provides an arbitrarily large count, this loop could read past the
end of the packet data.

  The message is already validated by head_pattern, tail_pattern, and
  version checks before the loop. The port enumeration protocol
  between host and modem guarantees that the buffer contains exactly
  port_cnt entries. The modem firmware is a trusted source — this is
  a control plane interface, not an untrusted network protocol.

Q8: If a signal is pending, wait_event_interruptible_timeout will return
-ERESTARTSYS immediately. Jumping back to start_wait without returning to
userspace to handle the signal creates an infinite tight loop that will
stall the CPU at 100% until the hardware completes.

  This is intentional. Channel enable/disable are control plane
  operations that must complete or timeout — aborting mid-operation
  due to a signal would leave the channel in an inconsistent state.
  The loop does not spin: each iteration sleeps in
  wait_event_interruptible_timeout until either the signal re-fires,
  the condition is met, or the timeout expires. The timeout
  (MTK_DFLT_TRB_TIMEOUT) guarantees bounded execution.

Q9: Is there a race condition between checking PORT_S_OPEN and setting it?
If two threads concurrently open the same port, they might both read the
bit as clear and proceed with initialization. Could test_and_set_bit be
used?

  All callers of mtk_port_common_open acquire port_mngr_grp_mtx via
  mtk_port_get_locked before calling this function, which serializes
  concurrent open attempts on the same port. The test_bit/set_bit
  sequence is protected by the mutex.

Q10: If mtk_port_common_open fails, the port reference is dropped via
mtk_port_put_locked, but the pointer itself is not set to NULL before
jumping to out. This causes the function to return an un-refcounted,
invalid port pointer.

  Valid. Fixed in v3 by adding port = NULL after mtk_port_put_locked()
  in the error path.

Q11: If mtk_cldma_submit_tx fails with a fatal error (like -EINVAL), the
error path breaks out of the switch without unlinking the SKB from
skb_list and without signaling completion. Will the next processing loop
retrieve the exact same failing SKB with skb_peek and infinitely repeat
the failure?

  Valid. Fixed in v3 by restoring skb_unlink + trb_complete for
  non-EAGAIN errors in the TX path.

Thanks.

================================================================================================================================================================
This message may contain information which is private, privileged or confidential of Compal Electronics, Inc. If you are not the intended recipient of this message, please notify the sender and destroy/delete the message. Any review, retransmission, dissemination or other use of, or taking of any action in reliance upon this information, by persons or entities other than the intended recipient is prohibited.
================================================================================================================================================================

^ permalink raw reply

* Re: [RFC PATCH v2 06/10] kvm: guest_memfd: Add support for freezing and unfreezing mappings
From: Pratyush Yadav @ 2026-06-24  9:00 UTC (permalink / raw)
  To: tarunsahu
  Cc: Pratyush Yadav, Ackerley Tng, Jonathan Corbet, vannapurve, fvdl,
	Pasha Tatashin, Shuah Khan, sagis, aneesh.kumar, skhawaja,
	vipinsh, david, dmatlack, mark.rutland, Paolo Bonzini,
	Mike Rapoport, Alexander Graf, seanjc, axelrasmussen,
	linux-kselftest, kexec, linux-kernel, linux-doc, kvm, linux-mm
In-Reply-To: <9huzv7b910oe.fsf@tarunix.c.googlers.com>

On Tue, Jun 23 2026, tarunsahu@google.com wrote:

> Pratyush Yadav <pratyush@kernel.org> writes:
>
>> On Tue, Jun 23 2026, tarunsahu@google.com wrote:
>>
>>> Ackerley Tng <ackerleytng@google.com> writes:
>>>
>>>> Tarun Sahu <tarunsahu@google.com> writes:
>>>>
>>>>>  static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
>>>>>  			       loff_t len)
>>>>>  {
>>>>> +	struct inode *inode = file_inode(file);
>>>>>  	int ret;
>>>>> +	int idx;
>>>>>
>>>>> -	if (!(mode & FALLOC_FL_KEEP_SIZE))
>>>>> -		return -EOPNOTSUPP;
>>>>> +	idx = srcu_read_lock(&kvm_gmem_freeze_srcu);
>>>>> +	if (kvm_gmem_is_frozen(inode)) {
>>>>> +		srcu_read_unlock(&kvm_gmem_freeze_srcu, idx);
>>>>> +		return -EPERM;
>>>>> +	}
>>>>
>>>> fallocate may eventually go to kvm_gmem_get_folio(), so that would check
>>>> kvm_gmem_is_frozen() twice. Is this meant to catch the punch hole case?
>>
>> Yeah, I reckon you can get away with doing this check only in
>> kvm_gmem_get_folio(). Normally you'd like to fail early, but as of now I
>> don't see much of a problem. If you drop the check here and fail in
>> kvm_gmem_get_folio() you'd end up taking and releasing the mapping
>> invalidate_lock, but this isn't a fast path anyway so I don't think it
>> should matter much.
>
> No, Don't agree.
> kvm_gmem_get_folios already have the is_frozen check. which blocks the
> kvm_gmem_allocate. But not kvm_gmem_punch_hole. Your argument is correct
> for kvm_gmem_allocate only. So is_frozen check in fallocate is to
> block the punch hole as well. What ackerley said is correct.

Oh, right. Then we do need the check in both places.

[...]

-- 
Regards,
Pratyush Yadav

^ permalink raw reply

* Re: [PATCH v6 6/8] Documentation: bootconfig: document build-time cmdline rendering
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-6-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:33 -0700
Breno Leitao <leitao@debian.org> wrote:

> Add a section describing CONFIG_CMDLINE_FROM_BOOTCONFIG: what it
> does (renders the embedded "kernel" subtree to a flat cmdline at
> build time so early_param() handlers see the values), what it
> requires (BOOT_CONFIG_EMBED, a non-empty BOOT_CONFIG_EMBED_FILE,
> and ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG -- currently x86 only),
> the bootconfig opt-in semantics, the initrd-vs-embedded precedence,
> and the soft-error overflow behavior.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  Documentation/admin-guide/bootconfig.rst | 81 ++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
> 
> diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
> index f712758472d5c..349cefbb2bbcd 100644
> --- a/Documentation/admin-guide/bootconfig.rst
> +++ b/Documentation/admin-guide/bootconfig.rst
> @@ -234,6 +234,87 @@ Kconfig option selected.
>  Note that even if you set this option, you can override the embedded
>  bootconfig by another bootconfig which attached to the initrd.
>  
> +Rendering Embedded kernel.* Keys at Build Time
> +----------------------------------------------
> +
> +By default, the embedded bootconfig (``CONFIG_BOOT_CONFIG_EMBED=y``) is
> +parsed at runtime, after ``parse_early_param()`` has already run. Early
> +parameter handlers (``mem=``, ``earlycon=``, ``loglevel=``, ...) therefore
> +cannot see values supplied via the embedded ``kernel`` subtree.
> +
> +``CONFIG_CMDLINE_FROM_BOOTCONFIG`` resolves this by rendering the
> +``kernel`` subtree of ``CONFIG_BOOT_CONFIG_EMBED_FILE`` into a flat cmdline
> +string at kernel build time (via ``tools/bootconfig -C``) and prepending
> +it to ``boot_command_line`` during early architecture setup, so the keys
> +are visible to ``parse_early_param()``.
> +
> +The option requires ``CONFIG_BOOT_CONFIG_EMBED=y``, a non-empty
> +``CONFIG_BOOT_CONFIG_EMBED_FILE``, and an architecture that selects
> +``CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG``. Currently only x86
> +selects it; on other architectures the embedded bootconfig still works,
> +but only through the late runtime parser.

As commented by Sashiko, here we need to mention that this option requires
CONFIG_CMDLINE to be empty. This means user can NOT set both option
at once (This also means user doesn't have to worry about configuration
conflicts.)

Thanks,



-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v6 8/8] x86/setup: prepend embedded bootconfig cmdline before parse_early_param
From: Masami Hiramatsu @ 2026-06-24  8:47 UTC (permalink / raw)
  To: Breno Leitao
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260623-bootconfig_using_tools-v6-8-640c2f587a3c@debian.org>

On Tue, 23 Jun 2026 09:15:35 -0700
Breno Leitao <leitao@debian.org> wrote:

> Call xbc_prepend_embedded_cmdline() in setup_arch() right after the
> CONFIG_CMDLINE merge and before strscpy(command_line, ...) so the
> build-time-rendered embedded bootconfig "kernel" subtree is part of
> boot_command_line by the time parse_early_param() runs. early_param()
> handlers (mem=, earlycon=, loglevel=, ...) now see values supplied via
> CONFIG_BOOT_CONFIG_EMBED_FILE without parsing bootconfig at runtime.
> 
> Gate the prepend on the same opt-in the runtime parser uses: prepend
> when "bootconfig" is present on the command line, or when
> CONFIG_BOOT_CONFIG_FORCE is set. Detect it with parse_args(), exactly
> as setup_boot_config() does, so both agree on what counts as opt-in:
> any "bootconfig" key regardless of value (bare, =0, =1, ...), and only
> before the "--" that separates init arguments. Sharing the parser keeps
> the early and late paths from diverging -- e.g. "bootconfig=0" or a
> "-- bootconfig" meant for init must not apply the embedded keys early
> while the runtime parser skips them.
> 
> The prepend necessarily runs before setup_boot_config() detects an
> initrd bootconfig, so an initrd cannot override the embedded "kernel"
> keys for early_param(). This is intentional: the embedded cmdline acts
> like a build-time CONFIG_CMDLINE. An initrd bootconfig's "kernel" keys
> never reached early_param() anyway (they apply late via
> extra_command_line), so nothing is lost -- the initrd keys still apply
> late, with last-wins keeping the embedded values in effect.
> 
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>  arch/x86/Kconfig        |  1 +
>  arch/x86/kernel/setup.c | 43 +++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 0de23e6471973..8ab11199c16d5 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -127,6 +127,7 @@ config X86
>  	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
>  	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
>  	select ARCH_SUPPORTS_CFI		if X86_64
> +	select ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG
>  	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI
>  	select ARCH_SUPPORTS_LTO_CLANG
>  	select ARCH_SUPPORTS_LTO_CLANG_THIN
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 46882ce79c3a4..c973a2cebcd04 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -6,6 +6,7 @@
>   * parts of early kernel initialization.
>   */
>  #include <linux/acpi.h>
> +#include <linux/bootconfig.h>
>  #include <linux/console.h>
>  #include <linux/cpu.h>
>  #include <linux/crash_dump.h>
> @@ -881,6 +882,37 @@ static void __init x86_report_nx(void)
>   * Note: On x86_64, fixmaps are ready for use even before this is called.
>   */
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +static int __init bootconfig_optin(char *param, char *val,
> +				   const char *unused, void *arg)
> +{
> +	if (!strcmp(param, "bootconfig"))
> +		*(bool *)arg = true;
> +	return 0;
> +}
> +
> +/*
> + * Did the user opt in to bootconfig on the kernel command line? Use
> + * parse_args() so this matches setup_boot_config() exactly, including
> + * stopping at the "--" that separates init arguments.
> + */
> +static bool __init bootconfig_cmdline_requested(void)
> +{
> +	static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
> +	bool found = false;
> +
> +	if (IS_ENABLED(CONFIG_BOOT_CONFIG_FORCE))
> +		return true;
> +
> +	strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
> +	if (IS_ERR(parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0,
> +			      &found, bootconfig_optin)))
> +		return false;
> +
> +	return found;
> +}

It seems that this should be placed in a common place because it will
be used from other architectures (and init/main.c too). Maybe we can
introduce something like this?

bool __init bootconfig_cmdline_requested(const char *boot_cmdline, int *end_offset);

Thanks,

> +#endif
> +
>  void __init setup_arch(char **cmdline_p)
>  {
>  #ifdef CONFIG_X86_32
> @@ -924,6 +956,17 @@ void __init setup_arch(char **cmdline_p)
>  	builtin_cmdline_added = true;
>  #endif
>  
> +#ifdef CONFIG_CMDLINE_FROM_BOOTCONFIG
> +	/*
> +	 * Prepend the build-time-rendered embedded "kernel" keys here so
> +	 * parse_early_param() below sees them, gating on the same opt-in
> +	 * as the runtime parser (see bootconfig_cmdline_requested()).
> +	 */
> +	if (bootconfig_cmdline_requested())
> +		xbc_prepend_embedded_cmdline(boot_command_line,
> +					     COMMAND_LINE_SIZE);
> +#endif
> +
>  	strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
>  	*cmdline_p = command_line;
>  
> 
> -- 
> 2.53.0-Meta
> 


-- 
Masami Hiramatsu (Google) <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v2 1/2] cgroup/cpuset: Avoid unnecessary cpus & mems update in cpuset_hotplug_update_tasks()
From: Manuel Ebner @ 2026-06-24  8:40 UTC (permalink / raw)
  To: Waiman Long, Tejun Heo, Johannes Weiner, Michal Koutný,
	Ridong Chen, Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc, linux-kselftest
In-Reply-To: <20260623230413.1984188-2-longman@redhat.com>

On Tue, 2026-06-23 at 19:04 -0400, Waiman Long wrote:
> As reported by sashiko [1], cpuset_hotplug_update_tasks() may perform
> unnecessary task iteration and updating of tasks' CPU and node masks
> when mems_allowed and/or cpus_allowed are not set in cpuset v2. It is
> due to the fact that the temporary new_cpus and new_mems masks do not
> inherit parent's effective_cpus/mems when they are empty which is the
> expected behavior for cpuset v2 since commit 4ec22e9c5a90 ("cpuset:
> Enable cpuset controller in default hierarchy").
> 
> Fix that and avoid unnecessay work by enhancing
> compute_effective_cpumask() to add the empty cpumask check
> and inheriting the parent's versions if empty when in v2. A new
> compute_effective_nodemask() helper is also added to perform similar
> function for new effective_mems.

perform a similar function
or
perform similar functions

> [...]
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index aff86acea701..044ddbf66f8e 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1094,12 +1094,35 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct
> cpumask *new_cpus)
>   * @cs: the cpuset the need to recompute the new effective_cpus mask
>   * @parent: the parent cpuset
>   *
> + * For v2, the parent's effective_cpus is inherited if cpumask empty.

+ * For v2, the parent's effective_cpus is inherited if cpumask is empty.

Thanks
 Manuel

^ permalink raw reply

* Re: [PATCH v2 2/2] cgroup/cpuset: Rebind/migrate mm only for threadgroup leader in cpuset_update_tasks_nodemask()
From: Manuel Ebner @ 2026-06-24  8:27 UTC (permalink / raw)
  To: Waiman Long, Tejun Heo, Johannes Weiner, Michal Koutný,
	Ridong Chen, Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc, linux-kselftest
In-Reply-To: <20260623230413.1984188-3-longman@redhat.com>

Hi

On Tue, 2026-06-23 at 19:04 -0400, Waiman Long wrote:
> [...]
> Also add a paragraph in cgroup-v2.rst under cpuset.mems that the
> threadgroup leader is the memory owner of that threadgroup. Therefore
> the non-leading threads shouldn't be in other cgroups whose "cpuset.mems"
> doesn't fully overlap that of the group leader.

This sentence is long and complex, split into two if possible. I couldn't
figure out how to do so.

> [...]
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -2527,6 +2527,13 @@ Cpuset Interface Files
>  	a need to change "cpuset.mems" with active tasks, it shouldn't
>  	be done frequently.
>  
> +	For a multithreaded process, the threadgroup leader is
> +	considered the owner of the group's memory. Memory policy
> +	rebinding and migration will only happen with respect to the
> +	threadgroup leader. To avoid unexpected result, non-leading

/result/results/
or 
To avoid an unexpected result, 

> +	threads shouldn't be put into another cgroup whose "cpuset.mems"
> +	doesn't fully overlap that of the threadgroup leader.

maybe
/threadgroup/threadgroups/


Thanks
 Manuel

^ permalink raw reply

* [PATCH v3 RESEND 5/5] RISC-V: KVM: Add the eager_page_split module parameter
From: wang.yechao255 @ 2026-06-24  8:13 UTC (permalink / raw)
  To: anup, kvm, kvm-riscv, linux-riscv, linux-kernel, linux-doc
  Cc: pjw, palmer, aou, atish.patra, alex, wang.yechao255
In-Reply-To: <20260624160054463wcDvJaMoydSggcNOWgcfB@zte.com.cn>

From: Wang Yechao <wang.yechao255@zte.com.cn>

Add an eager_page_split module parameter for RISC-V KVM, following
the same approach as on x86. This parameter controls whether eager
page splitting is enabled. The default value is on.

When eager page splitting is enabled, KVM proactively splits large
pages (huge pages) into smaller pages when needed for dirty logging
or other operations. Disabling it can be beneficial for VM workloads
that rarely perform writes, or that only write to a small region of
memory, as it allows huge pages to remain intact for read accesses.

Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++++--
 arch/riscv/kvm/mmu.c                            | 13 ++++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a68003c3599cc..b4c68a896fa79 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3047,7 +3047,7 @@ Kernel parameters
 			Default is 0 (don't ignore, but inject #GP)

 	kvm.eager_page_split=
-			[KVM,X86] Controls whether or not KVM will try to
+			[KVM,X86,RISCV] Controls whether or not KVM will try to
 			proactively split all huge pages during dirty logging.
 			Eager page splitting reduces interruptions to vCPU
 			execution by eliminating the write-protection faults
@@ -3067,7 +3067,10 @@ Kernel parameters
 			the KVM_CLEAR_DIRTY ioctl, and only for the pages being
 			cleared.

-			Eager page splitting is only supported when kvm.tdp_mmu=Y.
+			On x86, eager page splitting is only supported when
+			kvm.tdp_mmu=Y.
+
+			On RISCV, eager page splitting is supported by default.

 			Default is Y (on).

diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index cbda927dd24e3..278dd3bba680e 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -16,6 +16,9 @@
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_nacl.h>

+static bool __read_mostly eager_page_split = true;
+module_param(eager_page_split, bool, 0644);
+
 static void mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
 	struct kvm_memslots *slots = kvm_memslots(kvm);
@@ -165,8 +168,10 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,

 	kvm_riscv_gstage_wp_range(&gstage, start, end);

-	if (kvm_dirty_log_manual_protect_and_init_set(kvm))
-		mmu_split_huge_pages(&gstage, start, end);
+	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
+		if (READ_ONCE(eager_page_split))
+			mmu_split_huge_pages(&gstage, start, end);
+	}
 }

 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
@@ -236,7 +241,9 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
 			return;
 		mmu_wp_memory_region(kvm, new->id);
-		mmu_split_memory_region(kvm, new->id);
+
+		if (READ_ONCE(eager_page_split))
+			mmu_split_memory_region(kvm, new->id);
 	}
 }

-- 
2.43.5

^ permalink raw reply related

* Re: [PATCH v2] usbcore: Add quirk for 255-bytes initial config read
From: Nikhil Solanke @ 2026-06-24  8:06 UTC (permalink / raw)
  To: Alan Stern
  Cc: linux-usb, gregkh, linux-kernel, michal.pecio, stable, corbet,
	skhan, linux-doc
In-Reply-To: <5159fd69-dddf-4073-a8e7-95fa77de0b7f@rowland.harvard.edu>

> Actually, the best approach here would be to put this single change into
> a separate patch that comes before the current one.  That removes issues
> of making more than one functional change in one patch and improves
> bisectability.

Before? Shouldn't it be after my changes? That would make it easier to
justify the changes. And just to be sure, you did mention it does
align with what the intention of USB_QUIRK_DELAY_INIT, but it does
change its behavior when the quirk is not set. Atleast from what I
understood from the documentation and an LLM's summary, the device
needs time to prepare the full configuration set. So, does delaying
before the first header read really work? I can't test this since I
don't have a device that requires the quirk to be set.

I personally think adding a condition to check if the quirk is set and
then delaying before sending the first request would be appropriate.
What are your opinions on this.

> The style used in this file is to indent continuation lines by 4 spaces,
> because some of the continued statements are extremely long.  If you
> want to align new continuation lines with an open paren, you can -- but
> you didn't even do that in the example above; you aligned it with the
> space following the first comma.

I will make my changes more consistent with the existing file, i.e.
continuation with 4 spaces.

Also is it fine if the string lines exceed 100 columns?

Also, is there a need to check for krealloc()'s return value? Since we
are only shrinking the buffer, there won't be any moves or completely
new blocks (at least as per my understanding). Do I still need to
check its return value for completeness' sake?

^ permalink raw reply

* Re: [PATCH 1/2] dt-bindings: hwmon: chipcap2: Add label property
From: Krzysztof Kozlowski @ 2026-06-24  7:45 UTC (permalink / raw)
  To: Flaviu Nistor
  Cc: Guenter Roeck, Javier Carrasco, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Jonathan Corbet, Shuah Khan, linux-hwmon,
	linux-kernel, devicetree, linux-doc
In-Reply-To: <20260623192217.4804-1-flaviu.nistor@gmail.com>

On Tue, Jun 23, 2026 at 10:22:17PM +0300, Flaviu Nistor wrote:
> On 6/23/26 9:58 PM CET, Guenter Roeck wrote:
> >On 6/23/26 11:16, Flaviu Nistor wrote:
> >> On Mon Jun 22, 2026 at 7:29 PM CEST, Javier Carrasco wrote:
> >>> On Mon Jun 22, 2026 at 2:21 PM CEST, Flaviu Nistor wrote:
> >>>> Add support for an optional label property similar to other hwmon devices
> >>>> This allows, in case of boards with multiple CHIPCAP2 sensors, to assign
> >>>> distinct names to each instance.
> >>>>
> >>>> Signed-off-by: Flaviu Nistor <flaviu.nistor@gmail.com>
> >>>> ---
> >>>>   .../devicetree/bindings/hwmon/amphenol,chipcap2.yaml         | 5 +++++
> >>>>   1 file changed, 5 insertions(+)
> >>>>
> >>>> diff --git a/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.ya=
> >>> ml b/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
> >>>> index 17351fdbefce..f00b5a4b14dd 100644
> >>>> --- a/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
> >>>> +++ b/Documentation/devicetree/bindings/hwmon/amphenol,chipcap2.yaml
> >>>> @@ -33,6 +33,10 @@ properties:
> >>>>     reg:
> >>>>       maxItems: 1
> >>>>
> >>>> +  label:
> >>>> +    description:
> >>>> +      A descriptive name for this channel, like "ambient" or "psu".
> >>>> +
> >>>>     interrupts:
> >>>>       items:
> >>>>         - description: measurement ready indicator
> >>>> @@ -72,6 +76,7 @@ examples:
> >>>>                            <5 IRQ_TYPE_EDGE_RISING>,
> >>>>                            <6 IRQ_TYPE_EDGE_RISING>;
> >>>>               interrupt-names =3D "ready", "low", "high";
> >>>> +            label =3D "somelabel";
> >>>>               vdd-supply =3D <&reg_vdd>;
> >>>>           };
> >>>       };
> >>>
> >>> Hello Falviu, thank you for your patch.
> >>>
> >>
> >> Hello Javier, thanks for your reply.
> >>
> >>> Should we not add a reference to hwmon-common.yaml (with
> >>> unevelautedProperties instead of additionalProperties), as label is
> >>> defined there? I believe that Krzysztof Kozlowski did something similar
> >>> for the shunt-resistor-micro-ohms property. Could we follow suit here?
> >>>
> >>
> >> This is a good question and I am happy you asked. I also thought a lot
> >> about this and the reason I decided to go for this approach is that by using
> >> $ref: hwmon-common.yaml#, I would have to change additionalProperties: false
> >> to unevaluatedProperties: false, which will evaluate in case it is used, also
> >> shunt-resistor-micro-ohms property which does not apply to this sensor. At
> >> least this is my understanding, but of course I can be wrong (I see lm75 binding
> >> also uses $ref: hwmon-common.yaml# but shunt-resistor-micro-ohms does not apply).
> >>
> >
> >Where does the idea come from that shunt-resistor-micro-ohms would be mandatory ?
> >That would make hwmon-common.yaml unusable for most chips.
> 
> I think this is a misunderstanding since I never had the intention to imply that
> shunt-resistor-micro-ohms would be mandatory, but rather I observed that if I used
> $ref: hwmon-common.yaml#, property shunt-resistor-micro-ohms can be added (no need to,
> but still possible) in the example section and the dt_binding_check will pass.
> Since hwmon-common.yaml is already there I will change the binding in a v2 and use it.

I propose to reference hwmon-common.yaml with:
label: true
additionalProperties: false

thus shunt-resistor won't be allowed but you still will reference common
schema for the definition of common properties.


Best regards,
Krzysztof


^ permalink raw reply

* Re: [PATCH v2 1/2] cgroup/cpuset: Avoid unnecessary cpus & mems update in cpuset_hotplug_update_tasks()
From: Ridong Chen @ 2026-06-24  5:51 UTC (permalink / raw)
  To: Waiman Long, Tejun Heo, Johannes Weiner, Michal Koutný,
	Jonathan Corbet, Shuah Khan
  Cc: cgroups, linux-kernel, linux-doc, linux-kselftest
In-Reply-To: <20260623230413.1984188-2-longman@redhat.com>



On 6/24/2026 7:04 AM, Waiman Long wrote:
> As reported by sashiko [1], cpuset_hotplug_update_tasks() may perform
> unnecessary task iteration and updating of tasks' CPU and node masks
> when mems_allowed and/or cpus_allowed are not set in cpuset v2. It is
> due to the fact that the temporary new_cpus and new_mems masks do not
> inherit parent's effective_cpus/mems when they are empty which is the
> expected behavior for cpuset v2 since commit 4ec22e9c5a90 ("cpuset:
> Enable cpuset controller in default hierarchy").
> 
> Fix that and avoid unnecessay work by enhancing
> compute_effective_cpumask() to add the empty cpumask check
> and inheriting the parent's versions if empty when in v2. A new
> compute_effective_nodemask() helper is also added to perform similar
> function for new effective_mems.
> 
> Add new test_cpuset_prs.sh test cases to confirm that effective_cpus
> will inherit the parent's version if cpuset.cpus is empty.
> 
> [1] https://sashiko.dev/#/patchset/20260621032816.1806773-1-longman%40redhat.com
> 
> Suggested-by: Ridong Chen <ridong.chen@linux.dev>
> Fixes: 4ec22e9c5a90 ("cpuset: Enable cpuset controller in default hierarchy")
> Signed-off-by: Waiman Long <longman@redhat.com>
> ---
>   kernel/cgroup/cpuset.c                        | 45 +++++++++++--------
>   .../selftests/cgroup/test_cpuset_prs.sh       | 11 ++++-
>   2 files changed, 35 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index aff86acea701..044ddbf66f8e 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1094,12 +1094,35 @@ void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
>    * @cs: the cpuset the need to recompute the new effective_cpus mask
>    * @parent: the parent cpuset
>    *
> + * For v2, the parent's effective_cpus is inherited if cpumask empty.
>    * The result is valid only if the given cpuset isn't a partition root.
>    */
>   static void compute_effective_cpumask(struct cpumask *new_cpus,
>   				      struct cpuset *cs, struct cpuset *parent)
>   {
> -	cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
> +	bool has_cpus;
> +
> +	has_cpus = cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
> +	if (!has_cpus && is_in_v2_mode())
> +		cpumask_copy(new_cpus, parent->effective_cpus);
> +}
> +
> +/**
> + * compute_effective_nodemask - Compute the effective nodemask of the cpuset
> + * @new_cpus: the temp variable for the new effective_mems mask
> + * @cs: the cpuset the need to recompute the new effective_mems mask
> + * @parent: the parent cpuset
> + *
> + * For v2, the parent's effective_mems is inherited if nodemask empty.
> + */
> +static void compute_effective_nodemask(nodemask_t *new_mems,
> +				       struct cpuset *cs, struct cpuset *parent)
> +{
> +	bool has_mems;
> +
> +	has_mems = nodes_and(*new_mems, cs->mems_allowed, parent->effective_mems);
> +	if (!has_mems && is_in_v2_mode())
> +		nodes_copy(*new_mems, parent->effective_mems);
>   }
>   
>   /*
> @@ -2148,15 +2171,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
>   			goto update_parent_effective;
>   		}
>   
> -		/*
> -		 * If it becomes empty, inherit the effective mask of the
> -		 * parent, which is guaranteed to have some CPUs unless
> -		 * it is a partition root that has explicitly distributed
> -		 * out all its CPUs.
> -		 */
> -		if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
> -			cpumask_copy(tmp->new_cpus, parent->effective_cpus);
> -
>   		/*
>   		 * Skip the whole subtree if
>   		 * 1) the cpumask remains the same,
> @@ -2704,14 +2718,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
>   	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
>   		struct cpuset *parent = parent_cs(cp);
>   
> -		bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
> -
> -		/*
> -		 * If it becomes empty, inherit the effective mask of the
> -		 * parent, which is guaranteed to have some MEMs.
> -		 */
> -		if (is_in_v2_mode() && !has_mems)
> -			*new_mems = parent->effective_mems;
> +		compute_effective_nodemask(new_mems, cp, parent);
>   
>   		/* Skip the whole subtree if the nodemask remains the same. */
>   		if (nodes_equal(*new_mems, cp->effective_mems)) {
> @@ -3923,7 +3930,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
>   
>   	parent = parent_cs(cs);
>   	compute_effective_cpumask(&new_cpus, cs, parent);
> -	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
> +	compute_effective_nodemask(&new_mems, cs, parent);
>   
>   	if (!tmp || !cs->partition_root_state)
>   		goto update_tasks;
> diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> index 0d41aa0d343d..ca9bc38fdb95 100755
> --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> @@ -495,13 +495,20 @@ REMOTE_TEST_MATRIX=(
>   	# Narrowing cpuset.cpus to previously sibling-excluded CPUs should
>   	# not return CPUs that were never actually owned.
>   	"  C1-4:P1   .   C1-2:P1  C1-3:P2  .       .  \
> -	      .      .     .         C3    .       .     p1:4|c11:1-2|c12:3 \
> +	      .      .     .       C3      .       .     p1:4|c11:1-2|c12:3 \
>   							 p1:P1|c11:P1|c12:P2 3"
>   	# Expanding cpuset.cpus to include a previously sibling-excluded CPU
>   	# after the sibling has become a member should correctly request it.
>   	"  C1-4:P1   .   C1-2:P1  C1-3:P2  .       .  \
> -	      .      .      P0     C2-3    .       .     p1:1,4|c11:1|c12:2-3 \
> +	      .      .     P0      C2-3    .       .     p1:1,4|c11:1|c12:2-3 \
>   							 p1:P1|c11:P0|c12:P2 2-3"
> +	# Cpusets with empty cpuset.cpus should inherit parent's effective_cpus
> +	"  C1-4:P1 C5-6   C1-2     .       C5      .  \
> +	      .      P1    P1      .       .       .     p1:3-4|p2:5-6|c11:1-2|c12:3-4|c21:5|c22:5-6 \
> +							 p1:P1|p2:P1|c11:P1"
> +	"  C1-4:P1 C5-6   C1-2     .       C5      .  \
> +	      .      P1    P1      .      O5=0     .     p1:3-4|p2:6|c11:1-2|c12:3-4|c21:6|c22:6 \
> +							 p1:P1|p2:P1|c11:P1"
>   )
>   
>   #

LGTM.

Reviewed-by: Ridong Chen <ridong.chen@linux.dev>

-- 
Best regards
Ridong


^ permalink raw reply

* Re: [RFC PATCH 0/2] kasan: hw_tags: Add option to tag only at allocation time
From: Dev Jain @ 2026-06-24  4:14 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Harry Yoo, ryabinin.a.a, akpm, corbet, glider, andreyknvl,
	dvyukov, vincenzo.frascino, kasan-dev, linux-mm, linux-kernel,
	skhan, workflows, linux-doc, linux-arm-kernel, ryan.roberts,
	anshuman.khandual, kaleshsingh, 21cnbao, david, will
In-Reply-To: <ajq-Ukmd9NBruhr5@arm.com>



On 23/06/26 10:41 pm, Catalin Marinas wrote:
> On Tue, Jun 23, 2026 at 10:32:16AM +0530, Dev Jain wrote:
>> On 22/06/26 10:43 pm, Catalin Marinas wrote:
>>> On Mon, Jun 22, 2026 at 09:42:10PM +0900, Harry Yoo wrote:
>>>> On 6/19/26 10:19 PM, Catalin Marinas wrote:
>>>>> On Thu, Jun 18, 2026 at 10:35:15PM +0900, Harry Yoo wrote:
>>>>>> On 6/12/26 1:44 PM, Dev Jain wrote:
>>>>>>> Now, when a memory object will be freed, it will retain the random tag it
>>>>>>> had at allocation time. This compromises on catching UAF bugs, till the
>>>>>>> time the object is not reallocated, at which point it will have a new
>>>>>>> random tag.
>>>>>>>
>>>>>>> Hence, not catching "use-after-free-before-reallocation" and not catching
>>>>>>> "double-free" will be the compromise for reduced KASAN overhead.
>>>>>>
>>>>>> I doubt users who care about security enough to enable HW_TAGS KASAN
>>>>>> are willing to compromise on security just to save a few instructions
>>>>>> to store tags in the free path.
>>>>>>
>>>>>> To me, it looks like too much of a compromise on security for little
>>>>>> performance gain.
>>>>>
>>>>> I don't think there's much compromise on security for use-after-free.
>>>>
>>>> I think it depends... OH, WAIT! I see what you mean.
>>>>
>>>> You mean use-after-free before reallocation does not lead to much
>>>> compromise on security because objects are initialized after allocation?
>>>>
>>>> You're probably right.
>>>>
>>>> Hmm, but stores to e.g.) free pointer, fields initialized by
>>>> constructor or accessed by SLAB_TYPESAFE_BY_RCU semantics after free
>>>> will be undiscovered if they happen before reallocation.
>>>
>>> Even with SLAB_TYPESAFE_BY_RCU, the object isn't tagged on free either
>>> (or realloc, only if the actual slab page ends up freed). But we don't
>>> get type confusion for such slab.
>>>
>>> However, without tagging on free, one could argue that it reduces
>>> security for cases where the page is re-allocated as untagged - e.g. all
>>> user pages mapped without PROT_MTE. Currently we have a deterministic
>>> tag check fault if the page is coloured as KASAN_TAG_INVALID. I think
>>
>> So you are saying that a stale kernel pointer can continue to use the
>> reallocated page, because for non-PROT_MTE case the page does not get
>> a new tag. Makes sense.
> 
> Yes.
> 
>>> for this patch, it might be better to only do such skip on free in
>>> kasan_poison_slab() rather than kasan_poison(). Freed pages would then
>>> be tagged.
>>
>> I think you mean to say, "skip tag on free when freeing pages into buddy"?
> 
> No, I meant always poison via kasan_poison_pages(), as we currently do
> with KASAN_PAGE_FREE being set.
> 
>> So that would mean, kasan_poison() will do the poisoning also in the
>> case of value == KASAN_PAGE_FREE.

Yeah sorry I wrote two contradictory things above, that's what I meant too.

>>
>>> An alternative would be tagging on free only with a new tag and skipping
>>> it on re-alloc. But we'd need to track when it's a completely new
>>> allocation or a reused object (I haven't looked I'm pretty sure it's
>>> doable).
>>
>> That was our original approach, and IIRC we had concluded there was no
>> security compromise. However it is difficult to implement - it has cases
>> like, what happens when two differently tagged pages are coalesced by
>> buddy and someone gets that large page as an allocation.
> 
> Yeah, it's fairly complex.
> 
> I think the more problematic case is when we can't detect
> use-after-reallocation and this happens when a page is reused untagged
> (probabilistically, also when the page is reused with the old tag). As
> an optimisation, it might be sufficient to skip poisoning when freeing
> an object into the slab but keep the poisoning when freeing a slab page
> into the buddy allocator. That's where the page may end up in a place
> untagged.
> 
> Also for your optimisation to only tag on reallocation, do you have any
> code to read the current tag and avoid reusing it? That's useful for
> kmalloc caches or other merged kmem caches where we can have type
> confusion.

I don't have it, but should be fairly simple I guess. I just wanted to
keep it simple for now.

Anyhow someone needs to first test the current patchset to get some
numbers, we would be wasting time on this if no one gets an improvement.

> 


^ permalink raw reply

* Re: [PATCH] cxl: docs/linux/dax-driver - fix typos
From: Alison Schofield @ 2026-06-24  3:37 UTC (permalink / raw)
  To: Zenghui Yu
  Cc: linux-cxl, linux-doc, linux-kernel, dave, jic23, dave.jiang,
	vishal.l.verma, ira.weiny, djbw, gourry, corbet, skhan
In-Reply-To: <20260614161458.88942-1-zenghui.yu@linux.dev>

On Mon, Jun 15, 2026 at 12:14:58AM +0800, Zenghui Yu wrote:
> Fix two obvious typos in the "kmem conversion" section.
> 
> Signed-off-by: Zenghui Yu <zenghui.yu@linux.dev>

Thanks,
Reviewed-by: Alison Schofield <alison.schofield@intel.com>


^ permalink raw reply

* [PATCH 2/2] cgroup/dmem: introduce dmem.events.local for local counts
From: Hongfu Li @ 2026-06-24  3:11 UTC (permalink / raw)
  To: tj, hannes, mkoutny, corbet, skhan, dev, mripard, natalie.vock
  Cc: cgroups, linux-doc, linux-kernel, dri-devel, Hongfu Li
In-Reply-To: <20260624031107.667253-1-lihongfu@kylinos.cn>

Add dmem.events.local for local-only low/max event counts per DMEM
region.  Refactor the shared events show logic used by dmem.events.

Signed-off-by: Hongfu Li <lihongfu@kylinos.cn>
---
 Documentation/admin-guide/cgroup-v2.rst |  5 ++--
 kernel/cgroup/dmem.c                    | 32 +++++++++++++++++++++----
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index afc924539a41..5e4dbe4a75c6 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2881,7 +2881,7 @@ DMEM Interface Files
 	  drm/0000:03:00.0/vram0 12550144
 	  drm/0000:03:00.0/stolen 8650752
 
-  dmem.events
+  dmem.events, dmem.events.local
 	A read-only file that reports the number of times each cgroup
 	has hit its configured memory limits.  The format lists each
 	region on a single line, followed by the event counters::
@@ -2894,7 +2894,8 @@ DMEM Interface Files
 	``max`` counts how many times an allocation failed because the
 	cgroup or one of its ancestors hit ``dmem.max``.
 
-	``dmem.events`` contains hierarchical counts.  This file exists
+	``dmem.events`` contains hierarchical counts.  ``dmem.events.local``
+	contains counts for only the cgroup itself.  These files exist
 	for all cgroups except root.
 
 HugeTLB
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 79d4c5d0a046..29f8719561e6 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -60,6 +60,7 @@ struct dmemcg_state {
 	struct list_head pools;
 
 	struct cgroup_file events_file;
+	struct cgroup_file events_local_file;
 };
 
 enum dmemcg_memory_event {
@@ -84,6 +85,7 @@ struct dmem_cgroup_pool_state {
 	struct dmem_cgroup_pool_state *parent;
 
 	atomic_long_t events[DMEMCG_NR_EVENTS];
+	atomic_long_t events_local[DMEMCG_NR_EVENTS];
 
 	refcount_t ref;
 	bool inited;
@@ -196,6 +198,9 @@ static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
 static void dmemcg_memory_event(struct dmem_cgroup_pool_state *pool,
 				enum dmemcg_memory_event event)
 {
+	atomic_long_inc(&pool->events_local[event]);
+	cgroup_file_notify(&pool->cs->events_local_file);
+
 	for (; pool; pool = pool->parent) {
 		atomic_long_inc(&pool->events[event]);
 		cgroup_file_notify(&pool->cs->events_file);
@@ -203,11 +208,14 @@ static void dmemcg_memory_event(struct dmem_cgroup_pool_state *pool,
 }
 
 static long dmemcg_get_event(struct dmem_cgroup_pool_state *pool,
-			     enum dmemcg_memory_event event)
+			     enum dmemcg_memory_event event, bool local)
 {
 	if (!pool)
 		return 0;
 
+	if (local)
+		return atomic_long_read(&pool->events_local[event]);
+
 	return atomic_long_read(&pool->events[event]);
 }
 
@@ -874,7 +882,7 @@ static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
 	return dmemcg_limit_show(sf, v, get_resource_max);
 }
 
-static int dmem_cgroup_region_events_show(struct seq_file *sf, void *v)
+static int dmemcg_events_show(struct seq_file *sf, void *v, bool local)
 {
 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
 	struct dmem_cgroup_region *region;
@@ -885,14 +893,24 @@ static int dmem_cgroup_region_events_show(struct seq_file *sf, void *v)
 
 		seq_puts(sf, region->name);
 		seq_printf(sf, " low %ld max %ld\n",
-			   dmemcg_get_event(pool, DMEMCG_LOW),
-			   dmemcg_get_event(pool, DMEMCG_MAX));
+			   dmemcg_get_event(pool, DMEMCG_LOW, local),
+			   dmemcg_get_event(pool, DMEMCG_MAX, local));
 	}
 	rcu_read_unlock();
 
 	return 0;
 }
 
+static int dmem_cgroup_region_events_show(struct seq_file *sf, void *v)
+{
+	return dmemcg_events_show(sf, v, false);
+}
+
+static int dmem_cgroup_region_events_local_show(struct seq_file *sf, void *v)
+{
+	return dmemcg_events_show(sf, v, true);
+}
+
 static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off)
 {
@@ -933,6 +951,12 @@ static struct cftype files[] = {
 		.file_offset = offsetof(struct dmemcg_state, events_file),
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
+	{
+		.name = "events.local",
+		.seq_show = dmem_cgroup_region_events_local_show,
+		.file_offset = offsetof(struct dmemcg_state, events_local_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{ } /* Zero entry terminates. */
 };
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH 1/2] cgroup/dmem: add per-region event counters
From: Hongfu Li @ 2026-06-24  3:11 UTC (permalink / raw)
  To: tj, hannes, mkoutny, corbet, skhan, dev, mripard, natalie.vock
  Cc: cgroups, linux-doc, linux-kernel, dri-devel, Hongfu Li
In-Reply-To: <20260624031107.667253-1-lihongfu@kylinos.cn>

Add dmem.events to report hierarchical low/max event counts per DMEM
region.  Increment counters on dmem.max allocation failures and
dmem.low protection events.  The file is available for non-root cgroups
only.

Signed-off-by: Hongfu Li <lihongfu@kylinos.cn>
---
 Documentation/admin-guide/cgroup-v2.rst | 16 +++++++
 kernel/cgroup/dmem.c                    | 61 ++++++++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 993446ab66d0..afc924539a41 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2881,6 +2881,22 @@ DMEM Interface Files
 	  drm/0000:03:00.0/vram0 12550144
 	  drm/0000:03:00.0/stolen 8650752
 
+  dmem.events
+	A read-only file that reports the number of times each cgroup
+	has hit its configured memory limits.  The format lists each
+	region on a single line, followed by the event counters::
+
+	  drm/0000:03:00.0/vram0 low 0 max 3
+	  drm/0000:03:00.0/stolen low 0 max 0
+
+	``low`` counts how many times reclaim or eviction considered
+	the cgroup to be below its effective ``dmem.low`` protection.
+	``max`` counts how many times an allocation failed because the
+	cgroup or one of its ancestors hit ``dmem.max``.
+
+	``dmem.events`` contains hierarchical counts.  This file exists
+	for all cgroups except root.
+
 HugeTLB
 -------
 
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 4753a67d0f0f..79d4c5d0a046 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  */
 
+#include <linux/atomic.h>
 #include <linux/cgroup.h>
 #include <linux/cgroup_dmem.h>
 #include <linux/list.h>
@@ -57,6 +58,14 @@ struct dmemcg_state {
 	struct cgroup_subsys_state css;
 
 	struct list_head pools;
+
+	struct cgroup_file events_file;
+};
+
+enum dmemcg_memory_event {
+	DMEMCG_LOW,
+	DMEMCG_MAX,
+	DMEMCG_NR_EVENTS,
 };
 
 struct dmem_cgroup_pool_state {
@@ -74,6 +83,8 @@ struct dmem_cgroup_pool_state {
 	struct page_counter cnt;
 	struct dmem_cgroup_pool_state *parent;
 
+	atomic_long_t events[DMEMCG_NR_EVENTS];
+
 	refcount_t ref;
 	bool inited;
 };
@@ -182,6 +193,24 @@ static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
 	return pool ? page_counter_read(&pool->cnt) : 0;
 }
 
+static void dmemcg_memory_event(struct dmem_cgroup_pool_state *pool,
+				enum dmemcg_memory_event event)
+{
+	for (; pool; pool = pool->parent) {
+		atomic_long_inc(&pool->events[event]);
+		cgroup_file_notify(&pool->cs->events_file);
+	}
+}
+
+static long dmemcg_get_event(struct dmem_cgroup_pool_state *pool,
+			     enum dmemcg_memory_event event)
+{
+	if (!pool)
+		return 0;
+
+	return atomic_long_read(&pool->events[event]);
+}
+
 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
 {
 	set_resource_min(rpool, 0);
@@ -345,6 +374,7 @@ bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
 			return true;
 
 		*ret_hit_low = true;
+		dmemcg_memory_event(test_pool, DMEMCG_LOW);
 		return false;
 	}
 	return true;
@@ -675,8 +705,12 @@ int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
 	}
 
 	if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
+		struct dmem_cgroup_pool_state *limit_pool;
+
+		limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
+		dmemcg_memory_event(limit_pool, DMEMCG_MAX);
 		if (ret_limit_pool) {
-			*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
+			*ret_limit_pool = limit_pool;
 			css_get(&(*ret_limit_pool)->cs->css);
 			dmemcg_pool_get(*ret_limit_pool);
 		}
@@ -840,6 +874,25 @@ static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
 	return dmemcg_limit_show(sf, v, get_resource_max);
 }
 
+static int dmem_cgroup_region_events_show(struct seq_file *sf, void *v)
+{
+	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
+	struct dmem_cgroup_region *region;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+		struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
+
+		seq_puts(sf, region->name);
+		seq_printf(sf, " low %ld max %ld\n",
+			   dmemcg_get_event(pool, DMEMCG_LOW),
+			   dmemcg_get_event(pool, DMEMCG_MAX));
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
 static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off)
 {
@@ -874,6 +927,12 @@ static struct cftype files[] = {
 		.seq_show = dmem_cgroup_region_max_show,
 		.flags = CFTYPE_NOT_ON_ROOT,
 	},
+	{
+		.name = "events",
+		.seq_show = dmem_cgroup_region_events_show,
+		.file_offset = offsetof(struct dmemcg_state, events_file),
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{ } /* Zero entry terminates. */
 };
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH 0/2] cgroup/dmem: add per-region event counters
From: Hongfu Li @ 2026-06-24  3:11 UTC (permalink / raw)
  To: tj, hannes, mkoutny, corbet, skhan, dev, mripard, natalie.vock
  Cc: cgroups, linux-doc, linux-kernel, dri-devel, Hongfu Li

This patch series adds event counters to the device memory (dmem) cgroup
controller.

The dmem controller exposes per-region limits and current usage, but
not how often those limits are hit.  It is hard to tell whether failures
come from this cgroup, a parent limit, or pressure elsewhere in the
hierarchy.

To provide that visibility, this series introduces:
  - dmem.events:       reports hierarchical low/max counts per region.
  - dmem.events.local: reports per-region low/max counts for this cgroup only.

Patch overview:

Patch 1/2:
  - Add dmem.events with hierarchical low/max counters per region.
  - Record dmem.max allocation failures and dmem.low protection events.
  - Document the interface in cgroup-v2.rst.

Patch 2/2:
  - Add dmem.events.local for local-only per-region counts.
  - Share the events show logic between both files.
  - Update cgroup-v2.rst accordingly.

Example output (dmem.events):

  drm/0000:03:00.0/vram0 low 0 max 3
  drm/0000:03:00.0/stolen low 0 max 0

  low  - reclaim/eviction considered the cgroup below its effective
         dmem.low protection
  max  - allocation failed because the cgroup or an ancestor hit dmem.max

Both files exist for all non-root cgroups, like dmem.max and dmem.current.

These patches have been tested locally.

Hongfu Li (2):
  cgroup/dmem: add per-region event counters
  cgroup/dmem: introduce dmem.events.local for local counts

 Documentation/admin-guide/cgroup-v2.rst | 17 +++++
 kernel/cgroup/dmem.c                    | 85 ++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 1 deletion(-)

-- 
2.25.1


^ permalink raw reply

* Re: [PATCH] crypto: af_alg - Add af_alg_restrict sysctl, defaulting to 1
From: Demi Marie Obenour @ 2026-06-24  2:09 UTC (permalink / raw)
  To: Eric Biggers, Andy Lutomirski
  Cc: linux-crypto, Herbert Xu, linux-kernel, linux-doc,
	linux-bluetooth, iwd, linux-hardening, Milan Broz
In-Reply-To: <20260623192715.GE1850517@google.com>


[-- Attachment #1.1.1: Type: text/plain, Size: 2658 bytes --]

On 6/23/26 15:27, Eric Biggers wrote:
> On Tue, Jun 23, 2026 at 12:12:24PM -0700, Andy Lutomirski wrote:
>> On Mon, Jun 22, 2026 at 4:49 PM Eric Biggers <ebiggers@kernel.org> wrote:
>>>
>>> AF_ALG is a frequent source of vulnerabilities and a maintenance
>>> nightmare.  It exposes far more functionality to userspace than ever
>>> should have been exposed, especially to unprivileged processes.  Recent
>>> exploits have targeted kernel internal implementation details like
>>> "authencesn" that have zero use case for userspace access.
>>>
>>> Fortunately, AF_ALG is rarely used in practice, as userspace crypto
>>> libraries exist.  And when it is used, only some functionality is known
>>> to be used, and many users are known to hold capabilities already.
>>> iwd for example requires CAP_NET_ADMIN and has a known algorithm list
>>> (https://lore.kernel.org/linux-crypto/bcbbef00-5881-421b-8892-7be6c04b832d@gmail.com/).
>>>
>>> Thus, let's restrict the set of allowed algorithms by default, depending
>>> on the capabilities held.
>>>
>>> Add a sysctl /proc/sys/crypto/af_alg_restrict with meaning:
>>>
>>>     0: unrestricted
>>>     1: limited functionality
>>>     2: completely disabled
>>>
>>> Set the default value to 1, which enables an algorithm allowlist for
>>> unprivileged processes and a slightly longer allowlist for privileged
>>> processes.
>>
>> In our brave new world of containers, this is a bit awkward.  The
>> admin is sort of asking two separate questions:
>>
>> 1. Is the actual running distro and its privileged components capable
>> of working without AF_ALG or with only the parts marked as being
>> unprivileged?
>>
>> 2. Is the system running contains that need the unprivileged parts?
>> (Which is maybe just sha1 for ip?  I really don't know.)
>>
>> Should there maybe be two separate options so that all options are
>> available?  Or maybe something between 2 and 3 that means "limited
>> functionality and privileged modes are completely disabled"?
> 
> If we want to offer more settings we could.  I could see this getting
> quite complex pretty quickly once everyone weighs in, though.  There's
> quite a bit of value in keeping things simple, even if the offered
> settings won't be optimal for every case.
> 
> - Eric

What about exposing both allowlists to userspace and making them
configurable?

I'm mostly concerned about systems running code (possibly
closed-source) that uses algorithms that nobody here knows about.
It would be better to allow a single algorithm than to turn off all
restrictions.
-- 
Sincerely,
Demi Marie Obenour (she/her/hers)

[-- Attachment #1.1.2: OpenPGP public key --]
[-- Type: application/pgp-keys, Size: 7253 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox