Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH] netfilter: xt_realm: fix null-ptr-deref in realm_mt()
From: Florian Westphal @ 2026-04-15  9:44 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Kito Xu (veritas501), phil, davem, edumazet, kuba, pabeni, horms,
	jengelh, kaber, netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <ad9aDziQEBR0h3U8@chamomile>

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Wed, Apr 15, 2026 at 11:02:15AM +0200, Florian Westphal wrote:
> > Kito Xu (veritas501) <hxzene@gmail.com> wrote:
> > > realm_mt() unconditionally dereferences skb_dst(skb) without a NULL
> > > check. The xt_realm match registers with .family = NFPROTO_UNSPEC,
> > > making it available to all netfilter protocol families. Through the
> > > nftables compat layer (nft_compat), an unprivileged user inside a
> > > user/net namespace can load this match into a bridge-family chain.
> > 
> > I do not think this bug is related to nft_compat.
> > You can also use ebtables setsockopt api to request xt_realm, no?
> > 
> > > Fixes: ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")
> > 
> > Looks correct.  Alternatively we could revert the xt_realm.c change.
> > But I don't have a strong opinion here, patch looks correct.
> 
> Maybe partial revert makes sense, since in ab4f21e6fb1c:
> 
> - xt_MARK: OK
> - xt_NOTRACK: OK
> - xt_comment: OK

Agree.

> - xt_mac: There is a better way to do this in bridge.

Right.

> - xt_owner, no sockets in bridge.

Output/postrouting maybe?

> - xt_physdev, which makes no sense in bridge, this is for br_netfilter
>   only.

Agree.

> - xt_realm (as already mentioned).
> That is, a partial revert of this patch for:
> 
> - xt_mac
> - xt_owner
> - xt_physdev
> - xt_realm

I'm ok with that too.

^ permalink raw reply

* [net-next v1 3/3] net: phy: motorcomm: Add YT8522 100M RMII PHY support
From: Minda Chen @ 2026-04-15  9:26 UTC (permalink / raw)
  To: Frank, Andrew Lunn, Heiner Kallweit, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev
  Cc: linux-kernel, Minda Chen
In-Reply-To: <20260415092654.64907-1-minda.chen@starfivetech.com>

Add YT8522 100M RMII ethernet PHY base driver support, including
PHY ID and base config init function.

Signed-off-by: Minda Chen <minda.chen@starfivetech.com>
---
 drivers/net/phy/motorcomm.c | 49 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/drivers/net/phy/motorcomm.c b/drivers/net/phy/motorcomm.c
index f3129419f7c9..86396424b042 100644
--- a/drivers/net/phy/motorcomm.c
+++ b/drivers/net/phy/motorcomm.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
- * Motorcomm 8511/8521/8531/8531S/8821 PHY driver.
+ * Motorcomm 8511/8521/8522/8531/8531S/8821 PHY driver.
  *
  * Author: Peter Geis <pgwipeout@gmail.com>
  * Author: Frank <Frank.Sae@motor-comm.com>
@@ -14,6 +14,7 @@
 
 #define PHY_ID_YT8511		0x0000010a
 #define PHY_ID_YT8521		0x0000011a
+#define PHY_ID_YT8522		0x4f51e928
 #define PHY_ID_YT8531		0x4f51e91b
 #define PHY_ID_YT8531S		0x4f51e91a
 #define PHY_ID_YT8821		0x4f51ea19
@@ -227,6 +228,13 @@
 #define YT8521_LED_100_ON_EN			BIT(5)
 #define YT8521_LED_10_ON_EN			BIT(4)
 
+#define YT8522_EXTREG_SLEEP_CONTROL		0x2027
+#define YT8522_EN_SLEEP_SW			15
+
+#define YT8522_EXTENDED_COMBO_CTRL		0x4000
+#define YT8522_RXDV_SEL				BIT(4)
+#define YT8522_RMII_EN				BIT(1)
+
 #define YTPHY_MISC_CONFIG_REG			0xA006
 #define YTPHY_MCR_FIBER_SPEED_MASK		BIT(0)
 #define YTPHY_MCR_FIBER_1000BX			(0x1 << 0)
@@ -1857,6 +1865,36 @@ static int yt8531_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int yt8522_config_init(struct phy_device *phydev)
+{
+	struct device_node *node = phydev->mdio.dev.of_node;
+	int ret, val;
+
+	val = ytphy_read_ext_with_lock(phydev, YT8522_EXTENDED_COMBO_CTRL);
+	if (val < 0)
+		return val;
+
+	if (val & YT8522_RMII_EN) {
+		val |= YT8522_RXDV_SEL;
+		ret = ytphy_write_ext_with_lock(phydev,
+						YT8522_EXTENDED_COMBO_CTRL,
+						val);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (of_property_read_bool(node, "motorcomm,auto-sleep-disabled")) {
+		/* disable auto sleep */
+		ret = ytphy_modify_ext_with_lock(phydev,
+						 YT8522_EXTREG_SLEEP_CONTROL,
+						 YT8522_EN_SLEEP_SW, 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
 /**
  * yt8531_link_change_notify() - Adjust the tx clock direction according to
  * the current speed and dts config.
@@ -3066,6 +3104,14 @@ static struct phy_driver motorcomm_phy_drvs[] = {
 		.led_hw_control_set = yt8521_led_hw_control_set,
 		.led_hw_control_get = yt8521_led_hw_control_get,
 	},
+	{
+		PHY_ID_MATCH_EXACT(PHY_ID_YT8522),
+		.name		= "YT8522 100 Megabit Ethernet",
+		.config_aneg	= genphy_config_aneg,
+		.config_init	= yt8522_config_init,
+		.suspend	= genphy_suspend,
+		.resume		= genphy_resume,
+	},
 	{
 		PHY_ID_MATCH_EXACT(PHY_ID_YT8531),
 		.name		= "YT8531 Gigabit Ethernet",
@@ -3126,6 +3172,7 @@ MODULE_LICENSE("GPL");
 static const struct mdio_device_id __maybe_unused motorcomm_tbl[] = {
 	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8511) },
 	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8521) },
+	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8522) },
 	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8531) },
 	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8531S) },
 	{ PHY_ID_MATCH_EXACT(PHY_ID_YT8821) },
-- 
2.17.1


^ permalink raw reply related

* [net-next v1 0/3] Add motorcomm 8531s set ds func and 8522 driver
From: Minda Chen @ 2026-04-15  9:26 UTC (permalink / raw)
  To: Frank, Andrew Lunn, Heiner Kallweit, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev
  Cc: linux-kernel, Minda Chen

This patch is for Starfive JHB100 EVB board. JHB100  contain
1 RGMII/RMII and 1 RMII synopsys GMAC cores. In the EVB board, RGMII
interface connect with YT8531s Ethernet PHY. RMII interface connect
with YT8522 ethernet PHY. So patch 1-2 is for RGMII interface
patch 3 is RMII is for RMII interface.

JHB100 is a Starfive new RISC-V SoC for datacenter BMC (BaseBoard
Managent Controller). Similar with Aspeed 27x0.

The JHB100 minimal system upstream is in progress:
https://patchwork.kernel.org/project/linux-riscv/cover/20260403054945.467700-1-changhuang.liang@starfivetech.com/

The patch base in V7.0-rc5

Minda Chen (3):
  net: phy: motorcomm: Add yt8531_set_ds() mdio_locked bool parameter
  net: motorcomm: phy: set drive strength in 8531s RGMII case
  net: phy: motorcomm: Add YT8522 100M RMII PHY support

 drivers/net/phy/motorcomm.c | 105 ++++++++++++++++++++++++++++++------
 1 file changed, 88 insertions(+), 17 deletions(-)

base-commit: c369299895a591d96745d6492d4888259b004a9e
-- 
2.17.1

^ permalink raw reply

* [PATCH v3 4/4] rust_binder: report netlink transactions
From: Alice Ryhl @ 2026-04-15  9:37 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: linux-kernel, rust-for-linux, netdev, Alice Ryhl
In-Reply-To: <20260415-binder-netlink-v3-0-84be9ba63ee2@google.com>

From: Carlos Llamas <cmllamas@google.com>

The Android Binder driver supports a netlink API that reports
transaction *failures* to a userapce daemon. This allows devices to
monitor processes with many failed transactions so that it can e.g. kill
misbehaving apps.

One very important thing that this monitors is when many oneway messages
are sent to a frozen process, so there is special handling to ensure
this scenario is surfaced over netlink.

Signed-off-by: Carlos Llamas <cmllamas@google.com>
Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
 drivers/android/binder/rust_binder_main.rs |  1 -
 drivers/android/binder/thread.rs           | 10 ++++++++
 drivers/android/binder/transaction.rs      | 40 ++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs
index 9057e5dba7ed..407cda7bd766 100644
--- a/drivers/android/binder/rust_binder_main.rs
+++ b/drivers/android/binder/rust_binder_main.rs
@@ -36,7 +36,6 @@
 mod deferred_close;
 mod defs;
 mod error;
-#[allow(dead_code)]
 mod netlink;
 mod node;
 mod page_range;
diff --git a/drivers/android/binder/thread.rs b/drivers/android/binder/thread.rs
index 97d5f31e8fe3..aa4e93a877ac 100644
--- a/drivers/android/binder/thread.rs
+++ b/drivers/android/binder/thread.rs
@@ -1263,6 +1263,15 @@ fn transaction(self: &Arc<Self>, cmd: u32, reader: &mut UserSliceReader) -> Resu
             }
         }
 
+        if info.oneway_spam_suspect {
+            // If this is both a oneway spam suspect and a failure, we report it twice. This is
+            // useful in case the transaction failed with BR_TRANSACTION_PENDING_FROZEN.
+            info.report_netlink(BR_ONEWAY_SPAM_SUSPECT, &self.process.ctx);
+        }
+        if info.reply != 0 {
+            info.report_netlink(info.reply, &self.process.ctx);
+        }
+
         Ok(())
     }
 
@@ -1332,6 +1341,7 @@ fn reply_inner(self: &Arc<Self>, info: &mut TransactionInfo) -> BinderResult {
             );
             let reply = Err(BR_FAILED_REPLY);
             orig.from.deliver_reply(reply, &orig);
+            info.reply = BR_FAILED_REPLY;
             err.reply = BR_TRANSACTION_COMPLETE;
             err
         });
diff --git a/drivers/android/binder/transaction.rs b/drivers/android/binder/transaction.rs
index 47d5e4d88b07..3fa7091ed8a6 100644
--- a/drivers/android/binder/transaction.rs
+++ b/drivers/android/binder/transaction.rs
@@ -3,6 +3,7 @@
 // Copyright (C) 2025 Google LLC.
 
 use kernel::{
+    netlink::GENLMSG_DEFAULT_SIZE,
     prelude::*,
     seq_file::SeqFile,
     seq_print,
@@ -17,6 +18,7 @@
     allocation::{Allocation, TranslatedFds},
     defs::*,
     error::{BinderError, BinderResult},
+    netlink::Report,
     node::{Node, NodeRef},
     process::{Process, ProcessInner},
     ptr_align,
@@ -49,6 +51,44 @@ impl TransactionInfo {
     pub(crate) fn is_oneway(&self) -> bool {
         self.flags & TF_ONE_WAY != 0
     }
+
+    pub(crate) fn report_netlink(&self, reply: u32, ctx: &crate::Context) {
+        if let Err(err) = self.report_netlink_inner(reply, ctx) {
+            pr_warn!(
+                "{}:{} netlink report failed: {err:?}\n",
+                self.from_pid,
+                self.from_tid
+            );
+        }
+    }
+
+    fn report_netlink_inner(&self, reply: u32, ctx: &crate::Context) -> kernel::error::Result {
+        if !Report::has_listeners() {
+            return Ok(());
+        }
+        let mut report = Report::new(GENLMSG_DEFAULT_SIZE, 0, 0, GFP_KERNEL)?;
+
+        report.error(reply)?;
+        report.context(&ctx.name)?;
+        report.from_pid(self.from_pid as u32)?;
+        report.from_tid(self.from_tid as u32)?;
+        if self.to_pid != 0 {
+            report.to_pid(self.to_pid as u32)?;
+        }
+        if self.to_tid != 0 {
+            report.to_tid(self.to_tid as u32)?;
+        }
+
+        if self.is_reply {
+            report.is_reply()?;
+        }
+        report.flags(self.flags)?;
+        report.code(self.code)?;
+        report.data_size(self.data_size as u32)?;
+
+        report.multicast(0, GFP_KERNEL)?;
+        Ok(())
+    }
 }
 
 use core::mem::offset_of;

-- 
2.54.0.rc0.605.g598a273b03-goog


^ permalink raw reply related

* [PATCH v3 3/4] rust_binder: add generated netlink.rs file
From: Alice Ryhl @ 2026-04-15  9:37 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: linux-kernel, rust-for-linux, netdev, Alice Ryhl
In-Reply-To: <20260415-binder-netlink-v3-0-84be9ba63ee2@google.com>

To use netlink from Rust Binder, add a new generated netlink file using
the new script and Documentation/netlink/specs/binder.yaml.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
 drivers/android/Kconfig                    |   2 +-
 drivers/android/binder/netlink.rs          | 113 +++++++++++++++++++++++++++++
 drivers/android/binder/rust_binder_main.rs |   9 ++-
 rust/uapi/uapi_helper.h                    |   1 +
 4 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig
index e2e402c9d175..606a9d07f774 100644
--- a/drivers/android/Kconfig
+++ b/drivers/android/Kconfig
@@ -16,7 +16,7 @@ config ANDROID_BINDER_IPC
 
 config ANDROID_BINDER_IPC_RUST
 	bool "Rust version of Android Binder IPC Driver"
-	depends on RUST && MMU && !ANDROID_BINDER_IPC
+	depends on RUST && MMU && NET && !ANDROID_BINDER_IPC
 	help
 	  This enables the Rust implementation of the Binder driver.
 
diff --git a/drivers/android/binder/netlink.rs b/drivers/android/binder/netlink.rs
new file mode 100644
index 000000000000..2a842c7b1b33
--- /dev/null
+++ b/drivers/android/binder/netlink.rs
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/binder.yaml */
+/* YNL-GEN rust source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#![allow(unreachable_pub, clippy::wrong_self_convention)]
+use kernel::netlink::{Family, MulticastGroup};
+use kernel::prelude::*;
+
+pub static BINDER_NL_FAMILY: Family = Family::const_new(
+    &crate::THIS_MODULE,
+    kernel::uapi::BINDER_FAMILY_NAME,
+    kernel::uapi::BINDER_FAMILY_VERSION,
+    &BINDER_NL_FAMILY_MCGRPS,
+);
+
+static BINDER_NL_FAMILY_MCGRPS: [MulticastGroup; 1] = [MulticastGroup::const_new(c"report")];
+
+/// A multicast event sent to userspace subscribers to notify them about
+/// binder transaction failures. The generated report provides the full
+/// details of the specific transaction that failed. The intention is for
+/// programs to monitor these events and react to the failures as needed.
+pub struct Report {
+    skb: kernel::netlink::GenlMsg,
+}
+
+impl Report {
+    /// Create a new multicast message.
+    pub fn new(
+        size: usize,
+        portid: u32,
+        seq: u32,
+        flags: kernel::alloc::Flags,
+    ) -> Result<Self, kernel::alloc::AllocError> {
+        const BINDER_CMD_REPORT: u8 = kernel::uapi::BINDER_CMD_REPORT as u8;
+        let skb = kernel::netlink::NetlinkSkBuff::new(size, flags)?;
+        let skb = skb.genlmsg_put(portid, seq, &BINDER_NL_FAMILY, BINDER_CMD_REPORT)?;
+        Ok(Self { skb })
+    }
+
+    /// Broadcast this message.
+    pub fn multicast(self, portid: u32, flags: kernel::alloc::Flags) -> Result {
+        self.skb.multicast(&BINDER_NL_FAMILY, portid, 0, flags)
+    }
+
+    /// Check if this message type has listeners.
+    pub fn has_listeners() -> bool {
+        BINDER_NL_FAMILY.has_listeners(0)
+    }
+
+    /// The enum binder_driver_return_protocol returned to the sender.
+    pub fn error(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_ERROR: c_int = kernel::uapi::BINDER_A_REPORT_ERROR as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_ERROR, val)
+    }
+
+    /// The binder context where the transaction occurred.
+    pub fn context(&mut self, val: &CStr) -> Result {
+        const BINDER_A_REPORT_CONTEXT: c_int = kernel::uapi::BINDER_A_REPORT_CONTEXT as c_int;
+        self.skb.put_string(BINDER_A_REPORT_CONTEXT, val)
+    }
+
+    /// The PID of the sender process.
+    pub fn from_pid(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_FROM_PID: c_int = kernel::uapi::BINDER_A_REPORT_FROM_PID as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_FROM_PID, val)
+    }
+
+    /// The TID of the sender thread.
+    pub fn from_tid(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_FROM_TID: c_int = kernel::uapi::BINDER_A_REPORT_FROM_TID as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_FROM_TID, val)
+    }
+
+    /// The PID of the recipient process. This attribute may not be present
+    /// if the target could not be determined.
+    pub fn to_pid(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_TO_PID: c_int = kernel::uapi::BINDER_A_REPORT_TO_PID as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_TO_PID, val)
+    }
+
+    /// The TID of the recipient thread. This attribute may not be present
+    /// if the target could not be determined.
+    pub fn to_tid(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_TO_TID: c_int = kernel::uapi::BINDER_A_REPORT_TO_TID as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_TO_TID, val)
+    }
+
+    /// When present, indicates the failed transaction is a reply.
+    pub fn is_reply(&mut self) -> Result {
+        const BINDER_A_REPORT_IS_REPLY: c_int = kernel::uapi::BINDER_A_REPORT_IS_REPLY as c_int;
+        self.skb.put_flag(BINDER_A_REPORT_IS_REPLY)
+    }
+
+    /// The bitmask of enum transaction_flags from the transaction.
+    pub fn flags(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_FLAGS: c_int = kernel::uapi::BINDER_A_REPORT_FLAGS as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_FLAGS, val)
+    }
+
+    /// The application-defined code from the transaction.
+    pub fn code(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_CODE: c_int = kernel::uapi::BINDER_A_REPORT_CODE as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_CODE, val)
+    }
+
+    /// The transaction payload size in bytes.
+    pub fn data_size(&mut self, val: u32) -> Result {
+        const BINDER_A_REPORT_DATA_SIZE: c_int = kernel::uapi::BINDER_A_REPORT_DATA_SIZE as c_int;
+        self.skb.put_u32(BINDER_A_REPORT_DATA_SIZE, val)
+    }
+}
diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs
index 678e987902aa..9057e5dba7ed 100644
--- a/drivers/android/binder/rust_binder_main.rs
+++ b/drivers/android/binder/rust_binder_main.rs
@@ -36,6 +36,8 @@
 mod deferred_close;
 mod defs;
 mod error;
+#[allow(dead_code)]
+mod netlink;
 mod node;
 mod page_range;
 mod process;
@@ -286,19 +288,22 @@ fn ptr_align(value: usize) -> Option<usize> {
 // SAFETY: We call register in `init`.
 static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() };
 
-struct BinderModule {}
+struct BinderModule {
+    _netlink: kernel::netlink::Registration,
+}
 
 impl kernel::Module for BinderModule {
     fn init(_module: &'static kernel::ThisModule) -> Result<Self> {
         // SAFETY: The module initializer never runs twice, so we only call this once.
         unsafe { crate::context::CONTEXTS.init() };
 
+        let netlink = crate::netlink::BINDER_NL_FAMILY.register()?;
         BINDER_SHRINKER.register(c"android-binder")?;
 
         // SAFETY: The module is being loaded, so we can initialize binderfs.
         unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? };
 
-        Ok(Self {})
+        Ok(Self { _netlink: netlink })
     }
 }
 
diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h
index 06d7d1a2e8da..86c7b6b284b0 100644
--- a/rust/uapi/uapi_helper.h
+++ b/rust/uapi/uapi_helper.h
@@ -11,6 +11,7 @@
 #include <uapi/drm/nova_drm.h>
 #include <uapi/drm/panthor_drm.h>
 #include <uapi/linux/android/binder.h>
+#include <uapi/linux/android/binder_netlink.h>
 #include <uapi/linux/mdio.h>
 #include <uapi/linux/mii.h>
 #include <uapi/linux/ethtool.h>

-- 
2.54.0.rc0.605.g598a273b03-goog


^ permalink raw reply related

* [PATCH v3 2/4] ynl_gen: generate Rust files from yaml files
From: Alice Ryhl @ 2026-04-15  9:37 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: linux-kernel, rust-for-linux, netdev, Alice Ryhl
In-Reply-To: <20260415-binder-netlink-v3-0-84be9ba63ee2@google.com>

To generate netlink frames from Rust code easily, generate Rust
libraries with methods for generating different netlink messages as
appropriate.

The new 'rust' type corresponds to a Rust version of the C target
'kernel'. There is no Rust version of the 'uapi' target since Rust code
exports its uapi via C headers - choice of language is opaque to
userspace.

This logic is kept in the existing ynl_gen_c.py file to reuse CodeWriter
and other shared pieces of logic in the existing python file. This has
the disadvantage that the gen_c part of the name is now wrong, as it
also generates Rust. One possible solution to this could be to rename
the file.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
 tools/net/ynl/pyynl/ynl_gen_c.py | 139 ++++++++++++++++++++++++++++++++++++++-
 tools/net/ynl/ynl-regen.sh       |   2 +-
 2 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py
index 0e1e486c1185..76b8b2f1ac16 100755
--- a/tools/net/ynl/pyynl/ynl_gen_c.py
+++ b/tools/net/ynl/pyynl/ynl_gen_c.py
@@ -19,6 +19,7 @@ import pathlib
 import os
 import re
 import shutil
+import subprocess
 import sys
 import tempfile
 import yaml as pyyaml
@@ -1744,6 +1745,19 @@ class CodeWriter:
         else:
             self.p('}' + line)
 
+    def array_start(self, line=''):
+        if line:
+            line = line + ' '
+        self.p(line + '[')
+        self._ind += 1
+
+    def array_end(self, line=''):
+        if line and line[0] not in {';', ','}:
+            line = ' ' + line
+        self._ind -= 1
+        self._nl = False
+        self.p(']' + line)
+
     def write_doc_line(self, doc, indent=True):
         words = doc.split()
         line = ' *'
@@ -3415,10 +3429,126 @@ def find_kernel_root(full_path):
             return full_path, sub_path[:-1]
 
 
+def render_rust(family, cw):
+    cw.p('#![allow(unreachable_pub, clippy::wrong_self_convention)]')
+    cw.p('use kernel::netlink::{Family, MulticastGroup};')
+    cw.p('use kernel::prelude::*;')
+    cw.nl()
+
+    family_upper = c_upper(family.ident_name)
+    family_name = f'{family_upper}_NL_FAMILY'
+    mcgrps_name = f'{family_name}_MCGRPS'
+
+    cw.p(f'pub static {family_name}: Family = Family::const_new(')
+    cw._ind += 1
+    cw.p('&crate::THIS_MODULE,')
+    cw.p(f'kernel::uapi::{family.fam_key},')
+    cw.p(f'kernel::uapi::{family.ver_key},')
+    if family.mcgrps['list']:
+        cw.p(f'&{mcgrps_name},')
+    else:
+        cw.p('&[],')
+    cw._ind -= 1
+    cw.p(');')
+    cw.nl()
+
+    if family.mcgrps['list']:
+        cw.array_start(f'static {mcgrps_name}: [MulticastGroup; {len(family.mcgrps["list"])}] = ')
+        for grp in family.mcgrps['list']:
+            cw.p(f'MulticastGroup::const_new(c"{grp["name"]}"),')
+        cw.array_end(';')
+        cw.nl()
+
+    for idx, (op_name, op) in enumerate(item for item in family.msgs.items() if 'event' in item[1]):
+        struct_name = op_name.capitalize()
+
+        if 'doc' in op:
+            doc_lines = op['doc'].strip().split('\n')
+            for line in doc_lines:
+                cw.p(f'/// {line.strip()}')
+
+        cw.block_start(f'pub struct {struct_name}')
+        cw.p('skb: kernel::netlink::GenlMsg,')
+        cw.block_end()
+        cw.nl()
+
+        cw.block_start(f'impl {struct_name}')
+        cw.p('/// Create a new multicast message.')
+        cw.p('pub fn new(')
+        cw._ind += 1
+        cw.p('size: usize,')
+        cw.p('portid: u32,')
+        cw.p('seq: u32,')
+        cw.p('flags: kernel::alloc::Flags,')
+        cw._ind -= 1
+        cw.block_start(') -> Result<Self, kernel::alloc::AllocError>')
+        cw.p(f'const {op.enum_name}: u8 = kernel::uapi::{op.enum_name} as u8;')
+        cw.p('let skb = kernel::netlink::NetlinkSkBuff::new(size, flags)?;')
+        cw.p(f'let skb = skb.genlmsg_put(portid, seq, &{family_name}, {op.enum_name})?;')
+        cw.p('Ok(Self { skb })')
+        cw.block_end()
+        cw.nl()
+
+        grp_idx = 0
+        if 'mcgrp' in op:
+            grp_idx = next(i for i, grp in enumerate(family.mcgrps['list']) if grp['name'] == op['mcgrp'])
+
+        cw.p('/// Broadcast this message.')
+        cw.block_start('pub fn multicast(self, portid: u32, flags: kernel::alloc::Flags) -> Result')
+        cw.p(f'self.skb.multicast(&{family_name}, portid, {grp_idx}, flags)')
+        cw.block_end()
+        cw.nl()
+
+        cw.p('/// Check if this message type has listeners.')
+        cw.block_start('pub fn has_listeners() -> bool')
+        cw.p(f'{family_name}.has_listeners({grp_idx})')
+        cw.block_end()
+
+        attr_set_name = op['attribute-set']
+        attr_set = family.attr_sets[attr_set_name]
+        event_attrs = op['event']['attributes']
+
+        for attr_name in event_attrs:
+            attr = attr_set[attr_name]
+            method_name = attr_name.replace('-', '_')
+
+            if attr.type == 'u32':
+                put_fn = 'put_u32'
+                arg_str = ', val'
+                method_args = '(&mut self, val: u32)'
+            elif attr.type == 'string':
+                put_fn = 'put_string'
+                arg_str = ', val'
+                method_args = '(&mut self, val: &CStr)'
+            elif attr.type == 'flag':
+                put_fn = 'put_flag'
+                arg_str = ''
+                method_args = '(&mut self)'
+            else:
+                put_fn = f'put_{attr.type}'
+                arg_str = ', val'
+                method_args = f'(&mut self, val: {attr.type})'
+
+            cw.nl()
+            if 'doc' in attr.yaml:
+                doc_lines = attr.yaml['doc'].strip().split('\n')
+                for line in doc_lines:
+                    cw.p(f'/// {line.strip()}')
+
+            cw.block_start(f'pub fn {method_name}{method_args} -> Result')
+            cw.p(f'const {attr.enum_name}: c_int = kernel::uapi::{attr.enum_name} as c_int;')
+            cw.p(f'self.skb.{put_fn}({attr.enum_name}{arg_str})')
+            cw.block_end()
+
+        cw.block_end()
+        cw.nl()
+    cw.p(' ')
+
+
 def main():
     parser = argparse.ArgumentParser(description='Netlink simple parsing generator')
     parser.add_argument('--mode', dest='mode', type=str, required=True,
-                        choices=('user', 'kernel', 'uapi'))
+                        choices=('user', 'kernel', 'uapi', 'rust'))
     parser.add_argument('--spec', dest='spec', type=str, required=True)
     parser.add_argument('--header', dest='header', action='store_true', default=None)
     parser.add_argument('--source', dest='header', action='store_false')
@@ -3471,6 +3601,13 @@ def main():
         render_uapi(parsed, cw)
         return
 
+    if args.mode == 'rust':
+        render_rust(parsed, cw)
+        cw.close_out_file()
+        if args.out_file:
+            subprocess.run(['rustfmt', '--edition', '2021', args.out_file])
+        return
+
     hdr_prot = f"_LINUX_{parsed.c_name.upper()}_GEN_H"
     if args.header:
         cw.p('#ifndef ' + hdr_prot)
diff --git a/tools/net/ynl/ynl-regen.sh b/tools/net/ynl/ynl-regen.sh
index d9809276db98..4f5ceb4fe147 100755
--- a/tools/net/ynl/ynl-regen.sh
+++ b/tools/net/ynl/ynl-regen.sh
@@ -17,7 +17,7 @@ done
 KDIR=$(dirname $(dirname $(dirname $(dirname $(realpath $0)))))
 pushd ${search:-$KDIR} >>/dev/null
 
-files=$(git grep --files-with-matches '^/\* YNL-GEN \(kernel\|uapi\|user\)')
+files=$(git grep --files-with-matches '^/\* YNL-GEN \(kernel\|uapi\|user\|rust\)')
 for f in $files; do
     # params:     0       1      2     3
     #         $YAML YNL-GEN kernel $mode

-- 
2.54.0.rc0.605.g598a273b03-goog


^ permalink raw reply related

* [PATCH v3 1/4] rust: netlink: add raw netlink abstraction
From: Alice Ryhl @ 2026-04-15  9:37 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: linux-kernel, rust-for-linux, netdev, Alice Ryhl
In-Reply-To: <20260415-binder-netlink-v3-0-84be9ba63ee2@google.com>

This implements a safe and relatively simple API over the netlink API,
that allows you to add different attributes to a netlink message and
broadcast it. As the first user of this API only makes use of broadcast,
only broadcast messages are supported here.

This API is intended to be safe and to be easy to use in *generated*
code. This is because netlink is generally used with yaml files that
describe the underlying API, and the python generator outputs C code
(or, soon, Rust code) that lets you use the API more easily. So for
example, if there is a string field, the code generator will output a
method that internall calls `put_string()` with the right attr type.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
 rust/bindings/bindings_helper.h |   3 +
 rust/helpers/genetlink.c        |  46 ++++++
 rust/helpers/helpers.c          |   1 +
 rust/kernel/lib.rs              |   1 +
 rust/kernel/netlink.rs          | 329 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 380 insertions(+)

diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index 083cc44aa952..8abb626fce6c 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -88,6 +88,8 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/xarray.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
 #include <trace/events/rust_sample.h>
 
 /*
@@ -105,6 +107,7 @@
 const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
 const size_t RUST_CONST_HELPER_ARCH_KMALLOC_MINALIGN = ARCH_KMALLOC_MINALIGN;
 const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE;
+const size_t RUST_CONST_HELPER_GENLMSG_DEFAULT_SIZE = GENLMSG_DEFAULT_SIZE;
 const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC;
 const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
 const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
diff --git a/rust/helpers/genetlink.c b/rust/helpers/genetlink.c
new file mode 100644
index 000000000000..3530b69f6cf7
--- /dev/null
+++ b/rust/helpers/genetlink.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2026 Google LLC.
+ */
+
+#include <net/genetlink.h>
+
+#ifdef CONFIG_NET
+
+__rust_helper struct sk_buff *rust_helper_genlmsg_new(size_t payload, gfp_t flags)
+{
+	return genlmsg_new(payload, flags);
+}
+
+__rust_helper
+int rust_helper_genlmsg_multicast(const struct genl_family *family,
+				  struct sk_buff *skb, u32 portid,
+				  unsigned int group, gfp_t flags)
+{
+	return genlmsg_multicast(family, skb, portid, group, flags);
+}
+
+__rust_helper void rust_helper_genlmsg_cancel(struct sk_buff *skb, void *hdr)
+{
+	genlmsg_cancel(skb, hdr);
+}
+
+__rust_helper void rust_helper_genlmsg_end(struct sk_buff *skb, void *hdr)
+{
+	genlmsg_end(skb, hdr);
+}
+
+__rust_helper void rust_helper_nlmsg_free(struct sk_buff *skb)
+{
+	nlmsg_free(skb);
+}
+
+__rust_helper
+int rust_helper_genl_has_listeners(const struct genl_family *family,
+				   struct net *net, unsigned int group)
+{
+	return genl_has_listeners(family, net, group);
+}
+
+#endif
diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c
index a3c42e51f00a..0813185d8760 100644
--- a/rust/helpers/helpers.c
+++ b/rust/helpers/helpers.c
@@ -32,6 +32,7 @@
 #include "err.c"
 #include "irq.c"
 #include "fs.c"
+#include "genetlink.c"
 #include "io.c"
 #include "jump_label.c"
 #include "kunit.c"
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index d93292d47420..f5ea0ae0b6b7 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -122,6 +122,7 @@
 pub mod module_param;
 #[cfg(CONFIG_NET)]
 pub mod net;
+pub mod netlink;
 pub mod num;
 pub mod of;
 #[cfg(CONFIG_PM_OPP)]
diff --git a/rust/kernel/netlink.rs b/rust/kernel/netlink.rs
new file mode 100644
index 000000000000..21f959c95fdc
--- /dev/null
+++ b/rust/kernel/netlink.rs
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2026 Google LLC.
+
+//! Rust support for generic netlink.
+//!
+//! Currently only supports exposing multicast groups.
+//!
+//! C header: [`include/net/genetlink.h`](srctree/include/net/genetlink.h)
+#![cfg(CONFIG_NET)]
+
+use kernel::{
+    alloc::{self, AllocError},
+    error::to_result,
+    prelude::*,
+    transmute::AsBytes,
+    types::Opaque,
+    ThisModule,
+};
+
+use core::{
+    mem::ManuallyDrop,
+    ptr::NonNull, //
+};
+
+/// The default netlink message size.
+pub const GENLMSG_DEFAULT_SIZE: usize = bindings::GENLMSG_DEFAULT_SIZE;
+
+/// A wrapper around `struct sk_buff` for generic netlink messages.
+///
+/// This type is intended to be specific for buffers used with netlink only, and other usecases for
+/// `struct sk_buff` are out-of-scope for this abstraction.
+///
+/// # Invariants
+///
+/// The pointer has ownership over a valid `sk_buff`.
+pub struct NetlinkSkBuff {
+    skb: NonNull<kernel::bindings::sk_buff>,
+}
+
+impl NetlinkSkBuff {
+    /// Creates a new `NetlinkSkBuff` with the given size.
+    pub fn new(size: usize, flags: alloc::Flags) -> Result<NetlinkSkBuff, AllocError> {
+        // SAFETY: `genlmsg_new` only requires its arguments to be valid integers.
+        let skb = unsafe { bindings::genlmsg_new(size, flags.as_raw()) };
+        let skb = NonNull::new(skb).ok_or(AllocError)?;
+        Ok(NetlinkSkBuff { skb })
+    }
+
+    /// Puts a generic netlink header into the `NetlinkSkBuff`.
+    pub fn genlmsg_put(
+        self,
+        portid: u32,
+        seq: u32,
+        family: &'static Family,
+        cmd: u8,
+    ) -> Result<GenlMsg, AllocError> {
+        let skb = self.skb.as_ptr();
+        // SAFETY: The skb and family pointers are valid.
+        let hdr = unsafe { bindings::genlmsg_put(skb, portid, seq, family.as_raw(), 0, cmd) };
+        let hdr = NonNull::new(hdr).ok_or(AllocError)?;
+        Ok(GenlMsg { skb: self, hdr })
+    }
+}
+
+impl Drop for NetlinkSkBuff {
+    fn drop(&mut self) {
+        // SAFETY: We have ownership over the `sk_buff`, so we may free it.
+        unsafe { bindings::nlmsg_free(self.skb.as_ptr()) }
+    }
+}
+
+/// A generic netlink message being constructed.
+///
+/// # Invariants
+///
+/// `hdr` references the header in this netlink message.
+pub struct GenlMsg {
+    skb: NetlinkSkBuff,
+    hdr: NonNull<c_void>,
+}
+
+impl GenlMsg {
+    /// Puts an attribute into the message.
+    #[inline]
+    fn put<T>(&mut self, attrtype: c_int, value: &T) -> Result
+    where
+        T: ?Sized + AsBytes,
+    {
+        let skb = self.skb.skb.as_ptr();
+        let len = size_of_val(value);
+        let ptr = core::ptr::from_ref(value).cast::<c_void>();
+        // SAFETY: `skb` is valid by `NetlinkSkBuff` type invariants, and the provided value is
+        // readable and initialized for its `size_of` bytes.
+        to_result(unsafe { bindings::nla_put(skb, attrtype, len as c_int, ptr) })
+    }
+
+    /// Puts a `u32` attribute into the message.
+    #[inline]
+    pub fn put_u32(&mut self, attrtype: c_int, value: u32) -> Result {
+        self.put(attrtype, &value)
+    }
+
+    /// Puts a string attribute into the message.
+    #[inline]
+    pub fn put_string(&mut self, attrtype: c_int, value: &CStr) -> Result {
+        self.put(attrtype, value.to_bytes_with_nul())
+    }
+
+    /// Puts a flag attribute into the message.
+    #[inline]
+    pub fn put_flag(&mut self, attrtype: c_int) -> Result {
+        let skb = self.skb.skb.as_ptr();
+        // SAFETY: `skb` is valid by `NetlinkSkBuff` type invariants, and a null pointer is valid
+        // when the length is zero.
+        to_result(unsafe { bindings::nla_put(skb, attrtype, 0, core::ptr::null()) })
+    }
+
+    /// Sends the generic netlink message as a multicast message.
+    #[inline]
+    pub fn multicast(
+        self,
+        family: &'static Family,
+        portid: u32,
+        group: u32,
+        flags: alloc::Flags,
+    ) -> Result {
+        let me = ManuallyDrop::new(self);
+        // SAFETY: The `skb` and `family` pointers are valid. We pass ownership of the `skb` to
+        // `genlmsg_multicast` by not dropping `self`.
+        unsafe {
+            bindings::genlmsg_end(me.skb.skb.as_ptr(), me.hdr.as_ptr());
+            to_result(bindings::genlmsg_multicast(
+                family.as_raw(),
+                me.skb.skb.as_ptr(),
+                portid,
+                group,
+                flags.as_raw(),
+            ))
+        }
+    }
+}
+impl Drop for GenlMsg {
+    fn drop(&mut self) {
+        // SAFETY: The `hdr` pointer references the header of this generic netlink message.
+        unsafe { bindings::genlmsg_cancel(self.skb.skb.as_ptr(), self.hdr.as_ptr()) };
+    }
+}
+
+/// Flags for a generic netlink family.
+struct FamilyFlags {
+    /// Whether the family supports network namespaces.
+    netnsok: bool,
+    /// Whether the family supports parallel operations.
+    parallel_ops: bool,
+}
+
+impl FamilyFlags {
+    /// Converts the flags to the bitfield representation used by `genl_family`.
+    const fn into_bitfield(self) -> bindings::__BindgenBitfieldUnit<[u8; 1]> {
+        // The below shifts are verified correct by test_family_flags_bitfield() below.
+        //
+        // Although bindgen generates helpers to change bitfields based on the C headers, these
+        // helpers unfortunately can't be used in const context. Since `Family` needs to be filled
+        // out at build-time, we use this helper instead.
+        let mut bits = 0;
+        if self.netnsok {
+            bits |= 1 << 0;
+        }
+        if self.parallel_ops {
+            bits |= 1 << 1;
+        }
+        // SAFETY: This bitfield is represented as an u8.
+        unsafe { core::mem::transmute::<u8, bindings::__BindgenBitfieldUnit<[u8; 1]>>(bits) }
+    }
+}
+
+/// A generic netlink family.
+#[repr(transparent)]
+pub struct Family {
+    inner: Opaque<bindings::genl_family>,
+}
+
+// SAFETY: The `Family` type is thread safe.
+unsafe impl Sync for Family {}
+
+impl Family {
+    /// Creates a new `Family` instance.
+    pub const fn const_new(
+        module: &ThisModule,
+        name: &[u8],
+        version: u32,
+        mcgrps: &'static [MulticastGroup],
+    ) -> Family {
+        let n_mcgrps = mcgrps.len() as u8;
+        if n_mcgrps as usize != mcgrps.len() {
+            panic!("too many mcgrps");
+        }
+        let mut genl_family = bindings::genl_family {
+            version,
+            _bitfield_1: FamilyFlags {
+                netnsok: true,
+                parallel_ops: true,
+            }
+            .into_bitfield(),
+            module: module.as_ptr(),
+            mcgrps: mcgrps.as_ptr().cast(),
+            n_mcgrps,
+            ..pin_init::zeroed()
+        };
+        if CStr::from_bytes_with_nul(name).is_err() {
+            panic!("genl_family name not nul-terminated");
+        }
+        if genl_family.name.len() < name.len() {
+            panic!("genl_family name too long");
+        }
+        let mut i = 0;
+        while i < name.len() {
+            genl_family.name[i] = name[i];
+            i += 1;
+        }
+        Family {
+            inner: Opaque::new(genl_family),
+        }
+    }
+
+    /// Checks if there are any listeners for the given multicast group.
+    pub fn has_listeners(&self, group: u32) -> bool {
+        // SAFETY: The family and init_net pointers are valid.
+        unsafe {
+            bindings::genl_has_listeners(self.as_raw(), &raw mut bindings::init_net, group) != 0
+        }
+    }
+
+    /// Returns a raw pointer to the underlying `genl_family` structure.
+    pub fn as_raw(&self) -> *mut bindings::genl_family {
+        self.inner.get()
+    }
+}
+
+/// A generic netlink multicast group.
+#[repr(transparent)]
+pub struct MulticastGroup {
+    // No Opaque because fully immutable
+    group: bindings::genl_multicast_group,
+}
+
+// SAFETY: Pure data so thread safe.
+unsafe impl Sync for MulticastGroup {}
+
+impl MulticastGroup {
+    /// Creates a new `MulticastGroup` instance.
+    pub const fn const_new(name: &CStr) -> MulticastGroup {
+        let mut group: bindings::genl_multicast_group = pin_init::zeroed();
+
+        let name = name.to_bytes_with_nul();
+        if group.name.len() < name.len() {
+            panic!("genl_multicast_group name too long");
+        }
+        let mut i = 0;
+        while i < name.len() {
+            group.name[i] = name[i];
+            i += 1;
+        }
+
+        MulticastGroup { group }
+    }
+}
+
+/// A registration of a generic netlink family.
+///
+/// This type represents the registration of a [`Family`]. When an instance of this type is
+/// dropped, its respective generic netlink family will be unregistered from the system.
+///
+/// # Invariants
+///
+/// `self.family` always holds a valid reference to an initialized and registered [`Family`].
+pub struct Registration {
+    family: &'static Family,
+}
+
+impl Family {
+    /// Registers the generic netlink family with the kernel.
+    pub fn register(&'static self) -> Result<Registration> {
+        // SAFETY: `self.as_raw()` is a valid pointer to a `genl_family` struct.
+        // The `genl_family` struct is static, so it will outlive the registration.
+        to_result(unsafe { bindings::genl_register_family(self.as_raw()) })?;
+        Ok(Registration { family: self })
+    }
+}
+
+impl Drop for Registration {
+    fn drop(&mut self) {
+        // SAFETY: `self.family.as_raw()` is a valid pointer to a registered `genl_family` struct.
+        // The `Registration` struct ensures that `genl_unregister_family` is called exactly once
+        // for this family when it goes out of scope.
+        unsafe { bindings::genl_unregister_family(self.family.as_raw()) };
+    }
+}
+
+#[macros::kunit_tests(rust_netlink)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_family_flags_bitfield() {
+        for netnsok in [false, true] {
+            for parallel_ops in [false, true] {
+                let mut b_fam = bindings::genl_family {
+                    ..Default::default()
+                };
+                b_fam.set_netnsok(if netnsok { 1 } else { 0 });
+                b_fam.set_parallel_ops(if parallel_ops { 1 } else { 0 });
+
+                let c_bitfield = FamilyFlags {
+                    netnsok,
+                    parallel_ops,
+                }
+                .into_bitfield();
+
+                // SAFETY: The bit field is stored as u8.
+                let b_val: u8 = unsafe { core::mem::transmute(b_fam._bitfield_1) };
+                // SAFETY: The bit field is stored as u8.
+                let c_val: u8 = unsafe { core::mem::transmute(c_bitfield) };
+                assert_eq!(b_val, c_val);
+            }
+        }
+    }
+}

-- 
2.54.0.rc0.605.g598a273b03-goog


^ permalink raw reply related

* [PATCH v3 0/4] Rust netlink support + use in Rust Binder
From: Alice Ryhl @ 2026-04-15  9:37 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Trevor Gross, Danilo Krummrich,
	Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Greg Kroah-Hartman,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: linux-kernel, rust-for-linux, netdev, Alice Ryhl

The C Binder driver exposes messages over netlink when transactions
fail, so that a userpace daemon can respond to processes with many
failing transactions.

This patch series adds netlink support from Rust, then implements an
equivalent API in Rust Binder.

As Binder only uses broadcast messages, I did not add support for other
kinds of messages.

Based on char-misc-next.

Signed-off-by: Alice Ryhl <aliceryhl@google.com>
---
Changes in v3:
- Fix spurious 'return' statements in Rust helpers (Patch 1).
- Sashiko review:
  - Fix ynl_gen to handle empty multicast groups and correct multicast indexing (Patch 2).
  - Fix transaction failed reply logic to report via Netlink inside reply_inner() (Patch 4).
- Link to v2: https://lore.kernel.org/r/20260408-binder-netlink-v2-0-c0d327d15435@google.com

Changes in v2:
- Make inclusion of to_pid conditional too.
- Add note about file name in second patch.
- Make it clear that the sk_buff wrapper is netlink-specific.
- Better handle bitfield in patch 1.
- Link to v1: https://lore.kernel.org/r/20260306-binder-netlink-v1-0-daceb5bc83f2@google.com

---
Alice Ryhl (3):
      rust: netlink: add raw netlink abstraction
      ynl_gen: generate Rust files from yaml files
      rust_binder: add generated netlink.rs file

Carlos Llamas (1):
      rust_binder: report netlink transactions

 drivers/android/Kconfig                    |   2 +-
 drivers/android/binder/netlink.rs          | 113 ++++++++++
 drivers/android/binder/rust_binder_main.rs |   8 +-
 drivers/android/binder/thread.rs           |  10 +
 drivers/android/binder/transaction.rs      |  40 ++++
 rust/bindings/bindings_helper.h            |   3 +
 rust/helpers/genetlink.c                   |  46 ++++
 rust/helpers/helpers.c                     |   1 +
 rust/kernel/lib.rs                         |   1 +
 rust/kernel/netlink.rs                     | 329 +++++++++++++++++++++++++++++
 rust/uapi/uapi_helper.h                    |   1 +
 tools/net/ynl/pyynl/ynl_gen_c.py           | 139 +++++++++++-
 tools/net/ynl/ynl-regen.sh                 |   2 +-
 13 files changed, 690 insertions(+), 5 deletions(-)
---
base-commit: 0990a71f678aa0f045f2c126b39b6b581844d3b0
change-id: 20260306-binder-netlink-c82110b2fb74

Best regards,
-- 
Alice Ryhl <aliceryhl@google.com>


^ permalink raw reply

* [PATCH net 1/1] mptcp: hold subflow request owners when cloning reqsk
From: Ren Wei @ 2026-04-15  9:31 UTC (permalink / raw)
  To: netdev, mptcp
  Cc: davem, edumazet, kuba, pabeni, horms, ncardwell, kuniyu, dsahern,
	matttbe, martineau, geliang, daniel, kafai, yuantan098, yifanwucs,
	tomapufckgml, bird, caoruide123, enjou1224z, n05ec
In-Reply-To: <cover.1776149210.git.caoruide123@gmail.com>

From: Ruide Cao <caoruide123@gmail.com>

TCP request migration clones pending request sockets with
inet_reqsk_clone(). For MPTCP MP_JOIN requests this raw-copies
subflow_req->msk, but the cloned request does not take a new reference.

Both the original and the cloned request can later drop the same msk in
subflow_req_destructor(), and a migrated request may keep a dangling msk
pointer after the original owner has already been released.

Add a request_sock clone callback and let MPTCP grab a reference for cloned
subflow requests that carry an msk. This keeps ownership balanced across
both successful migrations and failed clone/insert paths without changing
other protocols.

Fixes: c905dee62232 ("tcp: Migrate TCP_NEW_SYN_RECV requests at retransmitting SYN+ACKs.")
Cc: stable@kernel.org
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Signed-off-by: Ruide Cao <caoruide123@gmail.com>
Tested-by: Ren Wei <enjou1224z@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
---
 include/net/request_sock.h      |  2 ++
 net/ipv4/inet_connection_sock.c |  3 +++
 net/mptcp/subflow.c             | 13 +++++++++++++
 3 files changed, 18 insertions(+)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 5a9c826a7092..560e464c400f 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -36,6 +36,8 @@ struct request_sock_ops {
 				      struct sk_buff *skb,
 				      enum sk_rst_reason reason);
 	void		(*destructor)(struct request_sock *req);
+	void		(*init_clone)(const struct request_sock *req,
+				      struct request_sock *new_req);
 };
 
 struct saved_syn {
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e961936b6be7..140a9e96ad58 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -954,6 +954,9 @@ static struct request_sock *inet_reqsk_clone(struct request_sock *req,
 	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
 		rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
 
+	if (req->rsk_ops->init_clone)
+		req->rsk_ops->init_clone(req, nreq);
+
 	return nreq;
 }
 
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 4ff5863aa9fd..5f4069647822 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -47,6 +47,17 @@ static void subflow_req_destructor(struct request_sock *req)
 	mptcp_token_destroy_request(req);
 }
 
+static void subflow_req_clone(const struct request_sock *req,
+			      struct request_sock *new_req)
+{
+	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(new_req);
+
+	(void)req;
+
+	if (subflow_req->msk)
+		sock_hold((struct sock *)subflow_req->msk);
+}
+
 static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
 				  void *hmac)
 {
@@ -2143,6 +2154,7 @@ void __init mptcp_subflow_init(void)
 	mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops;
 	mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4";
 	mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor;
+	mptcp_subflow_v4_request_sock_ops.init_clone = subflow_req_clone;
 
 	if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0)
 		panic("MPTCP: failed to init subflow v4 request sock ops\n");
@@ -2184,6 +2196,7 @@ void __init mptcp_subflow_v6_init(void)
 	mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops;
 	mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6";
 	mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor;
+	mptcp_subflow_v6_request_sock_ops.init_clone = subflow_req_clone;
 
 	if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0)
 		panic("MPTCP: failed to init subflow v6 request sock ops\n");
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH] netfilter: xt_realm: fix null-ptr-deref in realm_mt()
From: Pablo Neira Ayuso @ 2026-04-15  9:27 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Kito Xu (veritas501), phil, davem, edumazet, kuba, pabeni, horms,
	jengelh, kaber, netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <ad9UF5Cr12YGJnbi@strlen.de>

On Wed, Apr 15, 2026 at 11:02:15AM +0200, Florian Westphal wrote:
> Kito Xu (veritas501) <hxzene@gmail.com> wrote:
> > realm_mt() unconditionally dereferences skb_dst(skb) without a NULL
> > check. The xt_realm match registers with .family = NFPROTO_UNSPEC,
> > making it available to all netfilter protocol families. Through the
> > nftables compat layer (nft_compat), an unprivileged user inside a
> > user/net namespace can load this match into a bridge-family chain.
> 
> I do not think this bug is related to nft_compat.
> You can also use ebtables setsockopt api to request xt_realm, no?
> 
> > Fixes: ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")
> 
> Looks correct.  Alternatively we could revert the xt_realm.c change.
> But I don't have a strong opinion here, patch looks correct.

Maybe partial revert makes sense, since in ab4f21e6fb1c:

- xt_MARK: OK
- xt_NOTRACK: OK
- xt_comment: OK
- xt_mac: There is a better way to do this in bridge.
- xt_owner, no sockets in bridge.
- xt_physdev, which makes no sense in bridge, this is for br_netfilter
  only.
- xt_realm (as already mentioned).

That is, a partial revert of this patch for:

- xt_mac
- xt_owner
- xt_physdev
- xt_realm

^ permalink raw reply

* [RFC PATCH net-next] pppoe: drop getsockname() syscall
From: Qingfang Deng @ 2026-04-15  9:16 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Qingfang Deng, Kees Cook, Guillaume Nault,
	Eric Woudstra, netdev, linux-kernel
  Cc: Paul Mackerras, linux-ppp, Jaco Kroon, James Carlson,
	Dianne Skoll, Denys Fedoryshchenko

The getsockname syscall is not used by pppd. It also has two flaws:
1. It does not hold the socket lock, so if the struct is being changed
   in pppoe_connect() simultaneously, it may see partial updates.
2. If the lower network device is renamed, this operation still returns
   the original name.

Given it is unused and buggy, remove the syscall.

Signed-off-by: Qingfang Deng <qingfang.deng@linux.dev>
---
Note: sent as RFC, since net-next is closed.

 drivers/net/ppp/pppoe.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index d546a7af0d54..0d64a16715e2 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -688,22 +688,6 @@ static int pppoe_connect(struct socket *sock, struct sockaddr_unsized *uservaddr
 	goto end;
 }
 
-static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr,
-		  int peer)
-{
-	int len = sizeof(struct sockaddr_pppox);
-	struct sockaddr_pppox sp;
-
-	sp.sa_family	= AF_PPPOX;
-	sp.sa_protocol	= PX_PROTO_OE;
-	memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa,
-	       sizeof(struct pppoe_addr));
-
-	memcpy(uaddr, &sp, len);
-
-	return len;
-}
-
 static int pppoe_ioctl(struct socket *sock, unsigned int cmd,
 		unsigned long arg)
 {
@@ -1049,7 +1033,7 @@ static const struct proto_ops pppoe_ops = {
 	.connect	= pppoe_connect,
 	.socketpair	= sock_no_socketpair,
 	.accept		= sock_no_accept,
-	.getname	= pppoe_getname,
+	.getname	= sock_no_getname,
 	.poll		= datagram_poll,
 	.listen		= sock_no_listen,
 	.shutdown	= sock_no_shutdown,
-- 
2.43.0


^ permalink raw reply related

* Re: [Intel-wired-lan] [PATCH iwl-net v5] ice: fix missing dpll notifications for SW pins
From: Ivan Vecera @ 2026-04-15  9:04 UTC (permalink / raw)
  To: Jacob Keller, Michal Schmidt, Petr Oros, netdev
  Cc: Tony Nguyen, Przemek Kitszel, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Arkadiusz Kubalewski, intel-wired-lan, linux-kernel
In-Reply-To: <b40362c7-a749-4915-93a9-08243ab09cb8@intel.com>

On 4/14/26 10:46 PM, Jacob Keller wrote:
> On 4/14/2026 12:16 PM, Michal Schmidt wrote:
>> On 4/9/26 12:25, Petr Oros wrote:
>>> ---
>>>    drivers/net/ethernet/intel/ice/ice_dpll.c | 74 +++++++++++++++++++----
>>>    1 file changed, 63 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/
>>> ethernet/intel/ice/ice_dpll.c
>>> index 3f8cd5b8298b57..d817f17dcf1951 100644
>>> --- a/drivers/net/ethernet/intel/ice/ice_dpll.c
>>> +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
>>> @@ -1154,6 +1154,30 @@ ice_dpll_input_state_get(const struct dpll_pin
>>> *pin, void *pin_priv,
>>>                          extack, ICE_DPLL_PIN_TYPE_INPUT);
>>>    }
>>>    +/**
>>> + * ice_dpll_sw_pin_notify_peer - notify the paired SW pin after a
>>> state change
>>> + * @d: pointer to dplls struct
>>> + * @changed: the SW pin that was explicitly changed (already notified
>>> by dpll core)
>>> + *
>>> + * SMA and U.FL pins share physical signal paths in pairs (SMA1/U.FL1
>>> and
>>> + * SMA2/U.FL2).  When one pin's routing changes via the PCA9575 GPIO
>>> + * expander, the paired pin's state may also change.  Send a change
>>> + * notification for the peer pin so userspace consumers monitoring the
>>> + * peer via dpll netlink learn about the update.
>>> + *
>>> + * Context: Can be called under pf->dplls.lock, dpll_pin_change_ntf()
>>> is safe.
>>> + */
>>> +static void ice_dpll_sw_pin_notify_peer(struct ice_dplls *d,
>>> +                    struct ice_dpll_pin *changed)
>>> +{
>>> +    struct ice_dpll_pin *peer;
>>> +
>>> +    peer = (changed >= d->sma && changed < d->sma +
>>> ICE_DPLL_PIN_SW_NUM) ?
>>> +        &d->ufl[changed->idx] : &d->sma[changed->idx];
>>> +    if (peer->pin)
>>> +        dpll_pin_change_ntf(peer->pin);
>>> +}
>>> +
>>>    /**
>>>     * ice_dpll_sma_direction_set - set direction of SMA pin
>>>     * @p: pointer to a pin
>>> @@ -1233,6 +1257,8 @@ static int ice_dpll_sma_direction_set(struct
>>> ice_dpll_pin *p,
>>>                ret = ice_dpll_pin_state_update(p->pf, target,
>>>                                type, extack);
>>>        }
>>> +    if (!ret)
>>> +        ice_dpll_sw_pin_notify_peer(d, p);
>>>          return ret;
>>>    }
>>
>> ice_dpll_sma_direction_set() runs to process a DPLL_CMD_PIN_SET command
>> from userspace. It runs with dpll_lock held - taken in dpll_pin_pre_doit().
>> ice_dpll_sw_pin_notify_peer() -> dpll_pin_change_ntf() will take
>> dpll_lock again and deadlock.
>>
> 
> Yep. I think you could use __dpll_pin_change_ntf() which is the version
> that assumes the lock is held.. but that function is not exported
> outside of drivers/dpll.
> 
> Either way, this needs to be fixed somehow before I can apply it.
> 
> Thanks,
> Jake

I'm solving the similar situation where some setting on some output pin
can change also sibling pin.

E.g. changing frequency on OUTxP also changes frequency on OUTxN in
certain situations (depending on signal format of the output)...

In such cases would be useful to inform about such change on sibling
pin.

Thanks,
Ivan


^ permalink raw reply

* Re: [PATCH] netfilter: xt_realm: fix null-ptr-deref in realm_mt()
From: Florian Westphal @ 2026-04-15  9:02 UTC (permalink / raw)
  To: Kito Xu (veritas501)
  Cc: pablo, phil, davem, edumazet, kuba, pabeni, horms, jengelh, kaber,
	netfilter-devel, coreteam, netdev, linux-kernel
In-Reply-To: <20260415034343.107920-1-hxzene@gmail.com>

Kito Xu (veritas501) <hxzene@gmail.com> wrote:
> realm_mt() unconditionally dereferences skb_dst(skb) without a NULL
> check. The xt_realm match registers with .family = NFPROTO_UNSPEC,
> making it available to all netfilter protocol families. Through the
> nftables compat layer (nft_compat), an unprivileged user inside a
> user/net namespace can load this match into a bridge-family chain.

I do not think this bug is related to nft_compat.
You can also use ebtables setsockopt api to request xt_realm, no?

> Fixes: ab4f21e6fb1c ("netfilter: xtables: use NFPROTO_UNSPEC in more extensions")

Looks correct.  Alternatively we could revert the xt_realm.c change.
But I don't have a strong opinion here, patch looks correct.

^ permalink raw reply

* RE: [PATCH iwl-next v2 2/2] idpf: implement pci error handlers
From: Loktionov, Aleksandr @ 2026-04-15  8:53 UTC (permalink / raw)
  To: Tantilov, Emil S, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Kitszel, Przemyslaw, Bhat, Jay,
	Barrera, Ivan D, Zaremba, Larysa, Nguyen, Anthony L,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, Lobakin, Aleksander,
	linux-pci@vger.kernel.org, Chittim, Madhu, decot@google.com,
	willemb@google.com, sheenamo@google.com, lukas@wunner.de
In-Reply-To: <eb07b21f-0133-40c3-ae86-338572e2a64a@intel.com>



> -----Original Message-----
> From: Tantilov, Emil S <emil.s.tantilov@intel.com>
> Sent: Tuesday, April 14, 2026 5:01 PM
> To: Loktionov, Aleksandr <aleksandr.loktionov@intel.com>; intel-wired-
> lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Bhat, Jay <jay.bhat@intel.com>;
> Barrera, Ivan D <ivan.d.barrera@intel.com>; Zaremba, Larysa
> <larysa.zaremba@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; andrew+netdev@lunn.ch;
> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; Lobakin, Aleksander <aleksander.lobakin@intel.com>;
> linux-pci@vger.kernel.org; Chittim, Madhu <madhu.chittim@intel.com>;
> decot@google.com; willemb@google.com; sheenamo@google.com;
> lukas@wunner.de
> Subject: Re: [PATCH iwl-next v2 2/2] idpf: implement pci error
> handlers
> 
> 
> 
> On 4/14/2026 4:09 AM, Loktionov, Aleksandr wrote:
> >
> >
> >> -----Original Message-----
> >> From: Tantilov, Emil S <emil.s.tantilov@intel.com>
> >> Sent: Tuesday, April 14, 2026 5:17 AM
> >> To: intel-wired-lan@lists.osuosl.org
> >> Cc: netdev@vger.kernel.org; Kitszel, Przemyslaw
> >> <przemyslaw.kitszel@intel.com>; Bhat, Jay <jay.bhat@intel.com>;
> >> Barrera, Ivan D <ivan.d.barrera@intel.com>; Loktionov, Aleksandr
> >> <aleksandr.loktionov@intel.com>; Zaremba, Larysa
> >> <larysa.zaremba@intel.com>; Nguyen, Anthony L
> >> <anthony.l.nguyen@intel.com>; andrew+netdev@lunn.ch;
> >> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> >> pabeni@redhat.com; Lobakin, Aleksander
> >> <aleksander.lobakin@intel.com>; linux-pci@vger.kernel.org; Chittim,
> >> Madhu <madhu.chittim@intel.com>; decot@google.com;
> >> willemb@google.com; sheenamo@google.com; lukas@wunner.de
> >> Subject: [PATCH iwl-next v2 2/2] idpf: implement pci error handlers
> >>
> >> Add callbacks to handle PCI errors and FLR reset. When preparing to
> >> handle reset on the bus, the driver must stop all operations that
> can
> >> lead to MMIO access in order to prevent HW errors. To accomplish
> this
> >> introduce helper
> >> idpf_reset_prepare() that gets called prior to FLR or when PCI
> error
> >> is detected. Upon resume the recovery is done through the existing
> >> reset path by starting the event task.
> >>
> >> The following callbacks are implemented:
> >> .reset_prepare runs the first portion of the generic reset path
> >> leading up to the part where we wait for the reset to complete.
> >> .reset_done/resume runs the recovery part of the reset handling.
> >> .error_detected is the callback dealing with PCI errors, similar to
> >> the prepare call, we stop all operations, prior to attempting a
> >> recovery.
> >> .slot_reset is the callback attempting to restore the device,
> >> provided a PCI reset was initiated by the AER driver.
> >>
> >> Whereas previously the init logic guaranteed netdevs during reset,
> >> the addition of idpf_detach_and_close() to the PCI callbacks flow
> >> makes it possible for the function to be called without netdevs.
> Add
> >> check to avoid NULL pointer dereference in that case.
> >>
> >> Co-developed-by: Alan Brady <alan.brady@intel.com>
> >> Signed-off-by: Alan Brady <alan.brady@intel.com>
> >> Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
> >> Reviewed-by: Jay Bhat <jay.bhat@intel.com>
> >> Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
> >> ---
> >>   drivers/net/ethernet/intel/idpf/idpf.h      |   3 +
> >>   drivers/net/ethernet/intel/idpf/idpf_lib.c  |  13 ++-
> >> drivers/net/ethernet/intel/idpf/idpf_main.c | 112
> ++++++++++++++++++++
> >>   3 files changed, 126 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/net/ethernet/intel/idpf/idpf.h
> >> b/drivers/net/ethernet/intel/idpf/idpf.h
> >> index 1d0e32e47e87..164d2f3e233a 100644
> >> --- a/drivers/net/ethernet/intel/idpf/idpf.h
> >> +++ b/drivers/net/ethernet/intel/idpf/idpf.h
> >> @@ -88,6 +88,7 @@ enum idpf_state {
> >>    * @IDPF_REMOVE_IN_PROG: Driver remove in progress
> >>    * @IDPF_MB_INTR_MODE: Mailbox in interrupt mode
> >>    * @IDPF_VC_CORE_INIT: virtchnl core has been init
> >> + * @IDPF_PCI_CB_RESET: Reset via the PCI callbacks
> >>    * @IDPF_FLAGS_NBITS: Must be last
> >>    */
> >>   enum idpf_flags {
> >> @@ -97,6 +98,7 @@ enum idpf_flags {
> >>   	IDPF_REMOVE_IN_PROG,
> >>   	IDPF_MB_INTR_MODE,
> >>   	IDPF_VC_CORE_INIT,
> >
> > ...
> >
> >> +/**
> >> + * idpf_pci_err_resume - Resume operations after PCI error
> recovery
> >> + * @pdev: PCI device struct
> >> + */
> >> +static void idpf_pci_err_resume(struct pci_dev *pdev) {
> >> +	struct idpf_adapter *adapter = pci_get_drvdata(pdev);
> >> +
> >> +	/* Force a PFR when resuming from PCI error. */
> >> +	if (test_and_set_bit(IDPF_PCI_CB_RESET, adapter->flags))
> >> +		adapter->dev_ops.reg_ops.trigger_reset(adapter,
> >> IDPF_HR_FUNC_RESET);
> > You say "Force a PFR", but PFR is only triggered on the AER path,
> not on the FLR path.
> 
> Hence the "force" - the call to `trigger_reset` results in a PFR and
> is only needed in the case of a PCI error. If this function was called
> because a user issued an FLR, the kernel will trigger it for us. This
> way we can reuse the reset handling path to restore the operation of
> the netdevs.
> 
> Though I may be misunderstanding - are you referring to the wording or
> the logic?
From the first glance the comment looks misleading from my point of view.
Please consider rewording. 

> 
> Thanks,
> Emil
> 
> >
> > Everything else looks fine
> > Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
> >
> >> +
> >> +	queue_delayed_work(adapter->vc_event_wq,
> >> +			   &adapter->vc_event_task,
> >> +			   msecs_to_jiffies(300));
> >> +}
> >
> > ...
> >
> >>   };
> >>   module_pci_driver(idpf_driver);
> >> --
> >> 2.37.3
> >


^ permalink raw reply

* [PATCH v3] net: wwan: t7xx: validate port_count against message length in t7xx_port_enum_msg_handler
From: Pavitra Jha @ 2026-04-15  8:47 UTC (permalink / raw)
  To: w; +Cc: pabeni, chandrashekar.devegowda, linux-wwan, netdev, stable,
	Pavitra Jha
In-Reply-To: <ad5p7XlSOKoaQC5D@1wt.eu>

t7xx_port_enum_msg_handler() uses the modem-supplied port_count field as
a loop bound over port_msg->data[] without checking that the message buffer
contains sufficient data. A modem sending port_count=65535 in a 12-byte
buffer triggers a slab-out-of-bounds read of up to 262140 bytes.

Add a struct_size() check after extracting port_count and before the loop.
Pass msg_len to t7xx_port_enum_msg_handler() and use it to validate
the message size before accessing port_msg->data[].
Pass msg_len from both call sites: skb->len at the DPMAIF path after
skb_pull(), and the captured rt_feature->data_len at the handshake path.

Fixes: 39d439047f1d ("net: wwan: t7xx: Add control DMA interface")
Cc: stable@vger.kernel.org
Signed-off-by: Pavitra Jha <jhapavitra98@gmail.com>
---
 drivers/net/wwan/t7xx/t7xx_modem_ops.c     | 14 +++++++-------
 drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c | 12 +++++++++---
 drivers/net/wwan/t7xx/t7xx_port_proxy.h    |  2 +-
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/wwan/t7xx/t7xx_modem_ops.c b/drivers/net/wwan/t7xx/t7xx_modem_ops.c
index 7968e208d..d0559fe16 100644
--- a/drivers/net/wwan/t7xx/t7xx_modem_ops.c
+++ b/drivers/net/wwan/t7xx/t7xx_modem_ops.c
@@ -453,25 +453,25 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf
 {
 	enum mtk_feature_support_type ft_spt_st, ft_spt_cfg;
 	struct mtk_runtime_feature *rt_feature;
+	size_t feat_data_len;
 	int i, offset;
 
 	offset = sizeof(struct feature_query);
 	for (i = 0; i < FEATURE_COUNT && offset < data_length; i++) {
 		rt_feature = data + offset;
-		offset += sizeof(*rt_feature) + le32_to_cpu(rt_feature->data_len);
-
+		feat_data_len = le32_to_cpu(rt_feature->data_len);
+		offset += sizeof(*rt_feature) + feat_data_len;
 		ft_spt_cfg = FIELD_GET(FEATURE_MSK, core->feature_set[i]);
 		if (ft_spt_cfg != MTK_FEATURE_MUST_BE_SUPPORTED)
 			continue;
 
 		ft_spt_st = FIELD_GET(FEATURE_MSK, rt_feature->support_info);
 		if (ft_spt_st != MTK_FEATURE_MUST_BE_SUPPORTED)
 			return -EINVAL;
 
-		if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM)
-			t7xx_port_enum_msg_handler(ctl->md, rt_feature->data);
+		if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) {
+			t7xx_port_enum_msg_handler(ctl->md, rt_feature->data,
+						   feat_data_len);
+		}
 	}
 
 	return 0;
 }
 
diff --git a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
index ae632ef96..d984a688d 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
+++ b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c
@@ -124,7 +124,7 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c
  * * 0		- Success.
  * * -EFAULT	- Message check failure.
  */
-int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
+int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len)
 {
 	struct device *dev = &md->t7xx_dev->pdev->dev;
 	unsigned int version, port_count, i;
@@ -141,6 +141,13 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
 	}
 
 	port_count = FIELD_GET(PORT_MSG_PRT_CNT, le32_to_cpu(port_msg->info));
+
+	if (msg_len < struct_size(port_msg, data, port_count)) {
+		dev_err(dev, "Port enum msg too short: need %zu, have %zu\n",
+			struct_size(port_msg, data, port_count), msg_len);
+		return -EINVAL;
+	}
+
 	for (i = 0; i < port_count; i++) {
 		u32 port_info = le32_to_cpu(port_msg->data[i]);
 		unsigned int ch_id;
@@ -154,7 +161,6 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg)
 
 	return 0;
 }
 
 static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb)
 {
 	const struct t7xx_port_conf *port_conf = port->port_conf;
@@ -191,7 +197,7 @@ static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb)
 
 	case CTL_ID_PORT_ENUM:
 		skb_pull(skb, sizeof(*ctrl_msg_h));
-		ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data);
+		ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data, skb->len);
 		if (!ret)
 			ret = port_ctl_send_msg_to_md(port, CTL_ID_PORT_ENUM, 0);
 		else
diff --git a/drivers/net/wwan/t7xx/t7xx_port_proxy.h b/drivers/net/wwan/t7xx/t7xx_port_proxy.h
index f0918b36e..7c3190bf0 100644
--- a/drivers/net/wwan/t7xx/t7xx_port_proxy.h
+++ b/drivers/net/wwan/t7xx/t7xx_port_proxy.h
@@ -103,7 +103,7 @@ void t7xx_port_proxy_reset(struct port_proxy *port_prox);
 void t7xx_port_proxy_uninit(struct port_proxy *port_prox);
 int t7xx_port_proxy_init(struct t7xx_modem *md);
 void t7xx_port_proxy_md_status_notify(struct port_proxy *port_prox, unsigned int state);
-int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg);
+int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len);
 int t7xx_port_proxy_chl_enable_disable(struct port_proxy *port_prox, unsigned int ch_id,
 				       bool en_flag);
 void t7xx_port_proxy_set_cfg(struct t7xx_modem *md, enum port_cfg_id cfg_id);
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 1/1] xskmap: reject TX-only AF_XDP sockets
From: Jason Xing @ 2026-04-15  8:43 UTC (permalink / raw)
  To: Linpu Yu
  Cc: magnus.karlsson, maciej.fijalkowski, netdev, bpf, sdf, davem,
	edumazet, kuba, pabeni, horms, ast, daniel, hawk, john.fastabend,
	bjorn, linux-kernel, yuantan098, yifanwucs
In-Reply-To: <6584463e576b7bb3619dc302cfecfb8ca56bc86a.1774701288.git.linpu5433@gmail.com>

On Mon, Mar 30, 2026 at 3:33 AM Linpu Yu <linpu5433@gmail.com> wrote:
>
> Reject TX-only AF_XDP sockets from XSKMAP updates. Redirected
> packets always enter the Rx path, where the kernel expects the
> selected socket to have an Rx ring. A TX-only socket can
> currently be inserted into an XSKMAP, and redirecting a packet
> to it crashes the kernel in xsk_generic_rcv().
>
> Keep TX-only AF_XDP sockets valid for pure Tx use, but prevent
> them from being published through XSKMAP.
>
> Fixes: fbfc504a24f5 ("bpf: introduce new bpf AF_XDP map type BPF_MAP_TYPE_XSKMAP")
> Reported-by: Juefei Pu <tomapufckgml@gmail.com>
> Reported-by: Yuan Tan <yuantan098@gmail.com>
> Signed-off-by: Xin Liu <bird@lzu.edu.cn>
> Signed-off-by: Yifan Wu <yifanwucs@gmail.com>
> Signed-off-by: Linpu Yu <linpu5433@gmail.com>

Hi Linpu,

Any plan to post a v2 with our questions resolved?

Thanks,
Jason

^ permalink raw reply

* Re: [PATCH v2] vsock/virtio: fix accept queue count leak on transport mismatch
From: Luigi Leonardi @ 2026-04-15  8:44 UTC (permalink / raw)
  To: Dudu Lu; +Cc: netdev, stefanha, sgarzare, mst, jasowang
In-Reply-To: <20260413131409.19022-1-phx0fer@gmail.com>

On Mon, Apr 13, 2026 at 09:14:09PM +0800, Dudu Lu wrote:
>virtio_transport_recv_listen() calls sk_acceptq_added() before
>vsock_assign_transport(). If vsock_assign_transport() fails or
>selects a different transport, the error path returns without
>calling sk_acceptq_removed(), permanently incrementing
>sk_ack_backlog.
>
>After approximately backlog+1 such failures, sk_acceptq_is_full()
>returns true, causing the listener to reject all new connections.
>
>Fix by moving sk_acceptq_added() to after the transport validation,
>matching the pattern used by vmci_transport and hyperv_transport.
>
>Fixes: c0cfa2d8a788 ("vsock: add multi-transports support")
>Signed-off-by: Dudu Lu <phx0fer@gmail.com>
>---

You forgot to add the `net` tag to the patch.

Other than that, the code LGTM and testing passed.

Reviewed-by: Luigi Leonardi <leonardi@redhat.com>

> net/vmw_vsock/virtio_transport_common.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
>diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>index 8a9fb23c6e85..e01d983488e5 100644
>--- a/net/vmw_vsock/virtio_transport_common.c
>+++ b/net/vmw_vsock/virtio_transport_common.c
>@@ -1560,8 +1560,6 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
> 		return -ENOMEM;
> 	}
>
>-	sk_acceptq_added(sk);
>-
> 	lock_sock_nested(child, SINGLE_DEPTH_NESTING);
>
> 	child->sk_state = TCP_ESTABLISHED;
>@@ -1583,6 +1581,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
> 		return ret;
> 	}
>
>+	sk_acceptq_added(sk);
> 	if (virtio_transport_space_update(child, skb))
> 		child->sk_write_space(child);
>
>-- 
>2.39.3 (Apple Git-145)
>


^ permalink raw reply

* Re: [PATCH iwl-next 1/1] igc: add support for forcing link speed without autonegotiation
From: Abdul Rahim, Faizal @ 2026-04-15  8:37 UTC (permalink / raw)
  To: Simon Horman, khai.wen.tan
  Cc: anthony.l.nguyen, przemyslaw.kitszel, andrew+netdev, davem,
	edumazet, kuba, pabeni, intel-wired-lan, netdev, linux-kernel,
	faizal.abdul.rahim, hector.blanco.alcaine, hong.aun.looi,
	khai.wen.tan
In-Reply-To: <20260413145215.718532-1-horms@kernel.org>



On 13/4/2026 10:52 pm, Simon Horman wrote:
> From: 'Simon Horman' <horms@kernel.org>
> 
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> Full review at: https://sashiko.dev
> ---
> igc: add support for forcing link speed without autonegotiation
> 
> This commit allows forcing 10/100 Mb/s link speed and duplex settings via
> ethtool when autonegotiation is disabled. It configures the MAC CTRL and
> PHY registers to bypass autonegotiation and apply the requested settings.
> 
>> diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> index 0122009bedd03..e14771532dad8 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
> [ ... ]
>> +static int igc_handle_autoneg_disabled(struct net_device *netdev, u32 speed,
>> +				       u8 duplex)
>> +{
>> +	struct igc_adapter *adapter = netdev_priv(netdev);
>> +	struct igc_mac_info *mac = &adapter->hw.mac;
>> +	enum igc_forced_speed_duplex forced_speed_duplex;
>>   
>> +	switch (speed) {
>> +	case SPEED_10:
>> +		forced_speed_duplex = (duplex == DUPLEX_FULL) ? IGC_FORCED_10F : IGC_FORCED_10H;
>> +		break;
>> +	case SPEED_100:
>> +		forced_speed_duplex = (duplex == DUPLEX_FULL) ? IGC_FORCED_100F : IGC_FORCED_100H;
>> +		break;
> 
> Does forcing a half-duplex configuration here also require setting flow
> control to none (e.g., hw->fc.requested_mode = igc_fc_none)?
> 
>> diff --git a/drivers/net/ethernet/intel/igc/igc_mac.c b/drivers/net/ethernet/intel/igc/igc_mac.c
>> index 7ac6637f8db73..16b0f2db06896 100644
>> --- a/drivers/net/ethernet/intel/igc/igc_mac.c
>> +++ b/drivers/net/ethernet/intel/igc/igc_mac.c
>> @@ -452,7 +452,7 @@ s32 igc_config_fc_after_link_up(struct igc_hw *hw)
>>   	 * so we had to force link.  In this case, we need to force the
>>   	 * configuration of the MAC to match the "fc" parameter.
>>   	 */
>> -	if (mac->autoneg_failed)
>> +	if (mac->autoneg_failed || !mac->autoneg_enabled)
>>   		ret_val = igc_force_mac_fc(hw);
> 
> If mac->autoneg_enabled is false, igc_force_mac_fc() will configure the MAC
> using the existing hw->fc.current_mode. If current_mode defaults to full flow
> control, this will enable PAUSE frames.
> 
> Since autonegotiation is disabled, the MII status check further down in this
> same function will fail and return early:
> 
> drivers/net/ethernet/intel/igc/igc_mac.c:igc_config_fc_after_link_up() {
>      ...
>      if (!(mii_status_reg & MII_SR_AUTONEG_COMPLETE)) {
>          hw_dbg("Copper PHY and Auto Neg has not completed.\n");
>          goto out;
>      }
>      ...
> }
> 
> This early return skips the half-duplex safety check at the end of
> igc_config_fc_after_link_up() which would normally clear the flow control
> settings:
> 
> drivers/net/ethernet/intel/igc/igc_mac.c:igc_config_fc_after_link_up() {
>      ...
>      if (duplex == HALF_DUPLEX)
>          hw->fc.current_mode = igc_fc_none;
> 
>      ret_val = igc_force_mac_fc(hw);
>      ...
> }
> 
> Does this leave the MAC incorrectly configured to send and receive PAUSE
> frames on a half-duplex link, violating the IEEE 802.3 specification?

Hi Simon,
Thanks for the review.

Yes, you're right, I missed that interaction.

I’ll update the code to also set the following in 
igc_handle_autoneg_disabled() when forcing half‑duplex:
hw->fc.requested_mode = igc_fc_none

I’ll test it and, if everything looks good, send out v2.

Thanks again!




^ permalink raw reply

* [PATCH v12 net-next 00/11] nbl driver for Nebulamatrix NICs
From: illusion.wang @ 2026-04-15  8:29 UTC (permalink / raw)
  To: netdev; +Cc: illusion.wang, open list
In-Reply-To: <20260415033608.2438-2-illusion.wang@nebula-matrix.com>

This patch series represents the first phase. We plan to integrate it in
two phases: the first phase covers mailbox and chip configuration,
while the second phase involves net dev configuration.
Together, they will provide basic PF-based Ethernet port transmission and
reception capabilities.

After that, we will consider other features, such as ethtool support,
flow management, adminq messaging, VF support, debugfs support, etc.

changes v11->v12
Link to v10:https://lore.kernel.org/netdev/20260408093739.56001-1-illusion.wang@nebula-matrix.com/
AI review issues
changes v10->v11
Link to v10:https://lore.kernel.org/netdev/20260401022318.28550-1-illusion.wang@nebula-matrix.com/
1.Issues found by Mohsin
2.AI review issues
changes v9->v10
Link to v9:https://lore.kernel.org/netdev/20260325040048.2313-1-illusion.wang@nebula-matrix.com/
1.Issues found by Jakub
2.AI review issue
changes v8->v9
Link to v8:https://lore.kernel.org/netdev/20260317034533.5600-1-illusion.wang@nebula-matrix.com/
1.Issues found by Jakub
2.AI review issue
Changes v7→v8
Link to v7:https://lore.kernel.org/netdev/20260310120959.22015-1-illusion.wang@nebula-matrix.com/
1.Issues found by Paolo
Changes v6->v7
Link to v6:https://lore.kernel.org/netdev/20260306033451.5196-1-illusion.wang@nebula-matrix.com/
1.Issue found by Jakub
2.AI review issue
Changes v5->v6
Link to V5:https://lore.kernel.org/netdev/20260226073840.3222-1-illusion.wang@nebula-matrix.com/
1.put all standard linux includes files the .c file which needs it & others
--Andrew
2.AI review issue
Changes v4->v5
Link to V4:https://lore.kernel.org/netdev/20260206021608.85381-1-illusion.wang@nebula-matrix.com/
1.change nbl_core to nbl & change ** pointers to *pointers & others
--Andrew
2.AI review issue
Changes v3->v4
Link to v3: https://lore.kernel.org/netdev/20260123011804.31263-1-illusion.wang@nebula-matrix.com
1.cut down to part of a mini driver(mailbox and chip init)
--Jakub Kicinski Simon Horman(some sort of staged approached)
2.modify issues found by ai.
3. Reverse Christmas tree/nbl_err/devm_kfree/remove some macros/
void type to real type/others
--Andrew Lunn
4.change deprecated pci_enable_msix_range to pci_alloc_irq_vectors
5.delete service layer
6.the style of kconfig---Randy Dunlap
7.add to Documentation/networking/device_drivers/ethernet/index.rst
--Simon Horman
Changes v2 →v3
Link to v2: https://lore.kernel.org/netdev/20260109100146.63569-1-illusion.wang@nebula-matrix.com/
1.cut down to a mini driver:
    delete vf support
    use promisc mode to cut down flow management
    drop patch15 in v2
    delete adminq msg
    delete abnormal handling
    delete some unimportant interfaces
2.modify issues found by ai review
Changes v1->v2
Link to v1: https://lore.kernel.org/netdev/20251223035113.31122-1-illusion.wang@nebula-matrix.com/
1.Format Issues and Compilation Issues
- Paolo Abeni
2.add sysfs patch and drop coexisting patch
- Andrew Lunn
3.delete some unimportant ndo operations
4.add machine generated headers patch
5.Modify the issues found in patch1-2 and apply the same fixes to other
patches
6.modify issues found by nipa

illusion.wang (11):
  net/nebula-matrix: add minimum nbl build framework
  net/nebula-matrix: add our driver architecture
  net/nebula-matrix: add chip related definitions
  net/nebula-matrix: channel msg value and msg struct
  net/nebula-matrix: add channel layer
  net/nebula-matrix: add common resource implementation
  net/nebula-matrix: add intr resource implementation
  net/nebula-matrix: add vsi resource implementation
  net/nebula-matrix: add Dispatch layer implementation
  net/nebula-matrix: add common/ctrl dev init/reinit operation
  net/nebula-matrix: add common dev start/stop operation

 .../device_drivers/ethernet/index.rst         |    1 +
 .../ethernet/nebula-matrix/nbl.rst            |   27 +
 MAINTAINERS                                   |   10 +
 drivers/net/ethernet/Kconfig                  |    1 +
 drivers/net/ethernet/Makefile                 |    1 +
 drivers/net/ethernet/nebula-matrix/Kconfig    |   34 +
 drivers/net/ethernet/nebula-matrix/Makefile   |    6 +
 .../net/ethernet/nebula-matrix/nbl/Makefile   |   16 +
 .../nbl/nbl_channel/nbl_channel.c             |  843 +++++
 .../nbl/nbl_channel/nbl_channel.h             |  163 +
 .../nebula-matrix/nbl/nbl_common/nbl_common.c |  214 ++
 .../nebula-matrix/nbl/nbl_common/nbl_common.h |   33 +
 .../net/ethernet/nebula-matrix/nbl/nbl_core.h |   59 +
 .../nebula-matrix/nbl/nbl_core/nbl_dev.c      |  441 +++
 .../nebula-matrix/nbl/nbl_core/nbl_dev.h      |   58 +
 .../nebula-matrix/nbl/nbl_core/nbl_dispatch.c |  487 +++
 .../nebula-matrix/nbl/nbl_core/nbl_dispatch.h |   56 +
 .../nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis.c  |  797 +++++
 .../nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis.h  |  493 +++
 .../nbl_hw/nbl_hw_leonis/nbl_hw_leonis_regs.c | 2901 +++++++++++++++++
 .../nbl_hw/nbl_hw_leonis/nbl_hw_leonis_regs.h |   11 +
 .../nbl_hw_leonis/nbl_resource_leonis.c       |  264 ++
 .../nbl_hw_leonis/nbl_resource_leonis.h       |   10 +
 .../nebula-matrix/nbl/nbl_hw/nbl_hw_reg.h     |   68 +
 .../nebula-matrix/nbl/nbl_hw/nbl_interrupt.c  |  246 ++
 .../nebula-matrix/nbl/nbl_hw/nbl_interrupt.h  |   12 +
 .../nebula-matrix/nbl/nbl_hw/nbl_resource.c   |  118 +
 .../nebula-matrix/nbl/nbl_hw/nbl_resource.h   |  116 +
 .../nebula-matrix/nbl/nbl_hw/nbl_vsi.c        |   51 +
 .../nebula-matrix/nbl/nbl_hw/nbl_vsi.h        |   11 +
 .../nbl/nbl_include/nbl_def_channel.h         |  362 ++
 .../nbl/nbl_include/nbl_def_common.h          |   80 +
 .../nbl/nbl_include/nbl_def_dev.h             |   16 +
 .../nbl/nbl_include/nbl_def_dispatch.h        |   42 +
 .../nbl/nbl_include/nbl_def_hw.h              |   54 +
 .../nbl/nbl_include/nbl_def_resource.h        |   37 +
 .../nbl/nbl_include/nbl_include.h             |   79 +
 .../nbl/nbl_include/nbl_product_base.h        |   19 +
 .../net/ethernet/nebula-matrix/nbl/nbl_main.c |  329 ++
 39 files changed, 8566 insertions(+)
 create mode 100644 Documentation/networking/device_drivers/ethernet/nebula-matrix/nbl.rst
 create mode 100644 drivers/net/ethernet/nebula-matrix/Kconfig
 create mode 100644 drivers/net/ethernet/nebula-matrix/Makefile
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/Makefile
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_channel/nbl_channel.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_channel/nbl_channel.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_common/nbl_common.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_common/nbl_common.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_core.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_core/nbl_dev.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_core/nbl_dev.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_core/nbl_dispatch.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_core/nbl_dispatch.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis_regs.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_hw_leonis_regs.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_resource_leonis.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_leonis/nbl_resource_leonis.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_hw_reg.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_interrupt.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_interrupt.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_resource.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_resource.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_vsi.c
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_hw/nbl_vsi.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_channel.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_common.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_dev.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_dispatch.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_hw.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_def_resource.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_include.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_include/nbl_product_base.h
 create mode 100644 drivers/net/ethernet/nebula-matrix/nbl/nbl_main.c

-- 
2.47.3


^ permalink raw reply

* [PATCH RFC net-next v4 14/14] xsk: optimize xsk_build_skb for batch copy-mode fast path
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Three targeted optimizations for the batch copy-mode TX hot path:

Replace skb_store_bits() with memcpy() for single-buffer first-desc
path.  After skb_reserve() + skb_put(), the SKB is freshly allocated
with all data in the linear area and no frags, so skb_store_bits()
degenerates to memcpy(skb->data, buffer, len) but carries unnecessary
function call overhead, offset validation, and frag iteration logic.

Inline UMEM address computation in Phase 3 and pass the pre-computed
buffer pointer to xsk_build_skb(), avoiding the per-packet non-inlined
xp_raw_get_data() (EXPORT_SYMBOL) call chain:
xsk_buff_raw_get_data -> xp_raw_get_data -> __xp_raw_get_addr +
__xp_raw_get_data.
In the batch loop the pool->addrs and pool->unaligned are invariant,
so we cache them once and compute each buffer address inline.

Prefetch the *next* descriptor's UMEM data buffer at the top of the
Phase 3 loop, hiding the memory latency of the upcoming memcpy.

It improves 3-4% performance stably.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xdp_sock.h |  3 ++-
 net/core/skbuff.c      | 18 ++++++++++++++++--
 net/xdp/xsk.c          | 15 ++++++---------
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 0609e3b04279..5e05236c7fba 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -139,7 +139,8 @@ void __xsk_map_flush(struct list_head *flush_list);
 INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
 struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			      struct sk_buff *allocated_skb,
-			      struct xdp_desc *desc);
+			      struct xdp_desc *desc,
+			      void *buffer);
 int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err);
 int xsk_direct_xmit_batch(struct xdp_sock *xs, struct net_device *dev);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5726b1566b2b..bef5270e6332 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -752,14 +752,28 @@ int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err
 	if (total_truesize)
 		refcount_add(total_truesize, &xs->sk.sk_wmem_alloc);
 
-	/* Phase 3: Build SKBs with packet data */
+	/* Phase 3: Build SKBs with packet data. */
+	struct xsk_buff_pool *pool = xs->pool;
+	void *pool_addrs = pool->addrs;
+	bool unaligned = pool->unaligned;
+
 	for (j = 0; j < alloc_descs; j++) {
+		u64 addr = descs[j].addr;
+		void *buffer;
+
+		if (unaligned)
+			addr = xp_unaligned_add_offset_to_addr(addr);
+		buffer = pool_addrs + addr;
+
+		if (j + 1 < alloc_descs)
+			prefetch(pool_addrs + descs[j + 1].addr);
+
 		if (!xs->skb) {
 			skb = skbs[skb_count - 1 - k];
 			k++;
 		}
 
-		skb = xsk_build_skb(xs, skb, &descs[j]);
+		skb = xsk_build_skb(xs, skb, &descs[j], buffer);
 		if (IS_ERR(skb)) {
 			*err = PTR_ERR(skb);
 			break;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index be341290e42c..3bf81b838075 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -811,7 +811,8 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 
 struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			      struct sk_buff *allocated_skb,
-			      struct xdp_desc *desc)
+			      struct xdp_desc *desc,
+			      void *buffer)
 {
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
@@ -825,11 +826,10 @@ struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			goto free_err;
 		}
 	} else {
-		u32 hr, tr, len;
-		void *buffer;
+		u32 hr, tr, len = desc->len;
 
-		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
-		len = desc->len;
+		if (!buffer)
+			buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 
 		if (!skb) {
 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
@@ -844,10 +844,7 @@ struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 			skb_reserve(skb, hr);
 			skb_put(skb, len);
-
-			err = skb_store_bits(skb, 0, buffer, len);
-			if (unlikely(err))
-				goto free_err;
+			memcpy(skb->data, buffer, len);
 
 			xsk_skb_init_misc(skb, xs, desc->addr);
 			if (desc->options & XDP_TX_METADATA) {
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 13/14] xsk: retire old xmit path in copy mode
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Add a new helper xsk_init_batch() used in xsk_create() with the default
value 1.

Obsolete __xsk_generic_xmit.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c | 151 +++++++++++++-------------------------------------
 1 file changed, 37 insertions(+), 114 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index e1ad2ac2b39a..be341290e42c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -1036,101 +1036,14 @@ static int __xsk_generic_xmit_batch(struct xdp_sock *xs)
 	return err;
 }
 
-static int __xsk_generic_xmit(struct sock *sk)
-{
-	struct xdp_sock *xs = xdp_sk(sk);
-	bool sent_frame = false;
-	struct xdp_desc desc;
-	struct sk_buff *skb;
-	u32 max_batch;
-	int err = 0;
-
-	mutex_lock(&xs->mutex);
-
-	/* Since we dropped the RCU read lock, the socket state might have changed. */
-	if (unlikely(!xsk_is_bound(xs))) {
-		err = -ENXIO;
-		goto out;
-	}
-
-	if (xs->queue_id >= xs->dev->real_num_tx_queues)
-		goto out;
-
-	max_batch = READ_ONCE(xs->max_tx_budget);
-	while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
-		if (max_batch-- == 0) {
-			err = -EAGAIN;
-			goto out;
-		}
-
-		/* This is the backpressure mechanism for the Tx path.
-		 * Reserve space in the completion queue and only proceed
-		 * if there is space in it. This avoids having to implement
-		 * any buffering in the Tx path.
-		 */
-		if (!xsk_cq_reserve_locked(xs->pool, 1)) {
-			err = -EAGAIN;
-			goto out;
-		}
-
-		skb = xsk_build_skb(xs, NULL, &desc);
-		if (IS_ERR(skb)) {
-			err = PTR_ERR(skb);
-			if (err != -EOVERFLOW)
-				goto out;
-			err = 0;
-			continue;
-		}
-
-		xskq_cons_release(xs->tx);
-
-		if (xp_mb_desc(&desc)) {
-			xs->skb = skb;
-			continue;
-		}
-
-		err = __dev_direct_xmit(skb, xs->queue_id);
-		if  (err == NETDEV_TX_BUSY) {
-			/* Tell user-space to retry the send */
-			xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
-			xsk_consume_skb(skb);
-			err = -EAGAIN;
-			goto out;
-		}
-
-		/* Ignore NET_XMIT_CN as packet might have been sent */
-		if (err == NET_XMIT_DROP) {
-			/* SKB completed but not sent */
-			err = -EBUSY;
-			xs->skb = NULL;
-			goto out;
-		}
-
-		sent_frame = true;
-		xs->skb = NULL;
-	}
-
-	if (xskq_has_descs(xs->tx)) {
-		if (xs->skb)
-			xsk_drop_skb(xs->skb);
-		xskq_cons_release(xs->tx);
-	}
-
-out:
-	if (sent_frame)
-		__xsk_tx_release(xs);
-
-	mutex_unlock(&xs->mutex);
-	return err;
-}
-
 static int xsk_generic_xmit(struct sock *sk)
 {
+	struct xdp_sock *xs = xdp_sk(sk);
 	int ret;
 
 	/* Drop the RCU lock since the SKB path might sleep. */
 	rcu_read_unlock();
-	ret = __xsk_generic_xmit(sk);
+	ret = __xsk_generic_xmit_batch(xs);
 	/* Reaquire RCU lock before going into common code. */
 	rcu_read_lock();
 
@@ -1626,6 +1539,34 @@ struct xdp_umem_reg_v1 {
 	__u32 headroom;
 };
 
+static int xsk_init_batch(struct xsk_batch *batch, unsigned int size)
+{
+	struct xdp_desc *descs;
+	struct sk_buff **skbs;
+	void **data;
+
+	skbs = kmalloc(size * sizeof(struct sk_buff *), GFP_KERNEL);
+	if (!skbs)
+		return -ENOMEM;
+
+	data = kmalloc_array(size, sizeof(void *), GFP_KERNEL);
+	if (!data) {
+		kfree(skbs);
+		return -ENOMEM;
+	}
+
+	descs = kvcalloc(size, sizeof(struct xdp_desc), GFP_KERNEL);
+	if (!descs) {
+		kfree(data);
+		kfree(skbs);
+		return -ENOMEM;
+	}
+
+	xsk_batch_reset(batch, skbs, descs, data, size);
+
+	return 0;
+}
+
 static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			  sockptr_t optval, unsigned int optlen)
 {
@@ -1746,9 +1687,6 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	{
 		struct xsk_buff_pool *pool = xs->pool;
 		struct xsk_batch *batch = &xs->batch;
-		struct xdp_desc *descs;
-		struct sk_buff **skbs;
-		void **data;
 		unsigned int size;
 		int ret = 0;
 
@@ -1762,27 +1700,7 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
 			return -EACCES;
 
 		mutex_lock(&xs->mutex);
-		skbs = kmalloc(size * sizeof(struct sk_buff *), GFP_KERNEL);
-		if (!skbs) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		data = kmalloc_array(size, sizeof(void *), GFP_KERNEL);
-		if (!data) {
-			kfree(skbs);
-			ret = -ENOMEM;
-			goto out;
-		}
-		descs = kvcalloc(size, sizeof(struct xdp_desc), GFP_KERNEL);
-		if (!descs) {
-			kfree(data);
-			kfree(skbs);
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		xsk_batch_reset(batch, skbs, descs, data, size);
-out:
+		ret = xsk_init_batch(batch, size);
 		mutex_unlock(&xs->mutex);
 		return ret;
 	}
@@ -2056,6 +1974,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct xdp_sock *xs;
 	struct sock *sk;
+	int ret;
 
 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 		return -EPERM;
@@ -2071,6 +1990,11 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 	if (!sk)
 		return -ENOBUFS;
 
+	xs = xdp_sk(sk);
+	ret = xsk_init_batch(&xs->batch, 1);
+	if (ret)
+		return ret;
+
 	sock->ops = &xsk_proto_ops;
 
 	sock_init_data(sock, sk);
@@ -2081,7 +2005,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
 
 	sock_set_flag(sk, SOCK_RCU_FREE);
 
-	xs = xdp_sk(sk);
 	xs->state = XSK_READY;
 	xs->max_tx_budget = TX_BATCH_SIZE;
 	mutex_init(&xs->mutex);
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 12/14] xsk: separate read-mostly and write-heavy fields in xsk_buff_pool
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

perf c2c profiling of the AF_XDP generic-copy batch TX path reveals
that ~45% of all cache-line contention (HITM) comes from a single
cacheline inside struct xsk_buff_pool.

The sendmsg CPU reads pool geometry fields (addrs, chunk_size,
headroom, tx_metadata_len, etc.) in the validate and build hot
path, while the NAPI TX-completion CPU writes cq_prod_lock (via
xsk_destruct_skb -> xsk_cq_submit_addr_locked) and
cached_need_wakeup (via xsk_set/clear_tx_need_wakeup) on the same
cacheline—classic false sharing.

This adds one extra cacheline (64 bytes) to the per-pool allocation
but eliminates cross-CPU false sharing between the TX sendmsg and
TX completion paths.

This reorganization improves overall performance by 5-6%, which can
be captured by xdpsock.

After this, the only one hotpot is 6% refcount process, which has
already been batched to minimize the impact in the series.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/xsk_buff_pool.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index ccb3b350001f..b1b11e3aa273 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -73,23 +73,27 @@ struct xsk_buff_pool {
 	u64 addrs_cnt;
 	u32 free_list_cnt;
 	u32 dma_pages_cnt;
-	u32 free_heads_cnt;
+
+	/* Read-mostly fields */
 	u32 headroom;
 	u32 chunk_size;
 	u32 chunk_shift;
 	u32 frame_len;
 	u32 xdp_zc_max_segs;
 	u8 tx_metadata_len; /* inherited from umem */
-	u8 cached_need_wakeup;
 	bool uses_need_wakeup;
 	bool unaligned;
 	bool tx_sw_csum;
 	void *addrs;
+
+	/* Write-heavy fields */
 	/* Mutual exclusion of the completion ring in the SKB mode.
 	 * Protect: NAPI TX thread and sendmsg error paths in the SKB
 	 * destructor callback.
 	 */
-	spinlock_t cq_prod_lock;
+	spinlock_t cq_prod_lock ____cacheline_aligned_in_smp;
+	u8 cached_need_wakeup;
+	u32 free_heads_cnt;
 	struct xdp_buff_xsk *free_heads[];
 };

-- 
2.41.3

^ permalink raw reply related

* [PATCH RFC net-next v4 11/14] xsk: support batch xmit main logic
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

This function __xsk_generic_xmit_batch() is the core function in batches
xmit, implement a batch version of __xsk_generic_xmit().

The whole logic is divided into sections:
1. check if we have enough available slots in tx ring and completion
   ring.
2. read descriptors from tx ring into pool->tx_descs in batches
3. reserve enough slots in completion ring to avoid backpressure
4. allocate and build skbs in batches
5. send all the possible packets in batches at one time

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c       | 116 ++++++++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h |   8 +++
 2 files changed, 124 insertions(+)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c26e26cb4dda..e1ad2ac2b39a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -920,6 +920,122 @@ struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	return ERR_PTR(err);
 }
 
+static int __xsk_generic_xmit_batch(struct xdp_sock *xs)
+{
+	struct xsk_buff_pool *pool = xs->pool;
+	struct xsk_batch *batch = &xs->batch;
+	struct xdp_desc *descs = batch->desc_cache;
+	struct net_device *dev = xs->dev;
+	u32 max_batch, max_budget;
+	bool sent_frame = false;
+	struct sk_buff *skb;
+	u32 cons_descs;
+	int err = 0;
+	u32 i = 0;
+
+	mutex_lock(&xs->mutex);
+
+	/* Since we dropped the RCU read lock, the socket state might have changed. */
+	if (unlikely(!xsk_is_bound(xs))) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	if (xs->queue_id >= dev->real_num_tx_queues) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev))) {
+		err = -ENETDOWN;
+		goto out;
+	}
+
+	max_budget = READ_ONCE(xs->max_tx_budget);
+	max_batch = batch->generic_xmit_batch;
+
+	for (i = 0; i < max_budget; i += cons_descs) {
+		u32 nb_pkts = 0;
+		u32 nb_descs;
+
+		nb_descs = min(max_batch, max_budget - i);
+		nb_descs = xskq_cons_nb_entries(xs->tx, nb_descs);
+		if (!nb_descs)
+			goto out;
+
+		/* This is the backpressure mechanism for the Tx path. Try to
+		 * reserve space in the completion queue for all packets, but
+		 * if there are fewer slots available, just process that many
+		 * packets. This avoids having to implement any buffering in
+		 * the Tx path.
+		 */
+		nb_descs = xsk_cq_reserve_locked(pool, nb_descs);
+		if (!nb_descs) {
+			err = -EAGAIN;
+			goto out;
+		}
+
+		cons_descs = xskq_cons_read_desc_batch_copy(xs->tx, pool, descs,
+							    nb_descs, &nb_pkts);
+		if (cons_descs < nb_descs) {
+			u32 delta = nb_descs - cons_descs;
+
+			xsk_cq_cancel_locked(pool, delta);
+			xs->tx->queue_empty_descs += delta;
+			if (!cons_descs) {
+				err = -EAGAIN;
+				goto out;
+			}
+			nb_descs = cons_descs;
+		}
+
+		cons_descs = xsk_alloc_batch_skb(xs, nb_pkts, nb_descs, &err);
+		/* Return 'nb_descs - cons_descs' number of descs to the
+		 * pool if the batch allocation partially fails
+		 */
+		if (cons_descs < nb_descs) {
+			xskq_cons_cancel_n(xs->tx, nb_descs - cons_descs);
+			xsk_cq_cancel_locked(pool, nb_descs - cons_descs);
+		}
+
+		if (!skb_queue_empty(&batch->send_queue)) {
+			int err_xmit;
+
+			err_xmit = xsk_direct_xmit_batch(xs, dev);
+			if (err_xmit == NETDEV_TX_BUSY)
+				err = -EAGAIN;
+			else if (err_xmit == NET_XMIT_DROP)
+				err = -EBUSY;
+
+			sent_frame = true;
+		}
+
+		if (err)
+			goto out;
+	}
+
+	/* Maximum budget of descriptors have been consumed */
+	if (xskq_has_descs(xs->tx))
+		err = -EAGAIN;
+
+out:
+	if (xs->skb)
+		xsk_drop_skb(xs->skb);
+
+	/* If send_queue has more pending skbs, we must to clear
+	 * the rest of them.
+	 */
+	while ((skb = __skb_dequeue(&batch->send_queue)) != NULL) {
+		xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
+		xsk_consume_skb(skb);
+	}
+	if (sent_frame)
+		__xsk_tx_release(xs);
+
+	mutex_unlock(&xs->mutex);
+	return err;
+}
+
 static int __xsk_generic_xmit(struct sock *sk)
 {
 	struct xdp_sock *xs = xdp_sk(sk);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 34cc07d6115e..c3b97c6f2910 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -314,6 +314,14 @@ xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 					   NULL, pool->xdp_zc_max_segs);
 }
 
+static inline u32
+xskq_cons_read_desc_batch_copy(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			       struct xdp_desc *descs, u32 max, u32 *nb_pkts)
+{
+	return __xskq_cons_read_desc_batch(q, pool, descs, max,
+					   nb_pkts, MAX_SKB_FRAGS);
+}
+
 /* Functions for consumers */
 
 static inline void __xskq_cons_release(struct xsk_queue *q)
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 10/14] xsk: extend xsk_cq_reserve_locked() to reserve n slots
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Previously it only reserves one slot. The patch extends it to n to cover
the batch mode.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk.c       | 12 ++++++++----
 net/xdp/xsk_queue.h | 12 +++++++-----
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 6cd2e58e170c..c26e26cb4dda 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -546,12 +546,17 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
+/* The function tries to reserve as many descs as possible. If there
+ * is no single slot to allocate, return zero. Otherwise, return how
+ * many slots are available, even though it might stop reserving at
+ * certain point.
+ */
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool, u32 n)
 {
 	int ret;
 
 	spin_lock(&pool->cq->cq_cached_prod_lock);
-	ret = xskq_prod_reserve(pool->cq);
+	ret = xskq_prod_reserve(pool->cq, n);
 	spin_unlock(&pool->cq->cq_cached_prod_lock);
 
 	return ret;
@@ -947,8 +952,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_locked(xs->pool);
-		if (err) {
+		if (!xsk_cq_reserve_locked(xs->pool, 1)) {
 			err = -EAGAIN;
 			goto out;
 		}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 354f6fe86893..34cc07d6115e 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -413,14 +413,16 @@ static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt)
 	q->cached_prod -= cnt;
 }
 
-static inline int xskq_prod_reserve(struct xsk_queue *q)
+static inline int xskq_prod_reserve(struct xsk_queue *q, u32 n)
 {
-	if (xskq_prod_is_full(q))
-		return -ENOSPC;
+	u32 nr_free = xskq_prod_nb_free(q, n);
+
+	if (!nr_free)
+		return 0;
 
 	/* A, matches D */
-	q->cached_prod++;
-	return 0;
+	q->cached_prod += nr_free;
+	return nr_free;
 }
 
 static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
-- 
2.41.3


^ permalink raw reply related

* [PATCH RFC net-next v4 09/14] xsk: extend xskq_cons_read_desc_batch to count nb_pkts
From: Jason Xing @ 2026-04-15  8:26 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, bjorn, magnus.karlsson,
	maciej.fijalkowski, jonathan.lemon, sdf, ast, daniel, hawk,
	john.fastabend
  Cc: bpf, netdev, Jason Xing
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>

From: Jason Xing <kernelxing@tencent.com>

Add a new parameter nb_pkts to count how many packets are needed
practically by copy mode with the help of XDP_PKT_CONTD option.

Add descs to provide a way to pass xs->desc_cache to store the
descriptors for copy mode.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 net/xdp/xsk_queue.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index ec08d9c102b1..354f6fe86893 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -263,12 +263,12 @@ static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool,
 	parsed->mb = xp_mb_desc(desc);
 }
 
-static inline
-u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
-			      u32 max)
+static inline u32
+__xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			    struct xdp_desc *descs, u32 max, u32 *nb_pkts,
+			    u32 max_segs)
 {
 	u32 cached_cons = q->cached_cons, nb_entries = 0;
-	struct xdp_desc *descs = pool->tx_descs;
 	u32 total_descs = 0, nr_frags = 0;
 
 	/* track first entry, if stumble upon *any* invalid descriptor, rewind
@@ -288,9 +288,11 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 		if (likely(!parsed.mb)) {
 			total_descs += (nr_frags + 1);
 			nr_frags = 0;
+			if (nb_pkts)
+				(*nb_pkts)++;
 		} else {
 			nr_frags++;
-			if (nr_frags == pool->xdp_zc_max_segs) {
+			if (nr_frags == max_segs) {
 				nr_frags = 0;
 				break;
 			}
@@ -304,6 +306,14 @@ u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 	return total_descs;
 }
 
+static inline u32
+xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
+			  u32 max)
+{
+	return __xskq_cons_read_desc_batch(q, pool, pool->tx_descs, max,
+					   NULL, pool->xdp_zc_max_segs);
+}
+
 /* Functions for consumers */
 
 static inline void __xskq_cons_release(struct xsk_queue *q)
-- 
2.41.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox