Netdev List
 help / color / mirror / Atom feed
* [PATCH 09/10] rust: driver: remove duplicate ID table
From: Gary Guo @ 2026-06-18 17:03 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Rafael J. Wysocki, Danilo Krummrich,
	Miguel Ojeda, Boqun Feng, Björn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Daniel Almeida,
	Tamir Duberstein, Alexandre Courbot, Onur Özkan,
	FUJITA Tomonori, David Airlie, Simona Vetter, Bjorn Helgaas,
	Krzysztof Wilczyński, Abdiel Janulgue, Robin Murphy,
	Dave Ertman, Ira Weiny, Leon Romanovsky, Len Brown, Igor Korotin,
	Rob Herring, Saravana Kannan, Viresh Kumar, Michal Wilczynski,
	Drew Fustini, Guo Ren, Fu Wei, Uwe Kleine-König
  Cc: driver-core, rust-for-linux, linux-kernel, netdev, nova-gpu,
	dri-devel, linux-pci, linux-acpi, devicetree, linux-pm,
	linux-riscv, linux-pwm, Gary Guo
In-Reply-To: <20260618-id_info-v1-0-96af1e559ef9@garyguo.net>

Previously, `IdArray` contains both device ID table and info table so we
keep a separate copy for MODULE_DEVICE_TABLE for hotplug (which needs to be
just the device ID table). With the info being changed to be carried via
pointers, `IdArray` is now layout compatible with raw ID table and hence
there is no longer a need to keep the distinction.

Deduplicate the code, and remove the redundant copy for hotplug purpose by
just giving the `IdArray` instance a proper symbol name.

Signed-off-by: Gary Guo <gary@garyguo.net>
---
 rust/kernel/device_id.rs | 76 +++++++++++++++++-------------------------------
 1 file changed, 27 insertions(+), 49 deletions(-)

diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs
index 59453588df0e..26618bcda276 100644
--- a/rust/kernel/device_id.rs
+++ b/rust/kernel/device_id.rs
@@ -86,28 +86,23 @@ unsafe fn info_unchecked_opt<U>(&self) -> Option<&'static U> {
     }
 }
 
-/// A zero-terminated device id array.
+/// A zero-terminated device id array, followed by context data.
 #[repr(C)]
-pub struct RawIdArray<T: RawDeviceId, const N: usize> {
+pub struct IdArray<T: RawDeviceId, U: 'static, const N: usize> {
     // This is `MaybeUninit<T::RawType>` so any bytes inside it can carry provenance in CTFE.
     // If this were `T::RawType`, integer fields would not be able to contain pointers.
     ids: [MaybeUninit<T::RawType>; N],
     sentinel: MaybeUninit<T::RawType>,
+    phantom: PhantomData<&'static U>,
 }
 
-impl<T: RawDeviceId, const N: usize> RawIdArray<T, N> {
-    #[doc(hidden)]
-    pub const fn size(&self) -> usize {
-        core::mem::size_of::<Self>()
-    }
-}
+// SAFETY: device ID is plain data plus a `&'static U` and can thus be sent between threads safely
+// if `&U` can.
+unsafe impl<T: RawDeviceId, U: Sync + 'static, const N: usize> Send for IdArray<T, U, N> {}
 
-/// A zero-terminated device id array, followed by context data.
-#[repr(C)]
-pub struct IdArray<T: RawDeviceId, U: 'static, const N: usize> {
-    raw_ids: RawIdArray<T, N>,
-    phantom: PhantomData<&'static U>,
-}
+// SAFETY: device ID is plain data plus a `&'static U` and can thus be shared between threads safely
+// if `&U` can.
+unsafe impl<T: RawDeviceId, U: Sync + 'static, const N: usize> Sync for IdArray<T, U, N> {}
 
 impl<T: RawDeviceId + RawDeviceIdIndex, U: 'static, const N: usize> IdArray<T, U, N> {
     /// Creates a new instance of the array.
@@ -137,22 +132,13 @@ impl<T: RawDeviceId + RawDeviceIdIndex, U: 'static, const N: usize> IdArray<T, U
         core::mem::forget(ids);
 
         Self {
-            raw_ids: RawIdArray {
-                ids: raw_ids,
-                sentinel: MaybeUninit::zeroed(),
-            },
+            ids: raw_ids,
+            sentinel: MaybeUninit::zeroed(),
             phantom: PhantomData,
         }
     }
 }
 
-impl<T: RawDeviceId, U: 'static, const N: usize> IdArray<T, U, N> {
-    /// Reference to the contained [`RawIdArray`].
-    pub const fn raw_ids(&self) -> &RawIdArray<T, N> {
-        &self.raw_ids
-    }
-}
-
 impl<T: RawDeviceId, const N: usize> IdArray<T, (), N> {
     /// Creates a new instance of the array without writing index values.
     ///
@@ -164,10 +150,8 @@ impl<T: RawDeviceId, const N: usize> IdArray<T, (), N> {
         core::mem::forget(ids);
 
         Self {
-            raw_ids: RawIdArray {
-                ids: raw_ids,
-                sentinel: MaybeUninit::zeroed(),
-            },
+            ids: raw_ids,
+            sentinel: MaybeUninit::zeroed(),
             phantom: PhantomData,
         }
     }
@@ -200,13 +184,17 @@ macro_rules! module_device_table {
         $table_name: ident, $id_info_type: ty,
         [$(($id: expr, $info:expr $(,)?)),* $(,)?]
     ) => {
-        const $table_name: $crate::device_id::IdArray<
+        #[export_name =
+            concat!("__mod_device_table__", line!(),
+                    "__kmod_", module_path!(),
+                    "__", $table_type,
+                    "__", stringify!($table_name))
+        ]
+        static $table_name: $crate::device_id::IdArray<
             $device_id_ty,
             $id_info_type,
             { <[$device_id_ty]>::len(&[$($id,)*]) },
         > = $crate::device_id::IdArray::new([$(($id, &$info),)*]);
-
-        $crate::module_device_table!($table_type, $table_name);
     };
 
     // Case for no ID info.
@@ -215,26 +203,16 @@ macro_rules! module_device_table {
         $table_name: ident, @none,
         [$($id: expr),* $(,)?]
     ) => {
-        const $table_name: $crate::device_id::IdArray<
+        #[export_name =
+            concat!("__mod_device_table__", line!(),
+                    "__kmod_", module_path!(),
+                    "__", $table_type,
+                    "__", stringify!($table_name))
+        ]
+        static $table_name: $crate::device_id::IdArray<
             $device_id_ty,
             (),
             { <[$device_id_ty]>::len(&[$($id,)*]) },
         > = $crate::device_id::IdArray::new_without_index([$($id),*]);
-
-        $crate::module_device_table!($table_type, $table_name);
-    };
-
-    ($table_type: literal, $table_name:ident) => {
-        const _: () = {
-            #[rustfmt::skip]
-            #[export_name =
-                concat!("__mod_device_table__", line!(),
-                        "__kmod_", module_path!(),
-                        "__", $table_type,
-                        "__", stringify!($table_name))
-            ]
-            static TABLE: [::core::mem::MaybeUninit<u8>; $table_name.raw_ids().size()] =
-                unsafe { ::core::mem::transmute_copy($table_name.raw_ids()) };
-        };
     };
 }

-- 
2.54.0


^ permalink raw reply related

* [PATCH 10/10] RFC: rust: driver: support map-like syntax for ID table
From: Gary Guo @ 2026-06-18 17:03 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Rafael J. Wysocki, Danilo Krummrich,
	Miguel Ojeda, Boqun Feng, Björn Roy Baron, Benno Lossin,
	Andreas Hindborg, Alice Ryhl, Trevor Gross, Daniel Almeida,
	Tamir Duberstein, Alexandre Courbot, Onur Özkan,
	FUJITA Tomonori, David Airlie, Simona Vetter, Bjorn Helgaas,
	Krzysztof Wilczyński, Abdiel Janulgue, Robin Murphy,
	Dave Ertman, Ira Weiny, Leon Romanovsky, Len Brown, Igor Korotin,
	Rob Herring, Saravana Kannan, Viresh Kumar, Michal Wilczynski,
	Drew Fustini, Guo Ren, Fu Wei, Uwe Kleine-König
  Cc: driver-core, rust-for-linux, linux-kernel, netdev, nova-gpu,
	dri-devel, linux-pci, linux-acpi, devicetree, linux-pm,
	linux-riscv, linux-pwm, Gary Guo
In-Reply-To: <20260618-id_info-v1-0-96af1e559ef9@garyguo.net>

The device ID table and its associated info is really just a map. Add a
syntax to `module_device_table` macro that reflects that.

Signed-off-by: Gary Guo <gary@garyguo.net>
---
 rust/kernel/device_id.rs        | 11 +++++++++++
 samples/rust/rust_driver_pci.rs |  7 +++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs
index 26618bcda276..7c61cdcc9427 100644
--- a/rust/kernel/device_id.rs
+++ b/rust/kernel/device_id.rs
@@ -183,6 +183,17 @@ macro_rules! module_device_table {
         $table_type: literal, $device_id_ty: ty,
         $table_name: ident, $id_info_type: ty,
         [$(($id: expr, $info:expr $(,)?)),* $(,)?]
+    ) => {
+        $crate::module_device_table!(
+            $table_type, $device_id_ty, $table_name, $id_info_type,
+            {$($id=>$info,)*}
+        );
+    };
+
+    (
+        $table_type: literal, $device_id_ty: ty,
+        $table_name: ident, $id_info_type: ty,
+        {$($id: expr => $info:expr),* $(,)?}
     ) => {
         #[export_name =
             concat!("__mod_device_table__", line!(),
diff --git a/samples/rust/rust_driver_pci.rs b/samples/rust/rust_driver_pci.rs
index 2282191e6292..652819dff082 100644
--- a/samples/rust/rust_driver_pci.rs
+++ b/samples/rust/rust_driver_pci.rs
@@ -75,10 +75,9 @@ struct SampleDriverData<'bound> {
 kernel::pci_device_table!(
     PCI_TABLE,
     <SampleDriver as pci::Driver>::IdInfo,
-    [(
-        pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5),
-        TestIndex::NO_EVENTFD
-    )]
+    {
+        pci::DeviceId::from_id(pci::Vendor::REDHAT, 0x5) => TestIndex::NO_EVENTFD,
+    }
 );
 
 impl SampleDriverData<'_> {

-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH 1/1] xfrm: nat_keepalive: avoid double free on send error
From: Eyal Birger @ 2026-06-18 17:21 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, steffen.klassert, herbert, davem, yuantan098, bird,
	qianyuluo3
In-Reply-To: <46eb334399ce0e25e0897b42f21020541d159300.1781788385.git.qianyuluo3@gmail.com>

On Thu, Jun 18, 2026 at 9:36 AM Ren Wei <n05ec@lzu.edu.cn> wrote:
>
> From: Qianyu Luo <qianyuluo3@gmail.com>
>
> nat_keepalive_send() frees the keepalive skb whenever the IPv4 or IPv6
> send helper reports an error.
>
> That cleanup is only correct before the skb is handed to the output
> path. Once ip_build_and_send_pkt() or ip6_xmit() takes ownership, the
> networking stack may already have consumed the skb before returning an
> error, so freeing it again is unsafe.
>
> Handle the pre-handoff failure cases inside nat_keepalive_send_ipv4()
> and nat_keepalive_send_ipv6(), where the caller still owns the skb, and
> keep nat_keepalive_send() responsible only for family dispatch and the
> unsupported-family cleanup path.

Thanks for the fix!

>
> Fixes: f531d13bdfe3 ("xfrm: support sending NAT keepalives in ESP in UDP states")
> Cc: stable@vger.kernel.org
> Reported-by: Yuan Tan <yuantan098@gmail.com>
> Reported-by: Xin Liu <bird@lzu.edu.cn>
> Signed-off-by: Qianyu Luo <qianyuluo3@gmail.com>
> Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
> ---
>  net/xfrm/xfrm_nat_keepalive.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c
> index 458931062a04..f71328096f11 100644
> --- a/net/xfrm/xfrm_nat_keepalive.c
> +++ b/net/xfrm/xfrm_nat_keepalive.c
> @@ -55,8 +55,10 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb,
>                            ka->encap_sport, sock_net_uid(net, NULL));
>
>         rt = ip_route_output_key(net, &fl4);
> -       if (IS_ERR(rt))
> +       if (IS_ERR(rt)) {
> +               kfree_skb(skb);
>                 return PTR_ERR(rt);
> +       }
>
>         skb_dst_set(skb, &rt->dst);
>
> @@ -100,6 +102,7 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb,
>         sock_net_set(sk, net);
>         dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL);
>         if (IS_ERR(dst)) {
> +               kfree_skb(skb);
>                 local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock);

Any reason to do the kfree under lock?

Eyal

^ permalink raw reply

* [PATCH v28 0/5] Type2 device basic support
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero

From: Alejandro Lucero <alucerop@amd.com>

This series adds the last bits for allowing a CXL Type2 driver to obtain
a CXL region linked to the device HDM decoders committed by the BIOS,
with the driver being the sfc network driver.

Changes from v27:

 - patch 1: make driver probe failing if error in efx_cxl_init (Dan)
 - patch 4: add unmapping if error after efx_cxl_init (Dave)
 - patch 4/5: move cxl_pio_initialised from patch 4 to patch 5 (Dave)

Tested in the cxl_for_7.3 branch.

Alejandro Lucero (5):
  sfc: add cxl support
  cxl/sfc: Map cxl regs
  cxl/sfc: Initialize dpa without a mailbox
  sfc: obtain and map cxl range using devm_cxl_probe_mem
  sfc: support pio mapping based on cxl

 drivers/cxl/core/core.h               |   2 +
 drivers/cxl/core/mbox.c               |  51 +------------
 drivers/cxl/core/memdev.c             |  67 ++++++++++++++++
 drivers/cxl/core/pci.c                |   1 +
 drivers/cxl/core/port.c               |   1 +
 drivers/cxl/core/regs.c               |   1 +
 drivers/cxl/cxlpci.h                  |  12 ---
 drivers/cxl/pci.c                     |   1 +
 drivers/net/ethernet/sfc/Kconfig      |   9 +++
 drivers/net/ethernet/sfc/Makefile     |   1 +
 drivers/net/ethernet/sfc/ef10.c       |  41 ++++++++--
 drivers/net/ethernet/sfc/efx.c        |  18 ++++-
 drivers/net/ethernet/sfc/efx.h        |   1 -
 drivers/net/ethernet/sfc/efx_cxl.c    | 105 ++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h    |  32 ++++++++
 drivers/net/ethernet/sfc/net_driver.h |  10 +++
 drivers/net/ethernet/sfc/nic.h        |   3 +
 include/cxl/cxl.h                     |   2 +
 include/cxl/pci.h                     |  22 ++++++
 19 files changed, 309 insertions(+), 71 deletions(-)
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h
 create mode 100644 include/cxl/pci.h


base-commit: 9b1e70e8f9ec4b5c6ce7fa774a0023bb6894c686
-- 
2.34.1


^ permalink raw reply

* [PATCH v28 1/5] sfc: add cxl support
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero, Jonathan Cameron, Edward Cree, Alison Schofield,
	Dan Williams
In-Reply-To: <20260618181806.118745-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Add CXL initialization based on new CXL API for accel drivers and make
it dependent on kernel CXL configuration.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Edward Cree <ecree.xilinx@gmail.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/net/ethernet/sfc/Kconfig      |  9 +++++
 drivers/net/ethernet/sfc/Makefile     |  1 +
 drivers/net/ethernet/sfc/efx.c        | 16 ++++++++-
 drivers/net/ethernet/sfc/efx_cxl.c    | 50 +++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h    | 29 ++++++++++++++++
 drivers/net/ethernet/sfc/net_driver.h |  9 +++++
 6 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.c
 create mode 100644 drivers/net/ethernet/sfc/efx_cxl.h

diff --git a/drivers/net/ethernet/sfc/Kconfig b/drivers/net/ethernet/sfc/Kconfig
index c4c43434f314..979f2801e2a8 100644
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@@ -66,6 +66,15 @@ config SFC_MCDI_LOGGING
 	  Driver-Interface) commands and responses, allowing debugging of
 	  driver/firmware interaction.  The tracing is actually enabled by
 	  a sysfs file 'mcdi_logging' under the PCI device.
+config SFC_CXL
+	bool "Solarflare SFC9100-family CXL support"
+	depends on SFC && CXL_BUS >= SFC
+	default SFC
+	help
+	  This enables SFC CXL support if the kernel is configuring CXL for
+	  using CTPIO with CXL.mem. The SFC device with CXL support and
+	  with a CXL-aware firmware can be used for minimizing latencies
+	  when sending through CTPIO.
 
 source "drivers/net/ethernet/sfc/falcon/Kconfig"
 source "drivers/net/ethernet/sfc/siena/Kconfig"
diff --git a/drivers/net/ethernet/sfc/Makefile b/drivers/net/ethernet/sfc/Makefile
index d99039ec468d..bb0f1891cde6 100644
--- a/drivers/net/ethernet/sfc/Makefile
+++ b/drivers/net/ethernet/sfc/Makefile
@@ -13,6 +13,7 @@ sfc-$(CONFIG_SFC_SRIOV)	+= sriov.o ef10_sriov.o ef100_sriov.o ef100_rep.o \
                            mae.o tc.o tc_bindings.o tc_counters.o \
                            tc_encap_actions.o tc_conntrack.o
 
+sfc-$(CONFIG_SFC_CXL)	+= efx_cxl.o
 obj-$(CONFIG_SFC)	+= sfc.o
 
 obj-$(CONFIG_SFC_FALCON) += falcon/
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 8f136a11d396..da008462096d 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -34,6 +34,7 @@
 #include "selftest.h"
 #include "sriov.h"
 #include "efx_devlink.h"
+#include "efx_cxl.h"
 
 #include "mcdi_port_common.h"
 #include "mcdi_pcol.h"
@@ -981,12 +982,14 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	efx_pci_remove_main(efx);
 
 	efx_fini_io(efx);
+
+	probe_data = container_of(efx, struct efx_probe_data, efx);
+
 	pci_dbg(efx->pci_dev, "shutdown successful\n");
 
 	efx_fini_devlink_and_unlock(efx);
 	efx_fini_struct(efx);
 	free_netdev(efx->net_dev);
-	probe_data = container_of(efx, struct efx_probe_data, efx);
 	kfree(probe_data);
 };
 
@@ -1190,6 +1193,17 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
 	if (rc)
 		goto fail2;
 
+	/* A successful cxl initialization implies a CXL region created to be
+	 * used for PIO buffers. If there is no CXL support legacy PIO buffers
+	 * defined at specific PCI BAR regions will be used. If there is CXL
+	 * support and the cxl initialization fails, the driver probe fails.
+	 */
+	rc = efx_cxl_init(probe_data);
+	if (rc) {
+		pci_err(pci_dev, "CXL initialization failed with error %d\n", rc);
+		goto fail2;
+	}
+
 	rc = efx_pci_probe_post_io(efx);
 	if (rc) {
 		/* On failure, retry once immediately.
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
new file mode 100644
index 000000000000..be252af972ab
--- /dev/null
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/****************************************************************************
+ *
+ * Driver for AMD network controllers and boards
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ */
+
+#include <linux/pci.h>
+
+#include "net_driver.h"
+#include "efx_cxl.h"
+
+#define EFX_CTPIO_BUFFER_SIZE	SZ_256M
+
+int efx_cxl_init(struct efx_probe_data *probe_data)
+{
+	struct efx_nic *efx = &probe_data->efx;
+	struct pci_dev *pci_dev = efx->pci_dev;
+	struct efx_cxl *cxl;
+	u16 dvsec;
+
+	/* Is the device configured with and using CXL? */
+	if (!pcie_is_cxl(pci_dev))
+		return 0;
+
+	dvsec = pci_find_dvsec_capability(pci_dev, PCI_VENDOR_ID_CXL,
+					  PCI_DVSEC_CXL_DEVICE);
+	if (!dvsec) {
+		pci_info(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability not found\n");
+		return 0;
+	}
+
+	pci_dbg(pci_dev, "CXL_DVSEC_PCIE_DEVICE capability found\n");
+
+	/* Create a cxl_dev_state embedded in the cxl struct using cxl core api
+	 * specifying no mbox available.
+	 */
+	cxl = devm_cxl_dev_state_create(&pci_dev->dev, CXL_DEVTYPE_DEVMEM,
+					pci_get_dsn(pci_dev), dvsec,
+					struct efx_cxl, cxlds, false);
+
+	if (!cxl)
+		return -ENOMEM;
+
+	probe_data->cxl = cxl;
+
+	return 0;
+}
+
+MODULE_IMPORT_NS("CXL");
diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h
new file mode 100644
index 000000000000..04e46278464d
--- /dev/null
+++ b/drivers/net/ethernet/sfc/efx_cxl.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/****************************************************************************
+ * Driver for AMD network controllers and boards
+ * Copyright (C) 2025, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#ifndef EFX_CXL_H
+#define EFX_CXL_H
+
+#ifdef CONFIG_SFC_CXL
+
+#include <cxl/cxl.h>
+
+struct efx_probe_data;
+
+struct efx_cxl {
+	struct cxl_dev_state cxlds;
+	struct cxl_memdev *cxlmd;
+};
+
+int efx_cxl_init(struct efx_probe_data *probe_data);
+#else
+static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; }
+#endif
+#endif
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index b98c259f672d..de3fc9537662 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -1197,14 +1197,23 @@ struct efx_nic {
 	atomic_t n_rx_noskb_drops;
 };
 
+#ifdef CONFIG_SFC_CXL
+struct efx_cxl;
+#endif
+
 /**
  * struct efx_probe_data - State after hardware probe
  * @pci_dev: The PCI device
  * @efx: Efx NIC details
+ * @cxl: details of related cxl objects
+ * @cxl_pio_initialised: cxl initialization outcome.
  */
 struct efx_probe_data {
 	struct pci_dev *pci_dev;
 	struct efx_nic efx;
+#ifdef CONFIG_SFC_CXL
+	struct efx_cxl *cxl;
+#endif
 };
 
 static inline struct efx_nic *efx_netdev_priv(struct net_device *dev)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v28 2/5] cxl/sfc: Map cxl regs
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero, Dan Williams, Jonathan Cameron, Ben Cheatham
In-Reply-To: <20260618181806.118745-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Export cxl core functions for a Type2 driver being able to discover and
map the device registers.

Use it in sfc driver cxl initialization.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
---
 drivers/cxl/core/pci.c             |  1 +
 drivers/cxl/core/port.c            |  1 +
 drivers/cxl/core/regs.c            |  1 +
 drivers/cxl/cxlpci.h               | 12 ------------
 drivers/cxl/pci.c                  |  1 +
 drivers/net/ethernet/sfc/efx_cxl.c | 26 ++++++++++++++++++++++++++
 include/cxl/pci.h                  | 22 ++++++++++++++++++++++
 7 files changed, 52 insertions(+), 12 deletions(-)
 create mode 100644 include/cxl/pci.h

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index d1f487b3d809..2bcd683aa286 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -6,6 +6,7 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/pci-doe.h>
+#include <cxl/pci.h>
 #include <linux/aer.h>
 #include <cxlpci.h>
 #include <cxlmem.h>
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 1215ee4f4035..cb633e19151b 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -11,6 +11,7 @@
 #include <linux/idr.h>
 #include <linux/node.h>
 #include <cxl/einj.h>
+#include <cxl/pci.h>
 #include <cxlmem.h>
 #include <cxlpci.h>
 #include <cxl.h>
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
index 93710cf4f0a6..20c2d9fbcfe7 100644
--- a/drivers/cxl/core/regs.c
+++ b/drivers/cxl/core/regs.c
@@ -4,6 +4,7 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <cxl/pci.h>
 #include <cxlmem.h>
 #include <cxlpci.h>
 #include <pmu.h>
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index b826eb53cf7b..110ec9c44f09 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -13,16 +13,6 @@
  */
 #define CXL_PCI_DEFAULT_MAX_VECTORS 16
 
-/* Register Block Identifier (RBI) */
-enum cxl_regloc_type {
-	CXL_REGLOC_RBI_EMPTY = 0,
-	CXL_REGLOC_RBI_COMPONENT,
-	CXL_REGLOC_RBI_VIRT,
-	CXL_REGLOC_RBI_MEMDEV,
-	CXL_REGLOC_RBI_PMU,
-	CXL_REGLOC_RBI_TYPES
-};
-
 /*
  * Table Access DOE, CDAT Read Entry Response
  *
@@ -112,6 +102,4 @@ static inline void devm_cxl_port_ras_setup(struct cxl_port *port)
 }
 #endif
 
-int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
-		       struct cxl_register_map *map);
 #endif /* __CXL_PCI_H__ */
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 267c679b0b3c..bb892dbfdd6d 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/aer.h>
 #include <linux/io.h>
+#include <cxl/pci.h>
 #include <cxl/mailbox.h>
 #include "cxlmem.h"
 #include "cxlpci.h"
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index be252af972ab..704b0ebae937 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -7,6 +7,8 @@
 
 #include <linux/pci.h>
 
+#include <cxl/cxl.h>
+#include <cxl/pci.h>
 #include "net_driver.h"
 #include "efx_cxl.h"
 
@@ -18,6 +20,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	struct pci_dev *pci_dev = efx->pci_dev;
 	struct efx_cxl *cxl;
 	u16 dvsec;
+	int rc;
 
 	/* Is the device configured with and using CXL? */
 	if (!pcie_is_cxl(pci_dev))
@@ -42,6 +45,29 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	if (!cxl)
 		return -ENOMEM;
 
+	rc = cxl_pci_setup_regs(pci_dev, CXL_REGLOC_RBI_COMPONENT,
+				&cxl->cxlds.reg_map);
+	if (rc) {
+		pci_err(pci_dev, "No component registers\n");
+		return rc;
+	}
+
+	if (!cxl->cxlds.reg_map.component_map.hdm_decoder.valid) {
+		pci_err(pci_dev, "Expected HDM component register not found\n");
+		return -ENODEV;
+	}
+
+	if (!cxl->cxlds.reg_map.component_map.ras.valid) {
+		pci_err(pci_dev, "Expected RAS component register not found\n");
+		return -ENODEV;
+	}
+
+	/* Set media ready explicitly as there are neither mailbox for checking
+	 * this state nor the CXL register involved, both not mandatory for
+	 * type2.
+	 */
+	cxl->cxlds.media_ready = true;
+
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/include/cxl/pci.h b/include/cxl/pci.h
new file mode 100644
index 000000000000..3e0000015871
--- /dev/null
+++ b/include/cxl/pci.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
+
+#ifndef __CXL_CXL_PCI_H__
+#define __CXL_CXL_PCI_H__
+
+/* Register Block Identifier (RBI) */
+enum cxl_regloc_type {
+	CXL_REGLOC_RBI_EMPTY = 0,
+	CXL_REGLOC_RBI_COMPONENT,
+	CXL_REGLOC_RBI_VIRT,
+	CXL_REGLOC_RBI_MEMDEV,
+	CXL_REGLOC_RBI_PMU,
+	CXL_REGLOC_RBI_TYPES
+};
+
+struct cxl_register_map;
+struct pci_dev;
+
+int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
+		       struct cxl_register_map *map);
+#endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH v28 3/5] cxl/sfc: Initialize dpa without a mailbox
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero, Dan Williams, Ben Cheatham, Jonathan Cameron
In-Reply-To: <20260618181806.118745-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Type3 relies on mailbox CXL_MBOX_OP_IDENTIFY command for initializing
memdev state params which end up being used for DPA initialization.

Allow a Type2 driver to initialize DPA simply by giving the size of its
volatile hardware partition.

Move related functions to memdev.

Add sfc driver as the client.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Ben Cheatham <benjamin.cheatham@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/cxl/core/core.h            |  2 +
 drivers/cxl/core/mbox.c            | 51 +----------------------
 drivers/cxl/core/memdev.c          | 67 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.c |  5 +++
 include/cxl/cxl.h                  |  2 +
 5 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 07555ae63859..f7cebb026552 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -101,6 +101,8 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr,
 struct dentry *cxl_debugfs_create_dir(const char *dir);
 int cxl_dpa_set_part(struct cxl_endpoint_decoder *cxled,
 		     enum cxl_partition_mode mode);
+struct cxl_memdev_state;
+int cxl_mem_get_partition_info(struct cxl_memdev_state *mds);
 int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, u64 size);
 int cxl_dpa_free(struct cxl_endpoint_decoder *cxled);
 resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled);
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 7c6c5b7450a5..97b1e61ad018 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1152,7 +1152,7 @@ EXPORT_SYMBOL_NS_GPL(cxl_mem_get_event_records, "CXL");
  *
  * See CXL @8.2.9.5.2.1 Get Partition Info
  */
-static int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
+int cxl_mem_get_partition_info(struct cxl_memdev_state *mds)
 {
 	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
 	struct cxl_mbox_get_partition_info pi;
@@ -1308,55 +1308,6 @@ int cxl_mem_sanitize(struct cxl_memdev *cxlmd, u16 cmd)
 	return -EBUSY;
 }
 
-static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
-{
-	int i = info->nr_partitions;
-
-	if (size == 0)
-		return;
-
-	info->part[i].range = (struct range) {
-		.start = start,
-		.end = start + size - 1,
-	};
-	info->part[i].mode = mode;
-	info->nr_partitions++;
-}
-
-int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
-{
-	struct cxl_dev_state *cxlds = &mds->cxlds;
-	struct device *dev = cxlds->dev;
-	int rc;
-
-	if (!cxlds->media_ready) {
-		info->size = 0;
-		return 0;
-	}
-
-	info->size = mds->total_bytes;
-
-	if (mds->partition_align_bytes == 0) {
-		add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
-		add_part(info, mds->volatile_only_bytes,
-			 mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
-		return 0;
-	}
-
-	rc = cxl_mem_get_partition_info(mds);
-	if (rc) {
-		dev_err(dev, "Failed to query partition information\n");
-		return rc;
-	}
-
-	add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
-	add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
-		 CXL_PARTMODE_PMEM);
-
-	return 0;
-}
-EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
-
 int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
 {
 	struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index 33a3d2e7b13a..500f077f935d 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -594,6 +594,73 @@ bool is_cxl_memdev(const struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, "CXL");
 
+static void add_part(struct cxl_dpa_info *info, u64 start, u64 size, enum cxl_partition_mode mode)
+{
+	int i = info->nr_partitions;
+
+	if (size == 0)
+		return;
+
+	info->part[i].range = (struct range) {
+		.start = start,
+		.end = start + size - 1,
+	};
+	info->part[i].mode = mode;
+	info->nr_partitions++;
+}
+
+int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
+{
+	struct cxl_dev_state *cxlds = &mds->cxlds;
+	struct device *dev = cxlds->dev;
+	int rc;
+
+	if (!cxlds->media_ready) {
+		info->size = 0;
+		return 0;
+	}
+
+	info->size = mds->total_bytes;
+
+	if (mds->partition_align_bytes == 0) {
+		add_part(info, 0, mds->volatile_only_bytes, CXL_PARTMODE_RAM);
+		add_part(info, mds->volatile_only_bytes,
+			 mds->persistent_only_bytes, CXL_PARTMODE_PMEM);
+		return 0;
+	}
+
+	rc = cxl_mem_get_partition_info(mds);
+	if (rc) {
+		dev_err(dev, "Failed to query partition information\n");
+		return rc;
+	}
+
+	add_part(info, 0, mds->active_volatile_bytes, CXL_PARTMODE_RAM);
+	add_part(info, mds->active_volatile_bytes, mds->active_persistent_bytes,
+		 CXL_PARTMODE_PMEM);
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
+
+
+/**
+ * cxl_set_capacity: initialize dpa by a driver without a mailbox.
+ *
++ * @cxlds: pointer to cxl_dev_state
+ * @capacity: device volatile memory size
+ */
+int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity)
+{
+	struct cxl_dpa_info range_info = {
+		.size = capacity,
+	};
+
+	add_part(&range_info, 0, capacity, CXL_PARTMODE_RAM);
+	return cxl_dpa_setup(cxlds, &range_info);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_set_capacity, "CXL");
+
 /**
  * set_exclusive_cxl_commands() - atomically disable user cxl commands
  * @mds: The device state to operate on
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 704b0ebae937..18b535b3ea40 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -68,6 +68,11 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 	 */
 	cxl->cxlds.media_ready = true;
 
+	if (cxl_set_capacity(&cxl->cxlds, EFX_CTPIO_BUFFER_SIZE)) {
+		pci_err(pci_dev, "dpa capacity setup failed\n");
+		return -ENODEV;
+	}
+
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/include/cxl/cxl.h b/include/cxl/cxl.h
index 016c74fb747c..802b143de83d 100644
--- a/include/cxl/cxl.h
+++ b/include/cxl/cxl.h
@@ -226,4 +226,6 @@ struct cxl_dev_state *_devm_cxl_dev_state_create(struct device *dev,
 
 struct cxl_memdev *devm_cxl_probe_mem(struct cxl_dev_state *cxlds,
 				      struct range *range);
+
+int cxl_set_capacity(struct cxl_dev_state *cxlds, u64 capacity);
 #endif /* __CXL_CXL_H__ */
-- 
2.34.1


^ permalink raw reply related

* [PATCH v28 4/5] sfc: obtain and map cxl range using devm_cxl_probe_mem
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero
In-Reply-To: <20260618181806.118745-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

Use core API for safely obtain the CXL range linked to an HDM committed
by the BIOS. Map such a range for being used as the ctpio buffer.

A potential user space action through sysfs unbinding or core cxl
modules remove will trigger sfc driver device detachment, with that case
not racing with this mapping as this is done during driver probe and
therefore protected with device lock against those user space actions.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
---
 drivers/net/ethernet/sfc/efx.c     |  2 ++
 drivers/net/ethernet/sfc/efx_cxl.c | 23 +++++++++++++++++++++++
 drivers/net/ethernet/sfc/efx_cxl.h |  3 +++
 3 files changed, 28 insertions(+)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index da008462096d..abfa0ce2b4d1 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -984,6 +984,7 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	efx_fini_io(efx);
 
 	probe_data = container_of(efx, struct efx_probe_data, efx);
+	efx_cxl_exit(probe_data);
 
 	pci_dbg(efx->pci_dev, "shutdown successful\n");
 
@@ -1244,6 +1245,7 @@ static int efx_pci_probe(struct pci_dev *pci_dev,
  fail3:
 	efx_fini_io(efx);
  fail2:
+	efx_cxl_exit(probe_data);
 	efx_fini_struct(efx);
  fail1:
 	WARN_ON(rc > 0);
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 18b535b3ea40..3e7c950f83e9 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -18,6 +18,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 {
 	struct efx_nic *efx = &probe_data->efx;
 	struct pci_dev *pci_dev = efx->pci_dev;
+	struct range cxl_pio_range;
 	struct efx_cxl *cxl;
 	u16 dvsec;
 	int rc;
@@ -73,9 +74,31 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 		return -ENODEV;
 	}
 
+	cxl->cxlmd = devm_cxl_probe_mem(&cxl->cxlds, &cxl_pio_range);
+	if (IS_ERR(cxl->cxlmd)) {
+		pci_err(pci_dev, "CXL accel memdev creation failed\n");
+		return PTR_ERR(cxl->cxlmd);
+	}
+
+	cxl->ctpio_cxl = ioremap_wc(cxl_pio_range.start,
+				    range_len(&cxl_pio_range));
+	if (!cxl->ctpio_cxl) {
+		pci_err(pci_dev, "CXL ioremap region (%pra) failed\n",
+			&cxl_pio_range);
+		return -ENOMEM;
+	}
+
 	probe_data->cxl = cxl;
 
 	return 0;
 }
 
+void efx_cxl_exit(struct efx_probe_data *probe_data)
+{
+	if (!probe_data->cxl)
+		return;
+
+	iounmap(probe_data->cxl->ctpio_cxl);
+}
+
 MODULE_IMPORT_NS("CXL");
diff --git a/drivers/net/ethernet/sfc/efx_cxl.h b/drivers/net/ethernet/sfc/efx_cxl.h
index 04e46278464d..3e2705cb063f 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.h
+++ b/drivers/net/ethernet/sfc/efx_cxl.h
@@ -20,10 +20,13 @@ struct efx_probe_data;
 struct efx_cxl {
 	struct cxl_dev_state cxlds;
 	struct cxl_memdev *cxlmd;
+	void __iomem *ctpio_cxl;
 };
 
 int efx_cxl_init(struct efx_probe_data *probe_data);
+void efx_cxl_exit(struct efx_probe_data *probe_data);
 #else
 static inline int efx_cxl_init(struct efx_probe_data *probe_data) { return 0; }
+static inline void efx_cxl_exit(struct efx_probe_data *probe_data) {}
 #endif
 #endif
-- 
2.34.1


^ permalink raw reply related

* [PATCH v28 5/5] sfc: support pio mapping based on cxl
From: alejandro.lucero-palau @ 2026-06-18 18:18 UTC (permalink / raw)
  To: linux-cxl, netdev, djbw, edward.cree, davem, kuba, pabeni,
	edumazet, dave.jiang
  Cc: Alejandro Lucero
In-Reply-To: <20260618181806.118745-1-alejandro.lucero-palau@amd.com>

From: Alejandro Lucero <alucerop@amd.com>

A PIO buffer is a region of device memory to which the driver can write a
packet for TX, with the device handling the transmit doorbell without
requiring a DMA for getting the packet data, which helps reducing latency
in certain exchanges. With CXL mem protocol this latency can be lowered
further.

With a device supporting CXL and successfully initialised, use the cxl
region to map the memory range and use this mapping for PIO buffers.

Signed-off-by: Alejandro Lucero <alucerop@amd.com>
---
 drivers/net/ethernet/sfc/ef10.c       | 41 ++++++++++++++++++++++-----
 drivers/net/ethernet/sfc/efx.h        |  1 -
 drivers/net/ethernet/sfc/efx_cxl.c    |  1 +
 drivers/net/ethernet/sfc/net_driver.h |  1 +
 drivers/net/ethernet/sfc/nic.h        |  3 ++
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 7e04f115bbaa..73bc064929f6 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -24,6 +24,7 @@
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <net/udp_tunnel.h>
+#include "efx_cxl.h"
 
 /* Hardware control for EF10 architecture including 'Huntington'. */
 
@@ -106,7 +107,7 @@ static int efx_ef10_get_vf_index(struct efx_nic *efx)
 
 static int efx_ef10_init_datapath_caps(struct efx_nic *efx)
 {
-	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V4_OUT_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_GET_CAPABILITIES_V7_OUT_LEN);
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 	size_t outlen;
 	int rc;
@@ -177,6 +178,12 @@ static int efx_ef10_init_datapath_caps(struct efx_nic *efx)
 			  efx->num_mac_stats);
 	}
 
+	if (outlen < MC_CMD_GET_CAPABILITIES_V7_OUT_LEN)
+		nic_data->datapath_caps3 = 0;
+	else
+		nic_data->datapath_caps3 = MCDI_DWORD(outbuf,
+						      GET_CAPABILITIES_V7_OUT_FLAGS3);
+
 	return 0;
 }
 
@@ -1140,6 +1147,9 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 	unsigned int channel_vis, pio_write_vi_base, max_vis;
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
 	unsigned int uc_mem_map_size, wc_mem_map_size;
+#ifdef CONFIG_SFC_CXL
+	struct efx_probe_data *probe_data;
+#endif
 	void __iomem *membase;
 	int rc;
 
@@ -1263,8 +1273,23 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 	iounmap(efx->membase);
 	efx->membase = membase;
 
-	/* Set up the WC mapping if needed */
-	if (wc_mem_map_size) {
+	if (!wc_mem_map_size)
+		goto skip_pio;
+
+	/* Set up the WC mapping */
+
+#ifdef CONFIG_SFC_CXL
+	probe_data = container_of(efx, struct efx_probe_data, efx);
+	if ((nic_data->datapath_caps3 &
+	    (1 << MC_CMD_GET_CAPABILITIES_V7_OUT_CXL_CONFIG_ENABLE_LBN)) &&
+	    probe_data->cxl_pio_initialised) {
+		/* Using PIO through CXL mapping */
+		nic_data->pio_write_base = probe_data->cxl->ctpio_cxl;
+		nic_data->pio_write_vi_base = pio_write_vi_base;
+	} else
+#endif
+	{
+		/* Using legacy PIO BAR mapping */
 		nic_data->wc_membase = ioremap_wc(efx->membase_phys +
 						  uc_mem_map_size,
 						  wc_mem_map_size);
@@ -1279,12 +1304,14 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
 			nic_data->wc_membase +
 			(pio_write_vi_base * efx->vi_stride + ER_DZ_TX_PIOBUF -
 			 uc_mem_map_size);
-
-		rc = efx_ef10_link_piobufs(efx);
-		if (rc)
-			efx_ef10_free_piobufs(efx);
 	}
 
+	rc = efx_ef10_link_piobufs(efx);
+	if (rc)
+		efx_ef10_free_piobufs(efx);
+
+skip_pio:
+
 	netif_dbg(efx, probe, efx->net_dev,
 		  "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n",
 		  &efx->membase_phys, efx->membase, uc_mem_map_size,
diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
index 45e191686625..057d30090894 100644
--- a/drivers/net/ethernet/sfc/efx.h
+++ b/drivers/net/ethernet/sfc/efx.h
@@ -236,5 +236,4 @@ static inline bool efx_rwsem_assert_write_locked(struct rw_semaphore *sem)
 
 int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs,
 		       bool flush);
-
 #endif /* EFX_EFX_H */
diff --git a/drivers/net/ethernet/sfc/efx_cxl.c b/drivers/net/ethernet/sfc/efx_cxl.c
index 3e7c950f83e9..348d7404cd7a 100644
--- a/drivers/net/ethernet/sfc/efx_cxl.c
+++ b/drivers/net/ethernet/sfc/efx_cxl.c
@@ -88,6 +88,7 @@ int efx_cxl_init(struct efx_probe_data *probe_data)
 		return -ENOMEM;
 	}
 
+	probe_data->cxl_pio_initialised = true;
 	probe_data->cxl = cxl;
 
 	return 0;
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index de3fc9537662..3964b2c56609 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -1213,6 +1213,7 @@ struct efx_probe_data {
 	struct efx_nic efx;
 #ifdef CONFIG_SFC_CXL
 	struct efx_cxl *cxl;
+	bool cxl_pio_initialised;
 #endif
 };
 
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index ec3b2df43b68..7480f9995dfb 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -152,6 +152,8 @@ enum {
  *	%MC_CMD_GET_CAPABILITIES response)
  * @datapath_caps2: Further Capabilities of datapath firmware (FLAGS2 field of
  * %MC_CMD_GET_CAPABILITIES response)
+ * @datapath_caps3: Further Capabilities of datapath firmware (FLAGS3 field of
+ * %MC_CMD_GET_CAPABILITIES response)
  * @rx_dpcpu_fw_id: Firmware ID of the RxDPCPU
  * @tx_dpcpu_fw_id: Firmware ID of the TxDPCPU
  * @must_probe_vswitching: Flag: vswitching has yet to be setup after MC reboot
@@ -187,6 +189,7 @@ struct efx_ef10_nic_data {
 	bool must_check_datapath_caps;
 	u32 datapath_caps;
 	u32 datapath_caps2;
+	u32 datapath_caps3;
 	unsigned int rx_dpcpu_fw_id;
 	unsigned int tx_dpcpu_fw_id;
 	bool must_probe_vswitching;
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net] igb: only strip Rx timestamp header on the first buffer of a frame
From: Tony Nguyen @ 2026-06-18 17:38 UTC (permalink / raw)
  To: Kurt Kanzenbach, Tjerk Kusters, netdev@vger.kernel.org
  Cc: intel-wired-lan@lists.osuosl.org, przemyslaw.kitszel@intel.com,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, richardcochran@gmail.com,
	hawk@kernel.org, stable@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <8733yojljf.fsf@jax.kurt.home>



On 6/15/2026 12:43 AM, Kurt Kanzenbach wrote:
> Hi,
> 
> On Fri Jun 12 2026, Tjerk Kusters wrote:
>> Hi,
>>
>> The patch is attached (0001-igb-only-strip-Rx-timestamp-header-on-the-first-buff.patch)
>> as my mail setup cannot send it inline via git send-email; apologies for the
>> attachment.
> 
> b4 has a web submission endpoint. Maybe you can use that one:
> 
> https://b4.docs.kernel.org/en/latest/contributor/send.html
Hi Tjerk,

It would be great if you could get this setup as it makes patch handling 
easier.

> [snip]
> 
>>  From fee3e3452dfcd7e109332369672a3e0090cadeb3 Mon Sep 17 00:00:00 2001
>> From: T Kusters <tkusters@aweta.nl>
>> Date: Tue, 9 Jun 2026 14:06:24 +0200
>> Subject: [PATCH net] igb: only strip Rx timestamp header on the first buffer
>>   of a frame
>>
>> When Rx hardware timestamping is enabled (e.g. ptp4l, which configures
>> HWTSTAMP_FILTER_ALL), the NIC prepends a 16-byte timestamp header to the
>> first Rx buffer of every received frame. igb_clean_rx_irq() strips this
>> header inside its per-buffer loop:
>>
>> 	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
>> 		ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
>> 						 pktbuf, &timestamp);
>> 		pkt_offset += ts_hdr_len;
>> 		size -= ts_hdr_len;
>> 	}
>>
>> For a frame that spans more than one Rx buffer (e.g. a jumbo frame), this
>> block runs once per buffer. The timestamp header only exists at the start
>> of the first buffer, but igb_ptp_rx_pktstamp() is called for every buffer.
>>
>> On a continuation buffer the data is packet payload, not a timestamp
>> header. igb_ptp_rx_pktstamp() already has two guards against acting on a
>> non-header buffer: it returns 0 if PTP is disabled, and returns 0 if the
>> reserved dwords (the first 8 bytes) are non-zero. Neither is sufficient
>> here: PTP is enabled, and a continuation buffer whose payload happens to
>> begin with 8 zero bytes passes the reserved-dword check. In that case the
>> payload is mistaken for a valid timestamp header and igb_ptp_rx_pktstamp()
>> returns IGB_TS_HDR_LEN, so the caller strips 16 bytes of real data from
>> that buffer. A frame spanning N buffers whose continuation buffers start
>> with zero bytes therefore loses 16 * (N - 1) bytes from its tail.
>>
>> This is easily triggered by a GigE Vision camera streaming dark frames
>> (mostly 0x00 pixel data) over jumbo UDP with PTP active on the receiver:
>> the all-zero frames arrive truncated while frames with non-zero content
>> are fine. There is no error indication.
>>
>> No content-based check can reliably tell a continuation buffer that begins
>> with zero bytes from a real timestamp header, because both are all zero.
>> Fix it structurally instead: only attempt the strip on the first buffer of
>> a frame, which is the only buffer that can contain a timestamp header. In
>> igb_clean_rx_irq() skb is NULL until the first buffer has been processed,
>> so guarding the strip with !skb restricts it to the first buffer
>> regardless of payload content.
>>
>> Fixes: 5379260852b0 ("igb: Fix XDP with PTP enabled")
>> Cc: stable@vger.kernel.org
>> Signed-off-by: T Kusters <tkusters@aweta.nl>

Sign off should be your full name.

Thanks,
Tony

> Great explanation! igb_clean_rx_irq_zc() does not need the same
> treatment, correct?
> 
> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de>
> 
>> ---
>>   drivers/net/ethernet/intel/igb/igb_main.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
>> index ce91dda00ec0..abb55cd589a9 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_main.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
>> @@ -9061,7 +9061,8 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>   		pktbuf = page_address(rx_buffer->page) + rx_buffer->page_offset;
>>   
>>   		/* pull rx packet timestamp if available and valid */
>> -		if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
>> +		if (!skb &&
>> +		    igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
>>   			int ts_hdr_len;
>>   
>>   			ts_hdr_len = igb_ptp_rx_pktstamp(rx_ring->q_vector,
>> -- 
>> 2.27.0
>>


^ permalink raw reply

* Re: [PATCH bpf v3 1/2] bpf, sockmap: fix use-after-free when the stream parser resizes the skb
From: John Fastabend @ 2026-06-18 18:01 UTC (permalink / raw)
  To: Jiayuan Chen; +Cc: netdev, bpf, linux-kernel, Jakub Kicinski, Sechang Lim
In-Reply-To: <34f330b8-60d2-4647-a6b4-a5b001c3715d@linux.dev>

On Thu, Jun 18, 2026 at 07:56:34PM +0800, Jiayuan Chen wrote:
>
>On 6/18/26 6:27 PM, Sechang Lim wrote:
>>sk_psock_strp_parse() runs the BPF_PROG_TYPE_SK_SKB stream-parser program
>>to find the length of the next message. strparser assembles a message out
>>of several received skbs by chaining them onto the head's frag_list and
>>recording where to append the next one in strp->skb_nextp:
>>
>>	*strp->skb_nextp = skb;
>>	strp->skb_nextp = &skb->next;
>>
>>and then calls the parser on the head:
>>
>>	len = (*strp->cb.parse_msg)(strp, head);
>
>[...]
>
>>unaffected and may still modify the skb.
>>
>>Fixes: 8a31db561566 ("bpf: add access to sock fields and pkt data from sk_skb programs")
>
>Is the Fixes tag correct ?
>
>Anyway, I don't think this patch is a fix; it's more of a hardening. 
>So no Fixes tag needed, IMO.
>
>
>>Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>
>>---

[...]

>
>
>CI failed:
>https://github.com/kernel-patches/bpf/actions/runs/27754218839/job/82113319982
>   Failed stream parser bpf prog attach
>
>Hi John
>I noticed that bpf_skb_pull_data was added to the skmsg test:
>https://github.com/torvalds/linux/commit/82a8616889d506cb690cfc0afb2ccadda120461d
>
>Can we drop bpf_skb_pull_data in parser prog(sockmap_parse_prog.c‎) ?
>And are there any scenarios where we need to modify skb len when using 
>strparser ?

We should never modify the skb from strparser. Just remove any tests
that do this and state its not safe. We haven't used strparser progs
for a long time anyways.

^ permalink raw reply

* [PATCH net] eth: bnxt: improve the timing of stats
From: Jakub Kicinski @ 2026-06-18 18:13 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, pabeni, andrew+netdev, horms, Jakub Kicinski,
	michael.chan, pavan.chebbi

Kernel selftests wait 1.25x of the promised stats refresh time
(as read from ethtool -c). bnxt reports 1sec by default, but
the stats update process has two steps. First device DMAs the
new values, then the service task performs update in full-width
SW counters. So the worst case delay is actually 2x.

Note that there is bnxt_hwrm_port_qstats() but the qstats here
probably stands for "query stats", and the command itself
updates detailed MAC-level stats (MAC errors, RMON histogram etc.)
It must not be updating the stats we care about, otherwise
update would be synchronous, and this patch would make no
difference (and it does help).

The problem of stale stats impacts not only tests but real workloads
which monitor egress bandwidth of a NIC. The inaccuracy causes double
counting in the next cycle and spurious overload alarms.

Try to read from the DMA buffer more aggressively, to mitigate
timing issues between DMA and service task. The SW update should
be cheap.

Fixes: 51f307856b60 ("bnxt_en: Allow statistics DMA to be configurable using ethtool -C.")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
CC: michael.chan@broadcom.com
CC: pavan.chebbi@broadcom.com

With this patch I had a 50 clean runs of ntuple.py in a row.
Previously it'd fail within 5 runs at most.

Hopefully this is good enough, in the past I sent an RFC to
convert the driver to use SW stats for everything. That felt
a little drastic.
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  4 +++
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 36 +++++++++++++++++++
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  8 +++++
 3 files changed, 48 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 6d312259f852..aab6e88c3ca1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2620,6 +2620,9 @@ struct bnxt {
 #define BNXT_MIN_STATS_COAL_TICKS	  250000
 #define BNXT_MAX_STATS_COAL_TICKS	 1000000
 
+	spinlock_t		stats_lock;
+	unsigned long		stats_updated_jiffies;
+
 	struct work_struct	sp_task;
 	unsigned long		sp_event;
 #define BNXT_RX_NTP_FLTR_SP_EVENT	1
@@ -3027,6 +3030,7 @@ void bnxt_reenable_sriov(struct bnxt *bp);
 void bnxt_close_nic(struct bnxt *, bool, bool);
 void bnxt_get_ring_drv_stats(struct bnxt *bp,
 			     struct bnxt_total_ring_drv_stats *stats);
+void bnxt_sync_stats(struct bnxt *bp);
 bool bnxt_rfs_capable(struct bnxt *bp, bool new_rss_ctx);
 int bnxt_dbg_hwrm_rd_reg(struct bnxt *bp, u32 reg_off, u16 num_words,
 			 u32 *reg_buf);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 055e93a417b6..25462f854478 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -10575,6 +10575,35 @@ static void bnxt_accumulate_all_stats(struct bnxt *bp)
 	}
 }
 
+/* Re-accumulate stats from DMA buffers if stale.
+ * uAPIs for reading sw_stats should call this first.
+ *
+ * We promise user space update frequency of bp->stats_coal_ticks but
+ * the update is a two step process - first device updates the DMA buffer,
+ * then we have to update from that buffer to driver stats in the service work.
+ * Worst case we would be 2x off from the desired frequency.
+ * Sync the stats sooner, if stale. The 20% threshold was chosen arbitrarily.
+ *
+ * Ideally we would split the user-configured time into two portions,
+ * i.e. also lower the DMA period by the 20%. But the DMA timer seems to have
+ * too coarse granularity to play such tricks.
+ */
+void bnxt_sync_stats(struct bnxt *bp)
+{
+	unsigned long stale;
+
+	if (!netif_running(bp->dev) || !bp->stats_coal_ticks)
+		return;
+
+	spin_lock(&bp->stats_lock);
+	stale = usecs_to_jiffies(bp->stats_coal_ticks / 5);
+	if (time_after_eq(jiffies, bp->stats_updated_jiffies + stale)) {
+		bnxt_accumulate_all_stats(bp);
+		bp->stats_updated_jiffies = jiffies;
+	}
+	spin_unlock(&bp->stats_lock);
+}
+
 static int bnxt_hwrm_port_qstats(struct bnxt *bp, u8 flags)
 {
 	struct hwrm_port_qstats_input *req;
@@ -13577,6 +13606,7 @@ bnxt_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 		return;
 	}
 
+	bnxt_sync_stats(bp);
 	bnxt_get_ring_stats(bp, stats);
 	bnxt_add_prev_stats(bp, stats);
 
@@ -14753,7 +14783,10 @@ static void bnxt_sp_task(struct work_struct *work)
 	if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) {
 		bnxt_hwrm_port_qstats(bp, 0);
 		bnxt_hwrm_port_qstats_ext(bp, 0);
+		spin_lock(&bp->stats_lock);
 		bnxt_accumulate_all_stats(bp);
+		bp->stats_updated_jiffies = jiffies;
+		spin_unlock(&bp->stats_lock);
 	}
 
 	if (test_and_clear_bit(BNXT_LINK_CHNG_SP_EVENT, &bp->sp_event)) {
@@ -15488,6 +15521,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
 	INIT_DELAYED_WORK(&bp->fw_reset_task, bnxt_fw_reset_task);
 
 	spin_lock_init(&bp->ntp_fltr_lock);
+	spin_lock_init(&bp->stats_lock);
 #if BITS_PER_LONG == 32
 	spin_lock_init(&bp->db_lock);
 #endif
@@ -16056,6 +16090,7 @@ static void bnxt_get_queue_stats_rx(struct net_device *dev, int i,
 	if (!bp->bnapi)
 		return;
 
+	bnxt_sync_stats(bp);
 	cpr = &bp->bnapi[i]->cp_ring;
 	sw = cpr->stats.sw_stats;
 
@@ -16084,6 +16119,7 @@ static void bnxt_get_queue_stats_tx(struct net_device *dev, int i,
 	if (!bp->tx_ring)
 		return;
 
+	bnxt_sync_stats(bp);
 	bnapi = bp->tx_ring[bp->tx_ring_map[i]].bnapi;
 	sw = bnapi->cp_ring.stats.sw_stats;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 56d74a3c24b7..835b54287579 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -606,6 +606,7 @@ static void bnxt_get_ethtool_stats(struct net_device *dev,
 		goto skip_ring_stats;
 	}
 
+	bnxt_sync_stats(bp);
 	tpa_stats = bnxt_get_num_tpa_ring_stats(bp);
 	for (i = 0; i < bp->cp_nr_rings; i++) {
 		struct bnxt_napi *bnapi = bp->bnapi[i];
@@ -3310,6 +3311,7 @@ static void bnxt_get_fec_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->rx_port_stats_ext.sw_stats;
 	fec_stats->corrected_bits.total =
 		*(rx + BNXT_RX_STATS_EXT_OFFSET(rx_corrected_bits));
@@ -3409,6 +3411,7 @@ static void bnxt_get_pause_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->port_stats.sw_stats;
 	tx = bp->port_stats.sw_stats + BNXT_TX_PORT_STATS_BYTE_OFFSET / 8;
 
@@ -5572,6 +5575,7 @@ static void bnxt_get_eth_phy_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->rx_port_stats_ext.sw_stats;
 	phy_stats->SymbolErrorDuringCarrier =
 		*(rx + BNXT_RX_STATS_EXT_OFFSET(rx_pcs_symbol_err));
@@ -5586,6 +5590,7 @@ static void bnxt_get_eth_mac_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->port_stats.sw_stats;
 	tx = bp->port_stats.sw_stats + BNXT_TX_PORT_STATS_BYTE_OFFSET / 8;
 
@@ -5610,6 +5615,7 @@ static void bnxt_get_eth_ctrl_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->port_stats.sw_stats;
 	ctrl_stats->MACControlFramesReceived =
 		BNXT_GET_RX_PORT_STATS64(rx, rx_ctrl_frames);
@@ -5639,6 +5645,7 @@ static void bnxt_get_rmon_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->port_stats.sw_stats;
 	tx = bp->port_stats.sw_stats + BNXT_TX_PORT_STATS_BYTE_OFFSET / 8;
 
@@ -5712,6 +5719,7 @@ static void bnxt_get_link_ext_stats(struct net_device *dev,
 	if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT))
 		return;
 
+	bnxt_sync_stats(bp);
 	rx = bp->rx_port_stats_ext.sw_stats;
 	stats->link_down_events =
 		*(rx + BNXT_RX_STATS_EXT_OFFSET(link_down_events));
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH 1/2] fs: Add bpf_sock_read_xattr() kfunc to read socket xattrs
From: John Fastabend @ 2026-06-18 18:20 UTC (permalink / raw)
  To: Christian Brauner
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Alexander Viro, Jan Kara,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, linux-fsdevel,
	netdev, bpf, Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Song Liu, Yonghong Song, Jiri Olsa
In-Reply-To: <20260617-work-bpf-sock-xattr-v1-1-a1276f7c9da3@kernel.org>

On Wed, Jun 17, 2026 at 01:18:27PM +0200, Christian Brauner wrote:
>In c8db08110cbe ("Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")
>we added support for extended attributes for sockets. This comes in two
>flavors: sockfs and non-sockfs/filesystem sockets. Filesystem sockets
>are actual filesystem objects so reading xattrs must use dedicated fs
>helpers such as bpf_get_dentry_xattr() and bpf_get_file_xattr(). Those
>are inherently sleeping operations. Sockfs sockets on the other hand
>don't need to use sleeping operations as the underlying data structure
>is lockless. In addition, retrieval of sockfs extended attributes often
>happens from LSM hooks that only provide struct socket and it's
>completely nonsensical to grab a reference to a file, then force a
>sleeping operation to retrieve the xattr and drop the reference. We know
>that the sockfs file cannot go away while the LSM hook runs.

[...]

>
>Link: https://github.com/systemd/systemd/pull/40559 [1]
>Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
>---

Nice this will simplify some of our socket tracking.

Reviewed-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* [PATCH v2 bpf-next 0/2] bpf: bpf_redirect_peer egress redirection
From: Jordan Rife @ 2026-06-18 18:20 UTC (permalink / raw)
  To: bpf
  Cc: Jordan Rife, netdev, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Stanislav Fomichev,
	Jiayuan Chen, Paul Chaignon

We have several use cases where a pod injects traffic into the datapath
of another so that the traffic appears to have originated from that
pod. One such use case is a synthetic flow generator which injects
synthetic traffic into a pod's datapath to enable dynamic probing and
debugging. Another is a transparent proxy where connections originating
from one pod are redirected towards another which proxies that
connection. The new connection is bound to the IP of the original pod
using IP_TRANSPARENT and its traffic is injected into that pod's
datapath and handled as if it had originated there. This can be used for
mTLS, etc.

We use bpf_redirect(BPF_F_INGRESS) to direct traffic leaving the proxy,
flow generator, etc. towards the target pod, ensuring that eBPF programs
that are meant to intercept traffic leaving that pod are executed.
However, this doesn't work with netkit.

With netkit, an ingress redirection from proxy to workload skips eBPF
programs that are meant to intercept traffic leaving the pod, since they
reside on the netkit peer device. One workaround is to attach the
same program to both the netkit peer device and the TCX ingress hook for
the netkit pair's primary interface, but

a) This seems hacky and we need to be careful not to run the same
   program twice for the same skb in cases where we want to pass that
   traffic to the host stack.
b) We're trying to keep the proxy redirection / traffic injection
   systems as modular and separated from Cilium as possible, the system
   that manages netkit setup and core eBPF programming.

It would be handy if instead we could redirect traffic directly from
one netkit peer device to another. This patch proposes an extension
to bpf_redirect_peer to allow us to do just that.

With this patch, the BPF_F_EGRESS flag tells bpf_redirect_peer to emit
the skb in the egress direction of the target interface's peer device
While the main use case is netkit, I suppose you could also use this
mode with veth as well if, e.g., there were some eBPF programs attached
to that side of the veth pair that needed to intercept traffic.

 +---------------------------------------------------------------------+
 | +-------------------------+         6. bpf_redirect_neigh(eth0)     |
 | | pod (10.244.0.10)       |           ------------------------      |
 | |                         |          |                        |     |
 | |              +--------+ |          |      +---------+       |     |
 | | 1. packet -->|        | |          |      |         |       |     |
 | |    leaves ^  | netkit |<===========|======| netkit  |       |     |
 | |           |  | peer   |=======(eBPF)=====>| primary |       |     |
 | |           |  |        | |          |      |         |       |     |
 | |           |  +--------+ |          |      +---------+       |     |
 | |           |             |          | 2. bpf_redirect        v     |
 | +-----------|-------------+          |___________________   +-------|
 |             |                                            |  | eth0  |
 |             | 5. bpf_redirect_peer(BPF_F_EGRESS)         |  +-------|
 |             |________________________                    |          |
 | +-------------------------+          |                   |          |
 | | proxy (10.244.0.11)     |          |                   |          |
 | | IP_TRANSPARENT          |          |                   |          |
 | |              +--------+ |          |      +---------+  |          |
 | | 3. packet <--|        | |          |      |         |<--          |
 | |    enters    | netkit |<===========|======| netkit  |             |
 | |    [proxy]   | peer   |=======(eBPF)=====>| primary |             |
 | | 4. packet -->|        | |                 |         |             |
 | |    leaves    +--------+ |                 +---------+             |
 | |    sip=10.244.0.10      |                                         |
 | +-------------------------+                                         |
 +---------------------------------------------------------------------+

Using the proxy use case as an example, in step 5 we would redirect
traffic leaving the proxy towards the pod's peer device using
bpf_redirect_peer(BPF_F_EGRESS).

As a bonus, since the skb doesn't have to go through the backlog queue
it can take full advantage of netkit's performance benefits. I set up a
test where outgoing iperf3 traffic is injected into the datapath of
another pod using either bpf_redirect_peer(BPF_F_EGRESS) or
bpf_redirect(BPF_F_INGRESS). I used Cilium's eBPF host routing mode
which skips the host stack and uses BPF redirect helpers to do all the
routing.

  (net.ipv4.tcp_congestion_control=cubic,mtu=1500,100GiB link,Cilium
   eBPF host routing mode)

BASELINE [bpf_redirect(BPF_F_INGRESS)]
  1. [iperf pod] ==bpf_redirect([pod b], BPF_F_INGRESS)==> [pod b]
  2. [pod b]     ==bpf_redirect_neigh([eth0])==>           eth0
  3. eth0        ==over network==>                         [host b]

  [ ID] Interval           Transfer     Bitrate         Retr
  [  5]   0.00-60.00  sec   231 GBytes  33.0 Gbits/sec  12060     sender
  [  5]   0.00-60.00  sec   230 GBytes  33.0 Gbits/sec            receiver

TEST [bpf_redirect_peer(BPF_F_EGRESS)]
  1. [iperf pod] ==bpf_redirect_peer([pod b], BPF_F_EGRESS)==> [pod b]
  2. [pod b]     ==bpf_redirect_neigh([eth0])==>               eth0
  3. eth0        ==over network==>                             [host b]

  [ ID] Interval           Transfer     Bitrate         Retr
  [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec    0       sender
  [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec            receiver

In this test, using bpf_redirect_peer(BPF_F_EGRESS) for the hop from
[iperf pod] to [pod b] led to ~18% more throughput compared to
bpf_redirect(BPF_F_INGRESS).

CHANGES
=======
v1->v2: https://lore.kernel.org/bpf/20260613183424.1198073-1-jordan@jrife.io/
* Introduce and use BPF_F_EGRESS instead of BPF_F_INGRESS (Paul,
  Jiayuan).
    Overall opinion was that BPF_F_EGRESS was clearer, but it was
    acknowledged that this creates some inconsistencies with
    bpf_redirect where 0 means egress implicitly.
* Invert `skb->dev = dev;` and `dev_sw_netstats_rx_add` to make the
  diff cleaner.

Jordan Rife (2):
  bpf: Support BPF_F_EGRESS with bpf_redirect_peer
  selftests/bpf: Add tests for bpf_redirect_peer with BPF_F_EGRESS

 include/uapi/linux/bpf.h                      | 19 +++---
 net/core/filter.c                             | 12 ++--
 tools/include/uapi/linux/bpf.h                | 19 +++---
 .../selftests/bpf/prog_tests/tc_redirect.c    | 68 +++++++++++++++++++
 .../selftests/bpf/progs/test_tc_peer.c        | 22 ++++++
 5 files changed, 119 insertions(+), 21 deletions(-)

-- 
2.43.0


^ permalink raw reply

* [PATCH v2 bpf-next 1/2] bpf: Support BPF_F_EGRESS with bpf_redirect_peer
From: Jordan Rife @ 2026-06-18 18:20 UTC (permalink / raw)
  To: bpf
  Cc: Jordan Rife, netdev, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Stanislav Fomichev,
	Jiayuan Chen, Paul Chaignon
In-Reply-To: <20260618182035.43811-1-jordan@jrife.io>

We have several use cases where a pod injects traffic into the datapath
of another so that the traffic appears to have originated from that
pod. One such use case is a synthetic flow generator which injects
synthetic traffic into a pod's datapath to enable dynamic probing and
debugging. Another is a transparent proxy where connections originating
from one pod are redirected towards another which proxies that
connection. The new connection is bound to the IP of the original pod
using IP_TRANSPARENT and its traffic is injected into that pod's
datapath and handled as if it had originated there. This can be used for
mTLS, etc.

We use bpf_redirect(BPF_F_INGRESS) to direct traffic leaving the proxy,
flow generator, etc. towards the target pod, ensuring that eBPF programs
that are meant to intercept traffic leaving that pod are executed.
However, this doesn't work with netkit.

With netkit, an ingress redirection from proxy to workload skips eBPF
programs that are meant to intercept traffic leaving the pod, since they
reside on the netkit peer device. One workaround is to attach the
same program to both the netkit peer device and the TCX ingress hook for
the netkit pair's primary interface, but

a) This seems hacky and we need to be careful not to run the same
   program twice for the same skb in cases where we want to pass that
   traffic to the host stack.
b) We're trying to keep the proxy redirection / traffic injection
   systems as modular and separated from Cilium as possible, the system
   that manages netkit setup and core eBPF programming.

It would be handy if instead we could redirect traffic directly from
one netkit peer device to another. This patch proposes an extension
to bpf_redirect_peer to allow us to do just that.

With this patch, the BPF_F_EGRESS flag tells bpf_redirect_peer to emit
the skb in the egress direction of the target interface's peer device
While the main use case is netkit, I suppose you could also use this
mode with veth as well if, e.g., there were some eBPF programs attached
to that side of the veth pair that needed to intercept traffic.

 +---------------------------------------------------------------------+
 | +-------------------------+         6. bpf_redirect_neigh(eth0)     |
 | | pod (10.244.0.10)       |           ------------------------      |
 | |                         |          |                        |     |
 | |              +--------+ |          |      +---------+       |     |
 | | 1. packet -->|        | |          |      |         |       |     |
 | |    leaves ^  | netkit |<===========|======| netkit  |       |     |
 | |           |  | peer   |=======(eBPF)=====>| primary |       |     |
 | |           |  |        | |          |      |         |       |     |
 | |           |  +--------+ |          |      +---------+       |     |
 | |           |             |          | 2. bpf_redirect        v     |
 | +-----------|-------------+          |___________________   +-------|
 |             |                                            |  | eth0  |
 |             | 5. bpf_redirect_peer(BPF_F_EGRESS)         |  +-------|
 |             |________________________                    |          |
 | +-------------------------+          |                   |          |
 | | proxy (10.244.0.11)     |          |                   |          |
 | | IP_TRANSPARENT          |          |                   |          |
 | |              +--------+ |          |      +---------+  |          |
 | | 3. packet <--|        | |          |      |         |<--          |
 | |    enters    | netkit |<===========|======| netkit  |             |
 | |    [proxy]   | peer   |=======(eBPF)=====>| primary |             |
 | | 4. packet -->|        | |                 |         |             |
 | |    leaves    +--------+ |                 +---------+             |
 | |    sip=10.244.0.10      |                                         |
 | +-------------------------+                                         |
 +---------------------------------------------------------------------+

Using the proxy use case as an example, in step 5 we would redirect
traffic leaving the proxy towards the pod's peer device using
bpf_redirect_peer(BPF_F_EGRESS).

As a bonus, since the skb doesn't have to go through the backlog queue
it can take full advantage of netkit's performance benefits. I set up a
test where outgoing iperf3 traffic is injected into the datapath of
another pod using either bpf_redirect_peer(BPF_F_EGRESS) or
bpf_redirect(BPF_F_INGRESS). I used Cilium's eBPF host routing mode
which skips the host stack and uses BPF redirect helpers to do all the
routing.

  (net.ipv4.tcp_congestion_control=cubic,mtu=1500,100GiB link,Cilium
   eBPF host routing mode)

BASELINE [bpf_redirect(BPF_F_INGRESS)]
  1. [iperf pod] ==bpf_redirect([pod b], BPF_F_INGRESS)==> [pod b]
  2. [pod b]     ==bpf_redirect_neigh([eth0])==>           eth0
  3. eth0        ==over network==>                         [host b]

  [ ID] Interval           Transfer     Bitrate         Retr
  [  5]   0.00-60.00  sec   231 GBytes  33.0 Gbits/sec  12060     sender
  [  5]   0.00-60.00  sec   230 GBytes  33.0 Gbits/sec            receiver

TEST [bpf_redirect_peer(BPF_F_EGRESS)]
  1. [iperf pod] ==bpf_redirect_peer([pod b], BPF_F_EGRESS)==> [pod b]
  2. [pod b]     ==bpf_redirect_neigh([eth0])==>               eth0
  3. eth0        ==over network==>                             [host b]

  [ ID] Interval           Transfer     Bitrate         Retr
  [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec    0       sender
  [  5]   0.00-60.00  sec   272 GBytes  38.9 Gbits/sec            receiver

In this test, using bpf_redirect_peer(BPF_F_EGRESS) for the hop from
[iperf pod] to [pod b] led to ~18% more throughput compared to
bpf_redirect(BPF_F_INGRESS).

Signed-off-by: Jordan Rife <jordan@jrife.io>
---
 include/uapi/linux/bpf.h       | 19 +++++++++++--------
 net/core/filter.c              | 12 +++++++-----
 tools/include/uapi/linux/bpf.h | 19 +++++++++++--------
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89b36de5fdbb..c91b5a4bda03 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5079,17 +5079,19 @@ union bpf_attr {
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
  * 		This helper is somewhat similar to **bpf_redirect**\ (), except
- * 		that the redirection happens to the *ifindex*' peer device and
- * 		the netns switch takes place from ingress to ingress without
- * 		going through the CPU's backlog queue.
+ * 		that the redirection happens to the *ifindex*' peer device. If
+ * 		*flags* is 0, the netns switch takes place from ingress to
+ * 		ingress without going through the CPU's backlog queue. If the
+ * 		**BPF_F_EGRESS** flag is provided then redirection happens in
+ * 		the egress direction of the peer device.
  *
  * 		*skb*\ **->mark** and *skb*\ **->tstamp** are not cleared during
  * 		the netns switch.
  *
- * 		The *flags* argument is reserved and must be 0. The helper is
- * 		currently only supported for tc BPF program types at the
- * 		ingress hook and for veth and netkit target device types. The
- * 		peer device must reside in a different network namespace.
+ * 		If the *flags* argument is 0, the helper is currently only
+ * 		supported for tc BPF program types at the ingress hook and for
+ * 		veth and netkit target device types. The peer device must reside
+ * 		in a different network namespace.
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
@@ -6336,9 +6338,10 @@ enum {
 /* Flags for bpf_redirect and bpf_redirect_map helpers */
 enum {
 	BPF_F_INGRESS		= (1ULL << 0), /* used for skb path */
+	BPF_F_EGRESS		= (1ULL << 1), /* used for skb path */
 	BPF_F_BROADCAST		= (1ULL << 3), /* used for XDP path */
 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4), /* used for XDP path */
-#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
+#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_EGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
 };
 
 #define __bpf_md_ptr(type, name)	\
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e96b4b847ce..ce2ef5d8ae44 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2529,16 +2529,18 @@ int skb_do_redirect(struct sk_buff *skb)
 	if (unlikely(!dev))
 		goto out_drop;
 	if (flags & BPF_F_PEER) {
-		if (unlikely(!skb_at_tc_ingress(skb)))
-			goto out_drop;
 		dev = skb_get_peer_dev(dev);
 		if (unlikely(!dev ||
 			     !(dev->flags & IFF_UP) ||
 			     net_eq(net, dev_net(dev))))
 			goto out_drop;
+		skb_scrub_packet(skb, false);
+		if (flags & BPF_F_EGRESS)
+			return __bpf_redirect(skb, dev, 0);
+		if (unlikely(!skb_at_tc_ingress(skb)))
+			goto out_drop;
 		skb->dev = dev;
 		dev_sw_netstats_rx_add(dev, skb->len);
-		skb_scrub_packet(skb, false);
 		return -EAGAIN;
 	}
 	return flags & BPF_F_NEIGH ?
@@ -2575,10 +2577,10 @@ BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
 {
 	struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
 
-	if (unlikely(flags))
+	if (unlikely(flags & ~BPF_F_EGRESS))
 		return TC_ACT_SHOT;
 
-	ri->flags = BPF_F_PEER;
+	ri->flags = BPF_F_PEER | flags;
 	ri->tgt_index = ifindex;
 
 	return TC_ACT_REDIRECT;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 89b36de5fdbb..c91b5a4bda03 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5079,17 +5079,19 @@ union bpf_attr {
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*.
  * 		This helper is somewhat similar to **bpf_redirect**\ (), except
- * 		that the redirection happens to the *ifindex*' peer device and
- * 		the netns switch takes place from ingress to ingress without
- * 		going through the CPU's backlog queue.
+ * 		that the redirection happens to the *ifindex*' peer device. If
+ * 		*flags* is 0, the netns switch takes place from ingress to
+ * 		ingress without going through the CPU's backlog queue. If the
+ * 		**BPF_F_EGRESS** flag is provided then redirection happens in
+ * 		the egress direction of the peer device.
  *
  * 		*skb*\ **->mark** and *skb*\ **->tstamp** are not cleared during
  * 		the netns switch.
  *
- * 		The *flags* argument is reserved and must be 0. The helper is
- * 		currently only supported for tc BPF program types at the
- * 		ingress hook and for veth and netkit target device types. The
- * 		peer device must reside in a different network namespace.
+ * 		If the *flags* argument is 0, the helper is currently only
+ * 		supported for tc BPF program types at the ingress hook and for
+ * 		veth and netkit target device types. The peer device must reside
+ * 		in a different network namespace.
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
@@ -6336,9 +6338,10 @@ enum {
 /* Flags for bpf_redirect and bpf_redirect_map helpers */
 enum {
 	BPF_F_INGRESS		= (1ULL << 0), /* used for skb path */
+	BPF_F_EGRESS		= (1ULL << 1), /* used for skb path */
 	BPF_F_BROADCAST		= (1ULL << 3), /* used for XDP path */
 	BPF_F_EXCLUDE_INGRESS	= (1ULL << 4), /* used for XDP path */
-#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
+#define BPF_F_REDIRECT_FLAGS (BPF_F_INGRESS | BPF_F_EGRESS | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS)
 };
 
 #define __bpf_md_ptr(type, name)	\
-- 
2.43.0


^ permalink raw reply related

* [PATCH v2 bpf-next 2/2] selftests/bpf: Add tests for bpf_redirect_peer with BPF_F_EGRESS
From: Jordan Rife @ 2026-06-18 18:20 UTC (permalink / raw)
  To: bpf
  Cc: Jordan Rife, netdev, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Stanislav Fomichev,
	Jiayuan Chen, Paul Chaignon
In-Reply-To: <20260618182035.43811-1-jordan@jrife.io>

Extend redirect tests to cover bpf_redirect_peer(BPF_F_EGRESS). SRC
redirects to DST using bpf_redirect_peer(BPF_F_EGRESS) then traffic is
hairpinned into DST using bpf_redirect.

Signed-off-by: Jordan Rife <jordan@jrife.io>
---
 .../selftests/bpf/prog_tests/tc_redirect.c    | 68 +++++++++++++++++++
 .../selftests/bpf/progs/test_tc_peer.c        | 22 ++++++
 2 files changed, 90 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index 64fbda082309..af8968b89ad7 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -192,6 +192,8 @@ static int create_netkit(int mode, char *prim, char *peer)
 	req.n.nlmsg_len += sizeof(struct ifinfomsg);
 	addattr_l(&req.n, sizeof(req), IFLA_IFNAME, peer, strlen(peer));
 	addattr_nest_end(&req.n, peer_info);
+	addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB,
+		  NETKIT_SCRUB_NONE);
 	addattr_nest_end(&req.n, data);
 	addattr_nest_end(&req.n, linkinfo);
 
@@ -405,6 +407,24 @@ static int netns_load_bpf(const struct bpf_program *src_prog,
 	return -1;
 }
 
+static struct bpf_link *netns_attach_nk(const char *ns, int ifindex,
+					struct bpf_program *prog)
+{
+	LIBBPF_OPTS(bpf_netkit_opts, optl);
+	struct nstoken *nstoken = NULL;
+	struct bpf_link *link = NULL;
+
+	nstoken = open_netns(ns);
+	if (!ASSERT_OK_PTR(nstoken, "setns"))
+		goto cleanup;
+
+	link = bpf_program__attach_netkit(prog, ifindex, &optl);
+cleanup:
+	if (nstoken)
+		close_netns(nstoken);
+	return link;
+}
+
 static void test_tcp(int family, const char *addr, __u16 port)
 {
 	int listen_fd = -1, accept_fd = -1, client_fd = -1;
@@ -1082,6 +1102,53 @@ static void test_tc_redirect_peer(struct netns_setup_result *setup_result)
 	close_netns(nstoken);
 }
 
+static void test_tc_redirect_peer_ing(struct netns_setup_result *setup_result)
+{
+	struct test_tc_peer *skel;
+	struct nstoken *nstoken;
+	int err;
+
+	nstoken = open_netns(NS_FWD);
+	if (!ASSERT_OK_PTR(nstoken, "setns fwd"))
+		return;
+
+	skel = test_tc_peer__open();
+	if (!ASSERT_OK_PTR(skel, "test_tc_peer__open"))
+		goto done;
+
+	skel->rodata->IFINDEX_SRC = setup_result->ifindex_src_fwd;
+	skel->rodata->IFINDEX_DST = setup_result->ifindex_dst_fwd;
+	ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc_src_ing,
+		  BPF_NETKIT_PRIMARY), 0, "src_prog_attach_type");
+	ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc_dst_ing,
+		  BPF_NETKIT_PRIMARY), 0, "dst_prog_attach_type");
+
+	err = test_tc_peer__load(skel);
+	if (!ASSERT_OK(err, "test_tc_peer__load"))
+		goto done;
+
+	skel->links.tc_src_ing = netns_attach_nk(NS_SRC,
+						 setup_result->ifindex_src,
+						 skel->progs.tc_src_ing);
+	if (!ASSERT_OK_PTR(skel->links.tc_src_ing, "attach_src"))
+		goto done;
+	skel->links.tc_dst_ing = netns_attach_nk(NS_DST,
+						 setup_result->ifindex_dst,
+						 skel->progs.tc_dst_ing);
+	if (!ASSERT_OK_PTR(skel->links.tc_dst_ing, "attach_dst"))
+		goto done;
+
+	if (!ASSERT_OK(set_forwarding(false), "disable forwarding"))
+		goto done;
+
+	test_connectivity();
+
+done:
+	if (skel)
+		test_tc_peer__destroy(skel);
+	close_netns(nstoken);
+}
+
 static int tun_open(char *name)
 {
 	struct ifreq ifr;
@@ -1280,6 +1347,7 @@ static void *test_tc_redirect_run_tests(void *arg)
 
 	RUN_TEST(tc_redirect_peer, MODE_VETH);
 	RUN_TEST(tc_redirect_peer, MODE_NETKIT);
+	RUN_TEST(tc_redirect_peer_ing, MODE_NETKIT);
 	RUN_TEST(tc_redirect_peer_l3, MODE_VETH);
 	RUN_TEST(tc_redirect_peer_l3, MODE_NETKIT);
 	RUN_TEST(tc_redirect_neigh, MODE_VETH);
diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
index 365eacb5dc34..cfb9ef7f467c 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_peer.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c
@@ -34,6 +34,28 @@ int tc_src(struct __sk_buff *skb)
 	return bpf_redirect_peer(IFINDEX_DST, 0);
 }
 
+SEC("tc")
+int tc_dst_ing(struct __sk_buff *skb)
+{
+	if (!skb->mark) {
+		skb->mark = 0x1;
+		return bpf_redirect_peer(IFINDEX_SRC, BPF_F_EGRESS);
+	}
+
+	return bpf_redirect(IFINDEX_DST, 0);
+}
+
+SEC("tc")
+int tc_src_ing(struct __sk_buff *skb)
+{
+	if (!skb->mark) {
+		skb->mark = 0x1;
+		return bpf_redirect_peer(IFINDEX_DST, BPF_F_EGRESS);
+	}
+
+	return bpf_redirect(IFINDEX_SRC, 0);
+}
+
 SEC("tc")
 int tc_dst_l3(struct __sk_buff *skb)
 {
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH 2/2] selftests/bpf: Add test for bpf_sock_read_xattr() kfunc
From: John Fastabend @ 2026-06-18 18:24 UTC (permalink / raw)
  To: Christian Brauner
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Alexander Viro, Jan Kara,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, linux-fsdevel,
	netdev, bpf, Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Song Liu, Yonghong Song, Jiri Olsa
In-Reply-To: <20260617-work-bpf-sock-xattr-v1-2-a1276f7c9da3@kernel.org>

On Wed, Jun 17, 2026 at 01:18:28PM +0200, Christian Brauner wrote:
>Add a selftest that loads the kfunc in sleepable and non-sleepable
>lsm/socket_connect programs and checks that a value set via fsetxattr()
>on a socket is read back.
>
>Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
>---

Reviewed-by: John Fastabend <john.fastabend@gmail.com>

^ permalink raw reply

* RE: [PATCH net] net: mana: Sync page pool RX frags for CPU
From: Haiyang Zhang @ 2026-06-18 18:38 UTC (permalink / raw)
  To: Dexuan Cui, KY Srinivasan, wei.liu@kernel.org, Dexuan Cui,
	Long Li, andrew+netdev@lunn.ch, davem@davemloft.net,
	edumazet@google.com, kuba@kernel.org, pabeni@redhat.com,
	Konstantin Taranov, horms@kernel.org, ernis@linux.microsoft.com,
	dipayanroy@linux.microsoft.com, kees@kernel.org,
	jacob.e.keller@intel.com, ssengar@linux.microsoft.com,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org
  Cc: stable@vger.kernel.org
In-Reply-To: <20260618035029.249361-1-decui@microsoft.com>



> -----Original Message-----
> From: Dexuan Cui <decui@microsoft.com>
> Sent: Wednesday, June 17, 2026 11:50 PM
> To: KY Srinivasan <kys@microsoft.com>; Haiyang Zhang
> <haiyangz@microsoft.com>; wei.liu@kernel.org; Dexuan Cui
> <DECUI@microsoft.com>; Long Li <longli@microsoft.com>;
> andrew+netdev@lunn.ch; davem@davemloft.net; edumazet@google.com;
> kuba@kernel.org; pabeni@redhat.com; Konstantin Taranov
> <kotaranov@microsoft.com>; horms@kernel.org; ernis@linux.microsoft.com;
> dipayanroy@linux.microsoft.com; kees@kernel.org; jacob.e.keller@intel.com;
> ssengar@linux.microsoft.com; linux-hyperv@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org
> Cc: stable@vger.kernel.org
> Subject: [PATCH net] net: mana: Sync page pool RX frags for CPU
> 
> MANA allocates RX buffers from page pool fragments when frag_count is
> greater than 1. In that case the buffers remain DMA mapped by page pool
> and the RX completion path does not call dma_unmap_single(). As a result,
> the implicit sync-for-CPU normally performed by dma_unmap_single() is
> missing before the packet data is passed to the networking stack.
> 
> This breaks RX on configurations which require explicit DMA syncing, for
> example when booted with swiotlb=force.
> 
> Fix this by recording the page pool page and DMA sync offset when the RX
> buffer is allocated, and syncing the received packet range for CPU access
> before handing the RX buffer to the stack.
> 
> Also validate the packet length reported in the RX CQE before using it as
> a DMA sync length or passing it to skb processing. The CQE is supplied
> by the device and should not be blindly trusted by Confidential VMs.
> 
> Fixes: 730ff06d3f5c ("net: mana: Use page pool fragments for RX buffers
> instead of full pages to improve memory efficiency.")
> Cc: stable@vger.kernel.org
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> ---
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 61 +++++++++++++++----
>  include/net/mana/mana.h                       |  8 +++
>  2 files changed, 57 insertions(+), 12 deletions(-)

Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>



^ permalink raw reply

* Re: [PATCH bpf] bpf, sockmap: fix BUG_ON in skb_to_sgvec() on a resized ingress skb
From: John Fastabend @ 2026-06-18 19:00 UTC (permalink / raw)
  To: Sechang Lim
  Cc: Jakub Sitnicki, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Liu Jian, Daniel Borkmann, Cong Wang,
	netdev, bpf, linux-kernel
In-Reply-To: <20260613082442.3252576-1-rhkrqnwk98@gmail.com>

On Sat, Jun 13, 2026 at 08:24:31AM +0000, Sechang Lim wrote:
>sk_psock_skb_ingress_enqueue() maps a received message into a scatterlist
>with skb_to_sgvec(skb, sg, off, len). On the SK_SKB strparser path off and
>len come from the message's strp_msg (stm->offset and stm->full_len), set
>by the stream parser. strparser does not trim the skb, so normally
>skb->len - off >= full_len and len is within the skb.
>
>An SK_SKB verdict (or parser) program may call bpf_skb_change_tail() and
>shrink the skb after full_len was recorded. len then covers more bytes than
>the skb holds, __skb_to_sgvec() walks past the data and trips BUG_ON(len):

FWIW this only happens if the strparser program is also attached. If 
there is no strparser program stm->offset = 0 and stm->full_len will be 
whatever the verdict program set. So there we would get

   len = skb->len; // then if it shrinks to skb->len - X its ok.
   off = 0;


>
>  kernel BUG at net/core/skbuff.c:5286!
>  RIP: 0010:__skb_to_sgvec+0x78c/0x790
>  Call Trace:
>   <IRQ>
>   skb_to_sgvec+0x32/0x90
>   sk_psock_skb_ingress_enqueue+0x42/0x370
>   sk_psock_skb_ingress_self+0x1a8/0x200
>   sk_psock_verdict_apply+0x33c/0x360
>   sk_psock_strp_read+0x24a/0x370
>   __strp_recv+0x66d/0xda0
>   __tcp_read_sock+0x13d/0x590
>   tcp_bpf_strp_read_sock+0x195/0x320
>   strp_data_ready+0x267/0x340
>   sk_psock_strp_data_ready+0x1ce/0x350
>   tcp_data_queue+0x1364/0x2fd0
>   </IRQ>
>
>Clamp len to skb->len - off, and drop the message if off is already past
>the skb. sk_psock_skb_ingress_enqueue() is the only skb_to_sgvec() caller
>and both ingress paths (verdict SK_PASS and the backlog worker) reach it.
>The clamp is a no-op unless the skb was shrunk.
>
>Fixes: 7303524e04af ("skmsg: Lose offset info in sk_psock_skb_ingress")
>Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>
>---
> net/core/skmsg.c | 4 ++++
> 1 file changed, 4 insertions(+)
>
>diff --git a/net/core/skmsg.c b/net/core/skmsg.c
>index e1850caf1a71..2961178ebd1e 100644
>--- a/net/core/skmsg.c
>+++ b/net/core/skmsg.c
>@@ -550,6 +550,10 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
> {
> 	int num_sge, copied;
>
>+	if (off >= skb->len)
>+		return -EINVAL;
>+	len = min_t(u32, len, skb->len - off);
>+

This is blocking the BUG but will break the socket. We should fix
at the cause. Something like this untested... although I've never
used the strparser program in any of our cases.

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2521b643fa05..95347f9d140c 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -542,6 +542,20 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
         return alloc_sk_msg(GFP_KERNEL);
  }

+static bool sk_psock_skb_strp_range(struct sk_buff *skb, u32 *off, u32 *len)
+{
+       struct strp_msg *stm = strp_msg(skb);
+
+       *off = stm->offset;
+       if (unlikely(*off >= skb->len)) {
+               *len = 0;
+               return false;
+       }
+
+       *len = min_t(u32, stm->full_len, skb->len - *off);
+       return *len;
+}
+
  static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
                                         u32 off, u32 len,
                                         struct sk_psock *psock,
@@ -696,12 +710,8 @@ static void sk_psock_backlog(struct work_struct *work)
         while ((skb = skb_peek(&psock->ingress_skb))) {
                 len = skb->len;
                 off = 0;
-               if (skb_bpf_strparser(skb)) {
-                       struct strp_msg *stm = strp_msg(skb);
-
-                       off = stm->offset;
-                       len = stm->full_len;
-               }
+               if (skb_bpf_strparser(skb))
+                       sk_psock_skb_strp_range(skb, &off, &len);

                 /* Resume processing from previous partial state */
                 if (unlikely(state->len)) {
@@ -709,6 +719,9 @@ static void sk_psock_backlog(struct work_struct *work)
                         off = state->off;
                 }

+               if (unlikely(!len))
+                       goto out_free_skb;
+
                 ingress = skb_bpf_ingress(skb);
                 skb_bpf_redirect_clear(skb);
                 do {
@@ -737,7 +750,8 @@ static void sk_psock_backlog(struct work_struct *work)
                         len -= ret;
                 } while (len);

-               /* The entire skb sent, clear state */
+out_free_skb:
+               /* The skb has been handled, clear state. */
                 sk_psock_skb_state(psock, state, 0, 0);
                 skb = skb_dequeue(&psock->ingress_skb);
                 kfree_skb(skb);
@@ -1020,10 +1034,10 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
                         len = skb->len;
                         off = 0;
                         if (skb_bpf_strparser(skb)) {
-                               struct strp_msg *stm = strp_msg(skb);
-
-                               off = stm->offset;
-                               len = stm->full_len;
+                               if (unlikely(!sk_psock_skb_strp_range(skb, &off, &len))) {
+                                       err = 0;
+                                       goto out_free;
+                               }
                         }
                         err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
		}

^ permalink raw reply related

* Re: lan7801 looses VLAN Filter Table
From: Nicolai Buchwitz @ 2026-06-18 19:00 UTC (permalink / raw)
  To: Sven Schuchmann; +Cc: netdev
In-Reply-To: <BEZP281MB224501E38B30BFDC4BD3D364D9E32@BEZP281MB2245.DEUP281.PROD.OUTLOOK.COM>

Hi Sven

On 18.6.2026 17:18, Sven Schuchmann wrote:
> Hi,
> I have a problem with a lan7801 chip in Kernel 6.18. I configure 
> VLAN-ID (2) and an IP address.
> But if I disconnect and connect the network-cable several times at some 
> point no packets are
> received anymore. Without using VLAN this does not happen.
> 
> I tracked this down that sometimes the VLAN Filter table seems
> to get cleared. I hooked into the lan78xx.c driver to dump the vlan 
> table:
> 
> static void lan78xx_get_stats(struct net_device *netdev,
> 			      struct ethtool_stats *stats, u64 *data)
> {
> 	struct lan78xx_net *dev = netdev_priv(netdev);
> 	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
> 
> 	lan78xx_update_stats(dev);
> 
> 	for (int i = 0; i < 3; i++) {
> 		u32 buf;
> 		lan78xx_dataport_read(dev, DP_SEL_RSEL_VLAN_DA_, i, 1, &buf);
> 		if (pdata->vlan_table[i] != buf)
> 			netdev_err(dev->net, "VLAN TABLE %d: 0x%08x 0x%08x", i, 
> pdata->vlan_table[i], buf);
> 		else
> 			netdev_info(dev->net, "VLAN TABLE %d: 0x%08x 0x%08x", i, 
> pdata->vlan_table[i], buf);
> 	}
> 
> So I can "read out" the table if I do "ethtool -S" and see it in the 
> kernel log.
> Normally the output looks like this:
> VLAN TABLE 0: 0x00000005 0x00000005
> So the table looks as expected. The Local Filter table from pdata is 
> the same as in the chip itself.
> 
> But after some cable disconnects and connects I see this:
> VLAN TABLE 0: 0x00000005 0x00000000
> So the table got cleared or deleted and no paketes on VLAN-ID 2 go 
> through.
> I even can do this after I read out the table in lan78xx_get_stats():
> 
> 	lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
> 				DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
> 
> ...and with this I can "fix" the table again from the ethtool and it 
> starts working again.
> 
> Has someone seen something like this or can point me to a direction 
> where
> I could reinit this table (I already tried at the end of 
> lan78xx_mac_link_up() without success...)

I was able to reproduce your issue on my hardware. AFAIU the vlan table 
is not restored after USB suspend.
I will send a patch shortly. Would be great if you can test it too.

> 
> Thanks!
> 
> Regards, Sven

Thanks
Nicolai

^ permalink raw reply

* [PATCH net] net: usb: lan78xx: restore VLAN filter table after device reset
From: Nicolai Buchwitz @ 2026-06-18 19:11 UTC (permalink / raw)
  To: Thangaraj Samynathan, Rengarajan Sundararajan, UNGLinuxDriver,
	Woojung.Huh
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Sven Schuchmann, netdev, linux-usb, linux-kernel,
	Nicolai Buchwitz

Configured VLANs stop receiving traffic after a USB autosuspend/resume
cycle, e.g. when a cable is unplugged long enough for the device to
suspend and then plugged back in. VLAN filtering stays enabled but all
VLAN-tagged frames are dropped until a VLAN is added or removed again.

The reset on resume clears the hardware VLAN filter table, but unlike
the multicast and address filters it is never reprogrammed from the
driver's shadow copy, so it stays empty.

Restore the VLAN filter table as part of the reset sequence.

Reported-by: Sven Schuchmann <schuchmann@schleissheimer.de>
Closes: https://lore.kernel.org/netdev/BEZP281MB224501E38B30BFDC4BD3D364D9E32@BEZP281MB2245.DEUP281.PROD.OUTLOOK.COM/T/#u
Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver")
Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
---
 drivers/net/usb/lan78xx.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index bcf293ea1bd3..52c76de64eb9 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -3065,14 +3065,20 @@ static int lan78xx_set_features(struct net_device *netdev,
 	return lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl);
 }
 
+static int lan78xx_write_vlan_table(struct lan78xx_net *dev)
+{
+	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
+
+	return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
+				      DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+}
+
 static void lan78xx_deferred_vlan_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
 			container_of(param, struct lan78xx_priv, set_vlan);
-	struct lan78xx_net *dev = pdata->dev;
 
-	lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
-			       DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+	lan78xx_write_vlan_table(pdata->dev);
 }
 
 static int lan78xx_vlan_rx_add_vid(struct net_device *netdev,
@@ -3353,6 +3359,15 @@ static int lan78xx_reset(struct lan78xx_net *dev)
 
 	lan78xx_set_multicast(dev->net);
 
+	/* The chip reset above also clears the VLAN filter table held in the
+	 * shared VLAN/DA hash RAM. The network stack does not re-add VLANs
+	 * after a silent device reset (e.g. on reset_resume after USB
+	 * autosuspend), so restore the table from our shadow copy here.
+	 */
+	ret = lan78xx_write_vlan_table(dev);
+	if (ret < 0)
+		return ret;
+
 	/* reset PHY */
 	ret = lan78xx_read_reg(dev, PMT_CTL, &buf);
 	if (ret < 0)

base-commit: 7d8297e26b4e20b5d1c3c3fe51fe81a1c7fbc823
-- 
2.53.0


^ permalink raw reply related

* [PATCH iwl-net] idpf: fix max_vport related crash on allocation error during init
From: Emil Tantilov @ 2026-06-18 19:23 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, anthony.l.nguyen, przemyslaw.kitszel, andrew+netdev,
	davem, edumazet, kuba, pabeni, madhu.chittim

Set adapter->max_vports only after successful allocation of vports, netdevs
and  vport_config buffers. This fixes possible crashes on reset or rmmod,
following failed allocation on init

[  305.981402] idpf 0000:83:00.0: enabling device (0100 -> 0102)
[  305.994464] idpf 0000:83:00.0: Device HW Reset initiated
[  320.416872] BUG: kernel NULL pointer dereference, address: 0000000000000000
[  320.416918] #PF: supervisor read access in kernel mode
[  320.416942] #PF: error_code(0x0000) - not-present page
[  320.416963] PGD 2099657067 P4D 0
[  320.416983] Oops: Oops: 0000 [#1] SMP NOPTI
...
[  320.417093] RIP: 0010:idpf_remove+0x118/0x200 [idpf]
[  320.417130] Code: 8b bb 98 09 00 00 e8 17 0f 5b e5 48 8b bb e8 08 00 00 e8 0b 0f 5b e5 66 83 bb 28 06 00 00 00 48 8b bb 20 06 00 00 74 49 31 ed <48> 8b 04 ef 48 85 c0 74 2f 48 8b 78 20 e8 66 58 91 e5 48 8b 83 20
[  320.417183] RSP: 0018:ff7322212903fdb8 EFLAGS: 00010246
[  320.417205] RAX: 0000000000000000 RBX: ff4463de40300000 RCX: ff7322212903fd4c
[  320.417228] RDX: 0000000000000001 RSI: ffffffffa7f7d100 RDI: 0000000000000000
[  320.417250] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[  320.417272] R10: 0000000000000001 R11: ff4463de3a638f58 R12: ff4463be89ac7000
[  320.417294] R13: ff4463be89ac7198 R14: ff4463be94fc7198 R15: ffffffffc0f10f20
[  320.417317] FS:  00007f963c0e6740(0000) GS:ff4463fdd65d8000(0000) knlGS:0000000000000000
[  320.417342] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  320.417362] CR2: 0000000000000000 CR3: 00000020ba674002 CR4: 0000000000773ef0
[  320.417385] PKRU: 55555554
[  320.417398] Call Trace:
[  320.417412]  <TASK>
[  320.417429]  pci_device_remove+0x42/0xb0
[  320.417459]  device_release_driver_internal+0x1a9/0x210
[  320.417492]  driver_detach+0x4b/0x90
[  320.417516]  bus_remove_driver+0x70/0x100
[  320.417539]  pci_unregister_driver+0x2e/0xb0
[  320.417564]  __do_sys_delete_module.constprop.0+0x190/0x2f0
[  320.417592]  ? kmem_cache_free+0x31e/0x550
[  320.417619]  ? lockdep_hardirqs_on_prepare+0xde/0x190
[  320.417644]  ? do_syscall_64+0x38/0x6b0
[  320.417665]  do_syscall_64+0xc8/0x6b0
[  320.417683]  ? clear_bhb_loop+0x30/0x80
[  320.417706]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  320.417727] RIP: 0033:0x7f963bb30beb

Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration")
Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index be66f9b2e101..dc5ad784f456 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -3555,7 +3555,6 @@ int idpf_vc_core_init(struct idpf_adapter *adapter)
 
 	pci_sriov_set_totalvfs(adapter->pdev, idpf_get_max_vfs(adapter));
 	num_max_vports = idpf_get_max_vports(adapter);
-	adapter->max_vports = num_max_vports;
 	adapter->vports = kzalloc_objs(*adapter->vports, num_max_vports);
 	if (!adapter->vports)
 		return -ENOMEM;
@@ -3576,6 +3575,12 @@ int idpf_vc_core_init(struct idpf_adapter *adapter)
 		goto err_netdev_alloc;
 	}
 
+	/* Set max_vports only after vports, netdevs and vport_config buffers
+	 * are allocated to make sure max_vport bound loops don't end up
+	 * crashing, following allocation errors on init.
+	 */
+	adapter->max_vports = num_max_vports;
+
 	/* Start the mailbox task before requesting vectors. This will ensure
 	 * vector information response from mailbox is handled
 	 */
-- 
2.37.3


^ permalink raw reply related

* Re: [PATCH net] net: dsa: realtek: fix memory leak in rtl8366rb_setup_led()
From: Luiz Angelo Daros de Luca @ 2026-06-18 20:12 UTC (permalink / raw)
  To: David Yang
  Cc: netdev, Linus Walleij, Alvin Šipraga, Andrew Lunn,
	Vladimir Oltean, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, linux-kernel
In-Reply-To: <20260618140200.1888707-1-mmyangfl@gmail.com>

Thanks David,


> led_classdev_register_ext() only reads init_data.devicename - it never
> stores the pointer. However, the caller allocated devicename with
> kasprintf() but never freed it, leaking the string memory.
>
> Fix it with a stack buffer to avoid dynamic buffers completely.
>
> Fixes: 32d617005475 ("net: dsa: realtek: add LED drivers for rtl8366rb")
> Signed-off-by: David Yang <mmyangfl@gmail.com>
> ---
>  drivers/net/dsa/realtek/rtl8366rb-leds.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/dsa/realtek/rtl8366rb-leds.c b/drivers/net/dsa/realtek/rtl8366rb-leds.c
> index 509ffd3f8db5..ba50d311cb15 100644
> --- a/drivers/net/dsa/realtek/rtl8366rb-leds.c
> +++ b/drivers/net/dsa/realtek/rtl8366rb-leds.c
> @@ -89,6 +89,7 @@ static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp,
>         struct led_init_data init_data = { };
>         enum led_default_state state;
>         struct rtl8366rb_led *led;
> +       char name[64];
>         u32 led_group;
>         int ret;
>
> @@ -129,10 +130,9 @@ static int rtl8366rb_setup_led(struct realtek_priv *priv, struct dsa_port *dp,
>         init_data.fwnode = led_fwnode;
>         init_data.devname_mandatory = true;
>
> -       init_data.devicename = kasprintf(GFP_KERNEL, "Realtek-%d:0%d:%d",
> -                                        dp->ds->index, dp->index, led_group);

Indeed, it will leak. init_data is local and init_data.devicename is
read by led_compose_name, not stored. However, stack is a limited
space for allocation.
You can alternatively solve the leak using devm_kasprintf() (my
choice) or adding a kfree() before leaving the function.

> -       if (!init_data.devicename)
> -               return -ENOMEM;
> +       snprintf(name, sizeof(name), "Realtek-%d:0%d:%d",
> +                dp->ds->index, dp->index, led_group);
> +       init_data.devicename = name;
>
>         ret = devm_led_classdev_register_ext(priv->dev, &led->cdev, &init_data);
>         if (ret) {
> --
> 2.53.0
>

^ permalink raw reply

* Re: [Intel-wired-lan] [PATCH net] igb: only strip Rx timestamp header on the first buffer of a frame
From: Jacob Keller @ 2026-06-18 20:25 UTC (permalink / raw)
  To: Tony Nguyen, Kurt Kanzenbach, Tjerk Kusters,
	netdev@vger.kernel.org
  Cc: intel-wired-lan@lists.osuosl.org, przemyslaw.kitszel@intel.com,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, richardcochran@gmail.com,
	hawk@kernel.org, stable@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <55ab9b13-ee51-4ac6-af7b-b3feb159eb51@intel.com>

On 6/18/2026 10:38 AM, Tony Nguyen wrote:
> On 6/15/2026 12:43 AM, Kurt Kanzenbach wrote:
>> On Fri Jun 12 2026, Tjerk Kusters wrote:
>>> Fixes: 5379260852b0 ("igb: Fix XDP with PTP enabled")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: T Kusters <tkusters@aweta.nl>
> 
> Sign off should be your full name.
> 
Ideally it should also match whatever you use as your email in the From.

^ permalink raw reply

* [RFC net-next 0/4] net: dsa: motorcomm: Add LED support
From: David Yang @ 2026-06-18 20:26 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel

RFC during net-next closed

David Yang (4):
  net: dsa: motorcomm: Move to subdirectory
  net: dsa: motorcomm: Split SMI module
  net: dsa: motorcomm: Dynamically allocate port structures
  net: dsa: motorcomm: Add LED support

 MAINTAINERS                                   |   2 +-
 drivers/net/dsa/Kconfig                       |  10 +-
 drivers/net/dsa/Makefile                      |   2 +-
 drivers/net/dsa/motorcomm/Kconfig             |  17 +
 drivers/net/dsa/motorcomm/Makefile            |   5 +
 .../net/dsa/{yt921x.c => motorcomm/chip.c}    | 311 +++-------
 .../net/dsa/{yt921x.h => motorcomm/chip.h}    |  21 +-
 drivers/net/dsa/motorcomm/leds.c              | 530 ++++++++++++++++++
 drivers/net/dsa/motorcomm/leds.h              | 104 ++++
 drivers/net/dsa/motorcomm/smi.c               | 155 +++++
 drivers/net/dsa/motorcomm/smi.h               |  88 +++
 11 files changed, 1003 insertions(+), 242 deletions(-)
 create mode 100644 drivers/net/dsa/motorcomm/Kconfig
 create mode 100644 drivers/net/dsa/motorcomm/Makefile
 rename drivers/net/dsa/{yt921x.c => motorcomm/chip.c} (95%)
 rename drivers/net/dsa/{yt921x.h => motorcomm/chip.h} (99%)
 create mode 100644 drivers/net/dsa/motorcomm/leds.c
 create mode 100644 drivers/net/dsa/motorcomm/leds.h
 create mode 100644 drivers/net/dsa/motorcomm/smi.c
 create mode 100644 drivers/net/dsa/motorcomm/smi.h

-- 
2.53.0


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox