* [PATCH iwl-next 2/2] idpf: implement pci error handlers
From: Emil Tantilov @ 2026-04-11 0:39 UTC (permalink / raw)
To: intel-wired-lan
Cc: netdev, przemyslaw.kitszel, jay.bhat, ivan.d.barrera,
aleksandr.loktionov, larysa.zaremba, anthony.l.nguyen,
andrew+netdev, davem, edumazet, kuba, pabeni, aleksander.lobakin,
linux-pci, madhu.chittim, decot, willemb, sheenamo
In-Reply-To: <20260411003959.30959-1-emil.s.tantilov@intel.com>
Add callbacks to handle PCI errors and FLR reset. When preparing to handle
reset on the bus, the driver must stop all operations that can lead to MMIO
access in order to prevent HW errors. To accomplish this introduce helper
idpf_reset_prepare() that gets called prior to FLR or when PCI error is
detected. Upon resume the recovery is done through the existing reset path
by starting the event task.
The following callbacks are implemented:
.reset_prepare runs the first portion of the generic reset path leading up
to the part where we wait for the reset to complete.
.reset_done/resume runs the recovery part of the reset handling.
.error_detected is the callback dealing with PCI errors, similar to the
prepare call, we stop all operations, prior to attempting a recovery.
.slot_reset is the callback attempting to restore the device, provided a
PCI reset was initiated by the AER driver.
Whereas previously the init logic guaranteed netdevs during reset, the
addition of idpf_detach_and_close() to the PCI callbacks flow makes it
possible for the function to be called without netdevs. Add check to
avoid NULL pointer dereference in that case.
Co-developed-by: Alan Brady <alan.brady@intel.com>
Signed-off-by: Alan Brady <alan.brady@intel.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Reviewed-by: Jay Bhat <jay.bhat@intel.com>
Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
---
drivers/net/ethernet/intel/idpf/idpf.h | 3 +
drivers/net/ethernet/intel/idpf/idpf_lib.c | 13 ++-
drivers/net/ethernet/intel/idpf/idpf_main.c | 114 ++++++++++++++++++++
3 files changed, 128 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h
index 1d0e32e47e87..164d2f3e233a 100644
--- a/drivers/net/ethernet/intel/idpf/idpf.h
+++ b/drivers/net/ethernet/intel/idpf/idpf.h
@@ -88,6 +88,7 @@ enum idpf_state {
* @IDPF_REMOVE_IN_PROG: Driver remove in progress
* @IDPF_MB_INTR_MODE: Mailbox in interrupt mode
* @IDPF_VC_CORE_INIT: virtchnl core has been init
+ * @IDPF_PCI_CB_RESET: Reset via the PCI callbacks
* @IDPF_FLAGS_NBITS: Must be last
*/
enum idpf_flags {
@@ -97,6 +98,7 @@ enum idpf_flags {
IDPF_REMOVE_IN_PROG,
IDPF_MB_INTR_MODE,
IDPF_VC_CORE_INIT,
+ IDPF_PCI_CB_RESET,
IDPF_FLAGS_NBITS,
};
@@ -1012,4 +1014,5 @@ void idpf_idc_vdev_mtu_event(struct iidc_rdma_vport_dev_info *vdev_info,
int idpf_add_del_fsteer_filters(struct idpf_adapter *adapter,
struct virtchnl2_flow_rule_add_del *rule,
enum virtchnl2_op opcode);
+void idpf_detach_and_close(struct idpf_adapter *adapter);
#endif /* !_IDPF_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 7988836fbae0..1e706beb0098 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -758,13 +758,16 @@ static int idpf_init_mac_addr(struct idpf_vport *vport,
return 0;
}
-static void idpf_detach_and_close(struct idpf_adapter *adapter)
+void idpf_detach_and_close(struct idpf_adapter *adapter)
{
int max_vports = adapter->max_vports;
for (int i = 0; i < max_vports; i++) {
struct net_device *netdev = adapter->netdevs[i];
+ if (!netdev)
+ continue;
+
/* If the interface is in detached state, that means the
* previous reset was not handled successfully for this
* vport.
@@ -1908,6 +1911,10 @@ static void idpf_init_hard_reset(struct idpf_adapter *adapter)
dev_info(dev, "Device HW Reset initiated\n");
+ /* Reset has already happened, skip to recovery. */
+ if (test_and_clear_bit(IDPF_PCI_CB_RESET, adapter->flags))
+ goto check_rst_complete;
+
/* Prepare for reset */
if (test_bit(IDPF_HR_DRV_LOAD, adapter->flags)) {
reg_ops->trigger_reset(adapter, IDPF_HR_DRV_LOAD);
@@ -1925,6 +1932,7 @@ static void idpf_init_hard_reset(struct idpf_adapter *adapter)
goto unlock_mutex;
}
+check_rst_complete:
/* Wait for reset to complete */
err = idpf_check_reset_complete(adapter, &adapter->reset_reg);
if (err) {
@@ -1984,7 +1992,8 @@ void idpf_vc_event_task(struct work_struct *work)
if (test_bit(IDPF_HR_FUNC_RESET, adapter->flags))
goto func_reset;
- if (test_bit(IDPF_HR_DRV_LOAD, adapter->flags))
+ if (test_bit(IDPF_HR_DRV_LOAD, adapter->flags) ||
+ test_bit(IDPF_PCI_CB_RESET, adapter->flags))
goto drv_load;
return;
diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c
index d99f759c55e1..cd467695047e 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_main.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_main.c
@@ -234,6 +234,7 @@ static int idpf_cfg_device(struct idpf_adapter *adapter)
if (err)
pci_dbg(pdev, "PCIe PTM is not supported by PCIe bus/controller\n");
+ pci_save_state(pdev);
pci_set_drvdata(pdev, adapter);
return 0;
@@ -360,6 +361,118 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
return err;
}
+static void idpf_reset_prepare(struct idpf_adapter *adapter)
+{
+ pci_dbg(adapter->pdev, "resetting\n");
+ set_bit(IDPF_HR_RESET_IN_PROG, adapter->flags);
+ cancel_delayed_work_sync(&adapter->serv_task);
+ cancel_delayed_work_sync(&adapter->vc_event_task);
+ idpf_detach_and_close(adapter);
+ idpf_idc_issue_reset_event(adapter->cdev_info);
+ idpf_vc_core_deinit(adapter);
+}
+
+/**
+ * idpf_pci_err_detected - PCI error detected, about to attempt recovery
+ * @pdev: PCI device struct
+ * @err: err detected
+ *
+ * Return: %PCI_ERS_RESULT_NEED_RESET to attempt recovery,
+ * %PCI_ERS_RESULT_DISCONNECT if recovery is not possible.
+ */
+static pci_ers_result_t
+idpf_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t err)
+{
+ struct idpf_adapter *adapter = pci_get_drvdata(pdev);
+
+ /* Shutdown the mailbox if PCI I/O is in a bad state to avoid MBX
+ * timeouts during the prepare stage.
+ */
+ if (pci_channel_offline(pdev))
+ libie_ctlq_xn_shutdown(adapter->xnm);
+
+ idpf_reset_prepare(adapter);
+
+ if (err == pci_channel_io_perm_failure)
+ return PCI_ERS_RESULT_DISCONNECT;
+
+ /* When called due to PCI error, driver will have to force PFR on
+ * resume, in order to complete the recovery via the event task.
+ */
+ set_bit(IDPF_PCI_CB_RESET, adapter->flags);
+
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * idpf_pci_err_slot_reset - PCI undergoing reset
+ * @pdev: PCI device struct
+ *
+ * Reset PCI state and use a register read to see if we're good.
+ *
+ * Return: %PCI_ERS_RESULT_RECOVERED on success,
+ * %PCI_ERS_RESULT_DISCONNECT on failure.
+ */
+static pci_ers_result_t
+idpf_pci_err_slot_reset(struct pci_dev *pdev)
+{
+ struct idpf_adapter *adapter = pci_get_drvdata(pdev);
+
+ pci_restore_state(pdev);
+ pci_set_master(pdev);
+ pci_wake_from_d3(pdev, false);
+ if (readl(adapter->reset_reg.rstat) != 0xFFFFFFFF) {
+ pci_save_state(pdev);
+ return PCI_ERS_RESULT_RECOVERED;
+ }
+
+ return PCI_ERS_RESULT_DISCONNECT;
+}
+
+/**
+ * idpf_pci_err_resume - Resume operations after PCI error recovery
+ * @pdev: PCI device struct
+ */
+static void idpf_pci_err_resume(struct pci_dev *pdev)
+{
+ struct idpf_adapter *adapter = pci_get_drvdata(pdev);
+
+ /* Force a PFR when resuming from PCI error. */
+ if (test_and_set_bit(IDPF_PCI_CB_RESET, adapter->flags))
+ adapter->dev_ops.reg_ops.trigger_reset(adapter, IDPF_HR_FUNC_RESET);
+
+ queue_delayed_work(adapter->vc_event_wq,
+ &adapter->vc_event_task,
+ msecs_to_jiffies(300));
+}
+
+/**
+ * idpf_pci_err_reset_prepare - Prepare driver for PCI reset
+ * @pdev: PCI device struct
+ */
+static void idpf_pci_err_reset_prepare(struct pci_dev *pdev)
+{
+ idpf_reset_prepare(pci_get_drvdata(pdev));
+}
+
+/**
+ * idpf_pci_err_reset_done - PCI err reset recovery complete
+ * @pdev: PCI device struct
+ */
+static void idpf_pci_err_reset_done(struct pci_dev *pdev)
+{
+ pci_dbg(pdev, "reset: done\n");
+ idpf_pci_err_resume(pdev);
+}
+
+static const struct pci_error_handlers idpf_pci_err_handler = {
+ .error_detected = idpf_pci_err_detected,
+ .slot_reset = idpf_pci_err_slot_reset,
+ .reset_prepare = idpf_pci_err_reset_prepare,
+ .reset_done = idpf_pci_err_reset_done,
+ .resume = idpf_pci_err_resume,
+};
+
/* idpf_pci_tbl - PCI Dev idpf ID Table
*/
static const struct pci_device_id idpf_pci_tbl[] = {
@@ -377,5 +490,6 @@ static struct pci_driver idpf_driver = {
.sriov_configure = idpf_sriov_configure,
.remove = idpf_remove,
.shutdown = idpf_shutdown,
+ .err_handler = &idpf_pci_err_handler,
};
module_pci_driver(idpf_driver);
--
2.37.3
^ permalink raw reply related
* [PATCH iwl-next 1/2] idpf: remove conditonal MBX deinit from idpf_vc_core_deinit()
From: Emil Tantilov @ 2026-04-11 0:39 UTC (permalink / raw)
To: intel-wired-lan
Cc: netdev, przemyslaw.kitszel, jay.bhat, ivan.d.barrera,
aleksandr.loktionov, larysa.zaremba, anthony.l.nguyen,
andrew+netdev, davem, edumazet, kuba, pabeni, aleksander.lobakin,
linux-pci, madhu.chittim, decot, willemb, sheenamo
In-Reply-To: <20260411003959.30959-1-emil.s.tantilov@intel.com>
Previously it was assumed that idpf_vc_core_deinit() is always being
called during reset handling, with remove being an exception. Ideally
the driver needs to communicate the changes to FW in all instances where
the MBX is not already disabled. Remove the remove_in_prog check from
idpf_vc_core_deinit() as the MBX was already disabled while handling the
reset via libie_ctlq_xn_shutdown() by the service task. This is also
needed by the following patch, introducing PCI callbacks support.
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Reviewed-by: Jay Bhat <jay.bhat@intel.com>
Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
---
drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
index 129c8f6b0faa..fceaf3ec1cd4 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
@@ -3229,24 +3229,15 @@ int idpf_vc_core_init(struct idpf_adapter *adapter)
*/
void idpf_vc_core_deinit(struct idpf_adapter *adapter)
{
- bool remove_in_prog;
-
if (!test_bit(IDPF_VC_CORE_INIT, adapter->flags))
return;
- /* Avoid transaction timeouts when called during reset */
- remove_in_prog = test_bit(IDPF_REMOVE_IN_PROG, adapter->flags);
- if (!remove_in_prog)
- idpf_deinit_dflt_mbx(adapter);
-
idpf_ptp_release(adapter);
idpf_deinit_task(adapter);
idpf_idc_deinit_core_aux_device(adapter);
idpf_rel_rx_pt_lkup(adapter);
idpf_intr_rel(adapter);
-
- if (remove_in_prog)
- idpf_deinit_dflt_mbx(adapter);
+ idpf_deinit_dflt_mbx(adapter);
cancel_delayed_work_sync(&adapter->serv_task);
--
2.37.3
^ permalink raw reply related
* [PATCH iwl-next 0/2] Introduce IDPF PCI callbacks
From: Emil Tantilov @ 2026-04-11 0:39 UTC (permalink / raw)
To: intel-wired-lan
Cc: netdev, przemyslaw.kitszel, jay.bhat, ivan.d.barrera,
aleksandr.loktionov, larysa.zaremba, anthony.l.nguyen,
andrew+netdev, davem, edumazet, kuba, pabeni, aleksander.lobakin,
linux-pci, madhu.chittim, decot, willemb, sheenamo
This series implements PCI callbacks for the purpose of handling FLR and
PCI errors in the IDPF driver.
The first patch removes the conditional deinitialization of the mailbox in
the idpf_vc_core_deinit() function. Aside from being redundant, due to the
shutdown of the mailbox after a reset is detected, the check was also
preventing the driver from sending messages to stop and disable the vports
and queues on FW side, which is needed for the prepare phase of the FLR
handling.
The second patch implements the PCI callbacks. The logic here follows
the reset handling done in idpf_init_hard_reset(), but is split in
prepare and resume phases, where idpf_reset_prepare() stops all driver
operations and the resume callback attempt to recover following the
reset or the PCI error event.
Testing hints:
1. FLR via sysfs:
echo 1 > /sys/class/net/<ifname>/device/reset
Previously this would have been handled by idpf_init_hard_reset() as the
driver detects the reset. Now it will be done by the PCI err callbacks,
so this is the easiest way to test the reset_prepare/resume path.
2. PCI errors can be tested with aer-inject:
./aer-inject -s 83:00.0 examples/<error_type>
3. Stress testing can be done by combining various callbacks with the
reset from step 1:
echo 1 > /sys/class/net/<if>/device/reset& ethtool -L <if> combined 8
ethtool -L <if> combined 16& echo 1 > /sys/class/net/<if>/device/reset
Emil Tantilov (2):
idpf: remove conditonal MBX deinit from idpf_vc_core_deinit()
idpf: implement pci error handlers
drivers/net/ethernet/intel/idpf/idpf.h | 3 +
drivers/net/ethernet/intel/idpf/idpf_lib.c | 13 +-
drivers/net/ethernet/intel/idpf/idpf_main.c | 114 ++++++++++++++++++
.../net/ethernet/intel/idpf/idpf_virtchnl.c | 11 +-
4 files changed, 129 insertions(+), 12 deletions(-)
--
2.37.3
^ permalink raw reply
* Re: [PATCH v3] selftests: vsock: avoid races creating Unix socket paths
From: Cao Ruichuang @ 2026-04-11 0:33 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Stefano Garzarella, Bobby Eshleman, virtualization,
linux-kselftest, netdev, linux-kernel
In-Reply-To: <20260410134733.1c10a183@kernel.org>
Hi Jakub,
Thanks for the correction.
No, I do not have a concrete failure reproducer for this one. My change
was based on the mktemp -u TOCTOU window in the selftest setup rather than
on a demonstrated flaky failure.
Given that, I agree this is not strong enough to keep reposting on the
netdev side. I will stop here unless I can come back with a real repro.
I will also follow the netdev posting rules you mentioned for future
updates.
Thanks,
Cao Ruichuang
^ permalink raw reply
* [PATCH bpf-next v1 3/3] bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
From: Amery Hung @ 2026-04-11 0:17 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411001711.3418264-1-ameryhung@gmail.com>
Remove the check that rejects sleepable BPF programs from doing
BPF_ANY/BPF_EXIST updates on local storage. This restriction was added
in commit b00fa38a9c1c ("bpf: Enable non-atomic allocations in local
storage") because kzalloc(GFP_KERNEL) could sleep inside
local_storage->lock. This is no longer a concern: all local storage
allocations now use kmalloc_nolock() which never sleeps.
In addition, since kmalloc_nolock() only accepts __GFP_ACCOUNT,
__GFP_ZERO and __GFP_NO_OBJ_EXT, the gfp_flags parameter plumbing from
bpf_*_storage_get() to bpf_local_storage_update() becomes dead code.
Remove gfp_flags from bpf_selem_alloc(), bpf_local_storage_alloc() and
bpf_local_storage_update(). Drop the hidden 5th argument from
bpf_*_storage_get helpers, and remove the verifier patching that
injected GFP_KERNEL/GFP_ATOMIC into the fifth argument.
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf_local_storage.h | 7 +++----
kernel/bpf/bpf_cgrp_storage.c | 9 ++++-----
kernel/bpf/bpf_inode_storage.c | 9 ++++-----
kernel/bpf/bpf_local_storage.c | 16 ++++++----------
kernel/bpf/bpf_task_storage.c | 9 ++++-----
kernel/bpf/verifier.c | 26 --------------------------
net/core/bpf_sk_storage.c | 17 +++++++----------
7 files changed, 28 insertions(+), 65 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index dced54e9265f..9e4f5c45c974 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -188,7 +188,7 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
- bool swap_uptrs, gfp_t gfp_flags);
+ bool swap_uptrs);
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now);
@@ -196,12 +196,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
int
bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags);
+ struct bpf_local_storage_elem *first_selem);
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags);
+ void *value, u64 map_flags, bool swap_uptrs);
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d93ac2866748..c76e9b0fabba 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -76,7 +76,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
return PTR_ERR(cgroup);
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static void cgroup_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(map, &cgroup_cache);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -143,7 +142,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, false, gfp_flags);
+ value, BPF_NOEXIST, false);
out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index efc8996a4c0a..0da8d923e39d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -98,7 +98,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return inode_storage_delete(file_inode(fd_file(f)), map);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -150,7 +149,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index d0e6070fa68c..77bb1b76fd4b 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -68,7 +68,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs)
{
struct bpf_local_storage_elem *selem;
@@ -464,8 +464,7 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags)
+ struct bpf_local_storage_elem *first_selem)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -535,7 +534,7 @@ int bpf_local_storage_alloc(void *owner,
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
@@ -552,9 +551,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
- if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
- return ERR_PTR(-EINVAL);
-
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -563,11 +559,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ err = bpf_local_storage_alloc(owner, smap, selem);
if (err) {
bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
@@ -597,7 +593,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 55f4f22bb212..4b342be29eac 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -118,7 +118,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- true, GFP_ATOMIC);
+ true);
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -165,9 +165,8 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -184,7 +183,7 @@ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7aa06f534cb2..0d2218033fdb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -603,14 +603,6 @@ static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
return is_may_goto_insn(&env->prog->insnsi[insn_idx]);
}
-static bool is_storage_get_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_storage_get ||
- func_id == BPF_FUNC_inode_storage_get ||
- func_id == BPF_FUNC_task_storage_get ||
- func_id == BPF_FUNC_cgrp_storage_get;
-}
-
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -23893,24 +23885,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (is_storage_get_function(insn->imm)) {
- if (env->insn_aux_data[i + delta].non_sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
- else
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 9fb22e352beb..3aaaf21c00eb 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
if (sock) {
sdata = bpf_local_storage_update(
sock->sk, (struct bpf_local_storage_map *)map, value,
- map_flags, false, GFP_ATOMIC);
+ map_flags, false);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -227,9 +227,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
return ret;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -250,7 +249,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
refcount_inc_not_zero(&sk->sk_refcnt)) {
sdata = bpf_local_storage_update(
sk, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -383,16 +382,14 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return false;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags)
{
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (in_hardirq() || in_nmi())
return (unsigned long)NULL;
- return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
- gfp_flags);
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
}
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v1 2/3] bpf: Use kmalloc_nolock() universally in local storage
From: Amery Hung @ 2026-04-11 0:17 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411001711.3418264-1-ameryhung@gmail.com>
Switch to kmalloc_nolock() universally in local storage. Socket local
storage didn't move to kmalloc_nolock() when BPF memory allocator was
replaced by it for performance reasons. Now that kfree_rcu() supports
freeing memory allocated by kmalloc_nolock(), we can move the remaining
local storages to use kmalloc_nolock() and cleanup the cluttered free
paths.
Use kfree() instead of kfree_nolock() in bpf_selem_free_trace_rcu() and
bpf_local_storage_free_trace_rcu(). Both callbacks run in process context
where spinning is allowed, so kfree_nolock() is unnecessary.
Benchmark:
./bench -p 1 local-storage-create --storage-type socket \
--batch-size {16,32,64}
The benchmark is a microbenchmark stress-testing how fast local storage
can be created. There is no measurable throughput change for socket local
storage after switching from kzalloc() to kmalloc_nolock().
Socket local storage
batch creation speed diff
--------------- ---- ------------------ ----
Baseline 16 433.9 ± 0.6 k/s
32 434.3 ± 1.4 k/s
64 434.2 ± 0.7 k/s
After 16 439.0 ± 1.9 k/s +1.2%
32 437.3 ± 2.0 k/s +0.7%
64 435.8 ± 2.5k/s +0.4%
Also worth noting that the baseline got a 5% throughput boost when sheaf
replaces percpu partial slab recently [0].
[0] https://lore.kernel.org/bpf/20260123-sheaves-for-all-v4-0-041323d506f7@suse.cz/
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf_local_storage.h | 8 +-
kernel/bpf/bpf_cgrp_storage.c | 2 +-
kernel/bpf/bpf_inode_storage.c | 2 +-
kernel/bpf/bpf_local_storage.c | 130 ++++--------------------------
kernel/bpf/bpf_task_storage.c | 2 +-
net/core/bpf_sk_storage.c | 2 +-
6 files changed, 21 insertions(+), 125 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d4..dced54e9265f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -54,7 +54,6 @@ struct bpf_local_storage_map {
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
- bool use_kmalloc_nolock;
};
struct bpf_local_storage_data {
@@ -86,8 +85,7 @@ struct bpf_local_storage_elem {
*/
};
atomic_t state;
- bool use_kmalloc_nolock;
- /* 3 bytes hole */
+ /* 4 bytes hole */
/* The data is stored in another cacheline to minimize
* the number of cachelines access during a cache hit.
*/
@@ -104,7 +102,6 @@ struct bpf_local_storage {
rqspinlock_t lock; /* Protect adding/removing from the "list" */
u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */
refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
- bool use_kmalloc_nolock;
};
/* U16_MAX is much more than enough for sk local storage
@@ -137,8 +134,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock);
+ struct bpf_local_storage_cache *cache);
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c2a2ead1f466..d93ac2866748 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -114,7 +114,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
}
static void cgroup_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e86734609f3d..efc8996a4c0a 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -179,7 +179,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 9c96a4477f81..d0e6070fa68c 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -75,18 +75,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->use_kmalloc_nolock) {
- selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
- __GFP_ZERO, NUMA_NO_NODE);
- } else {
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
- }
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
atomic_set(&selem->state, 0);
- selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -102,8 +96,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -115,47 +108,14 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree(local_storage);
}
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
-static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage *local_storage;
-
- local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_nolock(local_storage);
-}
-
-static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- bpf_local_storage_free_rcu(rcu);
-}
-
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool reuse_now)
{
if (!local_storage)
return;
- if (!local_storage->use_kmalloc_nolock) {
- __bpf_local_storage_free(local_storage, reuse_now);
- return;
- }
-
if (reuse_now) {
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ kfree_rcu(local_storage, rcu);
return;
}
@@ -163,42 +123,7 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
-
- selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
- smap = rcu_dereference_check(SDATA(selem)->smap, 1);
-
- if (smap)
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree(selem);
-}
-
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- __bpf_selem_free_rcu(rcu);
-}
-
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- call_rcu(&selem->rcu, __bpf_selem_free_rcu);
- else
- call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
-}
-
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
@@ -209,37 +134,24 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree_nolock(selem);
-}
-
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
- bpf_selem_free_rcu(rcu);
+ kfree(selem);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- if (!selem->use_kmalloc_nolock) {
- /*
- * No uptr will be unpin even when reuse_now == false since uptr
- * is only supported in task local storage, where
- * smap->use_kmalloc_nolock == true.
- */
- __bpf_selem_free(selem, reuse_now);
- return;
- }
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
if (reuse_now) {
- /*
- * While it is okay to call bpf_obj_free_fields() that unpins uptr when
- * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
- */
- call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ kfree_rcu(selem, rcu);
return;
}
@@ -565,12 +477,8 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->use_kmalloc_nolock)
- storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
- __GFP_ZERO, NUMA_NO_NODE);
- else
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -580,7 +488,6 @@ int bpf_local_storage_alloc(void *owner,
raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->mem_charge = sizeof(*storage);
- storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
@@ -857,8 +764,7 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -890,12 +796,6 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
- * preemptible context. Thus, enforce all storages to use
- * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
- */
- smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
-
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 605506792b5b..55f4f22bb212 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -212,7 +212,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache, true);
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index f8338acebf07..9fb22e352beb 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &sk_cache, false);
+ return bpf_local_storage_map_alloc(attr, &sk_cache);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v1 1/3] selftests/bpf: Remove kmalloc tracing from local storage create bench
From: Amery Hung @ 2026-04-11 0:17 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411001711.3418264-1-ameryhung@gmail.com>
Remove the raw_tp/kmalloc BPF program and its associated reporting from
the local storage create benchmark. The kmalloc count per create is not
a useful metric as different code paths use different allocators (e.g.
kmalloc_nolock vs kzalloc), introducing noise that makes the number
hard to interpret.
Keep total_creates in the summary output as it is useful for normalizing
perf statistics collected alongside the benchmark.
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
.../bpf/benchs/bench_local_storage_create.c | 21 ++++++-------------
.../bpf/progs/bench_local_storage_create.c | 11 ----------
2 files changed, 6 insertions(+), 26 deletions(-)
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
index e2ff8ea1cb79..71e38000ee06 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
@@ -101,11 +101,6 @@ static void setup(void)
}
}
- if (!bpf_program__attach(skel->progs.kmalloc)) {
- fprintf(stderr, "Error attaching bpf program\n");
- exit(1);
- }
-
threads = calloc(env.producer_cnt, sizeof(*threads));
if (!threads) {
@@ -140,7 +135,6 @@ static void setup(void)
static void measure(struct bench_res *res)
{
res->hits = atomic_swap(&skel->bss->create_cnts, 0);
- res->drops = atomic_swap(&skel->bss->kmalloc_cnts, 0);
}
static void *sk_producer(void *input)
@@ -203,28 +197,25 @@ static void *producer(void *input)
static void report_progress(int iter, struct bench_res *res, long delta_ns)
{
- double creates_per_sec, kmallocs_per_create;
+ double creates_per_sec;
creates_per_sec = res->hits / 1000.0 / (delta_ns / 1000000000.0);
- kmallocs_per_create = (double)res->drops / res->hits;
printf("Iter %3d (%7.3lfus): ",
iter, (delta_ns - 1000000000) / 1000.0);
- printf("creates %8.3lfk/s (%7.3lfk/prod), ",
+ printf("creates %8.3lfk/s (%7.3lfk/prod)\n",
creates_per_sec, creates_per_sec / env.producer_cnt);
- printf("%3.2lf kmallocs/create\n", kmallocs_per_create);
}
static void report_final(struct bench_res res[], int res_cnt)
{
double creates_mean = 0.0, creates_stddev = 0.0;
- long total_creates = 0, total_kmallocs = 0;
+ long total_creates = 0;
int i;
for (i = 0; i < res_cnt; i++) {
creates_mean += res[i].hits / 1000.0 / (0.0 + res_cnt);
total_creates += res[i].hits;
- total_kmallocs += res[i].drops;
}
if (res_cnt > 1) {
@@ -234,9 +225,9 @@ static void report_final(struct bench_res res[], int res_cnt)
(res_cnt - 1.0);
creates_stddev = sqrt(creates_stddev);
}
- printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), ",
- creates_mean, creates_stddev, creates_mean / env.producer_cnt);
- printf("%4.2lf kmallocs/create\n", (double)total_kmallocs / total_creates);
+ printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), %ld total\n",
+ creates_mean, creates_stddev, creates_mean / env.producer_cnt,
+ total_creates);
if (create_owner_errs || skel->bss->create_errs)
printf("%s() errors %ld create_errs %ld\n",
storage_type == BPF_MAP_TYPE_SK_STORAGE ?
diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
index c8ec0d0368e4..25ca6045fea3 100644
--- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
@@ -8,7 +8,6 @@
long create_errs = 0;
long create_cnts = 0;
-long kmalloc_cnts = 0;
__u32 bench_pid = 0;
struct storage {
@@ -29,16 +28,6 @@ struct {
__type(value, struct storage);
} task_storage_map SEC(".maps");
-SEC("raw_tp/kmalloc")
-int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
- size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags,
- int node)
-{
- __sync_fetch_and_add(&kmalloc_cnts, 1);
-
- return 0;
-}
-
SEC("tp_btf/sched_process_fork")
int BPF_PROG(sched_process_fork, struct task_struct *parent, struct task_struct *child)
{
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v1 0/3] Use kmalloc_nolock() universally in BPF local storage
From: Amery Hung @ 2026-04-11 0:17 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
Socket local storage did not convert to use kmalloc_nolock() since there
were observable performance degredation due to kfree_nolock() hitting the
slow path and the lack of kfree_rcu()-like batching freeing. Now that
these concern were addressed in slub, convert all remaining local storage
flavors to use kmalloc_nolock().
Amery Hung (3):
selftests/bpf: Remove kmalloc tracing from local storage create bench
bpf: Use kmalloc_nolock() universally in local storage
bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
include/linux/bpf_local_storage.h | 15 +-
kernel/bpf/bpf_cgrp_storage.c | 11 +-
kernel/bpf/bpf_inode_storage.c | 11 +-
kernel/bpf/bpf_local_storage.c | 146 +++---------------
kernel/bpf/bpf_task_storage.c | 11 +-
kernel/bpf/verifier.c | 26 ----
net/core/bpf_sk_storage.c | 19 +--
.../bpf/benchs/bench_local_storage_create.c | 21 +--
.../bpf/progs/bench_local_storage_create.c | 11 --
9 files changed, 55 insertions(+), 216 deletions(-)
--
2.52.0
^ permalink raw reply
* [PATCH net-next 2/2] net: dsa: mxl862xx: implement .get_stats64
From: Daniel Golle @ 2026-04-11 0:13 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King, netdev,
linux-kernel
Cc: Frank Wunderlich, Chad Monroe, Cezary Wilmanski, Liang Xu,
Benny (Ying-Tsan) Weng, Jose Maria Verdu Munoz, Avinash Jayaraman,
John Crispin
In-Reply-To: <cover.1775865049.git.daniel@makrotopia.org>
Poll free-running firmware RMON counters every 2 seconds and accumulate
deltas into 64-bit per-port statistics. 32-bit packet counters wrap
in ~220s at 10 Gbps line rate with minimum-size frames; the 2s polling
interval provides a comfortable margin. The .get_stats64 callback
forces a fresh poll so that counters are always up to date when queried.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
drivers/net/dsa/mxl862xx/mxl862xx-host.c | 8 +-
drivers/net/dsa/mxl862xx/mxl862xx.c | 174 +++++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx.h | 94 +++++++++++-
3 files changed, 269 insertions(+), 7 deletions(-)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-host.c b/drivers/net/dsa/mxl862xx/mxl862xx-host.c
index cadbdb590cf43..d55f9dff6433e 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-host.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-host.c
@@ -48,7 +48,7 @@ static void mxl862xx_crc_err_work_fn(struct work_struct *work)
dev_close(dp->conduit);
rtnl_unlock();
- clear_bit(0, &priv->crc_err);
+ clear_bit(MXL862XX_FLAG_CRC_ERR, &priv->flags);
}
/* Firmware CRC error codes (outside normal Zephyr errno range). */
@@ -247,7 +247,7 @@ static int mxl862xx_issue_cmd(struct mxl862xx_priv *priv, u16 cmd, u16 len)
ret = mxl862xx_crc6_verify(ctrl_enc, len_enc, &fw_result);
if (ret) {
- if (!test_and_set_bit(0, &priv->crc_err))
+ if (!test_and_set_bit(MXL862XX_FLAG_CRC_ERR, &priv->flags))
schedule_work(&priv->crc_err_work);
return -EIO;
}
@@ -314,7 +314,7 @@ static int mxl862xx_send_cmd(struct mxl862xx_priv *priv, u16 cmd, u16 size,
if (ret < 0) {
if ((ret == MXL862XX_FW_CRC6_ERR ||
ret == MXL862XX_FW_CRC16_ERR) &&
- !test_and_set_bit(0, &priv->crc_err))
+ !test_and_set_bit(MXL862XX_FLAG_CRC_ERR, &priv->flags))
schedule_work(&priv->crc_err_work);
if (!quiet)
dev_err(&priv->mdiodev->dev,
@@ -458,7 +458,7 @@ int mxl862xx_api_wrap(struct mxl862xx_priv *priv, u16 cmd, void *_data,
}
if (crc16(0xffff, (const u8 *)data, size) != crc) {
- if (!test_and_set_bit(0, &priv->crc_err))
+ if (!test_and_set_bit(MXL862XX_FLAG_CRC_ERR, &priv->flags))
schedule_work(&priv->crc_err_work);
ret = -EIO;
goto out;
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index 8159f6a66d724..01b3ebd62231a 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -30,6 +30,12 @@
#define MXL862XX_API_READ_QUIET(dev, cmd, data) \
mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, true)
+/* Polling interval for RMON counter accumulation. At 2.5 Gbps with
+ * minimum-size (64-byte) frames, a 32-bit packet counter wraps in ~880s.
+ * 2s gives a comfortable margin.
+ */
+#define MXL862XX_STATS_POLL_INTERVAL (2 * HZ)
+
struct mxl862xx_mib_desc {
unsigned int size;
unsigned int offset;
@@ -686,6 +692,9 @@ static int mxl862xx_setup(struct dsa_switch *ds)
if (ret)
return ret;
+ schedule_delayed_work(&priv->stats_work,
+ MXL862XX_STATS_POLL_INTERVAL);
+
return mxl862xx_setup_mdio(ds);
}
@@ -1879,6 +1888,158 @@ static void mxl862xx_get_pause_stats(struct dsa_switch *ds, int port,
pause_stats->rx_pause_frames = le32_to_cpu(cnt.rx_good_pause_pkts);
}
+/* Compute the delta between two 32-bit free-running counter snapshots,
+ * handling a single wrap-around correctly via unsigned subtraction.
+ */
+static u64 mxl862xx_delta32(u32 cur, u32 prev)
+{
+ return (u32)(cur - prev);
+}
+
+/**
+ * mxl862xx_stats_poll - Read RMON counters and accumulate into 64-bit stats
+ * @ds: DSA switch
+ * @port: port index
+ *
+ * The firmware RMON counters are free-running 32-bit values (64-bit for
+ * byte counters). This function reads the hardware via MDIO (may sleep),
+ * computes deltas from the previous snapshot, and accumulates them into
+ * 64-bit per-port stats under a spinlock.
+ *
+ * Called only from the stats polling workqueue -- serialized by the
+ * single-threaded delayed_work, so no MDIO locking is needed here.
+ */
+static void mxl862xx_stats_poll(struct dsa_switch *ds, int port)
+{
+ struct mxl862xx_priv *priv = ds->priv;
+ struct mxl862xx_port_stats *s = &priv->ports[port].stats;
+ u32 rx_fcserr, rx_under, rx_over, rx_align, tx_drop;
+ u32 rx_drop, rx_evlan, mtu_exc, tx_acm;
+ struct mxl862xx_rmon_port_cnt cnt;
+ u64 rx_bytes, tx_bytes;
+ u32 rx_mcast, tx_coll;
+ u32 rx_pkts, tx_pkts;
+
+ /* MDIO read -- may sleep, done outside the spinlock. */
+ if (mxl862xx_read_rmon(ds, port, &cnt))
+ return;
+
+ rx_pkts = le32_to_cpu(cnt.rx_good_pkts);
+ tx_pkts = le32_to_cpu(cnt.tx_good_pkts);
+ rx_bytes = le64_to_cpu(cnt.rx_good_bytes);
+ tx_bytes = le64_to_cpu(cnt.tx_good_bytes);
+ rx_fcserr = le32_to_cpu(cnt.rx_fcserror_pkts);
+ rx_under = le32_to_cpu(cnt.rx_under_size_error_pkts);
+ rx_over = le32_to_cpu(cnt.rx_oversize_error_pkts);
+ rx_align = le32_to_cpu(cnt.rx_align_error_pkts);
+ tx_drop = le32_to_cpu(cnt.tx_dropped_pkts);
+ rx_drop = le32_to_cpu(cnt.rx_dropped_pkts);
+ rx_evlan = le32_to_cpu(cnt.rx_extended_vlan_discard_pkts);
+ mtu_exc = le32_to_cpu(cnt.mtu_exceed_discard_pkts);
+ tx_acm = le32_to_cpu(cnt.tx_acm_dropped_pkts);
+ rx_mcast = le32_to_cpu(cnt.rx_multicast_pkts);
+ tx_coll = le32_to_cpu(cnt.tx_coll_count);
+
+ /* Accumulate deltas under spinlock -- .get_stats64 reads these. */
+ spin_lock_bh(&priv->ports[port].stats_lock);
+
+ s->rx_packets += mxl862xx_delta32(rx_pkts, s->prev_rx_good_pkts);
+ s->tx_packets += mxl862xx_delta32(tx_pkts, s->prev_tx_good_pkts);
+ s->rx_bytes += rx_bytes - s->prev_rx_good_bytes;
+ s->tx_bytes += tx_bytes - s->prev_tx_good_bytes;
+
+ s->rx_errors +=
+ mxl862xx_delta32(rx_fcserr, s->prev_rx_fcserror_pkts) +
+ mxl862xx_delta32(rx_under, s->prev_rx_under_size_error_pkts) +
+ mxl862xx_delta32(rx_over, s->prev_rx_oversize_error_pkts) +
+ mxl862xx_delta32(rx_align, s->prev_rx_align_error_pkts);
+ s->tx_errors +=
+ mxl862xx_delta32(tx_drop, s->prev_tx_dropped_pkts);
+
+ s->rx_dropped +=
+ mxl862xx_delta32(rx_drop, s->prev_rx_dropped_pkts) +
+ mxl862xx_delta32(rx_evlan, s->prev_rx_evlan_discard_pkts) +
+ mxl862xx_delta32(mtu_exc, s->prev_mtu_exceed_discard_pkts);
+ s->tx_dropped +=
+ mxl862xx_delta32(tx_drop, s->prev_tx_dropped_pkts) +
+ mxl862xx_delta32(tx_acm, s->prev_tx_acm_dropped_pkts);
+
+ s->multicast += mxl862xx_delta32(rx_mcast, s->prev_rx_multicast_pkts);
+ s->collisions += mxl862xx_delta32(tx_coll, s->prev_tx_coll_count);
+
+ s->rx_length_errors +=
+ mxl862xx_delta32(rx_under, s->prev_rx_under_size_error_pkts) +
+ mxl862xx_delta32(rx_over, s->prev_rx_oversize_error_pkts);
+ s->rx_crc_errors +=
+ mxl862xx_delta32(rx_fcserr, s->prev_rx_fcserror_pkts);
+ s->rx_frame_errors +=
+ mxl862xx_delta32(rx_align, s->prev_rx_align_error_pkts);
+
+ s->prev_rx_good_pkts = rx_pkts;
+ s->prev_tx_good_pkts = tx_pkts;
+ s->prev_rx_good_bytes = rx_bytes;
+ s->prev_tx_good_bytes = tx_bytes;
+ s->prev_rx_fcserror_pkts = rx_fcserr;
+ s->prev_rx_under_size_error_pkts = rx_under;
+ s->prev_rx_oversize_error_pkts = rx_over;
+ s->prev_rx_align_error_pkts = rx_align;
+ s->prev_tx_dropped_pkts = tx_drop;
+ s->prev_rx_dropped_pkts = rx_drop;
+ s->prev_rx_evlan_discard_pkts = rx_evlan;
+ s->prev_mtu_exceed_discard_pkts = mtu_exc;
+ s->prev_tx_acm_dropped_pkts = tx_acm;
+ s->prev_rx_multicast_pkts = rx_mcast;
+ s->prev_tx_coll_count = tx_coll;
+
+ spin_unlock_bh(&priv->ports[port].stats_lock);
+}
+
+static void mxl862xx_stats_work_fn(struct work_struct *work)
+{
+ struct mxl862xx_priv *priv =
+ container_of(work, struct mxl862xx_priv, stats_work.work);
+ struct dsa_switch *ds = priv->ds;
+ struct dsa_port *dp;
+
+ dsa_switch_for_each_available_port(dp, ds)
+ mxl862xx_stats_poll(ds, dp->index);
+
+ if (!test_bit(MXL862XX_FLAG_WORK_STOPPED, &priv->flags))
+ schedule_delayed_work(&priv->stats_work,
+ MXL862XX_STATS_POLL_INTERVAL);
+}
+
+static void mxl862xx_get_stats64(struct dsa_switch *ds, int port,
+ struct rtnl_link_stats64 *s)
+{
+ struct mxl862xx_priv *priv = ds->priv;
+ struct mxl862xx_port_stats *ps = &priv->ports[port].stats;
+
+ spin_lock_bh(&priv->ports[port].stats_lock);
+
+ s->rx_packets = ps->rx_packets;
+ s->tx_packets = ps->tx_packets;
+ s->rx_bytes = ps->rx_bytes;
+ s->tx_bytes = ps->tx_bytes;
+ s->rx_errors = ps->rx_errors;
+ s->tx_errors = ps->tx_errors;
+ s->rx_dropped = ps->rx_dropped;
+ s->tx_dropped = ps->tx_dropped;
+ s->multicast = ps->multicast;
+ s->collisions = ps->collisions;
+ s->rx_length_errors = ps->rx_length_errors;
+ s->rx_crc_errors = ps->rx_crc_errors;
+ s->rx_frame_errors = ps->rx_frame_errors;
+
+ spin_unlock_bh(&priv->ports[port].stats_lock);
+
+ /* Trigger a fresh poll so the next read sees up-to-date counters.
+ * No-op if the work is already pending, running, or teardown started.
+ */
+ if (!test_bit(MXL862XX_FLAG_WORK_STOPPED, &priv->flags))
+ schedule_delayed_work(&priv->stats_work, 0);
+}
+
static const struct dsa_switch_ops mxl862xx_switch_ops = {
.get_tag_protocol = mxl862xx_get_tag_protocol,
.setup = mxl862xx_setup,
@@ -1909,6 +2070,7 @@ static const struct dsa_switch_ops mxl862xx_switch_ops = {
.get_eth_mac_stats = mxl862xx_get_eth_mac_stats,
.get_eth_ctrl_stats = mxl862xx_get_eth_ctrl_stats,
.get_pause_stats = mxl862xx_get_pause_stats,
+ .get_stats64 = mxl862xx_get_stats64,
};
static void mxl862xx_phylink_mac_config(struct phylink_config *config,
@@ -1970,16 +2132,22 @@ static int mxl862xx_probe(struct mdio_device *mdiodev)
priv->ports[i].priv = priv;
INIT_WORK(&priv->ports[i].host_flood_work,
mxl862xx_host_flood_work_fn);
+ spin_lock_init(&priv->ports[i].stats_lock);
}
+ INIT_DELAYED_WORK(&priv->stats_work, mxl862xx_stats_work_fn);
+
dev_set_drvdata(dev, ds);
err = dsa_register_switch(ds);
if (err) {
+ set_bit(MXL862XX_FLAG_WORK_STOPPED, &priv->flags);
+ cancel_delayed_work_sync(&priv->stats_work);
mxl862xx_host_shutdown(priv);
for (i = 0; i < MXL862XX_MAX_PORTS; i++)
cancel_work_sync(&priv->ports[i].host_flood_work);
}
+
return err;
}
@@ -1994,6 +2162,9 @@ static void mxl862xx_remove(struct mdio_device *mdiodev)
priv = ds->priv;
+ set_bit(MXL862XX_FLAG_WORK_STOPPED, &priv->flags);
+ cancel_delayed_work_sync(&priv->stats_work);
+
dsa_unregister_switch(ds);
mxl862xx_host_shutdown(priv);
@@ -2020,6 +2191,9 @@ static void mxl862xx_shutdown(struct mdio_device *mdiodev)
dsa_switch_shutdown(ds);
+ set_bit(MXL862XX_FLAG_WORK_STOPPED, &priv->flags);
+ cancel_delayed_work_sync(&priv->stats_work);
+
mxl862xx_host_shutdown(priv);
for (i = 0; i < MXL862XX_MAX_PORTS; i++)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.h b/drivers/net/dsa/mxl862xx/mxl862xx.h
index a010cf6b961a9..80053ab40e4ce 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.h
@@ -116,6 +116,79 @@ struct mxl862xx_evlan_block {
u16 n_active;
};
+/**
+ * struct mxl862xx_port_stats - 64-bit accumulated hardware port statistics
+ * @rx_packets: total received packets
+ * @tx_packets: total transmitted packets
+ * @rx_bytes: total received bytes
+ * @tx_bytes: total transmitted bytes
+ * @rx_errors: total receive errors
+ * @tx_errors: total transmit errors
+ * @rx_dropped: total received packets dropped
+ * @tx_dropped: total transmitted packets dropped
+ * @multicast: total received multicast packets
+ * @collisions: total transmit collisions
+ * @rx_length_errors: received length errors (undersize + oversize)
+ * @rx_crc_errors: received FCS errors
+ * @rx_frame_errors: received alignment errors
+ * @prev_rx_good_pkts: previous snapshot of rx good packet counter
+ * @prev_tx_good_pkts: previous snapshot of tx good packet counter
+ * @prev_rx_good_bytes: previous snapshot of rx good byte counter
+ * @prev_tx_good_bytes: previous snapshot of tx good byte counter
+ * @prev_rx_fcserror_pkts: previous snapshot of rx FCS error counter
+ * @prev_rx_under_size_error_pkts: previous snapshot of rx undersize
+ * error counter
+ * @prev_rx_oversize_error_pkts: previous snapshot of rx oversize
+ * error counter
+ * @prev_rx_align_error_pkts: previous snapshot of rx alignment
+ * error counter
+ * @prev_tx_dropped_pkts: previous snapshot of tx dropped counter
+ * @prev_rx_dropped_pkts: previous snapshot of rx dropped counter
+ * @prev_rx_evlan_discard_pkts: previous snapshot of extended VLAN
+ * discard counter
+ * @prev_mtu_exceed_discard_pkts: previous snapshot of MTU exceed
+ * discard counter
+ * @prev_tx_acm_dropped_pkts: previous snapshot of tx ACM dropped
+ * counter
+ * @prev_rx_multicast_pkts: previous snapshot of rx multicast counter
+ * @prev_tx_coll_count: previous snapshot of tx collision counter
+ *
+ * The firmware RMON counters are 32-bit free-running (64-bit for byte
+ * counters). This structure holds 64-bit accumulators alongside the
+ * previous raw snapshot so that deltas can be computed across polls,
+ * handling 32-bit wrap correctly via unsigned subtraction.
+ */
+struct mxl862xx_port_stats {
+ u64 rx_packets;
+ u64 tx_packets;
+ u64 rx_bytes;
+ u64 tx_bytes;
+ u64 rx_errors;
+ u64 tx_errors;
+ u64 rx_dropped;
+ u64 tx_dropped;
+ u64 multicast;
+ u64 collisions;
+ u64 rx_length_errors;
+ u64 rx_crc_errors;
+ u64 rx_frame_errors;
+ u32 prev_rx_good_pkts;
+ u32 prev_tx_good_pkts;
+ u64 prev_rx_good_bytes;
+ u64 prev_tx_good_bytes;
+ u32 prev_rx_fcserror_pkts;
+ u32 prev_rx_under_size_error_pkts;
+ u32 prev_rx_oversize_error_pkts;
+ u32 prev_rx_align_error_pkts;
+ u32 prev_tx_dropped_pkts;
+ u32 prev_rx_dropped_pkts;
+ u32 prev_rx_evlan_discard_pkts;
+ u32 prev_mtu_exceed_discard_pkts;
+ u32 prev_tx_acm_dropped_pkts;
+ u32 prev_rx_multicast_pkts;
+ u32 prev_tx_coll_count;
+};
+
/**
* struct mxl862xx_port - per-port state tracked by the driver
* @priv: back-pointer to switch private data; needed by
@@ -145,6 +218,10 @@ struct mxl862xx_evlan_block {
* The worker acquires rtnl_lock() to serialize with
* DSA callbacks and checks @setup_done to avoid
* acting on torn-down ports.
+ * @stats: 64-bit accumulated hardware statistics; updated
+ * periodically by the stats polling work
+ * @stats_lock: protects accumulator reads in .get_stats64 against
+ * concurrent updates from the polling work
*/
struct mxl862xx_port {
struct mxl862xx_priv *priv;
@@ -160,16 +237,24 @@ struct mxl862xx_port {
bool host_flood_uc;
bool host_flood_mc;
struct work_struct host_flood_work;
+ struct mxl862xx_port_stats stats;
+ spinlock_t stats_lock; /* protects stats accumulators */
};
+/* Bit indices for struct mxl862xx_priv::flags */
+#define MXL862XX_FLAG_CRC_ERR 0
+#define MXL862XX_FLAG_WORK_STOPPED 1
+
/**
* struct mxl862xx_priv - driver private data for an MxL862xx switch
* @ds: pointer to the DSA switch instance
* @mdiodev: MDIO device used to communicate with the switch firmware
* @crc_err_work: deferred work for shutting down all ports on MDIO CRC
* errors
- * @crc_err: set atomically before CRC-triggered shutdown, cleared
- * after
+ * @flags: atomic status flags; %MXL862XX_FLAG_CRC_ERR is set
+ * before CRC-triggered shutdown and cleared after;
+ * %MXL862XX_FLAG_WORK_STOPPED is set before cancelling
+ * stats_work to prevent rescheduling during teardown
* @drop_meter: index of the single shared zero-rate firmware meter
* used to unconditionally drop traffic (used to block
* flooding)
@@ -181,18 +266,21 @@ struct mxl862xx_port {
* @evlan_ingress_size: per-port ingress Extended VLAN block size
* @evlan_egress_size: per-port egress Extended VLAN block size
* @vf_block_size: per-port VLAN Filter block size
+ * @stats_work: periodic work item that polls RMON hardware counters
+ * and accumulates them into 64-bit per-port stats
*/
struct mxl862xx_priv {
struct dsa_switch *ds;
struct mdio_device *mdiodev;
struct work_struct crc_err_work;
- unsigned long crc_err;
+ unsigned long flags;
u16 drop_meter;
struct mxl862xx_port ports[MXL862XX_MAX_PORTS];
u16 bridges[MXL862XX_MAX_BRIDGES + 1];
u16 evlan_ingress_size;
u16 evlan_egress_size;
u16 vf_block_size;
+ struct delayed_work stats_work;
};
#endif /* __MXL862XX_H */
--
2.53.0
^ permalink raw reply related
* [PATCH net-next 1/2] net: dsa: mxl862xx: add ethtool statistics support
From: Daniel Golle @ 2026-04-11 0:13 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King, netdev,
linux-kernel
Cc: Frank Wunderlich, Chad Monroe, Cezary Wilmanski, Liang Xu,
Benny (Ying-Tsan) Weng, Jose Maria Verdu Munoz, Avinash Jayaraman,
John Crispin
In-Reply-To: <cover.1775865049.git.daniel@makrotopia.org>
The MxL862xx firmware exposes per-port RMON counters through the
RMON_PORT_GET command, covering standard IEEE 802.3 MAC statistics
(unicast/multicast/broadcast packet and byte counts, collision
counters, pause frames) as well as hardware-specific counters such
as extended VLAN discard and MTU exceed events.
Add the RMON counter firmware API structures and command definitions.
Implement .get_strings, .get_sset_count, and .get_ethtool_stats for
legacy ethtool -S support. Implement .get_eth_mac_stats,
.get_eth_ctrl_stats, and .get_pause_stats for the standardized
IEEE 802.3 statistics interface.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
drivers/net/dsa/mxl862xx/mxl862xx-api.h | 142 ++++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-cmd.h | 3 +
drivers/net/dsa/mxl862xx/mxl862xx.c | 151 ++++++++++++++++++++++++
3 files changed, 296 insertions(+)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-api.h b/drivers/net/dsa/mxl862xx/mxl862xx-api.h
index c902e90397e5f..fb21ddc1bf1c0 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-api.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-api.h
@@ -1224,4 +1224,146 @@ struct mxl862xx_sys_fw_image_version {
__le32 iv_build_num;
} __packed;
+/**
+ * enum mxl862xx_port_type - Port Type
+ * @MXL862XX_LOGICAL_PORT: Logical Port
+ * @MXL862XX_PHYSICAL_PORT: Physical Port
+ * @MXL862XX_CTP_PORT: Connectivity Termination Port (CTP)
+ * @MXL862XX_BRIDGE_PORT: Bridge Port
+ */
+enum mxl862xx_port_type {
+ MXL862XX_LOGICAL_PORT = 0,
+ MXL862XX_PHYSICAL_PORT,
+ MXL862XX_CTP_PORT,
+ MXL862XX_BRIDGE_PORT,
+};
+
+/**
+ * enum mxl862xx_rmon_port_type - RMON counter table type
+ * @MXL862XX_RMON_CTP_PORT_RX: CTP RX counters
+ * @MXL862XX_RMON_CTP_PORT_TX: CTP TX counters
+ * @MXL862XX_RMON_BRIDGE_PORT_RX: Bridge port RX counters
+ * @MXL862XX_RMON_BRIDGE_PORT_TX: Bridge port TX counters
+ * @MXL862XX_RMON_CTP_PORT_PCE_BYPASS: CTP PCE bypass counters
+ * @MXL862XX_RMON_TFLOW_RX: TFLOW RX counters
+ * @MXL862XX_RMON_TFLOW_TX: TFLOW TX counters
+ * @MXL862XX_RMON_QMAP: QMAP counters
+ * @MXL862XX_RMON_METER: Meter counters
+ * @MXL862XX_RMON_PMAC: PMAC counters
+ */
+enum mxl862xx_rmon_port_type {
+ MXL862XX_RMON_CTP_PORT_RX = 0,
+ MXL862XX_RMON_CTP_PORT_TX,
+ MXL862XX_RMON_BRIDGE_PORT_RX,
+ MXL862XX_RMON_BRIDGE_PORT_TX,
+ MXL862XX_RMON_CTP_PORT_PCE_BYPASS,
+ MXL862XX_RMON_TFLOW_RX,
+ MXL862XX_RMON_TFLOW_TX,
+ MXL862XX_RMON_QMAP = 0x0e,
+ MXL862XX_RMON_METER = 0x19,
+ MXL862XX_RMON_PMAC = 0x1c,
+};
+
+/**
+ * struct mxl862xx_rmon_port_cnt - RMON counters for a port
+ * @port_type: Port type for counter retrieval (see &enum mxl862xx_port_type)
+ * @port_id: Ethernet port number (zero-based)
+ * @sub_if_id_group: Sub-interface ID group
+ * @pce_bypass: Separate CTP Tx counters when PCE is bypassed
+ * @rx_extended_vlan_discard_pkts: Discarded at extended VLAN operation
+ * @mtu_exceed_discard_pkts: Discarded due to MTU exceeded
+ * @tx_under_size_good_pkts: Tx undersize (<64) packet count
+ * @tx_oversize_good_pkts: Tx oversize (>1518) packet count
+ * @rx_good_pkts: Received good packet count
+ * @rx_unicast_pkts: Received unicast packet count
+ * @rx_broadcast_pkts: Received broadcast packet count
+ * @rx_multicast_pkts: Received multicast packet count
+ * @rx_fcserror_pkts: Received FCS error packet count
+ * @rx_under_size_good_pkts: Received undersize good packet count
+ * @rx_oversize_good_pkts: Received oversize good packet count
+ * @rx_under_size_error_pkts: Received undersize error packet count
+ * @rx_good_pause_pkts: Received good pause packet count
+ * @rx_oversize_error_pkts: Received oversize error packet count
+ * @rx_align_error_pkts: Received alignment error packet count
+ * @rx_filtered_pkts: Filtered packet count
+ * @rx64byte_pkts: Received 64-byte packet count
+ * @rx127byte_pkts: Received 65-127 byte packet count
+ * @rx255byte_pkts: Received 128-255 byte packet count
+ * @rx511byte_pkts: Received 256-511 byte packet count
+ * @rx1023byte_pkts: Received 512-1023 byte packet count
+ * @rx_max_byte_pkts: Received 1024-max byte packet count
+ * @tx_good_pkts: Transmitted good packet count
+ * @tx_unicast_pkts: Transmitted unicast packet count
+ * @tx_broadcast_pkts: Transmitted broadcast packet count
+ * @tx_multicast_pkts: Transmitted multicast packet count
+ * @tx_single_coll_count: Transmit single collision count
+ * @tx_mult_coll_count: Transmit multiple collision count
+ * @tx_late_coll_count: Transmit late collision count
+ * @tx_excess_coll_count: Transmit excessive collision count
+ * @tx_coll_count: Transmit collision count
+ * @tx_pause_count: Transmit pause packet count
+ * @tx64byte_pkts: Transmitted 64-byte packet count
+ * @tx127byte_pkts: Transmitted 65-127 byte packet count
+ * @tx255byte_pkts: Transmitted 128-255 byte packet count
+ * @tx511byte_pkts: Transmitted 256-511 byte packet count
+ * @tx1023byte_pkts: Transmitted 512-1023 byte packet count
+ * @tx_max_byte_pkts: Transmitted 1024-max byte packet count
+ * @tx_dropped_pkts: Transmit dropped packet count
+ * @tx_acm_dropped_pkts: Transmit ACM dropped packet count
+ * @rx_dropped_pkts: Received dropped packet count
+ * @rx_good_bytes: Received good byte count (64-bit)
+ * @rx_bad_bytes: Received bad byte count (64-bit)
+ * @tx_good_bytes: Transmitted good byte count (64-bit)
+ */
+struct mxl862xx_rmon_port_cnt {
+ __le32 port_type; /* enum mxl862xx_port_type */
+ __le16 port_id;
+ __le16 sub_if_id_group;
+ u8 pce_bypass;
+ __le32 rx_extended_vlan_discard_pkts;
+ __le32 mtu_exceed_discard_pkts;
+ __le32 tx_under_size_good_pkts;
+ __le32 tx_oversize_good_pkts;
+ __le32 rx_good_pkts;
+ __le32 rx_unicast_pkts;
+ __le32 rx_broadcast_pkts;
+ __le32 rx_multicast_pkts;
+ __le32 rx_fcserror_pkts;
+ __le32 rx_under_size_good_pkts;
+ __le32 rx_oversize_good_pkts;
+ __le32 rx_under_size_error_pkts;
+ __le32 rx_good_pause_pkts;
+ __le32 rx_oversize_error_pkts;
+ __le32 rx_align_error_pkts;
+ __le32 rx_filtered_pkts;
+ __le32 rx64byte_pkts;
+ __le32 rx127byte_pkts;
+ __le32 rx255byte_pkts;
+ __le32 rx511byte_pkts;
+ __le32 rx1023byte_pkts;
+ __le32 rx_max_byte_pkts;
+ __le32 tx_good_pkts;
+ __le32 tx_unicast_pkts;
+ __le32 tx_broadcast_pkts;
+ __le32 tx_multicast_pkts;
+ __le32 tx_single_coll_count;
+ __le32 tx_mult_coll_count;
+ __le32 tx_late_coll_count;
+ __le32 tx_excess_coll_count;
+ __le32 tx_coll_count;
+ __le32 tx_pause_count;
+ __le32 tx64byte_pkts;
+ __le32 tx127byte_pkts;
+ __le32 tx255byte_pkts;
+ __le32 tx511byte_pkts;
+ __le32 tx1023byte_pkts;
+ __le32 tx_max_byte_pkts;
+ __le32 tx_dropped_pkts;
+ __le32 tx_acm_dropped_pkts;
+ __le32 rx_dropped_pkts;
+ __le64 rx_good_bytes;
+ __le64 rx_bad_bytes;
+ __le64 tx_good_bytes;
+} __packed;
+
#endif /* __MXL862XX_API_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h b/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
index 45df37cde40d1..f1ea40aa7ea08 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
@@ -16,6 +16,7 @@
#define MXL862XX_BRDGPORT_MAGIC 0x400
#define MXL862XX_CTP_MAGIC 0x500
#define MXL862XX_QOS_MAGIC 0x600
+#define MXL862XX_RMON_MAGIC 0x700
#define MXL862XX_SWMAC_MAGIC 0xa00
#define MXL862XX_EXTVLAN_MAGIC 0xb00
#define MXL862XX_VLANFILTER_MAGIC 0xc00
@@ -43,6 +44,8 @@
#define MXL862XX_QOS_METERCFGSET (MXL862XX_QOS_MAGIC + 0x2)
#define MXL862XX_QOS_METERALLOC (MXL862XX_QOS_MAGIC + 0x2a)
+#define MXL862XX_RMON_PORT_GET (MXL862XX_RMON_MAGIC + 0x1)
+
#define MXL862XX_MAC_TABLEENTRYADD (MXL862XX_SWMAC_MAGIC + 0x2)
#define MXL862XX_MAC_TABLEENTRYREAD (MXL862XX_SWMAC_MAGIC + 0x3)
#define MXL862XX_MAC_TABLEENTRYQUERY (MXL862XX_SWMAC_MAGIC + 0x4)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index fca9a3e36bb69..8159f6a66d724 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -30,6 +30,47 @@
#define MXL862XX_API_READ_QUIET(dev, cmd, data) \
mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, true)
+struct mxl862xx_mib_desc {
+ unsigned int size;
+ unsigned int offset;
+ const char *name;
+};
+
+#define MIB_DESC(_size, _name, _element) \
+{ \
+ .size = _size, \
+ .name = _name, \
+ .offset = offsetof(struct mxl862xx_rmon_port_cnt, _element) \
+}
+
+static const struct mxl862xx_mib_desc mxl862xx_mib[] = {
+ MIB_DESC(1, "TxUnicastPkts", tx_unicast_pkts),
+ MIB_DESC(1, "Tx64BytePkts", tx64byte_pkts),
+ MIB_DESC(1, "Tx127BytePkts", tx127byte_pkts),
+ MIB_DESC(1, "Tx255BytePkts", tx255byte_pkts),
+ MIB_DESC(1, "Tx511BytePkts", tx511byte_pkts),
+ MIB_DESC(1, "Tx1023BytePkts", tx1023byte_pkts),
+ MIB_DESC(1, "TxMaxBytePkts", tx_max_byte_pkts),
+ MIB_DESC(1, "TxDroppedPkts", tx_dropped_pkts),
+ MIB_DESC(1, "TxAcmDroppedPkts", tx_acm_dropped_pkts),
+ MIB_DESC(1, "TxCollCount", tx_coll_count),
+ MIB_DESC(1, "RxUnicastPkts", rx_unicast_pkts),
+ MIB_DESC(1, "RxUnderSizeGoodPkts", rx_under_size_good_pkts),
+ MIB_DESC(1, "RxOversizeGoodPkts", rx_oversize_good_pkts),
+ MIB_DESC(1, "RxUnderSizeErrorPkts", rx_under_size_error_pkts),
+ MIB_DESC(1, "RxFilteredPkts", rx_filtered_pkts),
+ MIB_DESC(1, "Rx64BytePkts", rx64byte_pkts),
+ MIB_DESC(1, "Rx127BytePkts", rx127byte_pkts),
+ MIB_DESC(1, "Rx255BytePkts", rx255byte_pkts),
+ MIB_DESC(1, "Rx511BytePkts", rx511byte_pkts),
+ MIB_DESC(1, "Rx1023BytePkts", rx1023byte_pkts),
+ MIB_DESC(1, "RxMaxBytePkts", rx_max_byte_pkts),
+ MIB_DESC(1, "RxDroppedPkts", rx_dropped_pkts),
+ MIB_DESC(1, "RxExtendedVlanDiscardPkts", rx_extended_vlan_discard_pkts),
+ MIB_DESC(1, "MtuExceedDiscardPkts", mtu_exceed_discard_pkts),
+ MIB_DESC(2, "RxBadBytes", rx_bad_bytes),
+};
+
#define MXL862XX_SDMA_PCTRLP(p) (0xbc0 + ((p) * 0x6))
#define MXL862XX_SDMA_PCTRL_EN BIT(0)
@@ -1734,6 +1775,110 @@ static int mxl862xx_port_bridge_flags(struct dsa_switch *ds, int port,
return 0;
}
+static void mxl862xx_get_strings(struct dsa_switch *ds, int port,
+ u32 stringset, u8 *data)
+{
+ int i;
+
+ if (stringset != ETH_SS_STATS)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(mxl862xx_mib); i++)
+ ethtool_puts(&data, mxl862xx_mib[i].name);
+}
+
+static int mxl862xx_get_sset_count(struct dsa_switch *ds, int port, int sset)
+{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
+ return ARRAY_SIZE(mxl862xx_mib);
+}
+
+static int mxl862xx_read_rmon(struct dsa_switch *ds, int port,
+ struct mxl862xx_rmon_port_cnt *cnt)
+{
+ memset(cnt, 0, sizeof(*cnt));
+ cnt->port_type = cpu_to_le32(MXL862XX_CTP_PORT);
+ cnt->port_id = cpu_to_le16(port);
+
+ return MXL862XX_API_READ(ds->priv, MXL862XX_RMON_PORT_GET, *cnt);
+}
+
+static void mxl862xx_get_ethtool_stats(struct dsa_switch *ds, int port,
+ u64 *data)
+{
+ const struct mxl862xx_mib_desc *mib;
+ struct mxl862xx_rmon_port_cnt cnt;
+ int ret, i;
+ void *field;
+
+ ret = mxl862xx_read_rmon(ds, port, &cnt);
+ if (ret) {
+ dev_err(ds->dev, "failed to read RMON stats on port %d\n", port);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(mxl862xx_mib); i++) {
+ mib = &mxl862xx_mib[i];
+ field = (u8 *)&cnt + mib->offset;
+
+ if (mib->size == 1)
+ *data++ = le32_to_cpu(*(__le32 *)field);
+ else
+ *data++ = le64_to_cpu(*(__le64 *)field);
+ }
+}
+
+static void mxl862xx_get_eth_mac_stats(struct dsa_switch *ds, int port,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ struct mxl862xx_rmon_port_cnt cnt;
+
+ if (mxl862xx_read_rmon(ds, port, &cnt))
+ return;
+
+ mac_stats->FramesTransmittedOK = le32_to_cpu(cnt.tx_good_pkts);
+ mac_stats->SingleCollisionFrames = le32_to_cpu(cnt.tx_single_coll_count);
+ mac_stats->MultipleCollisionFrames = le32_to_cpu(cnt.tx_mult_coll_count);
+ mac_stats->FramesReceivedOK = le32_to_cpu(cnt.rx_good_pkts);
+ mac_stats->FrameCheckSequenceErrors = le32_to_cpu(cnt.rx_fcserror_pkts);
+ mac_stats->AlignmentErrors = le32_to_cpu(cnt.rx_align_error_pkts);
+ mac_stats->OctetsTransmittedOK = le64_to_cpu(cnt.tx_good_bytes);
+ mac_stats->LateCollisions = le32_to_cpu(cnt.tx_late_coll_count);
+ mac_stats->FramesAbortedDueToXSColls = le32_to_cpu(cnt.tx_excess_coll_count);
+ mac_stats->OctetsReceivedOK = le64_to_cpu(cnt.rx_good_bytes);
+ mac_stats->MulticastFramesXmittedOK = le32_to_cpu(cnt.tx_multicast_pkts);
+ mac_stats->BroadcastFramesXmittedOK = le32_to_cpu(cnt.tx_broadcast_pkts);
+ mac_stats->MulticastFramesReceivedOK = le32_to_cpu(cnt.rx_multicast_pkts);
+ mac_stats->BroadcastFramesReceivedOK = le32_to_cpu(cnt.rx_broadcast_pkts);
+ mac_stats->FrameTooLongErrors = le32_to_cpu(cnt.rx_oversize_error_pkts);
+}
+
+static void mxl862xx_get_eth_ctrl_stats(struct dsa_switch *ds, int port,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ struct mxl862xx_rmon_port_cnt cnt;
+
+ if (mxl862xx_read_rmon(ds, port, &cnt))
+ return;
+
+ ctrl_stats->MACControlFramesTransmitted = le32_to_cpu(cnt.tx_pause_count);
+ ctrl_stats->MACControlFramesReceived = le32_to_cpu(cnt.rx_good_pause_pkts);
+}
+
+static void mxl862xx_get_pause_stats(struct dsa_switch *ds, int port,
+ struct ethtool_pause_stats *pause_stats)
+{
+ struct mxl862xx_rmon_port_cnt cnt;
+
+ if (mxl862xx_read_rmon(ds, port, &cnt))
+ return;
+
+ pause_stats->tx_pause_frames = le32_to_cpu(cnt.tx_pause_count);
+ pause_stats->rx_pause_frames = le32_to_cpu(cnt.rx_good_pause_pkts);
+}
+
static const struct dsa_switch_ops mxl862xx_switch_ops = {
.get_tag_protocol = mxl862xx_get_tag_protocol,
.setup = mxl862xx_setup,
@@ -1758,6 +1903,12 @@ static const struct dsa_switch_ops mxl862xx_switch_ops = {
.port_vlan_filtering = mxl862xx_port_vlan_filtering,
.port_vlan_add = mxl862xx_port_vlan_add,
.port_vlan_del = mxl862xx_port_vlan_del,
+ .get_strings = mxl862xx_get_strings,
+ .get_sset_count = mxl862xx_get_sset_count,
+ .get_ethtool_stats = mxl862xx_get_ethtool_stats,
+ .get_eth_mac_stats = mxl862xx_get_eth_mac_stats,
+ .get_eth_ctrl_stats = mxl862xx_get_eth_ctrl_stats,
+ .get_pause_stats = mxl862xx_get_pause_stats,
};
static void mxl862xx_phylink_mac_config(struct phylink_config *config,
--
2.53.0
^ permalink raw reply related
* [PATCH net-next 0/2] net: dsa: mxl862xx: add statistics support
From: Daniel Golle @ 2026-04-11 0:13 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King, netdev,
linux-kernel
Cc: Frank Wunderlich, Chad Monroe, Cezary Wilmanski, Liang Xu,
Benny (Ying-Tsan) Weng, Jose Maria Verdu Munoz, Avinash Jayaraman,
John Crispin
Add per-port RMON statistics support for the MxL862xx DSA driver,
covering hardware-specific ethtool -S counters, standard IEEE 802.3
MAC/ctrl/pause statistics, and rtnl_link_stats64 via polled 64-bit
accumulation.
Daniel Golle (2):
net: dsa: mxl862xx: add ethtool statistics support
net: dsa: mxl862xx: implement .get_stats64
drivers/net/dsa/mxl862xx/mxl862xx-api.h | 142 ++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-cmd.h | 3 +
drivers/net/dsa/mxl862xx/mxl862xx-host.c | 8 +-
drivers/net/dsa/mxl862xx/mxl862xx.c | 325 +++++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx.h | 94 ++++++-
5 files changed, 565 insertions(+), 7 deletions(-)
--
2.53.0
^ permalink raw reply
* [PATCH v1 net] tcp: Don't set treq->req_usec_ts in cookie_tcp_reqsk_init().
From: Kuniyuki Iwashima @ 2026-04-10 23:53 UTC (permalink / raw)
To: Eric Dumazet, Neal Cardwell, David S. Miller, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Commit de5626b95e13 ("tcp: Factorise cookie-independent fields
initialisation in cookie_v[46]_check().") miscategorised
tcp_rsk(req)->req_usec_ts init to cookie_tcp_reqsk_init(),
which is used by both BPF/non-BPF SYN cookie reqsk.
Rather, it should have been moved to cookie_tcp_reqsk_alloc() by
commit 8e7bab6b9652 ("tcp: Factorise cookie-dependent fields
initialisation in cookie_v[46]_check()") so that only non-BPF SYN
cookie sets tcp_rsk(req)->req_usec_ts to false.
Let's move the initialisation to cookie_tcp_reqsk_alloc() to
respect bpf_tcp_req_attrs.usec_ts_ok.
Fixes: e472f88891ab ("bpf: tcp: Support arbitrary SYN Cookie.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
net/ipv4/syncookies.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index fc3affd9c801..b5f0a65c6786 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -286,7 +286,6 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
treq->rcv_isn = ntohl(th->seq) - 1;
treq->snt_isn = ntohl(th->ack_seq) - 1;
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
- treq->req_usec_ts = false;
#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
@@ -349,6 +348,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
ireq->wscale_ok = tcp_opt->wscale_ok;
ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN);
+ treq->req_usec_ts = false;
treq->ts_off = tsoff;
return req;
--
2.53.0.1213.gd9a14994de-goog
^ permalink raw reply related
* Re: [PATCH net-next] net: phy: call phy_init_hw() in phy resume path
From: Russell King (Oracle) @ 2026-04-10 23:51 UTC (permalink / raw)
To: Biju Das
Cc: Andrew Lunn, Heiner Kallweit, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Ovidiu Panait,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
Geert Uytterhoeven, Prabhakar Mahadev Lad,
linux-renesas-soc@vger.kernel.org
In-Reply-To: <TY3PR01MB113466D8DD83EC0B48D8262CA86592@TY3PR01MB11346.jpnprd01.prod.outlook.com>
On Fri, Apr 10, 2026 at 05:27:02PM +0000, Biju Das wrote:
> Hi Russell King/ Andrew,
>
> > -----Original Message-----
> > From: Russell King <linux@armlinux.org.uk>
> > Sent: 10 April 2026 15:55
> > Subject: Re: [PATCH net-next] net: phy: call phy_init_hw() in phy resume path
> >
> > On Fri, Apr 10, 2026 at 03:51:18PM +0100, Russell King (Oracle) wrote:
> > > On Fri, Apr 10, 2026 at 03:29:01PM +0100, Biju wrote:
> > > > From: Ovidiu Panait <ovidiu.panait.rb@renesas.com>
> > > >
> > > > When mac_managed_pm flag is set, mdio_bus_phy_resume() is skipped,
> > > > so phy_init_hw(), which performs soft_reset and config_init, is not
> > > > called during resume.
> > > >
> > > > This is inconsistent with the non-mac_managed_pm path, where
> > > > mdio_bus_phy_resume() calls phy_init_hw() before phy_resume() on
> > > > every resume.
> > > >
> > > > To align both paths, add a phy_init_hw() call at the top of
> > > > __phy_resume(), before invoking the driver's resume callback. This
> > > > guarantees the PHY undergoes soft reset and re-initialization
> > > > regardless of whether PM is managed by the MAC or the MDIO bus.
> > > >
> > > > Signed-off-by: Ovidiu Panait <ovidiu.panait.rb@renesas.com>
> > > > Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
> > > > ---
> > > > drivers/net/phy/phy_device.c | 4 ++++
> > > > 1 file changed, 4 insertions(+)
> > > >
> > > > diff --git a/drivers/net/phy/phy_device.c
> > > > b/drivers/net/phy/phy_device.c index 0edff47478c2..8255f4208d66
> > > > 100644
> > > > --- a/drivers/net/phy/phy_device.c
> > > > +++ b/drivers/net/phy/phy_device.c
> > > > @@ -2008,6 +2008,10 @@ int __phy_resume(struct phy_device *phydev)
> > > > if (!phydrv || !phydrv->resume)
> > > > return 0;
> > > >
> > > > + ret = phy_init_hw(phydev);
> > > > + if (ret)
> > > > + return ret;
> > >
> > > Do we want to do this even when phydrv->resume is NULL?
> >
> > I should've also added (sorry, busy packing) - with it always being called even when phydrv->resume is
> > NULL, it means that the call sites to phy_resume() in phylib which are preceeded by a call to
> > phy_init_hw() should have that call removed, otherwise we're going to be calling phy_init_hw() twice.
> >
> > As the patch currently stands, that's the case when phydrv->resume is populated, and I think we should
> > avoid that.
> >
> > > Apart from that, looks fine to me - it seems some paths call
> > > phy_init_hw() can be called with or without phydev->lock held, and
> > > this one will call it with the lock held which seems to be okay.
>
>
> The new patch will be like this, after moving phy_init_hw() without
> phydev->lock held. Please let me know are you ok with this?
>
>
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index 0edff47478c2..4a2b19d39373 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -396,10 +396,6 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev)
> WARN_ON(phydev->state != PHY_HALTED && phydev->state != PHY_READY &&
> phydev->state != PHY_UP);
>
> - ret = phy_init_hw(phydev);
> - if (ret < 0)
> - return ret;
> -
> ret = phy_resume(phydev);
> if (ret < 0)
> return ret;
> @@ -1857,16 +1853,14 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
> if (dev)
> netif_carrier_off(phydev->attached_dev);
>
> - /* Do initial configuration here, now that
> + /* Do initial configuration inside phy_init_hw(), now that
> * we have certain key parameters
> * (dev_flags and interface)
> */
> - err = phy_init_hw(phydev);
> + err = phy_resume(phydev);
> if (err)
> goto error;
>
> - phy_resume(phydev);
> -
> /**
> * If the external phy used by current mac interface is managed by
> * another mac interface, so we should create a device link between
> @@ -2020,6 +2014,10 @@ int phy_resume(struct phy_device *phydev)
> {
> int ret;
>
> + ret = phy_init_hw(phydev);
> + if (ret)
> + return ret;
> +
> mutex_lock(&phydev->lock);
> ret = __phy_resume(phydev);
> mutex_unlock(&phydev->lock);
Looks good to me. Thanks!
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!
^ permalink raw reply
* Re: [PATCH net-next] net: phy: call phy_init_hw() in phy resume path
From: Russell King (Oracle) @ 2026-04-10 23:49 UTC (permalink / raw)
To: Biju Das
Cc: Andrew Lunn, biju.das.au, Heiner Kallweit, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Ovidiu Panait,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
Geert Uytterhoeven, Prabhakar Mahadev Lad,
linux-renesas-soc@vger.kernel.org
In-Reply-To: <TY3PR01MB11346B6680E5952BD7B7078CA86592@TY3PR01MB11346.jpnprd01.prod.outlook.com>
On Fri, Apr 10, 2026 at 04:41:08PM +0000, Biju Das wrote:
> Hi Andrew,
>
> > -----Original Message-----
> > From: Andrew Lunn <andrew@lunn.ch>
> > Sent: 10 April 2026 16:15
> > Subject: Re: [PATCH net-next] net: phy: call phy_init_hw() in phy resume path
> >
> > > Apart from that, looks fine to me - it seems some paths call
> > > phy_init_hw() can be called with or without phydev->lock held, and
> > > this one will call it with the lock held which seems to be okay.
> >
> > Haven't we had deadlocks in this area before?
> >
> > Please test with CONFIG_PROVE_LOCKING enabled.
>
> I have n't faced any issue with micrel phy. But my collegue
> got the below issue with Microsemi phy. It doesn't finish the boot.
>
> drivers/net/phy/mscc/mscc_main.c
Looking at this driver, I'm wondering why it's taking phydev->lock
in vsc85xx_edge_rate_cntl_set()... phy_modify_paged() is already
a fully locked atomic operation (it takes the bus lock) and taking
phydev->lock gains nothing.
vsc85xx_mac_if_set() is a different matter, and this _should_ be
using phy_modify() to atomically change MSCC_PHY_EXT_PHY_CNTL_1.
phydev->lock doesn't guarantee that e.g. userspace won't access
the register behind this code's back.
vsc8531_pre_init_seq_set() is a repeat of vsc85xx_edge_rate_cntl_set()
except with phy_select_page()..phy_restore_page() which does the
necessary bus locking to ensure the entire sequence is done atomically.
Ditto vsc85xx_eee_init_seq_set().
So, I question whether any of the functions in this driver actually
have a valid reason to take phydev->lock - looks to me like a not
very well written driver.
In cases like this, I don't think we should make things more
difficult in the core just because we have a lockdep splat when that
can be avoided by killing off unnecessary locking.
--
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!
^ permalink raw reply
* Re: [PATCH 2/4] net: ionic: Add PHC state page for user space access
From: Allen Hubbe @ 2026-04-10 23:44 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Abhijit Gangurde, jgg, leon, brett.creeley, andrew+netdev, davem,
edumazet, pabeni, nikhil.agarwal, linux-rdma, netdev,
linux-kernel
In-Reply-To: <20260410134311.785683cd@kernel.org>
On 4/10/2026 4:43 PM, Jakub Kicinski wrote:
> On Fri, 10 Apr 2026 09:10:09 -0400 Allen Hubbe wrote:
>>>> +struct ionic_phc_state {
>>>> + __u32 seq;
>>>> + __u32 rsvd;
>>>> + __aligned_u64 mask;
>>>> + __aligned_u64 tick;
>>>> + __aligned_u64 nsec;
>>>> + __aligned_u64 frac;
>>>> + __u32 mult;
>>>> + __u32 shift;
>>>> +};
>>>
>>> You're just exposing kernel timecounter internals.
>>> Why is this ionic uAPI and not something reusable by other drivers?
>>
>> The simple answer is just following the same approach as an existing
>> implementation. See struct mlx5_ib_clock_info and
>> mlx5_update_clock_info_page().
>>
>> Making this common might risk presuming that other implementations will
>> be a similar design. Compare these to the sfc driver. The clock is
>> quite different from ionic and mlx5, not using timecounter, because
>> instead of a free-running cycle counter the hardware itself provides an
>> adjustable clock for timestamping.
>
> So your augment is basically that drivers which don't use sw timecounter
> exist so we shouldn't bother creating common definitions for drivers
> that do? Why do we have common implementation of timecounter in the
> kernel at all then?
>
> These are rhetorical questions.
There is no suggestion to get rid of timecounter in the kernel.
Maybe I've been overthinking this and misunderstood your first reply.
Did you mean, just, why not move this to ib_user_verbs.h, struct
ib_uverbs_phc_state, and use it from the vendor driver?
^ permalink raw reply
* Re: [PATCH net] netrom: do some basic forms of validation on incoming frames
From: Hugh Blemings @ 2026-04-10 23:38 UTC (permalink / raw)
To: Craig, hugh, Kuniyuki Iwashima, kuba
Cc: davem, edumazet, gregkh, horms, linux-hams, linux-kernel, netdev,
pabeni, stable, workflows, yizhe
In-Reply-To: <761f83cc-58eb-4b4a-ba91-d11412e7b2a6@gmail.com>
On 11/4/2026 08:51, Craig wrote:
>> If the main concern here is ongoing maintenance of these Ham Radio
>> related protocols/drivers, can we pause for a moment on anything as
>> dramatic as removing from the tree entirely ?
>>
>> There is a good cohort of capable kernel folks that either are or
>> were ham radio operators who I believe, upon realising that things
>> have got to this point, will be happy to redouble efforts to ensure
>> this code maintained and tested to a satisfactory standard.
>>
>> Or, alternatively, as a technical community it may be that the Ham
>> Radio interested folks conclude that out of tree or user space
>> solutions are a better way forward as others have proposed.
>>
>> Give us a few days, please, for the word to be put around that we
>> need to pull ourselves together a bit as a technical group :)
>>
>
> I, for one, really can't imagine pulling an entire network subsytem
> out of the kernel without any
> knowledge of how/if/when it's used. Like intercontinental radio
> networks, global email, ax.25
> keyboard-to-keyboard, BBS and other emergency-communication systems
> throughout the
> world. If you're sure the Internet will never fail, I guess it makes
> sense removing all of this
> since it's inconvenient to maintain.
>
> Global AX.25 keyboard-to-keyboard on 14.105Mhz
>
> https://qsl.net/kb9pvh/105.html
>
> AX.25/netrom VHF routed networks spanning from Oregon to Los Angeles.
>
> https://www.easymapmaker.com/map/80666c4898ec6e8fa0c35add5d03282d
>
> Global radio email using AX.25
>
> https://winlink.org/RMSChannels (1,336 AX.25 email packet nodes on
> the Earth and Space)
>
> This is all in operation by Amateur Radio ARES emergency
> protocols/technologies. This
> will not pass the headline test when it comes to Linux detractors.
>
> Most of this is running on Raspberry Pi / Linux 24/7.
>
> If we want to kill all these apps and somehow force them into user space,
> it's akin to just switching to Windows - and flounder with the
> Microsoft folks
> trying to do the same thing.
Your email Craig neatly encapsulates just some of the practical and
ongoing applications of the kernel code in question - I don't think this
is in dispute.
What's pertinent is if we as the ham/amatuer radio community can agree
on whether in tree, out of tree modules, or a userspace device driver
approach make the most sense. If we are to keep code in the kernel in
any form, we as a community need to find someone(s) that have the skills
and bandwidth to keep the in tree code up to date.
I don't think this would be onerous and I have a couple of people in
mind to nudge who may be happy to do so if that proves the right way
forward. At a pinch I could do it, but that'll mean a lot of catching
up. But I think it reasonable that the responsibility here falls to
folks that are closer to the code in question than the wider and
overworked kernel maintainer community.
That said, I think Dan Cross (KZ2X) earlier email makes a pretty strong
case for moving out of the kernel while still providing a way to have
backward compatibility, perhaps this might be the way forward?
In any case, done well, this approach would not kill the apps or force
anything like switching to Windows! :) Great projects like digipi would
be able to continue with minimal changes.
I wonder if a separate thread in linux-hams makes sense to discuss the
various longer term approaches to maintaining these capabilities - I'll
try make time later today to kick one off - such deliberations will be
of less interest to the broader LKML and other lists.
Cheers/73
Hugh
>
>
> -craig
> https://digipi.org/
>
>
--
I am slowly moving to hugh@blemings.id.au as my main email address.
If you're using hugh@blemings.org please update your address book accordingly.
Thank you :)
^ permalink raw reply
* Re: [PATCH net-next v2] iavf: fix kernel-doc comment style in iavf_ethtool.c
From: patchwork-bot+netdevbpf @ 2026-04-10 23:10 UTC (permalink / raw)
To: Loktionov, Aleksandr
Cc: intel-wired-lan, anthony.l.nguyen, netdev, leszek.pepiak
In-Reply-To: <20260409093020.3808687-1-aleksandr.loktionov@intel.com>
Hello:
This patch was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 9 Apr 2026 11:30:20 +0200 you wrote:
> iavf_ethtool.c contains 31 kernel-doc comment blocks using the legacy
> `**/` terminator instead of the correct single `*/`. Two function
> headers also use a colon separator (`iavf_get_channels:`,
> `iavf_set_channels:`) instead of the ` - ` dash required by kernel-doc.
>
> Additionally several comments embed their return-value descriptions in
> the body paragraph, producing `scripts/kernel-doc -Wreturn` warnings.
> Void functions that incorrectly say "Returns ..." are also rephrased.
>
> [...]
Here is the summary with links:
- [net-next,v2] iavf: fix kernel-doc comment style in iavf_ethtool.c
https://git.kernel.org/netdev/net-next/c/3f3a2aefbc66
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH net-next 0/3] net: dsa: mxl862xx: VLAN support and minor improvements
From: patchwork-bot+netdevbpf @ 2026-04-10 23:10 UTC (permalink / raw)
To: Daniel Golle
Cc: andrew, olteanv, davem, edumazet, kuba, pabeni, linux, netdev,
linux-kernel, frankwu, chad, cezary.wilmanski, lxu, yweng, jverdu,
ajayaraman, john
In-Reply-To: <cover.1775581804.git.daniel@makrotopia.org>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Tue, 7 Apr 2026 18:30:14 +0100 you wrote:
> This series adds VLAN offloading to the mxl862xx DSA driver along
> with two minor improvements to port setup and bridge configuration.
> VLAN support uses a hybrid architecture combining the Extended VLAN
> engine for PVID insertion and tag stripping with the VLAN Filter
> engine for per-port VID membership, both drawing from shared
> 1024-entry hardware pools partitioned across user ports at probe time.
>
> [...]
Here is the summary with links:
- [net-next,1/3] net: dsa: mxl862xx: reject DSA_PORT_TYPE_DSA
https://git.kernel.org/netdev/net-next/c/3a4056ec7ec8
- [net-next,2/3] net: dsa: mxl862xx: don't skip early bridge port configuration
https://git.kernel.org/netdev/net-next/c/71934b9e6f36
- [net-next,3/3] net: dsa: mxl862xx: implement VLAN functionality
https://git.kernel.org/netdev/net-next/c/d587f9b6dcc9
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH net-next v4 0/3] net: bridge: add stp_mode attribute for STP mode selection
From: patchwork-bot+netdevbpf @ 2026-04-10 23:10 UTC (permalink / raw)
To: Andy Roulin
Cc: netdev, bridge, razor, idosch, andrew+netdev, davem, edumazet,
kuba, pabeni, horms, corbet, shuah, petrm, donald.hunter,
jonas.gorski, linux-doc, linux-kselftest, linux-kernel
In-Reply-To: <20260405205224.3163000-1-aroulin@nvidia.com>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Sun, 5 Apr 2026 13:52:21 -0700 you wrote:
> The bridge-stp usermode helper is currently restricted to the initial
> network namespace, preventing userspace STP daemons like mstpd from
> operating on bridges in other namespaces. Since commit ff62198553e4
> ("bridge: Only call /sbin/bridge-stp for the initial network
> namespace"), bridges in non-init namespaces silently fall back to
> kernel STP with no way to request userspace STP.
>
> [...]
Here is the summary with links:
- [net-next,v4,1/3] net: bridge: add stp_mode attribute for STP mode selection
https://git.kernel.org/netdev/net-next/c/54fc83a17285
- [net-next,v4,2/3] docs: net: bridge: document stp_mode attribute
https://git.kernel.org/netdev/net-next/c/c4f2aab121cd
- [net-next,v4,3/3] selftests: net: add bridge STP mode selection test
https://git.kernel.org/netdev/net-next/c/20ae6d76e381
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH net] net: airoha: Fix FE_PSE_BUF_SET configuration if PPE2 is available
From: patchwork-bot+netdevbpf @ 2026-04-10 23:10 UTC (permalink / raw)
To: Lorenzo Bianconi
Cc: andrew+netdev, davem, edumazet, kuba, pabeni, horms,
linux-arm-kernel, linux-mediatek, netdev, xuegang.lu
In-Reply-To: <20260408-airoha-reg_fe_pse_buf_set-v1-1-0c4fa8f4d1d9@kernel.org>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Wed, 08 Apr 2026 12:20:09 +0200 you wrote:
> airoha_fe_set routine is used to set specified bits to 1 in the selected
> register. In the FE_PSE_BUF_SET case this can due to a overestimation of
> the required buffers for I/O queues since we can miss to set some bits
> of PSE_ALLRSV_MASK subfield to 0. Fix the issue relying on airoha_fe_rmw
> routine instead.
>
> Fixes: 8e38e08f2c560 ("net: airoha: fix PSE memory configuration in airoha_fe_pse_ports_init()")
> Tested-by: Xuegang Lu <xuegang.lu@airoha.com>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
>
> [...]
Here is the summary with links:
- [net] net: airoha: Fix FE_PSE_BUF_SET configuration if PPE2 is available
https://git.kernel.org/netdev/net/c/02f729643959
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH net] netrom: do some basic forms of validation on incoming frames
From: Craig @ 2026-04-10 22:51 UTC (permalink / raw)
To: hugh, Kuniyuki Iwashima, kuba
Cc: davem, edumazet, gregkh, horms, linux-hams, linux-kernel, netdev,
pabeni, stable, workflows, yizhe
In-Reply-To: <4f5810a7-c792-4d6b-9f7c-6c6b289def19@blemings.org>
> If the main concern here is ongoing maintenance of these Ham Radio
> related protocols/drivers, can we pause for a moment on anything as
> dramatic as removing from the tree entirely ?
>
> There is a good cohort of capable kernel folks that either are or were
> ham radio operators who I believe, upon realising that things have got
> to this point, will be happy to redouble efforts to ensure this code
> maintained and tested to a satisfactory standard.
>
> Or, alternatively, as a technical community it may be that the Ham
> Radio interested folks conclude that out of tree or user space
> solutions are a better way forward as others have proposed.
>
> Give us a few days, please, for the word to be put around that we need
> to pull ourselves together a bit as a technical group :)
>
I, for one, really can't imagine pulling an entire network subsytem out
of the kernel without any
knowledge of how/if/when it's used. Like intercontinental radio
networks, global email, ax.25
keyboard-to-keyboard, BBS and other emergency-communication systems
throughout the
world. If you're sure the Internet will never fail, I guess it makes
sense removing all of this
since it's inconvenient to maintain.
Global AX.25 keyboard-to-keyboard on 14.105Mhz
https://qsl.net/kb9pvh/105.html
AX.25/netrom VHF routed networks spanning from Oregon to Los Angeles.
https://www.easymapmaker.com/map/80666c4898ec6e8fa0c35add5d03282d
Global radio email using AX.25
https://winlink.org/RMSChannels (1,336 AX.25 email packet nodes on
the Earth and Space)
This is all in operation by Amateur Radio ARES emergency
protocols/technologies. This
will not pass the headline test when it comes to Linux detractors.
Most of this is running on Raspberry Pi / Linux 24/7.
If we want to kill all these apps and somehow force them into user space,
it's akin to just switching to Windows - and flounder with the Microsoft
folks
trying to do the same thing.
-craig
https://digipi.org/
^ permalink raw reply
* [PATCH net-next] net: shaper: Reject zero weight in shaper config
From: Mohsin Bashir @ 2026-04-10 22:51 UTC (permalink / raw)
To: netdev
Cc: ast, chuck.lever, davem, donald.hunter, edumazet, horms, kuba,
linux-kernel, matttbe, pabeni, mohsin.bashr
A zero weight is meaningless for DWRR scheduling and can cause
starvation of the affected node. Add a min-value constraint to
the weight attribute in the net_shaper netlink spec so that zero
is rejected at the netlink policy level.
Found while prototyping a new driver, existing drivers are not
affected.
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Mohsin Bashir <hmohsin@meta.com>
---
Documentation/netlink/specs/net_shaper.yaml | 2 ++
net/shaper/shaper_nl_gen.c | 6 +++---
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/Documentation/netlink/specs/net_shaper.yaml b/Documentation/netlink/specs/net_shaper.yaml
index 3f2ad772b64b..7216568fcfc5 100644
--- a/Documentation/netlink/specs/net_shaper.yaml
+++ b/Documentation/netlink/specs/net_shaper.yaml
@@ -106,6 +106,8 @@ attribute-sets:
-
name: weight
type: u32
+ checks:
+ min: 1
doc: |
Relative weight for round robin scheduling of the
given shaper.
diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c
index 9b29be3ef19a..0cad1a355350 100644
--- a/net/shaper/shaper_nl_gen.c
+++ b/net/shaper/shaper_nl_gen.c
@@ -20,7 +20,7 @@ const struct nla_policy net_shaper_handle_nl_policy[NET_SHAPER_A_HANDLE_ID + 1]
const struct nla_policy net_shaper_leaf_info_nl_policy[NET_SHAPER_A_WEIGHT + 1] = {
[NET_SHAPER_A_HANDLE] = NLA_POLICY_NESTED(net_shaper_handle_nl_policy),
[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
- [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+ [NET_SHAPER_A_WEIGHT] = NLA_POLICY_MIN(NLA_U32, 1),
};
/* NET_SHAPER_CMD_GET - do */
@@ -43,7 +43,7 @@ static const struct nla_policy net_shaper_set_nl_policy[NET_SHAPER_A_IFINDEX + 1
[NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, },
[NET_SHAPER_A_BURST] = { .type = NLA_UINT, },
[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
- [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+ [NET_SHAPER_A_WEIGHT] = NLA_POLICY_MIN(NLA_U32, 1),
};
/* NET_SHAPER_CMD_DELETE - do */
@@ -62,7 +62,7 @@ static const struct nla_policy net_shaper_group_nl_policy[NET_SHAPER_A_LEAVES +
[NET_SHAPER_A_BW_MAX] = { .type = NLA_UINT, },
[NET_SHAPER_A_BURST] = { .type = NLA_UINT, },
[NET_SHAPER_A_PRIORITY] = { .type = NLA_U32, },
- [NET_SHAPER_A_WEIGHT] = { .type = NLA_U32, },
+ [NET_SHAPER_A_WEIGHT] = NLA_POLICY_MIN(NLA_U32, 1),
[NET_SHAPER_A_LEAVES] = NLA_POLICY_NESTED(net_shaper_leaf_info_nl_policy),
};
--
2.52.0
^ permalink raw reply related
* [PATCH iproute 2/2] json_writer: fix builtin test code
From: Stephen Hemminger @ 2026-04-10 22:47 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger
In-Reply-To: <20260410224745.93416-1-stephen@networkplumber.org>
The code under #ifdef was not generating valid JSON
and it is missing test for control characters.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
lib/json_writer.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/lib/json_writer.c b/lib/json_writer.c
index 7202621e..d4a1c72b 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -369,7 +369,7 @@ int main(int argc, char **argv)
jsonw_null_field(wr, "my_null");
jsonw_name(wr, "special chars");
- jsonw_start_array(wr);
+ jsonw_start_object(wr);
jsonw_string_field(wr, "slash", "/");
jsonw_string_field(wr, "newline", "\n");
jsonw_string_field(wr, "tab", "\t");
@@ -377,7 +377,14 @@ int main(int argc, char **argv)
jsonw_string_field(wr, "quote", "\"");
jsonw_string_field(wr, "tick", "\'");
jsonw_string_field(wr, "backslash", "\\");
- jsonw_end_array(wr);
+ jsonw_end_object(wr);
+
+ jsonw_name(wr, "control chars");
+ jsonw_start_object(wr);
+ jsonw_string_field(wr, "bell", "\a");
+ jsonw_string_field(wr, "esc", "\033");
+ jsonw_string_field(wr, "del", "\177");
+ jsonw_end_object(wr);
jsonw_end_object(wr);
--
2.53.0
^ permalink raw reply related
* [PATCH iproute 1/2] json_writer: support control character escaping
From: Stephen Hemminger @ 2026-04-10 22:47 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger
Iproute2 never handled control characters in strings correctly.
There are some cases like where string is under user control
like paths in ss command. Make iproute2 json output conform
to RFC 8259.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
lib/json_writer.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/lib/json_writer.c b/lib/json_writer.c
index 2f3936c2..7202621e 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -53,7 +53,7 @@ static void jsonw_eor(json_writer_t *self)
/* Output JSON encoded string */
-/* Handles C escapes, does not do Unicode */
+/* Handles C escapes and control characters per RFC 8259 */
static void jsonw_puts(json_writer_t *self, const char *str)
{
putc('"', self->out);
@@ -81,7 +81,10 @@ static void jsonw_puts(json_writer_t *self, const char *str)
fputs("\\\"", self->out);
break;
default:
- putc(*str, self->out);
+ if ((unsigned char)*str < 0x20 || *str == 0x7f)
+ fprintf(self->out, "\\u%04x", *str);
+ else
+ putc(*str, self->out);
}
putc('"', self->out);
}
--
2.53.0
^ permalink raw reply related
* Re: [PATCH net-next 0/2] Add selftests for ntuple (NFC) rules
From: patchwork-bot+netdevbpf @ 2026-04-10 22:40 UTC (permalink / raw)
To: Dimitri Daskalakis
Cc: davem, andrew+netdev, edumazet, kuba, pabeni, shuah, willemb,
petrm, dw, carges, cjubran, daskald, netdev, linux-kselftest
In-Reply-To: <20260407164954.2977820-1-dimitri.daskalakis1@gmail.com>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Tue, 7 Apr 2026 09:49:52 -0700 you wrote:
> From: Dimitri Daskalakis <daskald@meta.com>
>
> Thoroughly testing a device's NFC implementation can be tedious. The more
> features a device supports, the more combinations to validate.
>
> This series aims to ease that burden, validating the most common NFC rule
> combinations.
>
> [...]
Here is the summary with links:
- [net-next,1/2] selftests: drv-net: Add ntuple (NFC) flow steering test
https://git.kernel.org/netdev/net-next/c/18589df9344c
- [net-next,2/2] selftests: drv-net: ntuple: Add dst-ip, src-port, dst-port fields
https://git.kernel.org/netdev/net-next/c/a66374a3eb02
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox