Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net v6 0/4] Fix i40e/ice/iavf VF bonding after netdev lock changes
From: Simon Horman @ 2026-06-22 10:31 UTC (permalink / raw)
  To: Jose Ignacio Tornos Martinez
  Cc: netdev, intel-wired-lan, przemyslaw.kitszel, aleksandr.loktionov,
	jacob.e.keller, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni
In-Reply-To: <20260619061321.8554-1-jtornosm@redhat.com>

On Fri, Jun 19, 2026 at 08:13:15AM +0200, Jose Ignacio Tornos Martinez wrote:
> This series fixes VF bonding failures introduced by commit ad7c7b2172c3
> ("net: hold netdev instance lock during sysfs operations").

...

Hi Jose,

Unfortunately the Netdev CI was unable to apply this series cleanly against net.
Would you be able to rebase and repost?

-- 
pw-bot: changes-requested



^ permalink raw reply

* [PATCH net v2] net: usb: lan78xx: restore VLAN and hash filters after link up
From: Nicolai Buchwitz @ 2026-06-22 10:29 UTC (permalink / raw)
  To: Thangaraj Samynathan, Rengarajan Sundararajan, UNGLinuxDriver,
	Woojung.Huh
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Sven Schuchmann, netdev, linux-usb, linux-kernel,
	Nicolai Buchwitz

Configured VLANs intermittently stop receiving traffic after a link
down/up cycle, e.g. when the network cable is unplugged and plugged back
in. VLAN filtering stays enabled but all VLAN-tagged frames are dropped
until a VLAN is added or removed again.

The LAN7801 datasheet (DS00002123E) states:

  "A portion of the MAC operates on clocks generated by the Ethernet
   PHY. During a PHY reset event, this portion of the MAC is designed to
   not be taken out of reset until the PHY clocks are operational"
  (section 8.10, MAC Reset Watchdog Timer)

  "After a reset event, the RFE will automatically initialize the
   contents of the VHF to 0h."
  (section 7.1.4, VHF Organization)

Thus a link down/up cycle stops and restarts the PHY clock, resets the
PHY-clocked portion of the MAC, and the RFE clears its VLAN/DA hash
filter (VHF) memory. The VHF holds both the VLAN filter table and the
multicast hash table, but the driver never reprograms either from its
shadow copy once the link is back, so both stay empty.

Reprogram the VLAN filter and multicast hash tables on link up.

Reported-by: Sven Schuchmann <schuchmann@schleissheimer.de>
Closes: https://lore.kernel.org/netdev/BEZP281MB224501E38B30BFDC4BD3D364D9E32@BEZP281MB2245.DEUP281.PROD.OUTLOOK.COM/T/#u
Tested-by: Sven Schuchmann <schuchmann@schleissheimer.de>
Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver")
Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
---
v2:
 - Reprogram in lan78xx_mac_link_up() instead of lan78xx_reset(); the
   table is lost on a plain link down/up cycle, where reset() is not
   called. This also avoids the usb_autopm_get_interface() -EACCES path
   in reset_resume() that was flagged on v1.
 - Also restore the multicast hash table: the RFE clears the whole VHF
   (VLAN + hash) memory, per the LAN7801 datasheet.

v1: https://lore.kernel.org/netdev/20260618191109.4086598-1-nb@tipi-net.de/

 drivers/net/usb/lan78xx.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index bcf293ea1bd3..c4cebacabcb5 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1452,6 +1452,15 @@ static inline u32 lan78xx_hash(char addr[ETH_ALEN])
 	return (ether_crc(ETH_ALEN, addr) >> 23) & 0x1ff;
 }
 
+static int lan78xx_write_mchash_table(struct lan78xx_net *dev)
+{
+	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
+
+	return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_,
+				      DP_SEL_VHF_VLAN_LEN,
+				      DP_SEL_VHF_HASH_LEN, pdata->mchash_table);
+}
+
 static void lan78xx_deferred_multicast_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
@@ -1462,9 +1471,7 @@ static void lan78xx_deferred_multicast_write(struct work_struct *param)
 	netif_dbg(dev, drv, dev->net, "deferred multicast write 0x%08x\n",
 		  pdata->rfe_ctl);
 
-	ret = lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_,
-				     DP_SEL_VHF_VLAN_LEN,
-				     DP_SEL_VHF_HASH_LEN, pdata->mchash_table);
+	ret = lan78xx_write_mchash_table(dev);
 	if (ret < 0)
 		goto multicast_write_done;
 
@@ -1557,6 +1564,7 @@ static void lan78xx_set_multicast(struct net_device *netdev)
 }
 
 static void lan78xx_rx_urb_submit_all(struct lan78xx_net *dev);
+static int lan78xx_write_vlan_table(struct lan78xx_net *dev);
 
 static int lan78xx_mac_reset(struct lan78xx_net *dev)
 {
@@ -2514,6 +2522,17 @@ static void lan78xx_mac_link_up(struct phylink_config *config,
 	if (ret < 0)
 		goto link_up_fail;
 
+	/* The RFE clears the VLAN/DA hash filter (VHF) on a link down/up
+	 * cycle, so reprogram both tables from their shadow copies.
+	 */
+	ret = lan78xx_write_vlan_table(dev);
+	if (ret < 0)
+		goto link_up_fail;
+
+	ret = lan78xx_write_mchash_table(dev);
+	if (ret < 0)
+		goto link_up_fail;
+
 	netif_start_queue(net);
 
 	return;
@@ -3065,14 +3084,20 @@ static int lan78xx_set_features(struct net_device *netdev,
 	return lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl);
 }
 
+static int lan78xx_write_vlan_table(struct lan78xx_net *dev)
+{
+	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
+
+	return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
+				      DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+}
+
 static void lan78xx_deferred_vlan_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
 			container_of(param, struct lan78xx_priv, set_vlan);
-	struct lan78xx_net *dev = pdata->dev;
 
-	lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
-			       DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+	lan78xx_write_vlan_table(pdata->dev);
 }
 
 static int lan78xx_vlan_rx_add_vid(struct net_device *netdev,

base-commit: d07d80b6a129a44538cda1549b7acf95154fb197
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 net] net: watchdog: fix refcount tracking races
From: Marek Szyprowski @ 2026-06-22 10:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+381d82bbf0253710b35d,
	syzbot+3479efbc2821cb2a79f2
In-Reply-To: <CANn89i+GVoQxFS26=s5w5vUa-ytRUgD1NM6MDZQdtB7FtcXv-w@mail.gmail.com>

On 22.06.2026 10:59, Eric Dumazet wrote:
> On Wed, Jun 17, 2026 at 3:48 AM Marek Szyprowski
> <m.szyprowski@samsung.com> wrote:
>> On 11.06.2026 17:27, Eric Dumazet wrote:
>>> Blamed commit converted the untracked dev_hold()/dev_put() calls
>>> in the watchdog code to use the tracked dev_hold_track()/dev_put_track()
>>> (which were later renamed/interfaced to netdev_hold() and netdev_put()).
>>>
>>> By introducing dev->watchdog_dev_tracker to store the
>>> reference tracking information without adding synchronization
>>> between netdev_watchdog_up() and dev_watchdog(), it enabled the
>>> race condition where this pointer could be overwritten or freed
>>> concurrently, leading to the list corruption crash syzbot reported:
>>>
>>> list_del corruption, ffff888114a18c00->next is NULL
>>>  kernel BUG at lib/list_debug.c:52 !
>>> Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
>>> CPU: 1 UID: 0 PID: 91 Comm: kworker/u8:5 Not tainted syzkaller #0 PREEMPT(lazy)
>>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
>>> Workqueue: events_unbound linkwatch_event
>>>  RIP: 0010:__list_del_entry_valid_or_report.cold+0x22/0x2a lib/list_debug.c:52
>>> Call Trace:
>>>  <TASK>
>>>   __list_del_entry_valid include/linux/list.h:132 [inline]
>>>   __list_del_entry include/linux/list.h:246 [inline]
>>>   list_move_tail include/linux/list.h:341 [inline]
>>>   ref_tracker_free+0x1a7/0x6c0 lib/ref_tracker.c:329
>>>   netdev_tracker_free include/linux/netdevice.h:4491 [inline]
>>>   netdev_put include/linux/netdevice.h:4508 [inline]
>>>   netdev_put include/linux/netdevice.h:4504 [inline]
>>>   netdev_watchdog_down net/sched/sch_generic.c:600 [inline]
>>>   dev_deactivate_many+0x28c/0xfe0 net/sched/sch_generic.c:1363
>>>   dev_deactivate+0x109/0x1d0 net/sched/sch_generic.c:1397
>>>   linkwatch_do_dev net/core/link_watch.c:184 [inline]
>>>   linkwatch_do_dev+0xd3/0x120 net/core/link_watch.c:166
>>>   __linkwatch_run_queue+0x3a5/0x810 net/core/link_watch.c:240
>>>   linkwatch_event+0x8f/0xc0 net/core/link_watch.c:314
>>>   process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314
>>>   process_scheduled_works kernel/workqueue.c:3397 [inline]
>>>   worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478
>>>   kthread+0x370/0x450 kernel/kthread.c:436
>>>   ret_from_fork+0x69a/0xc80 arch/x86/kernel/process.c:158
>>>   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
>>>
>>> This patch has three coordinated parts:
>>>
>>> 1) Add dev->watchdog_lock and dev->watchdog_ref_held to serialize watchdog operations.
>>>
>>> 2) Remove netdev_watchdog_up() call from netif_carrier_on():
>>>    This ensures netdev_watchdog_up() is only called from process/BH context
>>>    (via linkwatch workqueue dev_activate()), allowing us to use
>>>    spin_lock_bh() for synchronization.
>>>
>>> 3) Synchronize watchdog up and watchdog timer:
>>>    Protect netdev_watchdog_up() with tx_global_lock and watchdog_lock.
>>>    Only allocate a new tracker in netdev_watchdog_up() if one is
>>>    not already present.
>>>    In dev_watchdog(), ensure we don't release the tracker if the
>>>    timer was rescheduled either by dev_watchdog() itself or concurrently
>>>    by netdev_watchdog_up().
>>>
>>> Fixes: f12bf6f3f942 ("net: watchdog: add net device refcount tracker")
>>> Reported-by: syzbot+381d82bbf0253710b35d@syzkaller.appspotmail.com
>>> Closes: https://lore.kernel.org/netdev/6a26b751.c25708ab.1b19ef.0013.GAE@google.com/T/#u
>>> Tested-by: syzbot+3479efbc2821cb2a79f2@syzkaller.appspotmail.com
>>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>> This patch landed recently in linux-next as commit 8eed5519e496 ("net: watchdog:
>> fix refcount tracking races"). In my tests I found that it causes the following
>> deadlock during system suspend/resume on QEmu's ARM64bit 'virt' machine:
>>
>> root@target:~# time rtcwake -s10 -mmem
>> rtcwake: assuming RTC uses UTC ...
>> rtcwake: wakeup from "mem" using /dev/rtc0 at Wed Jun 17 10:46:12 2026
>> PM: suspend entry (s2idle)
>> Filesystems sync: 0.055 seconds
>> Freezing user space processes
>> Freezing user space processes completed (elapsed 0.006 seconds)
>> OOM killer disabled.
>> Freezing remaining freezable tasks
>> Freezing remaining freezable tasks completed (elapsed 0.003 seconds)
>>
>> ============================================
>> WARNING: possible recursive locking detected
>> 7.1.0-rc7+ #13003 Not tainted
>> --------------------------------------------
>> rtcwake/254 is trying to acquire lock:
>> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netdev_watchdog_up+0x40/0x108
>>
>> but task is already holding lock:
>> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>>
>> other info that might help us debug this:
>>  Possible unsafe locking scenario:
>>
>>        CPU0
>>        ----
>>   lock(&dev->tx_global_lock);
>>   lock(&dev->tx_global_lock);
>>
>>  *** DEADLOCK ***
>>
>>  May be due to missing lock nesting notation
>>
>> 6 locks held by rtcwake/254:
>>  #0: ffff0000071ab3e8 (sb_writers#5){.+.+}-{0:0}, at: vfs_write+0x1ec/0x35c
>>  #1: ffff00000d22c480 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0xf0/0x1c4
>>  #2: ffff0000049162c8 (kn->active#61){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1c4
>>  #3: ffffaa79533c03b0 (system_transition_mutex){+.+.}-{4:4}, at: pm_suspend+0x98/0x608
>>  #4: ffff000005e3a138 (&dev->mutex){....}-{4:4}, at: device_resume+0xb4/0x254
>>  #5: ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>>
>> stack backtrace:
>> CPU: 1 UID: 0 PID: 254 Comm: rtcwake Not tainted 7.1.0-rc7+ #13003 PREEMPT
>> Hardware name: linux,dummy-virt (DT)
>> Call trace:
>>  show_stack+0x18/0x24 (C)
>>  dump_stack_lvl+0x90/0xd0
>>  dump_stack+0x18/0x24
>>  print_deadlock_bug+0x260/0x350
>>  __lock_acquire+0x11b8/0x225c
>>  lock_acquire+0x1c4/0x3f0
>>  _raw_spin_lock_bh+0x50/0x68
>>  netdev_watchdog_up+0x40/0x108
>>  netif_device_attach+0x9c/0xb0
>>  virtnet_restore+0x100/0x21c
>>  virtio_device_restore_priv+0x11c/0x1d0
>>  virtio_device_restore+0x14/0x20
>>  virtio_mmio_restore+0x34/0x40
>>  platform_pm_resume+0x2c/0x68
>>  dpm_run_callback+0xa0/0x240
>>  device_resume+0x120/0x254
>>  dpm_resume+0x1f8/0x2ec
>>  dpm_resume_end+0x18/0x34
>>  suspend_devices_and_enter+0x1d0/0x990
>>  pm_suspend+0x1ec/0x608
>>  state_store+0x8c/0x110
>>  kobj_attr_store+0x18/0x2c
>>  sysfs_kf_write+0x50/0x7c
>>  kernfs_fop_write_iter+0x130/0x1c4
>>  vfs_write+0x2b8/0x35c
>>  ksys_write+0x6c/0x104
>>  __arm64_sys_write+0x1c/0x28
>>  invoke_syscall+0x54/0x110
>>  el0_svc_common.constprop.0+0x40/0xe8
>>  do_el0_svc+0x20/0x2c
>>  el0_svc+0x54/0x338
>>  el0t_64_sync_handler+0xa0/0xe4
>>  el0t_64_sync+0x198/0x19c
>>
>>
>> Reverting $subject on top of linux-next fixes this issue.
> Thanks for the report Marek!
>
> Acquiring tx_global_lock in netdev_watchdog_up() appears unnecessary anyway
> because the critical state (timer and refcount tracker) is already
> protected by dev->watchdog_lock.
>
> Could you try this patch?

This fixes the observed issue. Thanks! Feel free to add:

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>


> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 3f1c510df850dbdbaf10d483547c7b1f3a5d5482..ef2b4bf51564173751c74fefe17e3913ed2fa056
> 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -594,9 +594,8 @@ void netdev_watchdog_up(struct net_device *dev)
>                 return;
>         if (dev->watchdog_timeo <= 0)
>                 dev->watchdog_timeo = 5*HZ;
> -       spin_lock_bh(&dev->tx_global_lock);
>
> -       spin_lock(&dev->watchdog_lock);
> +       spin_lock_bh(&dev->watchdog_lock);
>         if (!mod_timer(&dev->watchdog_timer,
>                        round_jiffies(jiffies + dev->watchdog_timeo))) {
>                 if (!dev->watchdog_ref_held) {
> @@ -605,9 +604,7 @@ void netdev_watchdog_up(struct net_device *dev)
>                         dev->watchdog_ref_held = true;
>                 }
>         }
> -       spin_unlock(&dev->watchdog_lock);
> -
> -       spin_unlock_bh(&dev->tx_global_lock);
> +       spin_unlock_bh(&dev->watchdog_lock);
>  }
>  EXPORT_SYMBOL_GPL(netdev_watchdog_up);
>
Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply

* [PATCH v3 3/3] drm/xe/xe_ras: Add error-event support for CRI
From: Riana Tauro @ 2026-06-22 10:17 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: aravind.iddamsetty, anshuman.gupta, rodrigo.vivi, joonas.lahtinen,
	kuba, simona.vetter, airlied, pratik.bari, joshua.santosh.ranjan,
	ashwin.kumar.kulkarni, shubham.kumar, ravi.kishore.koppuravuri,
	raag.jadav, maarten.lankhorst, mallesh.koujalagi, soham.purkait,
	Riana Tauro
In-Reply-To: <20260622101716.3313496-5-riana.tauro@intel.com>

Add error-event support for Correctable errors in CRI.
error-event is reported to  userspace for all errors that crossed
threshold on receiving an interrupt for correctable errors.

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
 drivers/gpu/drm/xe/xe_ras.c | 53 +++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 44f4e1a3455b..acf3207aa2fd 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -77,6 +77,18 @@ static u8 drm_to_xe_ras_severity(u8 severity)
 	}
 }
 
+static u8 xe_to_drm_ras_severity(u8 severity)
+{
+	switch (severity) {
+	case XE_RAS_SEV_CORRECTABLE:
+		return DRM_XE_RAS_ERR_SEV_CORRECTABLE;
+	case XE_RAS_SEV_UNCORRECTABLE:
+		return DRM_XE_RAS_ERR_SEV_UNCORRECTABLE;
+	default:
+		return DRM_XE_RAS_ERR_SEV_MAX;
+	}
+}
+
 static u8 drm_to_xe_ras_component(u8 component)
 {
 	switch (component) {
@@ -95,6 +107,24 @@ static u8 drm_to_xe_ras_component(u8 component)
 	}
 }
 
+static u8 xe_to_drm_ras_component(u8 component)
+{
+	switch (component) {
+	case XE_RAS_COMP_DEVICE_MEMORY:
+		return DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY;
+	case XE_RAS_COMP_CORE_COMPUTE:
+		return DRM_XE_RAS_ERR_COMP_CORE_COMPUTE;
+	case XE_RAS_COMP_PCIE:
+		return DRM_XE_RAS_ERR_COMP_PCIE;
+	case XE_RAS_COMP_FABRIC:
+		return DRM_XE_RAS_ERR_COMP_FABRIC;
+	case XE_RAS_COMP_SOC_INTERNAL:
+		return DRM_XE_RAS_ERR_COMP_SOC_INTERNAL;
+	default:
+		return DRM_XE_RAS_ERR_COMP_MAX;
+	}
+}
+
 static int ras_status_to_errno(u32 status)
 {
 	switch (status) {
@@ -131,6 +161,27 @@ static inline const char *comp_to_str(u8 component)
 	return xe_ras_components[component];
 }
 
+static void ras_send_error_event(struct xe_device *xe, u8 severity, u8 component)
+{
+	u8 drm_severity, drm_component;
+	u32 value;
+	int ret;
+
+	drm_severity = xe_to_drm_ras_severity(severity);
+	if (drm_severity == DRM_XE_RAS_ERR_SEV_MAX)
+		return;
+
+	drm_component = xe_to_drm_ras_component(component);
+	if (drm_component == DRM_XE_RAS_ERR_COMP_MAX)
+		return;
+
+	ret = xe_ras_get_counter(xe, severity, component, &value);
+	if (ret)
+		return;
+
+	xe_drm_ras_event(xe, drm_component, drm_severity, value, GFP_KERNEL);
+}
+
 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 				      struct xe_sysctrl_event_response *response)
 {
@@ -152,6 +203,8 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 		severity = errors[id].common.severity;
 		component = errors[id].common.component;
 
+		ras_send_error_event(xe, severity, component);
+
 		xe_warn(xe, "[RAS]: %s %s detected\n",
 			comp_to_str(component), sev_to_str(severity));
 	}
-- 
2.47.1


^ permalink raw reply related

* [PATCH v3 2/3] drm/xe/xe_drm_ras: Add error-event support for PVC
From: Riana Tauro @ 2026-06-22 10:17 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: aravind.iddamsetty, anshuman.gupta, rodrigo.vivi, joonas.lahtinen,
	kuba, simona.vetter, airlied, pratik.bari, joshua.santosh.ranjan,
	ashwin.kumar.kulkarni, shubham.kumar, ravi.kishore.koppuravuri,
	raag.jadav, maarten.lankhorst, mallesh.koujalagi, soham.purkait,
	Riana Tauro
In-Reply-To: <20260622101716.3313496-5-riana.tauro@intel.com>

Report drm_ras error event to userspace when an error occurs.
Add support for core-compute and SoC errors in PVC.

$ sudo ynl --family drm_ras --output-json --subscribe error-report

{
    "name": "error-event",
     "msg": {
         "device-name": "0000:03:00.0",
         "node-id": 1,
         "node-name": "uncorrectable-errors",
         "error-id": 1,
         "error-name": "core-compute",
         "error-value": 1
     }
}

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
---
v2: use ynl (Raag)
    use value as function parameter
    move error event call to hw_error_source_handler 

v3: add has_drm_ras check
---
 drivers/gpu/drm/xe/xe_drm_ras.c  | 30 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_drm_ras.h  |  3 +++
 drivers/gpu/drm/xe/xe_hw_error.c |  5 ++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index 7937d8ba0ed9..36afdfb5e412 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -185,6 +185,36 @@ static int register_nodes(struct xe_device *xe)
 	return ret;
 }
 
+/**
+ * xe_drm_ras_event() - Report drm-ras error event to userspace
+ * @xe: xe device structure
+ * @component: error component (see &enum drm_xe_ras_error_component)
+ * @severity: error severity (see &enum drm_xe_ras_error_severity)
+ * @value: value of error counter
+ * @flags: flags for allocation
+ *
+ * Report an error-event to userspace.
+ */
+void xe_drm_ras_event(struct xe_device *xe, u32 component, u32 severity, u32 value, gfp_t flags)
+{
+	struct xe_drm_ras *ras = &xe->ras;
+	struct xe_drm_ras_counter *info = ras->info[severity];
+	struct drm_ras_node *node = &ras->node[severity];
+	int ret;
+
+	/* Event is supported only if drm_ras is enabled */
+	if (!xe->info.has_drm_ras)
+		return;
+
+	if (!info || !info[component].name)
+		return;
+
+	ret = drm_ras_nl_error_event(node, component, info[component].name, value, flags);
+	if (ret)
+		drm_err(&xe->drm, "RAS error-event failed: %d for %s %s\n", ret,
+			info[component].name, error_severity[severity]);
+}
+
 /**
  * xe_drm_ras_init() - Initialize DRM RAS
  * @xe: xe device instance
diff --git a/drivers/gpu/drm/xe/xe_drm_ras.h b/drivers/gpu/drm/xe/xe_drm_ras.h
index 365c70e93e82..2a694bf69478 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.h
+++ b/drivers/gpu/drm/xe/xe_drm_ras.h
@@ -5,11 +5,14 @@
 #ifndef _XE_DRM_RAS_H_
 #define _XE_DRM_RAS_H_
 
+#include <linux/types.h>
+
 struct xe_device;
 
 #define for_each_error_severity(i)	\
 	for (i = 0; i < DRM_XE_RAS_ERR_SEV_MAX; i++)
 
 int xe_drm_ras_init(struct xe_device *xe);
+void xe_drm_ras_event(struct xe_device *xe, u32 component, u32 severity, u32 value, gfp_t flags);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 4a4b363fc844..a833cecc74ec 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -432,7 +432,7 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er
 	struct xe_drm_ras *ras = &xe->ras;
 	struct xe_drm_ras_counter *info = ras->info[severity];
 	unsigned long flags, err_src;
-	u32 err_bit;
+	u32 err_bit, value;
 
 	if (!IS_DGFX(xe))
 		return;
@@ -495,6 +495,9 @@ static void hw_error_source_handler(struct xe_tile *tile, const enum hardware_er
 			gt_hw_error_handler(tile, hw_err, error_id);
 		if (err_bit == XE_SOC_ERROR)
 			soc_hw_error_handler(tile, hw_err, error_id);
+
+		value = atomic_read(&info[error_id].counter);
+		xe_drm_ras_event(xe, error_id, severity, value, GFP_ATOMIC);
 	}
 
 clear_reg:
-- 
2.47.1


^ permalink raw reply related

* [PATCH v3 1/3] drm/drm_ras: Add drm_ras netlink error event
From: Riana Tauro @ 2026-06-22 10:17 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: aravind.iddamsetty, anshuman.gupta, rodrigo.vivi, joonas.lahtinen,
	kuba, simona.vetter, airlied, pratik.bari, joshua.santosh.ranjan,
	ashwin.kumar.kulkarni, shubham.kumar, ravi.kishore.koppuravuri,
	raag.jadav, maarten.lankhorst, mallesh.koujalagi, soham.purkait,
	Riana Tauro, Zack McKevitt, Lijo Lazar, Hawking Zhang,
	David S. Miller, Paolo Abeni, Eric Dumazet
In-Reply-To: <20260622101716.3313496-5-riana.tauro@intel.com>

Define a new netlink event 'error-event' and a new multicast group
'error-report' in drm_ras. Each event contains device name, node and
error information to identify the error triggering the event.

Add drm_ras_nl_error_event() to trigger an event from the driver.
Userspace must subscribe to 'error-report' to receive 'error-event'
notifications.

Usage:

$ sudo ynl --family drm_ras --subscribe error-report

Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Zack McKevitt <zachary.mckevitt@oss.qualcomm.com>
Cc: Lijo Lazar <lijo.lazar@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
---
v2: remove redundant initialization
    remove unnecessary space
    use ynl in commit message and doc (Raag)
    simplify doc for error-event attrs

v3: rename error-notify to error-report
    Replace notify with report across the file (Raag)
---
 Documentation/gpu/drm-ras.rst            | 21 ++++++
 Documentation/netlink/specs/drm_ras.yaml | 48 +++++++++++++
 drivers/gpu/drm/drm_ras.c                | 87 ++++++++++++++++++++++++
 drivers/gpu/drm/drm_ras_nl.c             |  6 ++
 drivers/gpu/drm/drm_ras_nl.h             |  4 ++
 include/drm/drm_ras.h                    |  5 ++
 include/uapi/drm/drm_ras.h               | 15 ++++
 7 files changed, 186 insertions(+)

diff --git a/Documentation/gpu/drm-ras.rst b/Documentation/gpu/drm-ras.rst
index 83c21853b74b..406e4c49bac1 100644
--- a/Documentation/gpu/drm-ras.rst
+++ b/Documentation/gpu/drm-ras.rst
@@ -56,6 +56,7 @@ User space tools can:
   ``node-id`` and ``error-id`` as parameters.
 * Clear specific error counters with the ``clear-error-counter`` command, using both
   ``node-id`` and ``error-id`` as parameters.
+* Subscribe to the ``error-report`` multicast group to receive ``error-event``.
 
 YAML-based Interface
 --------------------
@@ -111,3 +112,23 @@ Example: Clear an error counter for a given node
 
     sudo ynl --family drm_ras --do clear-error-counter --json '{"node-id":0, "error-id":1}'
     None
+
+Example: Subscribe to ``error-report`` multicast group
+
+.. code-block:: bash
+
+    sudo ynl --family drm_ras --output-json --subscribe error-report
+
+.. code-block:: json
+
+    {
+        "name": "error-event",
+        "msg": {
+            "device-name": "0000:03:00.0",
+            "node-id": 1,
+            "node-name": "uncorrectable-errors",
+            "error-id": 1,
+            "error-name": "error_name1",
+            "error-value": 1
+        }
+    }
diff --git a/Documentation/netlink/specs/drm_ras.yaml b/Documentation/netlink/specs/drm_ras.yaml
index e113056f8c01..8aed3d4515e5 100644
--- a/Documentation/netlink/specs/drm_ras.yaml
+++ b/Documentation/netlink/specs/drm_ras.yaml
@@ -69,6 +69,33 @@ attribute-sets:
         name: error-value
         type: u32
         doc: Current value of the requested error counter.
+  -
+    name: error-event-attrs
+    attributes:
+      -
+        name: device-name
+        type: string
+        doc: Device (PCI BDF, UUID) that reported the error.
+      -
+        name: node-id
+        type: u32
+        doc: ID of the node that reported the error.
+      -
+        name: node-name
+        type: string
+        doc: Name of the node that reported the error.
+      -
+        name: error-id
+        type: u32
+        doc: ID of the error counter.
+      -
+        name: error-name
+        type: string
+        doc: Name of the error.
+      -
+        name: error-value
+        type: u32
+        doc: Current value of the error counter.
 
 operations:
   list:
@@ -124,3 +151,24 @@ operations:
       do:
         request:
           attributes: *id-attrs
+    -
+      name: error-event
+      doc: >-
+           Report an error event to userspace.
+           The event includes the device, node and error information
+           of the error that triggered the event.
+      attribute-set: error-event-attrs
+      mcgrp: error-report
+      event:
+        attributes:
+          - device-name
+          - node-id
+          - node-name
+          - error-id
+          - error-name
+          - error-value
+
+mcast-groups:
+  list:
+    -
+      name: error-report
diff --git a/drivers/gpu/drm/drm_ras.c b/drivers/gpu/drm/drm_ras.c
index d6eab29a1394..77f912a4d101 100644
--- a/drivers/gpu/drm/drm_ras.c
+++ b/drivers/gpu/drm/drm_ras.c
@@ -41,6 +41,11 @@
  *    Userspace must provide Node ID, Error ID.
  *    Clears specific error counter of a node if supported.
  *
+ * 4. ERROR_REPORT: Subscribe to this multicast group to receive error events
+ *
+ * 5. ERROR_EVENT: Report an error event to userspace. The event contains device, node
+ *    and error information that triggered the event.
+ *
  * Node registration:
  *
  * - drm_ras_node_register(): Registers a new node and assigns
@@ -186,6 +191,34 @@ static int msg_reply_value(struct sk_buff *msg, u32 error_id,
 			   value);
 }
 
+static int msg_put_error_event_attrs(struct sk_buff *msg, struct drm_ras_node *node,
+				     u32 error_id, const char *error_name, u32 value)
+{
+	int ret;
+
+	ret = nla_put_string(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_DEVICE_NAME, node->device_name);
+	if (ret)
+		return ret;
+
+	ret = nla_put_u32(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_NODE_ID, node->id);
+	if (ret)
+		return ret;
+
+	ret = nla_put_string(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_NODE_NAME, node->node_name);
+	if (ret)
+		return ret;
+
+	ret = nla_put_u32(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_ID, error_id);
+	if (ret)
+		return ret;
+
+	ret = nla_put_string(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_NAME, error_name);
+	if (ret)
+		return ret;
+
+	return nla_put_u32(msg, DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_VALUE, value);
+}
+
 static int doit_reply_value(struct genl_info *info, u32 node_id,
 			    u32 error_id)
 {
@@ -222,6 +255,60 @@ static int doit_reply_value(struct genl_info *info, u32 node_id,
 	return genlmsg_reply(msg, info);
 }
 
+/**
+ * drm_ras_nl_error_event() - Report an error event
+ * @node: Node structure
+ * @error_id: ID of the error
+ * @error_name: Name of the error
+ * @value: Value associated with the error
+ * @flags: GFP flags for memory allocation
+ *
+ * Report an error-event to userspace using the error-report multicast group.
+ *
+ * Return: 0 on success, or negative errno on failure.
+ */
+int drm_ras_nl_error_event(struct drm_ras_node *node, u32 error_id, const char *error_name,
+			   u32 value, gfp_t flags)
+{
+	struct genl_info info;
+	struct sk_buff *msg;
+	struct nlattr *hdr;
+	int ret;
+
+	if (!error_name)
+		return -EINVAL;
+
+	if (!genl_has_listeners(&drm_ras_nl_family, &init_net, DRM_RAS_NLGRP_ERROR_REPORT))
+		return 0;
+
+	genl_info_init_ntf(&info, &drm_ras_nl_family, DRM_RAS_CMD_ERROR_EVENT);
+
+	msg = genlmsg_new(NLMSG_GOODSIZE, flags);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = genlmsg_iput(msg, &info);
+	if (!hdr) {
+		ret = -EMSGSIZE;
+		goto free_msg;
+	}
+
+	ret = msg_put_error_event_attrs(msg, node, error_id, error_name, value);
+	if (ret)
+		goto cancel_msg;
+
+	genlmsg_end(msg, hdr);
+	genlmsg_multicast(&drm_ras_nl_family, msg, 0, DRM_RAS_NLGRP_ERROR_REPORT, flags);
+	return 0;
+
+cancel_msg:
+	genlmsg_cancel(msg, hdr);
+free_msg:
+	nlmsg_free(msg);
+	return ret;
+}
+EXPORT_SYMBOL(drm_ras_nl_error_event);
+
 /**
  * drm_ras_nl_get_error_counter_dumpit() - Dump all Error Counters
  * @skb: Netlink message buffer
diff --git a/drivers/gpu/drm/drm_ras_nl.c b/drivers/gpu/drm/drm_ras_nl.c
index dea1c1b2494e..9d3123cc9f9c 100644
--- a/drivers/gpu/drm/drm_ras_nl.c
+++ b/drivers/gpu/drm/drm_ras_nl.c
@@ -58,6 +58,10 @@ static const struct genl_split_ops drm_ras_nl_ops[] = {
 	},
 };
 
+static const struct genl_multicast_group drm_ras_nl_mcgrps[] = {
+	[DRM_RAS_NLGRP_ERROR_REPORT] = { "error-report", },
+};
+
 struct genl_family drm_ras_nl_family __ro_after_init = {
 	.name		= DRM_RAS_FAMILY_NAME,
 	.version	= DRM_RAS_FAMILY_VERSION,
@@ -66,4 +70,6 @@ struct genl_family drm_ras_nl_family __ro_after_init = {
 	.module		= THIS_MODULE,
 	.split_ops	= drm_ras_nl_ops,
 	.n_split_ops	= ARRAY_SIZE(drm_ras_nl_ops),
+	.mcgrps		= drm_ras_nl_mcgrps,
+	.n_mcgrps	= ARRAY_SIZE(drm_ras_nl_mcgrps),
 };
diff --git a/drivers/gpu/drm/drm_ras_nl.h b/drivers/gpu/drm/drm_ras_nl.h
index a398643572a5..03ec275aca92 100644
--- a/drivers/gpu/drm/drm_ras_nl.h
+++ b/drivers/gpu/drm/drm_ras_nl.h
@@ -21,6 +21,10 @@ int drm_ras_nl_get_error_counter_dumpit(struct sk_buff *skb,
 int drm_ras_nl_clear_error_counter_doit(struct sk_buff *skb,
 					struct genl_info *info);
 
+enum {
+	DRM_RAS_NLGRP_ERROR_REPORT,
+};
+
 extern struct genl_family drm_ras_nl_family;
 
 #endif /* _LINUX_DRM_RAS_GEN_H */
diff --git a/include/drm/drm_ras.h b/include/drm/drm_ras.h
index f2a787bc4f64..d4a275efdbb0 100644
--- a/include/drm/drm_ras.h
+++ b/include/drm/drm_ras.h
@@ -78,9 +78,14 @@ struct drm_device;
 #if IS_ENABLED(CONFIG_DRM_RAS)
 int drm_ras_node_register(struct drm_ras_node *node);
 void drm_ras_node_unregister(struct drm_ras_node *node);
+int drm_ras_nl_error_event(struct drm_ras_node *node, u32 error_id, const char *error_name,
+			   u32 value, gfp_t flags);
 #else
 static inline int drm_ras_node_register(struct drm_ras_node *node) { return 0; }
 static inline void drm_ras_node_unregister(struct drm_ras_node *node) { }
+static inline int drm_ras_nl_error_event(struct drm_ras_node *node, u32 error_id,
+					 const char *error_name, u32 value, gfp_t flags)
+{ return 0; }
 #endif
 
 #endif
diff --git a/include/uapi/drm/drm_ras.h b/include/uapi/drm/drm_ras.h
index 218a3ee86805..eab8231aa87c 100644
--- a/include/uapi/drm/drm_ras.h
+++ b/include/uapi/drm/drm_ras.h
@@ -38,13 +38,28 @@ enum {
 	DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX = (__DRM_RAS_A_ERROR_COUNTER_ATTRS_MAX - 1)
 };
 
+enum {
+	DRM_RAS_A_ERROR_EVENT_ATTRS_DEVICE_NAME = 1,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_NODE_ID,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_NODE_NAME,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_ID,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_NAME,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_ERROR_VALUE,
+
+	__DRM_RAS_A_ERROR_EVENT_ATTRS_MAX,
+	DRM_RAS_A_ERROR_EVENT_ATTRS_MAX = (__DRM_RAS_A_ERROR_EVENT_ATTRS_MAX - 1)
+};
+
 enum {
 	DRM_RAS_CMD_LIST_NODES = 1,
 	DRM_RAS_CMD_GET_ERROR_COUNTER,
 	DRM_RAS_CMD_CLEAR_ERROR_COUNTER,
+	DRM_RAS_CMD_ERROR_EVENT,
 
 	__DRM_RAS_CMD_MAX,
 	DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
 };
 
+#define DRM_RAS_MCGRP_ERROR_REPORT	"error-report"
+
 #endif /* _UAPI_LINUX_DRM_RAS_H */
-- 
2.47.1


^ permalink raw reply related

* [PATCH v3 0/3] Add drm_ras netlink error event support
From: Riana Tauro @ 2026-06-22 10:17 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: aravind.iddamsetty, anshuman.gupta, rodrigo.vivi, joonas.lahtinen,
	kuba, simona.vetter, airlied, pratik.bari, joshua.santosh.ranjan,
	ashwin.kumar.kulkarni, shubham.kumar, ravi.kishore.koppuravuri,
	raag.jadav, maarten.lankhorst, mallesh.koujalagi, soham.purkait,
	Riana Tauro

Define a new netlink event 'error-event' and a new multicast group
'error-report' in drm_ras. Each event contains device name, node and
error information to identify the error triggering the event.

Add drm_ras_nl_error_event() to trigger an event from the driver.
Wire this support to xe drm_ras to notify userspace whenever a GT or
SoC error occurs in PVC. Also add support for correctable errors in
CRI.

$ sudo ynl --family drm_ras --output-json --subscribe error-report

{
    "name": "error-event",
     "msg": {
         "device-name": "0000:03:00.0",
         "node-id": 1,
         "node-name": "uncorrectable-errors",
         "error-id": 1,
         "error-name": "core-compute",
         "error-value": 1
     }
}

Rev2: use ynl in document and commit message
      fix cosmetic review comments
      simplify caller 

Rev3: replace error-event with error-report
      had has_drm_ras check 
      add support for correctable errors in CRI

Riana Tauro (3):
  drm/drm_ras: Add drm_ras netlink error event
  drm/xe/xe_drm_ras: Add error-event support for PVC
  drm/xe/xe_ras: Add error-event support for Crescent Island

 Documentation/gpu/drm-ras.rst            | 21 ++++++
 Documentation/netlink/specs/drm_ras.yaml | 48 +++++++++++++
 drivers/gpu/drm/drm_ras.c                | 87 ++++++++++++++++++++++++
 drivers/gpu/drm/drm_ras_nl.c             |  6 ++
 drivers/gpu/drm/drm_ras_nl.h             |  4 ++
 drivers/gpu/drm/xe/xe_drm_ras.c          | 30 ++++++++
 drivers/gpu/drm/xe/xe_drm_ras.h          |  3 +
 drivers/gpu/drm/xe/xe_hw_error.c         |  5 +-
 drivers/gpu/drm/xe/xe_ras.c              | 53 +++++++++++++++
 include/drm/drm_ras.h                    |  5 ++
 include/uapi/drm/drm_ras.h               | 15 ++++
 11 files changed, 276 insertions(+), 1 deletion(-)

-- 
2.47.1


^ permalink raw reply

* AW: AW: AW: AW: [PATCH net] net: usb: lan78xx: restore VLAN filter table after device reset
From: Sven Schuchmann @ 2026-06-22 10:07 UTC (permalink / raw)
  To: Nicolai Buchwitz
  Cc: Thangaraj Samynathan, Rengarajan Sundararajan,
	UNGLinuxDriver@microchip.com, Woojung.Huh@microchip.com,
	Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev@vger.kernel.org, linux-usb@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <f76711d2f45c527f9ce0f5d288631bc6@tipi-net.de>

Hello Nicolai,

On 19.6.2026 16:01, Nicolai Buchwitz wrote:
> Hi Sven
> 
> On 19.6.2026 15:31, Sven Schuchmann wrote:
> > Hello Nicolai,
> >
> > looks good from my point of view
> > (Calling the lan78xx_write_vlan_table() from
> > lan78xx_mac_link_up() and from lan78xx_reset()).
> 
> Thanks.

Just to be clear I used this patch which is looking good:

---
 drivers/net/usb/lan78xx.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a5132f2f9..a2db38650 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1571,6 +1571,7 @@ static void lan78xx_set_multicast(struct net_device *netdev)
 }
 
 static void lan78xx_rx_urb_submit_all(struct lan78xx_net *dev);
+static int lan78xx_write_vlan_table(struct lan78xx_net *dev);
 
 static int lan78xx_mac_reset(struct lan78xx_net *dev)
 {
@@ -2528,6 +2529,10 @@ static void lan78xx_mac_link_up(struct phylink_config *config,
 	if (ret < 0)
 		goto link_up_fail;
 
+	ret = lan78xx_write_vlan_table(dev);
+	if (ret < 0)
+		goto link_up_fail;
+
 	netif_start_queue(net);
 
 	return;
@@ -3081,14 +3086,20 @@ static int lan78xx_set_features(struct net_device *netdev,
 	return lan78xx_write_reg(dev, RFE_CTL, pdata->rfe_ctl);
 }
 
+static int lan78xx_write_vlan_table(struct lan78xx_net *dev)
+{
+	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
+
+	return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
+				      DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+}
+
 static void lan78xx_deferred_vlan_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
 			container_of(param, struct lan78xx_priv, set_vlan);
-	struct lan78xx_net *dev = pdata->dev;
 
-	lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_, 0,
-			       DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
+	lan78xx_write_vlan_table(pdata->dev);
 }
 
 static int lan78xx_vlan_rx_add_vid(struct net_device *netdev,
@@ -3378,6 +3389,15 @@ static int lan78xx_reset(struct lan78xx_net *dev)
 
 	lan78xx_set_multicast(dev->net);
 
+	/* The chip reset above also clears the VLAN filter table held in the
+	 * shared VLAN/DA hash RAM. The network stack does not re-add VLANs
+	 * after a silent device reset (e.g. on reset_resume after USB
+	 * autosuspend), so restore the table from our shadow copy here.
+	 */
+	ret = lan78xx_write_vlan_table(dev);
+	if (ret < 0)
+		return ret;
+
 	/* reset PHY */
 	ret = lan78xx_read_reg(dev, PMT_CTL, &buf);
 	if (ret < 0)
-- 

> 
> > But I investigated a little more and it seems the hash table
> > (which is right behind the vlan table in the controllers memory)
> > also gets cleared. I wrote some random data into this table and have
> > seen that it gets also cleared. I think this needs to be fixed too.
> 
> Something like
> 
> static int lan78xx_write_mchash_table(struct lan78xx_net *dev)
> {
>         struct lan78xx_priv *pdata = (struct lan78xx_priv
> *)(dev->data[0]);
> 
>         return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_,
>                                       DP_SEL_VHF_VLAN_LEN,
>                                       DP_SEL_VHF_HASH_LEN,
> pdata->mchash_table); // from lan78xx_deferred_multicast_write)
> }
> 
> with callers in lan78xx_deferred_multicast_write() and
> lan78xx_mac_link_up(), should
> do the trick?

I used this one which is also looking good:
---
 drivers/net/usb/lan78xx.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index d449c1950fd3..6d7d349816a6 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1466,6 +1466,8 @@ static inline u32 lan78xx_hash(char addr[ETH_ALEN])
 	return (ether_crc(ETH_ALEN, addr) >> 23) & 0x1ff;
 }
 
+static int lan78xx_write_mchash_table(struct lan78xx_net *dev);
+
 static void lan78xx_deferred_multicast_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
@@ -1476,9 +1478,7 @@ static void lan78xx_deferred_multicast_write(struct work_struct *param)
 	netif_dbg(dev, drv, dev->net, "deferred multicast write 0x%08x\n",
 		  pdata->rfe_ctl);
 
-	ret = lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_,
-				     DP_SEL_VHF_VLAN_LEN,
-				     DP_SEL_VHF_HASH_LEN, pdata->mchash_table);
+	ret = lan78xx_write_mchash_table(dev);
 	if (ret < 0)
 		goto multicast_write_done;
 
@@ -2533,6 +2533,10 @@ static void lan78xx_mac_link_up(struct phylink_config *config,
 	if (ret < 0)
 		goto link_up_fail;
 
+	ret = lan78xx_write_mchash_table(dev);
+	if (ret < 0)
+		goto link_up_fail;
+
 	netif_start_queue(net);
 
 	return;
@@ -3094,6 +3098,16 @@ static int lan78xx_write_vlan_table(struct lan78xx_net *dev)
 				      DP_SEL_VHF_VLAN_LEN, pdata->vlan_table);
 }
 
+static int lan78xx_write_mchash_table(struct lan78xx_net *dev)
+{
+	struct lan78xx_priv *pdata = (struct lan78xx_priv *)(dev->data[0]);
+
+	return lan78xx_dataport_write(dev, DP_SEL_RSEL_VLAN_DA_,
+				      DP_SEL_VHF_VLAN_LEN,
+				      DP_SEL_VHF_HASH_LEN,
+				      pdata->mchash_table);
+}
+
 static void lan78xx_deferred_vlan_write(struct work_struct *param)
 {
 	struct lan78xx_priv *pdata =
@@ -3398,6 +3412,10 @@ static int lan78xx_reset(struct lan78xx_net *dev)
 	if (ret < 0)
 		return ret;
 
+	ret = lan78xx_write_mchash_table(dev);
+	if (ret < 0)
+		return ret;
+
 	/* reset PHY */
 	ret = lan78xx_read_reg(dev, PMT_CTL, &buf);
 	if (ret < 0)
-- 

> 
> >
> > In the Datasheet from the LAN7801 I can read:
> > "After a reset event, the RFE will automatically initialize the
> > contents of the VHF to 0h."
> > Where VHF also refers to the hash table.
> > But I still do not understand what reset is happening when I just
> > unplug the network cable....
> 
> I suspect it is triggered from the PHY:
> 
> 8.10 (MAC Reset Watchdog Timer):
> "A portion of the MAC operates on clocks generated by the Ethernet PHY
> [...] PHY Reset
> (PHY_RST) results in resetting the portion of the MAC operating on the
> PHY receive and
> transmit clocks."
> 
> So which PHY are you using?

I am using a DP83TC812R from TI. There is currently no driver available
so I ported this one
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/net/phy/dp83tg720.c
which is working fine (maybe I will also publish a patch for this).

The strange thing is that the MAC Reset Watchdog Timer seems 
to occur "silently" so that nor the mac or the phy driver know
about this reset.

But never the less. The two patches fixed my problem and
I think they should be mainline. 

Regards,

   Sven

^ permalink raw reply related

* [PATCH v2 net-next] sctp: use sctp_auth_shkey_release() in error path for consistency
From: Wentao Liang @ 2026-06-22 10:02 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, Xin Long, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, linux-sctp, netdev, linux-kernel, Wentao Liang

Use the proper refcount-aware helper sctp_auth_shkey_release() instead
of kfree() when freeing cur_key in the error path of sctp_auth_set_key().
While both are equivalent in the current code, using the helper maintains
abstraction consistency and prevents potential issues if the code is
reordered in the future.

Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
 net/sctp/auth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index be9782760f50..84708f87392f 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -753,7 +753,7 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
 	/* Create a new key data based on the info passed in */
 	key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL);
 	if (!key) {
-		kfree(cur_key);
+		sctp_auth_shkey_release(cur_key);
 		return -ENOMEM;
 	}
 
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related

* Re: [PATCH 0/2] Add bpf_sock_read_xattr() kfunc to read socket xattrs
From: Christian Brauner @ 2026-06-22 10:02 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Alexei Starovoitov, Daniel Borkmann, Christian Brauner
  Cc: Alexander Viro, Jan Kara, Simon Horman, Kuniyuki Iwashima,
	Willem de Bruijn, linux-fsdevel, netdev, bpf, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Song Liu, Yonghong Song, Jiri Olsa
In-Reply-To: <20260617-work-bpf-sock-xattr-v1-0-a1276f7c9da3@kernel.org>

On Wed, 17 Jun 2026 13:18:26 +0200, Christian Brauner wrote:
> Add bpf_sock_read_xattr() kfunc to read socket xattrs
> 
> In c8db08110cbe ("Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")
> we added support for extended attributes for sockets. This comes in two
> flavors: sockfs and non-sockfs/filesystem sockets. Filesystem sockets
> are actual filesystem objects so reading xattrs must use dedicated fs
> helpers such as bpf_get_dentry_xattr() and bpf_get_file_xattr(). Those
> are inherently sleeping operations. Sockfs sockets on the other hand
> don't need to use sleeping operations as the underlying data structure
> is lockless. In addition, retrieval of sockfs extended attributes often
> happens from LSM hooks that only provide struct socket and it's
> completely nonsensical to grab a reference to a file, then force a
> sleeping operation to retrieve the xattr and drop the reference. We know
> that the sockfs file cannot go away while the LSM hook runs.
> 
> [...]

Applied to the vfs-7.3.kfunc branch of the vfs/vfs.git tree.
Patches in the vfs-7.3.kfunc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.3.kfunc

[1/2] fs: Add bpf_sock_read_xattr() kfunc to read socket xattrs
      https://git.kernel.org/vfs/vfs/c/f80386e3838e
[2/2] selftests/bpf: Add test for bpf_sock_read_xattr() kfunc
      https://git.kernel.org/vfs/vfs/c/99a63a6aff40


^ permalink raw reply

* [PATCH net v2] net: airoha: Add retry mechanism to airoha_qdma_set_trtcm_param()
From: Lorenzo Bianconi @ 2026-06-22  9:35 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Lorenzo Bianconi
  Cc: Leto Liu, linux-arm-kernel, linux-mediatek, netdev, Brown Huang

From: Brown Huang <brown.huang@airoha.com>

CPU accesses QDMA via the bus. When multiple modules are using the bus
simultaneously, CPU access to QDMA may encounter bus timeouts and fails,
resulting in QDMA configuration failures and potentially causing packet
transmission issues. In order to mitigate the issue, introduce a retry
mechanism to airoha_qdma_set_trtcm_param routine in order to ensure the
configuration is correctly applied to the hardware.

Fixes: ef1ca9271313b ("net: airoha: Add sched HTB offload support")
Signed-off-by: Brown Huang <brown.huang@airoha.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
Changes in v2:
- Wait for write configuration to be completed before running
  airoha_qdma_get_trtcm_param() in airoha_qdma_set_trtcm_param().
- Link to v1: https://lore.kernel.org/r/20260608-airoha_qdma_set_trtcm_param-retry-fix-v1-1-f07704f0d8c5@kernel.org
---
 drivers/net/ethernet/airoha/airoha_eth.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
index 3370c3df7c10..bb5c0599a4ee 100644
--- a/drivers/net/ethernet/airoha/airoha_eth.c
+++ b/drivers/net/ethernet/airoha/airoha_eth.c
@@ -2673,14 +2673,30 @@ static int airoha_qdma_set_trtcm_param(struct airoha_qdma *qdma, int channel,
 		     FIELD_PREP(TRTCM_METER_GROUP_MASK, group) |
 		     FIELD_PREP(TRTCM_PARAM_INDEX_MASK, idx) |
 		     FIELD_PREP(TRTCM_PARAM_RATE_TYPE_MASK, mode);
+	int i;
 
-	airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val);
-	airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config);
+	for (i = 0; i < 100; i++) {
+		u32 data;
 
-	return read_poll_timeout(airoha_qdma_rr, val,
-				 val & TRTCM_PARAM_RW_DONE_MASK,
-				 USEC_PER_MSEC, 10 * USEC_PER_MSEC, true,
-				 qdma, REG_TRTCM_CFG_PARAM(addr));
+		airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val);
+		wmb();
+		airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config);
+
+		if (read_poll_timeout(airoha_qdma_rr, data,
+				      data & TRTCM_PARAM_RW_DONE_MASK,
+				      USEC_PER_MSEC, 10 * USEC_PER_MSEC,
+				      true, qdma, REG_TRTCM_CFG_PARAM(addr)))
+			return -ETIMEDOUT;
+
+		if (airoha_qdma_get_trtcm_param(qdma, channel, addr, param,
+						mode, &data, NULL))
+			continue;
+
+		if (data == val)
+			return 0;
+	}
+
+	return -EBUSY;
 }
 
 static int airoha_qdma_set_trtcm_config(struct airoha_qdma *qdma, int channel,

---
base-commit: d07d80b6a129a44538cda1549b7acf95154fb197
change-id: 20260605-airoha_qdma_set_trtcm_param-retry-fix-a9d2956b9b2f

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>


^ permalink raw reply related

* Re: [PATCH net v2] net: wwan: iosm: bound device offsets in the MUX downlink decoder
From: Loic Poulain @ 2026-06-22  9:24 UTC (permalink / raw)
  To: Maoyi Xie
  Cc: Sergey Ryazanov, Johannes Berg, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev, linux-kernel,
	stable
In-Reply-To: <178196118045.462404.11069139160448641355@maoyixie.com>

On Sat, Jun 20, 2026 at 3:13 PM Maoyi Xie <maoyixie.tju@gmail.com> wrote:
>
> mux_dl_adb_decode() walks a chain of aggregated datagram tables using
> offsets and lengths taken from the modem. first_table_index,
> next_table_index, table_length, datagram_index and datagram_length are
> all device supplied le values. Only first_table_index was checked, and
> only for being non zero. The decoder then formed adth = block +
> adth_index and read the table header and the datagram entries with no
> bound against the received skb. A modem that reports an index or a
> length past the downlink buffer makes the decoder read out of bounds.
>
> The buffer is IPC_MEM_MAX_DL_MUX_LITE_BUF_SIZE and skb->len is at most
> that, so skb->len is the real limit, but none of these in band offsets
> were checked against it.
>
> Validate every device offset and length against skb->len before use.
> The block header must fit. Each table header, on entry and after every
> next_table_index, must lie inside the skb. The datagram table must fit.
> Each datagram index and length must stay inside the skb. The header
> padding must not exceed the datagram length so the receive length does
> not wrap.
>
> This was reproduced under KASAN as a slab out of bounds read on a normal
> downlink receive once the iosm net device is up.
>
> Fixes: 1f52d7b62285 ("net: wwan: iosm: Enable M.2 7360 WWAN card support")
> Suggested-by: Loic Poulain <loic.poulain@oss.qualcomm.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Maoyi Xie <maoyixie.tju@gmail.com>

Reviewed-by: Loic Poulain <loic.poulain@oss.qualcomm.com>


> ---
> Changes in v2:
> - mux_dl_process_dg now uses intermediate native endian locals dg_index
>   and dg_len so the bound checks read cleaner and avoid the repeated
>   le32_to_cpu conversions, per Loic Poulain's review. No functional
>   change.
>
> Link to v1: https://lore.kernel.org/all/178185979029.4044562.9993615975949055530@maoyixie.com/
>
>  drivers/net/wwan/iosm/iosm_ipc_mux_codec.c | 33 ++++++++++++++++------
>  1 file changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> index bff46f7ca59f..ff9a4bc52f29 100644
> --- a/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> +++ b/drivers/net/wwan/iosm/iosm_ipc_mux_codec.c
> @@ -553,19 +553,21 @@ static int mux_dl_process_dg(struct iosm_mux *ipc_mux, struct mux_adbh *adbh,
>         u32 packet_offset, i, rc, dg_len;
>
>         for (i = 0; i < nr_of_dg; i++, dg++) {
> -               if (le32_to_cpu(dg->datagram_index)
> -                               < sizeof(struct mux_adbh))
> +               u32 dg_index = le32_to_cpu(dg->datagram_index);
> +
> +               dg_len = le16_to_cpu(dg->datagram_length);
> +
> +               if (dg_index < sizeof(struct mux_adbh))
>                         goto dg_error;
>
> -               /* Is the packet inside of the ADB */
> -               if (le32_to_cpu(dg->datagram_index) >=
> -                                       le32_to_cpu(adbh->block_length)) {
> +               /* Is the packet inside of the ADB and the received skb ? */
> +               if (dg_index >= le32_to_cpu(adbh->block_length) ||
> +                   dg_index >= skb->len ||
> +                   dg_len > skb->len - dg_index ||
> +                   dl_head_pad_len >= dg_len) {
>                         goto dg_error;
>                 } else {
> -                       packet_offset =
> -                               le32_to_cpu(dg->datagram_index) +
> -                               dl_head_pad_len;
> -                       dg_len = le16_to_cpu(dg->datagram_length);
> +                       packet_offset = dg_index + dl_head_pad_len;
>                         /* Pass the packet to the netif layer. */
>                         rc = ipc_mux_net_receive(ipc_mux, if_id, ipc_mux->wwan,
>                                                  packet_offset,
> @@ -595,6 +597,10 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>         block = skb->data;
>         adbh = (struct mux_adbh *)block;
>
> +       /* The block header itself must fit in the received skb. */
> +       if (skb->len < sizeof(struct mux_adbh))
> +               goto adb_decode_err;
> +
>         /* Process the aggregated datagram tables. */
>         adth_index = le32_to_cpu(adbh->first_table_index);
>
> @@ -606,6 +612,11 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>
>         /* Loop through mixed session tables. */
>         while (adth_index) {
> +               /* The table header must lie within the received skb. */
> +               if (adth_index < sizeof(struct mux_adbh) ||
> +                   adth_index > skb->len - sizeof(struct mux_adth))
> +                       goto adb_decode_err;
> +
>                 /* Get the reference to the table header. */
>                 adth = (struct mux_adth *)(block + adth_index);
>
> @@ -629,6 +640,10 @@ static void mux_dl_adb_decode(struct iosm_mux *ipc_mux,
>                 if (le16_to_cpu(adth->table_length) < sizeof(struct mux_adth))
>                         goto adb_decode_err;
>
> +               /* The whole datagram table must fit in the received skb. */
> +               if (le16_to_cpu(adth->table_length) > skb->len - adth_index)
> +                       goto adb_decode_err;
> +
>                 /* Calculate the number of datagrams. */
>                 nr_of_dg = (le16_to_cpu(adth->table_length) -
>                                         sizeof(struct mux_adth)) /
> --
> 2.34.1
>

^ permalink raw reply

* [PATCH net-next 6/6] selftests: net: add kselftest for IEEE 802.1CB FRER tc action
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Add frer_test.sh, a TAP-format kselftest script covering the FRER
(IEEE 802.1CB Frame Replication and Elimination for Reliability)
tc action (act_frer).

Tests 1-4 use a bond-based two-namespace topology:

  ns_talker
  +---------------------------+
  | bond0 (IP_SRC, balance-rr)|
  |   slave: veth_a0 (frer push + mirror to veth_b0)|
  |   slave: veth_b0 (frer push + mirror to veth_a0)|
  +-------+---------------+--+
          |               |
     veth_a0         veth_b0
          |               |
     veth_a1         veth_b1
          |               |
  +-------+---------------+--+
  | bond1 (IP_DST, balance-rr)|
  |   slave: veth_a1 (frer recover ingress)          |
  |   slave: veth_b1 (frer recover ingress)          |
  +---------------------------+
  ns_listener

  IP_SRC is assigned to bond0; IP_DST is assigned to bond1.  FRER push
  is configured on both veth_a0 and veth_b0 egress with cross-mirroring
  so every frame sent by either bond slave carries an R-TAG and a
  mirrored copy reaches the peer slave.  Tests 1-4 exercise shared and
  individual recover modes on the listener side.

Test 5 uses a self-contained single-path (no bond) topology:

  ns_p2p_src                        ns_p2p_dst
  +----------------------+          +----------------------+
  | frer_p2p_a0 (IP_P2P_SRC)| <---> | frer_p2p_a1 (IP_P2P_DST)|
  | egress: frer push     |          | ingress: frer recover |
  +----------------------+          +----------------------+

Test 6 uses a four-namespace relay topology:

  ns_talker -- bridge0 (br_r0) -+- path A -+- bridge1 (br_r1) -- ns_listener
                                 \- path B -/

  bridge0 acts as sequence generator (frer push + replicate to both
  redundant paths); bridge1 acts as eliminator (frer shared recover with
  tag-pop on both ingress ports).

Six functional test cases are included:

  1. push verify              - confirm that the frer push action inserts
                                an R-TAG (EtherType 0xF1C1) on egress;
                                tcpdump on both veth_a1 and veth_b1 must
                                capture at least one R-TAG frame each.

  2. shared recover e2e       - veth_a1 and veth_b1 share one recover
                                action; the action passes exactly one copy
                                and discards the duplicate; verified via
                                ping success, tcpdump frame count on bond1,
                                and tc stats (passed >= PING_COUNT,
                                discarded >= PING_COUNT).

  3. individual recover       - veth_a1 and veth_b1 use independent recover
                                actions so both copies are passed without
                                cross-port deduplication; verified via
                                per-slave tcpdump and tc stats
                                (discarded = 0 on each port).

  4. no tag-pop               - shared recover without tag-pop leaves the
                                R-TAG on passed frames; verified by
                                capturing EtherType 0xF1C1 (expect >= 1)
                                and plain ICMP (expect 0) on bond1.

  5. simple point-to-point    - single-path push + individual recover (with
                                tag-pop) end-to-end ping test; no bond.

  6. relay e2e                - four-namespace bridge relay topology; bridge0
                                pushes R-TAG and replicates to two paths;
                                bridge1 recovers (shared, tag-pop) and
                                forwards deduplicated frames to listener;
                                verified via ping success, tcpdump frame
                                count on listener, and bridge1 tc stats.

The script conforms to the kselftest framework (TAP output, KSFT_PASS /
KSFT_FAIL / KSFT_SKIP exit codes).  It loads kselftest/lib.sh when
available and falls back to a minimal inline implementation otherwise.
All tests are skipped gracefully when act_frer is not available in the
running kernel.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 tools/testing/selftests/net/Makefile     |    1 +
 tools/testing/selftests/net/frer_test.sh | 1013 ++++++++++++++++++++++
 2 files changed, 1014 insertions(+)
 create mode 100755 tools/testing/selftests/net/frer_test.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 6a190a525a39..67b896611f08 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -38,6 +38,7 @@ TEST_PROGS := \
 	fib_rule_tests.sh \
 	fib_tests.sh \
 	fin_ack_lat.sh \
+	frer_test.sh \
 	fq_band_pktlimit.sh \
 	gre_gso.sh \
 	gre_ipv6_lladdr.sh \
diff --git a/tools/testing/selftests/net/frer_test.sh b/tools/testing/selftests/net/frer_test.sh
new file mode 100755
index 000000000000..ecd88952f495
--- /dev/null
+++ b/tools/testing/selftests/net/frer_test.sh
@@ -0,0 +1,1013 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright 2026 NXP
+#
+# frer_test.sh - IEEE 802.1CB FRER tc action kselftest
+#
+# Topology for tests 1-4:
+#
+#   ns_talker  bond0 (veth_a0 + veth_b0)  <--->  bond1 (veth_a1 + veth_b1)  ns_listener
+#
+#   IP_SRC assigned to bond0;  IP_DST assigned to bond1
+#
+#   bond mode: balance-rr (round-robin), so frames are distributed across
+#              both slaves.  FRER push is configured on both veth_a0 and
+#              veth_b0 egress with cross-mirror so every frame sent by either
+#              slave carries an R-TAG and a mirrored copy reaches the peer.
+#   FRER recover: veth_a1/veth_b1 ingress, shared or individual recover per test
+#
+#   Ping runs from bond0 to bond1; tcpdump captures on bond1 (or on individual
+#   slave interfaces for tests where both copies must be observable).
+#
+# Test 5: simple point-to-point, self-contained topology (no bond).
+# Test 6: relay system, self-contained topology.
+#
+# All namespaces, veth pairs, bond interfaces, tc rules and addresses are
+# created and destroyed within this script.  External dependencies:
+#   - kernel with CONFIG_NET_ACT_FRER and CONFIG_BONDING
+#   - iproute2 tc with frer action support
+#   - tcpdump, ping
+#   - root privileges
+
+# ----------------------------------------------------------------------------
+# kselftest library: TAP output + exit-code constants
+# ----------------------------------------------------------------------------
+ksft_lib="${KSFT_LIB:-$(dirname "$0")/../kselftest/lib.sh}"
+if [ -f "$ksft_lib" ]; then
+	# shellcheck source=/dev/null
+	. "$ksft_lib"
+else
+	# Minimal fallback when run outside the kselftest tree
+	KSFT_PASS=0
+	KSFT_FAIL=1
+	KSFT_SKIP=4
+	_ksft_count=0
+	_ksft_pass=0
+	_ksft_fail=0
+	_ksft_skip=0
+
+	ksft_print_header() { echo "TAP version 13"; }
+	ksft_set_plan()     { echo "1..$1"; }
+	ksft_test_result_pass() {
+		_ksft_count=$((_ksft_count + 1)); _ksft_pass=$((_ksft_pass + 1))
+		echo "ok $_ksft_count - $*"
+	}
+	ksft_test_result_fail() {
+		_ksft_count=$((_ksft_count + 1)); _ksft_fail=$((_ksft_fail + 1))
+		echo "not ok $_ksft_count - $*"
+	}
+	ksft_test_result_skip() {
+		_ksft_count=$((_ksft_count + 1)); _ksft_skip=$((_ksft_skip + 1))
+		echo "ok $_ksft_count - $* # SKIP"
+	}
+	ksft_print_cnts() {
+		echo "# Totals: pass=$_ksft_pass fail=$_ksft_fail skip=$_ksft_skip"
+	}
+	ksft_exit_pass()     { exit $KSFT_PASS; }
+	ksft_exit_fail()     { exit $KSFT_FAIL; }
+	ksft_exit_fail_msg() { echo "# FATAL: $*" >&2; exit $KSFT_FAIL; }
+fi
+
+# ----------------------------------------------------------------------------
+# Configuration (override via environment)
+# ----------------------------------------------------------------------------
+TC="${TC:-tc}"
+PING="${PING:-ping}"
+TCPDUMP="${TCPDUMP:-tcpdump}"
+PING_COUNT="${PING_COUNT:-5}"
+PING_TIMEOUT="${PING_TIMEOUT:-2}"
+SKIP_MODPROBE="${SKIP_MODPROBE:-0}"
+
+# Bond topology interfaces (tests 1-4)
+readonly VETH_A0="frer_a0"
+readonly VETH_A1="frer_a1"
+readonly VETH_B0="frer_b0"
+readonly VETH_B1="frer_b1"
+readonly BOND0="frer_bond0"
+readonly BOND1="frer_bond1"
+
+readonly NS_TALKER="frer_ns_talker"
+readonly NS_LISTENER="frer_ns_listener"
+
+readonly IP_SRC="10.0.0.1"
+readonly IP_DST="10.0.0.2"
+
+# Point-to-point topology interfaces (test 5)
+readonly P2P_NS_SRC="frer_p2p_src"
+readonly P2P_NS_DST="frer_p2p_dst"
+readonly P2P_VETH_A0="frer_p2p_a0"
+readonly P2P_VETH_A1="frer_p2p_a1"
+readonly IP_P2P_SRC="10.0.1.1"
+readonly IP_P2P_DST="10.0.1.2"
+
+# Relay topology interfaces (test 6)
+#
+#   ns_talker (talker_eth.100) -- talker_eth/br0_uplink -- bridge0 (br_r0)
+#                                         |-- br0_swp0/br1_swp0 --\
+#                                         \-- br0_swp1/br1_swp1 --+--\
+#		bridge1 (br_r1) -- br1_downlink/listener_eth -- ns_listener
+#
+# bridge0 acts as sequence generator (frer push + replicate to both paths).
+# bridge1 acts as eliminator (frer recover, shared, tag-pop).
+readonly R_NS_TALKER="frer_r_talker"
+readonly R_NS_BRIDGE0="frer_r_bridge0"
+readonly R_NS_BRIDGE1="frer_r_bridge1"
+readonly R_NS_LISTENER="frer_r_listener"
+readonly R_TALKER_ETH="r_tlk_eth"       # talker-side physical port
+readonly R_BR0_UPLINK="r_br0_uplink"    # bridge0 uplink facing talker
+readonly R_BR0_SWP0="r_br0_swp0"        # bridge0 redundant path port 0
+readonly R_BR0_SWP1="r_br0_swp1"        # bridge0 redundant path port 1
+readonly R_BR1_SWP0="r_br1_swp0"        # bridge1 redundant path port 0
+readonly R_BR1_SWP1="r_br1_swp1"        # bridge1 redundant path port 1
+readonly R_BR1_DOWNLINK="r_br1_dwnlnk"  # bridge1 downlink facing listener
+readonly R_LISTENER_ETH="r_lst_eth"     # listener-side physical port
+readonly R_BR0="br_r0"
+readonly R_BR1="br_r1"
+readonly R_VLAN=100
+readonly R_IP_TALKER="10.1.0.1"
+readonly R_IP_LISTENER="10.1.0.2"
+
+# FRER action index constants
+readonly IDX_PUSH=1
+readonly IDX_SHARED_RCVY=10
+readonly IDX_INDV_RCVY_A=20
+readonly IDX_INDV_RCVY_B=21
+readonly IDX_NO_POP=30
+readonly IDX_P2P_RCVY=40
+readonly IDX_RELAY_PUSH=50
+readonly IDX_RELAY_RCVY=60
+
+readonly NUM_TESTS=6
+
+# ----------------------------------------------------------------------------
+# Prerequisite check
+# ----------------------------------------------------------------------------
+check_prerequisites()
+{
+	local missing=0
+
+	[ "$(id -u)" -eq 0 ] || { echo "# Must be run as root" >&2; missing=1; }
+
+	for cmd in ip "$TC" "$TCPDUMP" "$PING"; do
+		command -v "$cmd" >/dev/null 2>&1 || {
+			echo "# Missing command: $cmd" >&2
+			missing=1
+		}
+	done
+
+	if [ "$missing" -ne 0 ]; then
+		ksft_set_plan "$NUM_TESTS"
+		for i in $(seq 1 "$NUM_TESTS"); do
+			ksft_test_result_skip "prerequisites not met (test $i)"
+		done
+		ksft_print_cnts
+		exit "$KSFT_SKIP"
+	fi
+}
+
+load_module()
+{
+	[ "$SKIP_MODPROBE" = "1" ] && return
+	if ! modprobe act_frer 2>/dev/null; then
+		echo "# modprobe act_frer failed - may be built-in or unavailable" >&2
+	fi
+	if ! modprobe bonding 2>/dev/null; then
+		echo "# modprobe bonding failed - may be built-in or unavailable" >&2
+	fi
+}
+
+check_frer_action()
+{
+	ip netns exec "$NS_TALKER" \
+		$TC actions add action frer push index 999 2>/dev/null || return 1
+	ip netns exec "$NS_TALKER" \
+		$TC actions del action frer index 999 2>/dev/null || true
+	return 0
+}
+
+# ----------------------------------------------------------------------------
+# Bond topology setup / teardown (used by tests 1-4)
+# ----------------------------------------------------------------------------
+setup_topology()
+{
+	for n in "$NS_TALKER" "$NS_LISTENER"; do
+		ip netns add "$n"
+	done
+
+	ip link add "$VETH_A0" type veth peer name "$VETH_A1"
+	ip link set "$VETH_A0" netns "$NS_TALKER"
+	ip link set "$VETH_A1" netns "$NS_LISTENER"
+
+	ip link add "$VETH_B0" type veth peer name "$VETH_B1"
+	ip link set "$VETH_B0" netns "$NS_TALKER"
+	ip link set "$VETH_B1" netns "$NS_LISTENER"
+
+	# ns_talker: create bond0 (balance-rr), frames round-robin across both slaves.
+	ip netns exec "$NS_TALKER" ip link set lo up
+	ip netns exec "$NS_TALKER" ip link add "$BOND0" type bond mode balance-rr miimon 100
+	ip netns exec "$NS_TALKER" ip link set "$VETH_A0" master "$BOND0"
+	ip netns exec "$NS_TALKER" ip link set "$VETH_B0" master "$BOND0"
+	ip netns exec "$NS_TALKER" ip link set "$VETH_A0" up
+	ip netns exec "$NS_TALKER" ip link set "$VETH_B0" up
+	ip netns exec "$NS_TALKER" ip link set "$BOND0" up
+	ip netns exec "$NS_TALKER" ip addr add "${IP_SRC}/24" dev "$BOND0"
+
+	# ns_listener: create bond1 (balance-rr).
+	ip netns exec "$NS_LISTENER" ip link set lo up
+	ip netns exec "$NS_LISTENER" ip link add "$BOND1" type bond mode balance-rr miimon 100
+	ip netns exec "$NS_LISTENER" ip link set "$VETH_A1" master "$BOND1"
+	ip netns exec "$NS_LISTENER" ip link set "$VETH_B1" master "$BOND1"
+	ip netns exec "$NS_LISTENER" ip link set "$VETH_A1" up
+	ip netns exec "$NS_LISTENER" ip link set "$VETH_B1" up
+	ip netns exec "$NS_LISTENER" ip link set "$BOND1" up
+	ip netns exec "$NS_LISTENER" ip addr add "${IP_DST}/24" dev "$BOND1"
+
+	# Static ARP so L2 forwarding works without ARP broadcasts.
+	# With balance-rr both slaves share the bond MAC.
+	local mac_bond0 mac_bond1
+	mac_bond0=$(ip netns exec "$NS_TALKER"   cat /sys/class/net/"$BOND0"/address)
+	mac_bond1=$(ip netns exec "$NS_LISTENER" cat /sys/class/net/"$BOND1"/address)
+	ip netns exec "$NS_TALKER"   ip neigh add "$IP_DST" lladdr "$mac_bond1" dev "$BOND0"
+	ip netns exec "$NS_LISTENER" ip neigh add "$IP_SRC" lladdr "$mac_bond0" dev "$BOND1"
+}
+
+cleanup()
+{
+	for n in "$NS_TALKER" "$NS_LISTENER" \
+		"$P2P_NS_SRC" "$P2P_NS_DST" \
+		"$R_NS_TALKER" "$R_NS_BRIDGE0" "$R_NS_BRIDGE1" "$R_NS_LISTENER"; do
+		ip netns del "$n" 2>/dev/null || true
+	done
+}
+trap cleanup EXIT
+
+# ----------------------------------------------------------------------------
+# TC rule helpers
+# ----------------------------------------------------------------------------
+
+# Push on both veth_a0 and veth_b0 egress using the same shared frer push
+# action (IDX_PUSH).  Each slave also mirrors to the other so that every
+# outgoing frame is replicated onto both paths regardless of which slave the
+# bond currently selects.  This prevents packet loss during bond link changes.
+setup_push_mirror()
+{
+	ip netns exec "$NS_TALKER" $TC qdisc add dev "$VETH_A0" clsact
+	ip netns exec "$NS_TALKER" $TC filter add dev "$VETH_A0" egress \
+		protocol ip flower skip_hw \
+		action frer push index $IDX_PUSH \
+		action mirred egress mirror dev "$VETH_B0"
+
+	ip netns exec "$NS_TALKER" $TC qdisc add dev "$VETH_B0" clsact
+	ip netns exec "$NS_TALKER" $TC filter add dev "$VETH_B0" egress \
+		protocol ip flower skip_hw \
+		action frer push index $IDX_PUSH \
+		action mirred egress mirror dev "$VETH_A0"
+}
+
+teardown_tc()
+{
+	for dev in "$VETH_A0" "$VETH_B0"; do
+		ip netns exec "$NS_TALKER" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	for dev in "$VETH_A1" "$VETH_B1"; do
+		ip netns exec "$NS_LISTENER" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	ip netns exec "$NS_TALKER"   $TC actions flush action frer 2>/dev/null || true
+	ip netns exec "$NS_LISTENER" $TC actions flush action frer 2>/dev/null || true
+}
+
+# ----------------------------------------------------------------------------
+# Packet-capture helpers
+#
+# capture_start_on NS IFACE PCAP [BPF_FILTER]
+#   Starts tcpdump in namespace NS on IFACE, writing to PCAP.
+#   Stores PID in _CAP_PID.
+#
+# capture_stop
+#   Waits for tcpdump (stored in _CAP_PID) to finish.
+#
+# capture_count_on NS PCAP
+#   Prints the number of captured packets.
+#
+# Convenience wrappers capture_start / capture_count target bond1 in
+# NS_LISTENER (the primary observation point for tests 2 and 4).
+# ----------------------------------------------------------------------------
+_CAP_PID=""
+
+capture_start_on()
+{
+	local ns="$1" iface="$2" pcap="$3" filter="${4:-}"
+
+	if [ -n "$filter" ]; then
+		ip netns exec "$ns" timeout 4 \
+			$TCPDUMP -i "$iface" -w "$pcap" \
+			--immediate-mode -Z root -y EN10MB \
+			$filter >/dev/null 2>&1 &
+	else
+		ip netns exec "$ns" timeout 4 \
+			$TCPDUMP -i "$iface" -w "$pcap" \
+			--immediate-mode -Z root -y EN10MB \
+			>/dev/null 2>&1 &
+	fi
+	_CAP_PID=$!
+
+	# Wait until tcpdump opens a packet socket (max ~2.5 s).
+	local tries=0
+	while [ $tries -lt 50 ]; do
+		ip netns exec "$ns" grep -q "$iface" /proc/net/packet 2>/dev/null && break
+		sleep 0.05
+		tries=$((tries + 1))
+	done
+}
+
+capture_stop()
+{
+	[ -n "$_CAP_PID" ] || return 0
+	wait "$_CAP_PID" 2>/dev/null || true
+	_CAP_PID=""
+}
+
+capture_count_on()
+{
+	local ns="$1" pcap="$2"
+	ip netns exec "$ns" \
+		$TCPDUMP -r "$pcap" --no-promiscuous-mode 2>/dev/null \
+		| grep -c "^[0-9]" || true
+}
+
+# Convenience wrappers: default to bond1 in NS_LISTENER
+capture_start() { capture_start_on "$NS_LISTENER" "$BOND1" "$@"; }
+capture_count() { capture_count_on "$NS_LISTENER" "$1"; }
+
+# ----------------------------------------------------------------------------
+# Ping helper
+# ----------------------------------------------------------------------------
+do_ping()
+{
+	local rc=0
+	ip netns exec "$NS_TALKER" \
+		$PING -c "$PING_COUNT" -W "$PING_TIMEOUT" -i 0.2 -q \
+		"$IP_DST" >/dev/null 2>&1 || rc=$?
+	return $rc
+}
+
+# ----------------------------------------------------------------------------
+# tc statistics parser
+# ----------------------------------------------------------------------------
+tc_stat()
+{
+	local dump="$1" field="$2"
+	echo "$dump" | awk -F"${field}=" 'NF>1{split($2,a," ");print a[1];exit}' || echo "0"
+}
+
+# ----------------------------------------------------------------------------
+# TEST 1: PUSH VERIFY (bond topology)
+#
+# Only push is configured on the talker side; no recover on the listener.
+# The push action on veth_a0 egress inserts an R-TAG and mirrors a copy to
+# veth_b0, so both listener slaves (veth_a1 and veth_b1) receive a frame
+# with EtherType 0xF1C1.  Captures run sequentially on each slave to verify
+# that both paths carry R-TAG frames.
+#
+# Pass criteria:
+#   - veth_a1 captures >= 1 R-TAG frame
+#   - veth_b1 captures >= 1 R-TAG frame
+# ----------------------------------------------------------------------------
+test_push_verify_bond()
+{
+	local pcap_a pcap_b cap_a cap_b
+	local result="pass"
+
+	setup_push_mirror
+
+	# Capture 1: R-TAG frames on veth_a1 (path A)
+	pcap_a=$(mktemp /tmp/frer_bond_push_a_XXXXXX.pcap)
+	capture_start_on "$NS_LISTENER" "$VETH_A1" "$pcap_a" "ether proto 0xf1c1"
+	ip netns exec "$NS_TALKER" \
+		$PING -c 3 -W 1 -i 0.2 -q "$IP_DST" >/dev/null 2>&1 || true
+	capture_stop
+	cap_a=$(capture_count_on "$NS_LISTENER" "$pcap_a")
+	rm -f "$pcap_a"
+
+	# Capture 2: R-TAG frames on veth_b1 (path B, mirrored copy)
+	pcap_b=$(mktemp /tmp/frer_bond_push_b_XXXXXX.pcap)
+	capture_start_on "$NS_LISTENER" "$VETH_B1" "$pcap_b" "ether proto 0xf1c1"
+	ip netns exec "$NS_TALKER" \
+		$PING -c 3 -W 1 -i 0.2 -q "$IP_DST" >/dev/null 2>&1 || true
+	capture_stop
+	cap_b=$(capture_count_on "$NS_LISTENER" "$pcap_b")
+	rm -f "$pcap_b"
+
+	teardown_tc
+
+	echo "# bond push verify: veth_a1 R-TAG=$cap_a veth_b1 R-TAG=$cap_b"
+
+	[ "$cap_a" -ge 1 ] || result="fail"
+	[ "$cap_b" -ge 1 ] || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"bond push verify: R-TAG on both paths (a1=$cap_a b1=$cap_b)"
+	else
+		ksft_test_result_fail \
+			"bond push verify: expected R-TAG on both paths (a1=$cap_a b1=$cap_b)"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# TEST 2: SHARED RECOVER E2E (bond topology)
+#
+# veth_a1 and veth_b1 ingress share one recover action (idx=10) with tag-pop.
+# The listener receives two R-TAG copies per request; the shared recover passes
+# exactly one and discards the other.  The recovered plain ICMP reaches bond1's
+# IP stack and a reply is sent, making ping succeed.
+#
+# Pass criteria:
+#   - ping succeeds (rc=0)
+#   - tcpdump on bond1 captures exactly PING_COUNT ICMP echo-request frames
+#     (filter is restricted to type=8 to exclude echo replies, which would
+#     double the count since bond1 also originates the reply packets)
+#   - tc stats on veth_a1: passed >= PING_COUNT, discarded >= PING_COUNT
+# ----------------------------------------------------------------------------
+test_shared_recover_bond()
+{
+	local pcap cap_count ping_rc=0
+	local dump_a
+	local total_passed total_discarded tagless
+	local result="pass"
+
+	setup_push_mirror
+
+	# veth_a1 ingress: create shared recover action with tag-pop
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_A1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_A1" ingress \
+		protocol all flower skip_hw \
+		action frer recover alg vector history-length 16 \
+			reset-time 2000 tag-pop index $IDX_SHARED_RCVY
+
+	# veth_b1 ingress: bind to the same shared action by index
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_B1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_B1" ingress \
+		protocol all flower skip_hw \
+		action frer recover index $IDX_SHARED_RCVY
+
+	pcap=$(mktemp /tmp/frer_bond_shared_XXXXXX.pcap)
+	capture_start "$pcap" "icmp[icmptype] == icmp-echo"
+
+	do_ping || ping_rc=$?
+
+	capture_stop
+
+	cap_count=$(capture_count "$pcap")
+	rm -f "$pcap"
+
+	dump_a=$(ip netns exec "$NS_LISTENER" \
+		$TC -s filter show dev "$VETH_A1" ingress 2>/dev/null)
+
+	teardown_tc
+
+	total_passed=$(tc_stat    "$dump_a" "passed")
+	total_discarded=$(tc_stat "$dump_a" "discarded")
+	tagless=$(tc_stat         "$dump_a" "tagless")
+	total_discarded=$((total_discarded - tagless))
+
+	echo "# bond shared recover: ping_rc=$ping_rc cap=$cap_count" \
+		"passed=$total_passed discarded=$total_discarded"
+
+	[ "$ping_rc"         -eq 0 ]            || result="fail"
+	[ "$cap_count"       -eq "$PING_COUNT" ] || result="fail"
+	[ "$total_passed"    -ge "$PING_COUNT" ] || result="fail"
+	[ "$total_discarded" -ge "$PING_COUNT" ] || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"bond shared recover: ping OK, cap=$cap_count" \
+			"passed=$total_passed discarded=$total_discarded"
+	else
+		ksft_test_result_fail \
+			"bond shared recover: ping_rc=$ping_rc cap=$cap_count" \
+			"passed=$total_passed discarded=$total_discarded" \
+			"(expected ping OK, cap=$PING_COUNT," \
+			"passed>=$PING_COUNT, discarded>=$PING_COUNT)"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# TEST 3: INDIVIDUAL RECOVER (bond topology)
+#
+# veth_a1 and veth_b1 use independent recover actions (idx=20 and idx=21).
+# Each port maintains its own sequence history so both copies of every frame
+# are passed (no cross-port deduplication).  With active-backup bond1, only
+# the active slave's (veth_a1) recovered frame reaches bond1's IP stack, so
+# ping succeeds.  The absence of deduplication is verified via per-slave
+# tcpdump (each slave should capture PING_COUNT ICMP frames) and tc stats.
+#
+# Pass criteria:
+#   - ping succeeds
+#   - veth_a1 captures PING_COUNT ICMP frames (passed, not discarded)
+#   - veth_b1 captures PING_COUNT ICMP frames (passed independently)
+#   - tc stats: veth_a1 passed=PING_COUNT discarded=0
+#               veth_b1 passed=PING_COUNT discarded=0
+# ----------------------------------------------------------------------------
+test_individual_recover_bond()
+{
+	local pcap_a pcap_b cap_a cap_b ping_rc=0
+	local dump_a dump_b
+	local passed_a discarded_a passed_b discarded_b tagless_a tagless_b
+	local result="pass"
+
+	setup_push_mirror
+
+	# veth_a1 ingress: individual recover idx=20 (independent state)
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_A1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_A1" ingress \
+		protocol all flower skip_hw \
+		action frer recover individual alg vector history-length 16 \
+			reset-time 2000 tag-pop index $IDX_INDV_RCVY_A
+
+	# veth_b1 ingress: individual recover idx=21 (separate independent state)
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_B1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_B1" ingress \
+		protocol all flower skip_hw \
+		action frer recover individual alg vector history-length 16 \
+			reset-time 2000 tag-pop index $IDX_INDV_RCVY_B
+
+	# Per-slave capture A: verify veth_a1 passes frames; also use this run
+	# for the overall ping_rc check (do_ping targets bond0->bond1).
+	pcap_a=$(mktemp /tmp/frer_bond_indv_a_XXXXXX.pcap)
+	capture_start_on "$NS_LISTENER" "$VETH_A1" "$pcap_a" "icmp"
+	do_ping || ping_rc=$?
+	capture_stop
+	cap_a=$(capture_count_on "$NS_LISTENER" "$pcap_a")
+	rm -f "$pcap_a"
+
+	# Per-slave capture B: verify veth_b1 also passes frames (balance-rr
+	# distributes egress across both slaves, so both paths carry traffic).
+	pcap_b=$(mktemp /tmp/frer_bond_indv_b_XXXXXX.pcap)
+	capture_start_on "$NS_LISTENER" "$VETH_B1" "$pcap_b" "icmp"
+	do_ping || true
+	capture_stop
+	cap_b=$(capture_count_on "$NS_LISTENER" "$pcap_b")
+	rm -f "$pcap_b"
+
+	dump_a=$(ip netns exec "$NS_LISTENER" \
+		$TC -s filter show dev "$VETH_A1" ingress 2>/dev/null)
+	dump_b=$(ip netns exec "$NS_LISTENER" \
+		$TC -s filter show dev "$VETH_B1" ingress 2>/dev/null)
+
+	teardown_tc
+
+	passed_a=$(tc_stat    "$dump_a" "passed")
+	discarded_a=$(tc_stat "$dump_a" "discarded")
+	tagless_a=$(tc_stat   "$dump_a" "tagless")
+	passed_b=$(tc_stat    "$dump_b" "passed")
+	discarded_b=$(tc_stat "$dump_b" "discarded")
+	tagless_b=$(tc_stat   "$dump_b" "tagless")
+	discarded_a=$((discarded_a - tagless_a))
+	discarded_b=$((discarded_b - tagless_b))
+
+	echo "# bond individual recover: ping_rc=$ping_rc" \
+		"a1: cap=$cap_a passed=$passed_a discarded=$discarded_a" \
+		"b1: cap=$cap_b passed=$passed_b discarded=$discarded_b"
+
+	[ "$ping_rc"   -eq 0 ]            || result="fail"
+	[ "$cap_a"     -ge "$PING_COUNT" ] || result="fail"
+	[ "$cap_b"     -ge "$PING_COUNT" ] || result="fail"
+	[ "$passed_a"  -ge "$PING_COUNT" ] || result="fail"
+	[ "$passed_b"  -ge "$PING_COUNT" ] || result="fail"
+	[ "$discarded_a" -eq 0 ]           || result="fail"
+	[ "$discarded_b" -eq 0 ]           || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"bond individual recover: ping OK" \
+			"a1: cap=$cap_a passed=$passed_a/0" \
+			"b1: cap=$cap_b passed=$passed_b/0"
+	else
+		ksft_test_result_fail \
+			"bond individual recover: ping_rc=$ping_rc" \
+			"a1: cap=$cap_a passed=$passed_a discarded=$discarded_a" \
+			"b1: cap=$cap_b passed=$passed_b discarded=$discarded_b"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# TEST 4: NO TAG-POP (bond topology)
+#
+# Shared recover runs without tag-pop; passed frames still carry the R-TAG
+# when they reach bond1.
+#
+# Pass criteria:
+#   - tcpdump on bond1 with "ether proto 0xf1c1" captures >= 1 R-TAG frame
+#   - tcpdump on bond1 with "icmp" captures 0 frames (outer EtherType is
+#     0xF1C1, not 0x0800, so plain-IP ICMP filter does not match)
+# ----------------------------------------------------------------------------
+test_no_tag_pop_bond()
+{
+	local pcap_rtag pcap_icmp rtag_count icmp_count
+	local result="pass"
+
+	setup_push_mirror
+
+	# veth_a1 ingress: shared recover WITHOUT tag-pop
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_A1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_A1" ingress \
+		protocol all flower skip_hw \
+		action frer recover alg vector history-length 16 \
+			reset-time 2000 index $IDX_NO_POP
+
+	# veth_b1 ingress: bind to the same shared action
+	ip netns exec "$NS_LISTENER" $TC qdisc add dev "$VETH_B1" clsact
+	ip netns exec "$NS_LISTENER" $TC filter add dev "$VETH_B1" ingress \
+		protocol all flower skip_hw \
+		action frer recover index $IDX_NO_POP
+
+	# Capture 1: frames with R-TAG EtherType on bond1 (expect >= 1)
+	pcap_rtag=$(mktemp /tmp/frer_bond_nopop_rtag_XXXXXX.pcap)
+	capture_start "$pcap_rtag" "ether proto 0xf1c1"
+	ip netns exec "$NS_TALKER" \
+		$PING -c 3 -W 1 -i 0.2 -q "$IP_DST" >/dev/null 2>&1 || true
+	capture_stop
+	rtag_count=$(capture_count "$pcap_rtag")
+	rm -f "$pcap_rtag"
+
+	# Capture 2: plain ICMP frames on bond1 (expect 0)
+	pcap_icmp=$(mktemp /tmp/frer_bond_nopop_icmp_XXXXXX.pcap)
+	capture_start "$pcap_icmp" "icmp"
+	ip netns exec "$NS_TALKER" \
+		$PING -c 3 -W 1 -i 0.2 -q "$IP_DST" >/dev/null 2>&1 || true
+	capture_stop
+	icmp_count=$(capture_count "$pcap_icmp")
+	rm -f "$pcap_icmp"
+
+	teardown_tc
+
+	echo "# bond no tag-pop: rtag=$rtag_count (expected >=1) icmp=$icmp_count (expected 0)"
+
+	[ "$rtag_count" -ge 1 ] || result="fail"
+	[ "$icmp_count" -eq 0 ] || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"bond no tag-pop: R-TAG present on bond1 " \
+			"(rtag=$rtag_count), ICMP absent (icmp=$icmp_count)"
+	else
+		ksft_test_result_fail \
+			"bond no tag-pop: rtag=$rtag_count icmp=$icmp_count " \
+			"(expected rtag>=1 icmp=0)"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# TEST 5: SIMPLE POINT-TO-POINT (no bond)
+#
+# Self-contained single-path topology: push on p2p_a0 egress, individual
+# recover (with tag-pop) on p2p_a1 ingress.  IP is assigned directly to the
+# veth interfaces (no bond).
+#
+# Pass criteria:
+#   - ping succeeds (rc=0)
+#   - veth_a1 recover stats: passed >= PING_COUNT, discarded = 0
+# ----------------------------------------------------------------------------
+test_simple_point_to_point()
+{
+	local ping_rc=0
+	local dump_a1 passed discarded
+	local result="pass"
+
+	# Create self-contained p2p namespaces
+	ip netns add "$P2P_NS_SRC"
+	ip netns add "$P2P_NS_DST"
+
+	ip link add "$P2P_VETH_A0" type veth peer name "$P2P_VETH_A1"
+	ip link set "$P2P_VETH_A0" netns "$P2P_NS_SRC"
+	ip link set "$P2P_VETH_A1" netns "$P2P_NS_DST"
+
+	ip netns exec "$P2P_NS_SRC" ip link set lo up
+	ip netns exec "$P2P_NS_SRC" ip link set "$P2P_VETH_A0" up
+	ip netns exec "$P2P_NS_SRC" ip addr add "${IP_P2P_SRC}/24" dev "$P2P_VETH_A0"
+
+	ip netns exec "$P2P_NS_DST" ip link set lo up
+	ip netns exec "$P2P_NS_DST" ip link set "$P2P_VETH_A1" up
+	ip netns exec "$P2P_NS_DST" ip addr add "${IP_P2P_DST}/24" dev "$P2P_VETH_A1"
+
+	local mac_a0 mac_a1
+	mac_a0=$(ip netns exec "$P2P_NS_SRC" cat /sys/class/net/"$P2P_VETH_A0"/address)
+	mac_a1=$(ip netns exec "$P2P_NS_DST" cat /sys/class/net/"$P2P_VETH_A1"/address)
+	ip netns exec "$P2P_NS_SRC" ip neigh add "$IP_P2P_DST" lladdr "$mac_a1" dev "$P2P_VETH_A0"
+	ip netns exec "$P2P_NS_DST" ip neigh add "$IP_P2P_SRC" lladdr "$mac_a0" dev "$P2P_VETH_A1"
+
+	# veth_a0 egress: push R-TAG
+	ip netns exec "$P2P_NS_SRC" $TC qdisc add dev "$P2P_VETH_A0" clsact
+	ip netns exec "$P2P_NS_SRC" $TC filter add dev "$P2P_VETH_A0" egress \
+		protocol ip flower skip_hw \
+		action frer push index $IDX_PUSH
+
+	# veth_a1 ingress: individual recover with tag-pop
+	ip netns exec "$P2P_NS_DST" $TC qdisc add dev "$P2P_VETH_A1" clsact
+	ip netns exec "$P2P_NS_DST" $TC filter add dev "$P2P_VETH_A1" ingress \
+		protocol all flower skip_hw \
+		action frer recover individual alg vector history-length 16 \
+			reset-time 2000 tag-pop index $IDX_P2P_RCVY
+
+	ip netns exec "$P2P_NS_SRC" \
+		$PING -c "$PING_COUNT" -W "$PING_TIMEOUT" -i 0.2 -q \
+		"$IP_P2P_DST" >/dev/null 2>&1 || ping_rc=$?
+
+	dump_a1=$(ip netns exec "$P2P_NS_DST" \
+		$TC -s filter show dev "$P2P_VETH_A1" ingress 2>/dev/null)
+
+	# Teardown p2p topology
+	for dev in "$P2P_VETH_A0"; do
+		ip netns exec "$P2P_NS_SRC" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	for dev in "$P2P_VETH_A1"; do
+		ip netns exec "$P2P_NS_DST" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	ip netns exec "$P2P_NS_SRC" $TC actions flush action frer 2>/dev/null || true
+	ip netns exec "$P2P_NS_DST" $TC actions flush action frer 2>/dev/null || true
+	ip netns del "$P2P_NS_SRC" 2>/dev/null || true
+	ip netns del "$P2P_NS_DST" 2>/dev/null || true
+
+	passed=$(tc_stat    "$dump_a1" "passed")
+	discarded=$(tc_stat "$dump_a1" "discarded")
+	local tagless
+	tagless=$(tc_stat   "$dump_a1" "tagless")
+	discarded=$((discarded - tagless))
+
+	echo "# p2p: ping_rc=$ping_rc passed=$passed discarded=$discarded"
+
+	[ "$ping_rc"   -eq 0 ]            || result="fail"
+	[ "$passed"    -ge "$PING_COUNT" ] || result="fail"
+	[ "$discarded" -eq 0 ]            || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"simple p2p: ping OK, passed=$passed discarded=$discarded"
+	else
+		ksft_test_result_fail \
+			"simple p2p: ping_rc=$ping_rc passed=$passed discarded=$discarded"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# TEST 6: RELAY E2E (self-contained, no bond)
+#
+# Talker sends VLAN-100 frames into bridge0 (sequence generator).  Bridge0
+# pushes an R-TAG and replicates to two redundant paths.  Bridge1 (eliminator)
+# recovers (shared, tag-pop) on both paths and forwards the deduplicated frame
+# to the listener.
+#
+# Topology:
+#   ns_talker (talker_eth.100) -- talker_eth/br0_uplink
+#       -- bridge0 (br_r0) -+- br0_swp0/br1_swp0 -+
+#                            \- br0_swp1/br1_swp1 -+
+#       -- bridge1 (br_r1) -- br1_downlink/listener_eth -- ns_listener
+#
+# FRER rules:
+#   bridge0 / br0_uplink ingress  : push idx=50, redirect br0_swp0, mirror br0_swp1
+#   bridge1 / br1_swp0 ingress    : recover (shared, tag-pop) idx=60, redirect br1_downlink
+#   bridge1 / br1_swp1 ingress    : recover idx=60 (bind same), redirect br1_downlink
+#   bridge1 / br1_downlink ingress: redirect br1_swp0 (reply path, bypass FDB)
+#
+# Pass criteria:
+#   - ping from ns_talker to ns_listener succeeds (rc=0)
+#   - tcpdump on listener captures exactly PING_COUNT ICMP echo-request frames
+#   - br1_swp0 tc stats: passed >= PING_COUNT, discarded >= PING_COUNT
+# ----------------------------------------------------------------------------
+teardown_relay_tc()
+{
+	for dev in "$R_BR0_UPLINK"; do
+		ip netns exec "$R_NS_BRIDGE0" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	for dev in "$R_BR1_SWP0" "$R_BR1_SWP1" "$R_BR1_DOWNLINK"; do
+		ip netns exec "$R_NS_BRIDGE1" $TC qdisc del dev "$dev" clsact \
+			2>/dev/null || true
+	done
+	ip netns exec "$R_NS_BRIDGE0" $TC actions flush action frer 2>/dev/null || true
+	ip netns exec "$R_NS_BRIDGE1" $TC actions flush action frer 2>/dev/null || true
+}
+
+test_relay_e2e()
+{
+	local ping_rc=0
+	local dump_r1swp0
+	local total_passed total_discarded
+	local result="pass"
+	local ns
+
+	for ns in "$R_NS_TALKER" "$R_NS_BRIDGE0" "$R_NS_BRIDGE1" "$R_NS_LISTENER"; do
+		ip netns add "$ns" || {
+			echo "# relay e2e: failed to create netns $ns" >&2
+			ksft_test_result_skip "relay e2e: netns setup failed"
+			return
+		}
+	done
+
+	ip link add "$R_TALKER_ETH"   type veth peer name "$R_BR0_UPLINK"
+	ip link add "$R_BR0_SWP0"     type veth peer name "$R_BR1_SWP0"
+	ip link add "$R_BR0_SWP1"     type veth peer name "$R_BR1_SWP1"
+	ip link add "$R_BR1_DOWNLINK" type veth peer name "$R_LISTENER_ETH"
+
+	ip link set "$R_TALKER_ETH"   netns "$R_NS_TALKER"
+	ip link set "$R_BR0_UPLINK"   netns "$R_NS_BRIDGE0"
+	ip link set "$R_BR0_SWP0"     netns "$R_NS_BRIDGE0"
+	ip link set "$R_BR0_SWP1"     netns "$R_NS_BRIDGE0"
+	ip link set "$R_BR1_SWP0"     netns "$R_NS_BRIDGE1"
+	ip link set "$R_BR1_SWP1"     netns "$R_NS_BRIDGE1"
+	ip link set "$R_BR1_DOWNLINK" netns "$R_NS_BRIDGE1"
+	ip link set "$R_LISTENER_ETH" netns "$R_NS_LISTENER"
+
+	local ns_dev
+	for ns_dev in \
+		"$R_NS_TALKER:$R_TALKER_ETH" \
+		"$R_NS_BRIDGE0:$R_BR0_UPLINK" "$R_NS_BRIDGE0:$R_BR0_SWP0" \
+		"$R_NS_BRIDGE0:$R_BR0_SWP1" \
+		"$R_NS_BRIDGE1:$R_BR1_SWP0" "$R_NS_BRIDGE1:$R_BR1_SWP1" \
+		"$R_NS_BRIDGE1:$R_BR1_DOWNLINK" \
+		"$R_NS_LISTENER:$R_LISTENER_ETH"; do
+		local _ns="${ns_dev%%:*}"
+		local _dev="${ns_dev##*:}"
+		ip netns exec "$_ns" ip link set lo up
+		ip netns exec "$_ns" ip link set "$_dev" up
+	done
+
+	# bridge0: sequence generator, VLAN filtering
+	ip netns exec "$R_NS_BRIDGE0" ip link add name "$R_BR0" type bridge vlan_filtering 1
+	ip netns exec "$R_NS_BRIDGE0" ip link set "$R_BR0" up
+	ip netns exec "$R_NS_BRIDGE0" ip link set "$R_BR0_UPLINK" master "$R_BR0"
+	ip netns exec "$R_NS_BRIDGE0" ip link set "$R_BR0_SWP0" master "$R_BR0"
+	ip netns exec "$R_NS_BRIDGE0" ip link set "$R_BR0_SWP1" master "$R_BR0"
+
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan add dev "$R_BR0_UPLINK" vid "$R_VLAN"
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan add dev "$R_BR0_SWP0" vid "$R_VLAN"
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan del dev "$R_BR0_SWP1" vid 1
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan add dev "$R_BR0_SWP1" \
+		vid "$R_VLAN" pvid untagged
+	ip netns exec "$R_NS_BRIDGE0" bridge link set dev "$R_BR0_SWP0" learning off
+	ip netns exec "$R_NS_BRIDGE0" bridge link set dev "$R_BR0_SWP1" learning off
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan set dev "$R_BR0_SWP0" vid "$R_VLAN" noflood
+	ip netns exec "$R_NS_BRIDGE0" bridge vlan set dev "$R_BR0_SWP1" vid "$R_VLAN" noflood
+
+	# bridge1: eliminator, VLAN filtering
+	ip netns exec "$R_NS_BRIDGE1" ip link add name "$R_BR1" type bridge vlan_filtering 1
+	ip netns exec "$R_NS_BRIDGE1" ip link set "$R_BR1" up
+	ip netns exec "$R_NS_BRIDGE1" ip link set "$R_BR1_SWP0" master "$R_BR1"
+	ip netns exec "$R_NS_BRIDGE1" ip link set "$R_BR1_SWP1" master "$R_BR1"
+	ip netns exec "$R_NS_BRIDGE1" ip link set "$R_BR1_DOWNLINK" master "$R_BR1"
+
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan add dev "$R_BR1_SWP0" vid "$R_VLAN"
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan del dev "$R_BR1_SWP1" vid 1
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan add dev "$R_BR1_SWP1" \
+		vid "$R_VLAN" pvid untagged
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan add dev "$R_BR1_DOWNLINK" vid "$R_VLAN"
+	ip netns exec "$R_NS_BRIDGE1" bridge link set dev "$R_BR1_SWP0" learning off
+	ip netns exec "$R_NS_BRIDGE1" bridge link set dev "$R_BR1_SWP1" learning off
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan set dev "$R_BR1_SWP0" vid "$R_VLAN" noflood
+	ip netns exec "$R_NS_BRIDGE1" bridge vlan set dev "$R_BR1_SWP1" vid "$R_VLAN" noflood
+
+	# ns_talker: VLAN sub-interface
+	ip netns exec "$R_NS_TALKER" ip link add link "$R_TALKER_ETH" \
+		name "${R_TALKER_ETH}.${R_VLAN}" type vlan id "$R_VLAN"
+	ip netns exec "$R_NS_TALKER" ip link set "${R_TALKER_ETH}.${R_VLAN}" up
+	ip netns exec "$R_NS_TALKER" ip addr add "${R_IP_TALKER}/24" \
+		dev "${R_TALKER_ETH}.${R_VLAN}"
+
+	# ns_listener: VLAN sub-interface
+	ip netns exec "$R_NS_LISTENER" ip link add link "$R_LISTENER_ETH" \
+		name "${R_LISTENER_ETH}.${R_VLAN}" type vlan id "$R_VLAN"
+	ip netns exec "$R_NS_LISTENER" ip link set "${R_LISTENER_ETH}.${R_VLAN}" up
+	ip netns exec "$R_NS_LISTENER" ip addr add "${R_IP_LISTENER}/24" \
+		dev "${R_LISTENER_ETH}.${R_VLAN}"
+
+	# Static ARP (VLAN 100 flooding is disabled)
+	local mac_talker mac_listener
+	mac_talker=$(ip netns exec "$R_NS_TALKER" \
+		cat /sys/class/net/"${R_TALKER_ETH}.${R_VLAN}"/address)
+	mac_listener=$(ip netns exec "$R_NS_LISTENER" \
+		cat /sys/class/net/"${R_LISTENER_ETH}.${R_VLAN}"/address)
+	ip netns exec "$R_NS_TALKER"   ip neigh add "$R_IP_LISTENER" \
+		lladdr "$mac_listener" dev "${R_TALKER_ETH}.${R_VLAN}"
+	ip netns exec "$R_NS_LISTENER" ip neigh add "$R_IP_TALKER" \
+		lladdr "$mac_talker"   dev "${R_LISTENER_ETH}.${R_VLAN}"
+
+	# bridge0 / br0_uplink ingress: push R-TAG then replicate to both redundant paths.
+	# mirror must come before redirect because redirect is a terminating action.
+	ip netns exec "$R_NS_BRIDGE0" $TC qdisc add dev "$R_BR0_UPLINK" clsact
+	ip netns exec "$R_NS_BRIDGE0" $TC filter add dev "$R_BR0_UPLINK" ingress \
+		protocol 802.1Q flower skip_hw vlan_id "$R_VLAN" \
+		action frer push index $IDX_RELAY_PUSH \
+		action mirred egress mirror  dev "$R_BR0_SWP1" \
+		action mirred egress redirect dev "$R_BR0_SWP0"
+
+	# bridge1 / br1_swp0 ingress: create shared recover action (tag-pop)
+	ip netns exec "$R_NS_BRIDGE1" $TC qdisc add dev "$R_BR1_SWP0" clsact
+	ip netns exec "$R_NS_BRIDGE1" $TC filter add dev "$R_BR1_SWP0" ingress \
+		protocol all flower skip_hw \
+		action frer recover alg vector history-length 16 \
+			reset-time 2000 tag-pop index $IDX_RELAY_RCVY \
+		action mirred egress redirect dev "$R_BR1_DOWNLINK"
+
+	# bridge1 / br1_swp1 ingress: bind to the same shared recover action
+	ip netns exec "$R_NS_BRIDGE1" $TC qdisc add dev "$R_BR1_SWP1" clsact
+	ip netns exec "$R_NS_BRIDGE1" $TC filter add dev "$R_BR1_SWP1" ingress \
+		protocol all flower skip_hw \
+		action frer recover index $IDX_RELAY_RCVY \
+		action mirred egress redirect dev "$R_BR1_DOWNLINK"
+
+	# bridge1 / br1_downlink ingress: redirect VLAN 100 replies directly to br1_swp0
+	ip netns exec "$R_NS_BRIDGE1" $TC qdisc add dev "$R_BR1_DOWNLINK" clsact
+	ip netns exec "$R_NS_BRIDGE1" $TC filter add dev "$R_BR1_DOWNLINK" ingress \
+		protocol 802.1Q flower skip_hw vlan_id "$R_VLAN" \
+		action mirred egress redirect dev "$R_BR1_SWP0"
+
+	# Capture ICMP echo-requests on listener_eth.VLAN to verify exactly
+	# PING_COUNT deduplicated frames reach the listener after recovery.
+	local pcap cap_count
+	pcap=$(mktemp /tmp/frer_relay_XXXXXX.pcap)
+	capture_start_on "$R_NS_LISTENER" "${R_LISTENER_ETH}.${R_VLAN}" \
+		"$pcap" "icmp[icmptype] == icmp-echo"
+
+	ip netns exec "$R_NS_TALKER" \
+		$PING -c "$PING_COUNT" -W "$PING_TIMEOUT" -i 0.2 -q \
+		"$R_IP_LISTENER" >/dev/null 2>&1 || ping_rc=$?
+
+	capture_stop
+	cap_count=$(capture_count_on "$R_NS_LISTENER" "$pcap")
+	rm -f "$pcap"
+
+	dump_br1_swp0=$(ip netns exec "$R_NS_BRIDGE1" \
+		$TC -s filter show dev "$R_BR1_SWP0" ingress 2>/dev/null)
+
+	teardown_relay_tc
+	for ns in "$R_NS_TALKER" "$R_NS_BRIDGE0" "$R_NS_BRIDGE1" "$R_NS_LISTENER"; do
+		ip netns del "$ns" 2>/dev/null || true
+	done
+
+	total_passed=$(tc_stat    "$dump_br1_swp0" "passed")
+	total_discarded=$(tc_stat "$dump_br1_swp0" "discarded")
+	local tagless
+	tagless=$(tc_stat         "$dump_br1_swp0" "tagless")
+	total_discarded=$((total_discarded - tagless))
+
+	echo "# relay e2e: ping_rc=$ping_rc cap=$cap_count" \
+		"passed=$total_passed discarded=$total_discarded"
+
+	[ "$ping_rc"         -eq 0 ]            || result="fail"
+	[ "$cap_count"       -eq "$PING_COUNT" ] || result="fail"
+	[ "$total_passed"    -ge "$PING_COUNT" ] || result="fail"
+	[ "$total_discarded" -ge "$PING_COUNT" ] || result="fail"
+
+	if [ "$result" = "pass" ]; then
+		ksft_test_result_pass \
+			"relay e2e: ping OK, cap=$cap_count " \
+			"passed=$total_passed discarded=$total_discarded"
+	else
+		ksft_test_result_fail \
+			"relay e2e: ping_rc=$ping_rc cap=$cap_count " \
+			"passed=$total_passed discarded=$total_discarded" \
+			"(expected ping OK, cap=$PING_COUNT," \
+			"passed>=$PING_COUNT, discarded>=$PING_COUNT)"
+	fi
+}
+
+# ----------------------------------------------------------------------------
+# Main
+# ----------------------------------------------------------------------------
+main()
+{
+	ksft_print_header
+	check_prerequisites
+	load_module
+	setup_topology
+
+	if ! check_frer_action; then
+		ksft_set_plan "$NUM_TESTS"
+		for i in $(seq 1 "$NUM_TESTS"); do
+			ksft_test_result_skip \
+				"frer action not available in this kernel (test $i)"
+		done
+		ksft_print_cnts
+		exit "$KSFT_SKIP"
+	fi
+
+	ksft_set_plan "$NUM_TESTS"
+
+	test_push_verify_bond        # TEST 1: push on a0/b0, no recover, R-TAG on both paths
+	test_shared_recover_bond     # TEST 2: shared recover, dedup, ping succeeds
+	test_individual_recover_bond # TEST 3: individual recover, no dedup, double frames
+	test_no_tag_pop_bond         # TEST 4: shared recover without tag-pop, R-TAG preserved
+	test_simple_point_to_point   # TEST 5: single-path p2p, no bond
+	test_relay_e2e               # TEST 6: relay bridge topology
+
+	ksft_print_cnts
+
+	[ "$_ksft_fail" -eq 0 ] && ksft_exit_pass || ksft_exit_fail
+}
+
+main "$@"
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 3/6] uapi: tc_act: add tc_frer UAPI header
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Define the netlink attribute layout and enumerations for the FRER tc
action (IEEE 802.1CB Frame Replication and Elimination for Reliability).

The action is split into two functional sub-commands selected by the
TCA_FRER_FUNC attribute:

  TCA_FRER_FUNC_PUSH    - Egress: sequence number generation and R-TAG
                          insertion. The action inserts an R-TAG with
                          the current sequence number into the frame
                          before passing it on. When chained with
                          "action mirred egress mirror", the mirrored
                          copy already carries the R-TAG, so all
                          replicated frames on different egress paths
                          carry the same sequence number without any
                          additional shared state.

  TCA_FRER_FUNC_RECOVER - Ingress: duplicate detection and elimination.
                          Multiple ingress filters can share the same
                          recovery state by referencing the same action
                          index, implementing Sequence Recovery across
                          ports (IEEE 802.1CB Section 7.4.2).
                          When TCA_FRER_RCVY_INDIVIDUAL flag is set,
                          the action uses private per-action state
                          (Individual Recovery, Section 7.5).

Statistics attributes map directly to the managed objects defined in
IEEE 802.1CB Table 10-1.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/uapi/linux/tc_act/tc_frer.h | 89 +++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 include/uapi/linux/tc_act/tc_frer.h

diff --git a/include/uapi/linux/tc_act/tc_frer.h b/include/uapi/linux/tc_act/tc_frer.h
new file mode 100644
index 000000000000..241e90827e26
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_frer.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/* Copyright 2026 NXP */
+
+#ifndef __LINUX_TC_FRER_H
+#define __LINUX_TC_FRER_H
+
+#include <linux/pkt_cls.h>
+
+/* Base parameters passed in TCA_FRER_PARMS */
+struct tc_frer {
+	tc_gen;
+};
+
+/**
+ * enum TCA_FRER_* - netlink attributes for the FRER tc action
+ *
+ * @TCA_FRER_FUNC:             Functional sub-command (tc_frer_func).
+ *                             Mandatory.
+ * @TCA_FRER_TAG_TYPE:         Redundancy tag type (tc_frer_tag_type).
+ *                             Mandatory.
+ *
+ * Push-specific attributes (TCA_FRER_FUNC_PUSH):
+ * Recover-specific attributes (TCA_FRER_FUNC_RECOVER):
+ * @TCA_FRER_RCVY_INDIVIDUAL:  Flag. Force Individual Recovery.
+ * @TCA_FRER_RCVY_ALG:         u8. Recovery algorithm (tc_frer_rcvy_alg).
+ * @TCA_FRER_RCVY_HISTORY_LEN: u8. SequenceHistory window size (1-32).
+ *                             Maps to frerSeqRcvyHistoryLength.
+ * @TCA_FRER_RCVY_RESET_MSEC:  u32. Reset timer in milliseconds.
+ *                             0 disables the timer.
+ *                             Maps to frerSeqRcvyResetMSec.
+ * @TCA_FRER_RCVY_TAKE_NO_SEQ: Flag. Accept frames without a redundancy
+ *                             tag and pass them unconditionally.
+ *                             Maps to frerSeqRcvyTakeNoSeq.
+ * @TCA_FRER_RCVY_TAG_POP:     Flag. Remove the redundancy tag from
+ *                             frames that pass the recovery function.
+ *
+ * Read-only statistics (filled on dump, IEEE 802.1CB Table 10-1):
+ * @TCA_FRER_STATS_TAGLESS_PKTS:       frerCpsSeqRcvyTaglessPackets
+ * @TCA_FRER_STATS_OUT_OF_ORDER_PKTS:  frerCpsSeqRcvyOutOfOrderPackets
+ * @TCA_FRER_STATS_ROGUE_PKTS:         frerCpsSeqRcvyRoguePackets
+ * @TCA_FRER_STATS_LOST_PKTS:          frerCpsSeqRcvyLostPackets
+ * @TCA_FRER_STATS_RESETS:             frerCpsSeqRcvyResets
+ * @TCA_FRER_STATS_PASSED_PKTS:        frerCpsSeqRcvyPassedPackets
+ * @TCA_FRER_STATS_DISCARDED_PKTS:     frerCpsSeqRcvyDiscardedPackets
+ * @TCA_FRER_STATS_SEQGEN_PKTS:        frerCpsSeqGenPackets
+ */
+enum {
+	TCA_FRER_UNSPEC,
+	TCA_FRER_TM,                       /* struct tcf_t */
+	TCA_FRER_PARMS,                    /* struct tc_frer */
+	TCA_FRER_PAD,
+	TCA_FRER_FUNC,                     /* u8: tc_frer_func */
+	TCA_FRER_TAG_TYPE,                 /* u8: tc_frer_tag_type */
+	TCA_FRER_RCVY_INDIVIDUAL,          /* NLA_FLAG */
+	TCA_FRER_RCVY_ALG,                 /* u8: tc_frer_rcvy_alg */
+	TCA_FRER_RCVY_HISTORY_LEN,         /* u8: 1-32 */
+	TCA_FRER_RCVY_RESET_MSEC,          /* u32 */
+	TCA_FRER_RCVY_TAKE_NO_SEQ,         /* NLA_FLAG */
+	TCA_FRER_RCVY_TAG_POP,             /* NLA_FLAG */
+	TCA_FRER_STATS_TAGLESS_PKTS,       /* u64 */
+	TCA_FRER_STATS_OUT_OF_ORDER_PKTS,  /* u64 */
+	TCA_FRER_STATS_ROGUE_PKTS,         /* u64 */
+	TCA_FRER_STATS_LOST_PKTS,          /* u64 */
+	TCA_FRER_STATS_RESETS,             /* u64 */
+	TCA_FRER_STATS_PASSED_PKTS,        /* u64 */
+	TCA_FRER_STATS_DISCARDED_PKTS,     /* u64 */
+	TCA_FRER_STATS_SEQGEN_PKTS,        /* u64 */
+	__TCA_FRER_MAX,
+};
+
+#define TCA_FRER_MAX (__TCA_FRER_MAX - 1)
+
+enum tc_frer_func {
+	TCA_FRER_FUNC_PUSH    = 1,
+	TCA_FRER_FUNC_RECOVER = 2,
+};
+
+enum tc_frer_tag_type {
+	TCA_FRER_TAG_RTAG = 1,
+	TCA_FRER_TAG_HSR,
+	TCA_FRER_TAG_PRP,
+};
+
+enum tc_frer_rcvy_alg {
+	TCA_FRER_RCVY_VECTOR_ALG = 0,  /* IEEE 802.1CB 7.4.3.4 */
+	TCA_FRER_RCVY_MATCH_ALG  = 1,  /* IEEE 802.1CB 7.4.3.5 */
+};
+
+#endif /* __LINUX_TC_FRER_H */
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 5/6] selftest: add tc-testing JSON test cases for act_frer
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Add a tc-testing JSON file covering the FRER (IEEE 802.1CB Frame
Replication and Elimination for Reliability) tc action (act_frer).

The test suite contains 32 test cases and exercises:

 - Creating push and recover actions with default and explicit parameters
   (tag-type, alg vector/match, history-length, reset-time, tag-pop,
   individual, take-no-seq)
 - Boundary values for history-length (1 and 32) and reset-time (0)
 - Combining multiple flags (frer_0011, frer_0012)
 - Statistics output format for push (SeqGen) and recover (passed,
   discarded, tagless, out-of-order, rogue, lost, resets)
 - Replace and delete operations
 - Flush all actions
 - Duplicate-index failure (expExitCode 255)
 - Control actions (continue, pipe) placed after the index token
 - Binding push and recover actions to egress/ingress clsact filters
 - Sharing a recover action across two filters and verifying the
   reference count increments
 - not_in_hw flag present in show output

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 .../tc-testing/tc-tests/actions/frer.json     | 785 ++++++++++++++++++
 1 file changed, 785 insertions(+)
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/frer.json

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json b/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json
new file mode 100644
index 000000000000..d5be6ae156f7
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json
@@ -0,0 +1,785 @@
+[
+  {
+    "id": "frer_0001",
+    "name": "Create frer push action with default parameters",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0002",
+    "name": "Create frer push action with explicit tag-type rtag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push tag-type rtag index 2",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 2",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 2",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0003",
+    "name": "Create frer recover action with default parameters",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10 alg vector history-length [0-9]+ reset-time [0-9]+",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0004",
+    "name": "Create frer recover action with vector algorithm explicit",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector index 11",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 11",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 11 alg vector",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0005",
+    "name": "Create frer recover action with match algorithm",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg match index 12",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 12",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 12 alg match",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0006",
+    "name": "Create frer recover action with history-length 16",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 16 index 13",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 13",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 13 alg vector history-length 16",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0007",
+    "name": "Create frer recover action with reset-time 2000",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector reset-time 2000 index 14",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 14",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 14 alg vector history-length [0-9]+ reset-time 2000",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0008",
+    "name": "Create frer recover action with tag-pop flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover tag-pop index 15",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 15",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 15.*tag-pop",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0009",
+    "name": "Create frer recover action with individual flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover individual index 16",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 16",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 16.*individual",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0010",
+    "name": "Create frer recover action with take-no-seq flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover take-no-seq index 17",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 17",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 17.*take-no-seq",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0011",
+    "name": "Create frer recover action with all parameters combined",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 16 reset-time 1000 tag-pop individual index 20",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 20",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 20 individual alg vector history-length 16 reset-time 1000 tag-pop",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0012",
+    "name": "Create frer recover action with match alg and all flags",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg match take-no-seq tag-pop individual index 21",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 21",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 21 individual alg match history-length [0-9]+ reset-time [0-9]+ tag-pop take-no-seq",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0013",
+    "name": "Show frer push action SeqGen statistics (zero after create)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 1",
+    "matchPattern": "SeqGen packets: 0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0014",
+    "name": "Show frer recover action Statistics line (zero after create)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover alg vector history-length 16 reset-time 1000 tag-pop index 10"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "Statistics: passed=0 discarded=0 tagless=0 out-of-order=0 rogue=0 lost=0 resets=0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0015",
+    "name": "Show frer recover action Statistics fields present",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover index 10"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "Statistics: passed=[0-9]+ discarded=[0-9]+ tagless=[0-9]+ out-of-order=[0-9]+ rogue=[0-9]+ lost=[0-9]+ resets=[0-9]+",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0016",
+    "name": "Replace frer push action (same index)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions replace action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0017",
+    "name": "Replace frer recover action changing algorithm from vector to match",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover alg vector index 10"
+    ],
+    "cmdUnderTest": "$TC actions replace action frer recover alg match index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10 alg match",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0018",
+    "name": "Delete frer push action by index",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions del action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "frer push tag-type rtag index 1",
+    "matchCount": "0",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0019",
+    "name": "Flush all frer actions",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1",
+      "$TC actions add action frer recover index 10",
+      "$TC actions add action frer recover index 11"
+    ],
+    "cmdUnderTest": "$TC actions flush action frer",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "action order [0-9]+: frer",
+    "matchCount": "0",
+    "teardown": [
+      "$TC actions flush action frer 2>/dev/null || true"
+    ]
+  },
+  {
+    "id": "frer_0020",
+    "name": "Add duplicate frer action index fails without replace flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1",
+    "expExitCode": "255",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0021",
+    "name": "Create frer push action with continue control action",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1 continue",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1.*control continue",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0022",
+    "name": "Create frer recover action with pipe control action",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover index 10 pipe",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10.*control pipe",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0023",
+    "name": "Create frer recover action history-length minimum boundary (1)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 1 index 30",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 30",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 30 alg vector history-length 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0024",
+    "name": "Create frer recover action history-length maximum boundary (32)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 32 index 31",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 31",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 31 alg vector history-length 32",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0025",
+    "name": "Create frer recover action with reset-time 0 (timer disabled)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector reset-time 0 index 32",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 32",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 32 alg vector history-length [0-9]+ reset-time 0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0026",
+    "name": "List all frer actions shows correct count",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1",
+      "$TC actions add action frer recover alg vector index 10",
+      "$TC actions add action frer recover alg match tag-pop index 11"
+    ],
+    "cmdUnderTest": "$TC actions show action frer",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "action order [0-9]+: frer",
+    "matchCount": "3",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0027",
+    "name": "Bind frer push action to egress clsact filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy egress protocol ip flower skip_hw action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC filter show dev frer_dummy egress",
+    "matchPattern": "frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0028",
+    "name": "Bind frer recover action to ingress clsact filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy ingress protocol all flower skip_hw action frer recover alg vector history-length 16 reset-time 1000 tag-pop index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC filter show dev frer_dummy ingress",
+    "matchPattern": "frer recover tag-type rtag index 10 alg vector history-length 16 reset-time 1000",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0029",
+    "name": "Share frer recover action across two ingress filters (refcount check)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_a 2>/dev/null || true",
+      "ip link del frer_b 2>/dev/null || true",
+      "ip link add frer_a type dummy",
+      "ip link add frer_b type dummy",
+      "ip link set frer_a up",
+      "ip link set frer_b up",
+      "$TC qdisc add dev frer_a clsact",
+      "$TC qdisc add dev frer_b clsact",
+      "$TC filter add dev frer_a ingress protocol all flower skip_hw action frer recover alg vector history-length 16 tag-pop index 10"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_b ingress protocol all flower skip_hw action frer recover index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "ref [2-9][0-9]*",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_a clsact",
+      "$TC qdisc del dev frer_b clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_a",
+      "ip link del frer_b"
+    ]
+  },
+  {
+    "id": "frer_0030",
+    "name": "frer push action refcount increments when bound to filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact",
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy egress protocol ip flower skip_hw action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 1",
+    "matchPattern": "ref [2-9][0-9]*",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0031",
+    "name": "frer push output shows not_in_hw flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions show action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "not_in_hw",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0032",
+    "name": "frer recover output shows not_in_hw flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover index 10"
+    ],
+    "cmdUnderTest": "$TC actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "not_in_hw",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  }
+]
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 2/6] uapi: pkt_cls: add TCA_ID_FRER action identifier
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Register TCA_ID_FRER in the global tc action ID enum so that the FRER
tc action can be identified uniquely among all tc actions.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/uapi/linux/pkt_cls.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 28d94b11d1aa..9b87f0455110 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -139,6 +139,7 @@ enum tca_id {
 	TCA_ID_MPLS,
 	TCA_ID_CT,
 	TCA_ID_GATE,
+	TCA_ID_FRER,
 	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 4/6] net: sched: act_frer: add FRER tc action
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Introduce the FRER tc action for IEEE 802.1CB.  This patch adds the
module skeleton, the shared sequence-generator infrastructure, the
TCA_FRER_FUNC_PUSH data path, and the TCA_FRER_FUNC_RECOVER data path.

Sequence generation (IEEE 802.1CB Section 7.4.1):
  Each push action embeds a struct frer_seqgen directly in tcf_frer,
  protected by a per-action spinlock.  The sequence counter wraps at
  65536 (16-bit R-TAG field).  When a Talker chains "action frer push"
  with "action mirred egress mirror", both the primary and the mirrored
  frame carry the same R-TAG because mirred copies the already-modified
  skb.  No changes to act_mirred are required (Split function,
  Section 7.7).

Sequence Recovery vs. Individual Recovery (IEEE 802.1CB Section 7.5):

  Sequence Recovery (cross-port deduplication):
    Multiple ingress filters on different ports share one recover
    action by referencing the same action index.  They all operate on
    the same struct frer_rcvy embedded in that tcf_frer instance and
    protected by a spinlock.  A frame arriving on any port is checked
    against the shared sequence history; the first copy passes and all
    later copies with the same sequence number are discarded.

  Individual Recovery (per-port independent deduplication):
    Each action uses its own frer_rcvy embedded directly in tcf_frer.
    Selected when the user sets the "individual" flag.

Recovery algorithms:
  Vector (7.4.3.4, default): 32-bit history bit-vector, handles
    out-of-order delivery within the window.
  Match (7.4.3.5): remembers only the last accepted sequence number.

Reset timer:
  An hrtimer fires after frerSeqRcvyResetMSec ms of inactivity.
  CLOCK_MONOTONIC is used throughout.  The reset runs in a workqueue
  to avoid holding the spinlock in the hrtimer callback.

R-TAG wire format (IEEE 802.1CB 7.8, EtherType 0xF1C1):
  [Dst MAC 6B][Src MAC 6B][Optional 802.1Q tag 4B][0xF1C1 2B]
  [Reserved 2B][Sequence Number 2B][Encapsulated EtherType 2B][Payload]

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/net/flow_offload.h   |  11 +
 include/net/tc_act/tc_frer.h |  71 +++
 net/sched/Kconfig            |  16 +
 net/sched/Makefile           |   1 +
 net/sched/act_frer.c         | 835 +++++++++++++++++++++++++++++++++++
 5 files changed, 934 insertions(+)
 create mode 100644 include/net/tc_act/tc_frer.h
 create mode 100644 net/sched/act_frer.c

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 70a02ee14308..8d97a5f293e6 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -184,6 +184,7 @@ enum flow_action_id {
 	FLOW_ACTION_VLAN_PUSH_ETH,
 	FLOW_ACTION_VLAN_POP_ETH,
 	FLOW_ACTION_CONTINUE,
+	FLOW_ACTION_FRER,
 	NUM_FLOW_ACTIONS,
 };
 
@@ -329,6 +330,16 @@ struct flow_action_entry {
 		struct {				/* FLOW_ACTION_PPPOE_PUSH */
 			u16		sid;
 		} pppoe;
+		struct {                                /* FLOW_ACTION_FRER */
+			u8		func;
+			u8		tag_type;
+			bool		individual;
+			u8		rcvy_alg;
+			u8		rcvy_history_len;
+			u32		rcvy_reset_msec;
+			bool		tag_pop;
+			bool		take_no_seq;
+		} frer;
 	};
 	struct flow_action_cookie *user_cookie; /* user defined action cookie */
 };
diff --git a/include/net/tc_act/tc_frer.h b/include/net/tc_act/tc_frer.h
new file mode 100644
index 000000000000..5f6f8ca70813
--- /dev/null
+++ b/include/net/tc_act/tc_frer.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2026 NXP */
+
+#ifndef __NET_TC_FRER_H
+#define __NET_TC_FRER_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_frer.h>
+
+/**
+ * struct frer_seqgen - sequence number generator state (embedded in tcf_frer)
+ */
+struct frer_seqgen {
+	u32		gen_seq_num;
+	u64		seq_space;	/* 1 << 16 */
+	spinlock_t	lock;		/* protects frer_seqgen state */
+	u64		stats_pkts;	/* frerCpsSeqGenPackets */
+};
+
+/**
+ * struct frer_rcvy - sequence recovery state (embedded in tcf_frer)
+ *
+ */
+struct frer_rcvy {
+	u8		alg;
+	u8		history_len;	/* 1-32 */
+	u32		reset_msec;
+	u64		seq_space;
+	u32		rcvy_seq_num;
+	u32		seq_history;
+	bool		take_any;
+	bool		take_no_seq;
+	struct hrtimer	hrtimer;
+	spinlock_t	lock;		/* protects frer_rcvy state */
+	/* statistics */
+	u64		stats_tagless_pkts;
+	u64		stats_out_of_order_pkts;
+	u64		stats_rogue_pkts;
+	u64		stats_lost_pkts;
+	u64		stats_resets;
+	u64		stats_passed_pkts;
+	u64		stats_discarded_pkts;
+};
+
+/**
+ * struct tcf_frer - per tc_action FRER private data
+ */
+struct tcf_frer {
+	struct tc_action	common;
+	u8			func;
+	u8			tag_type;
+	bool			tag_pop;
+	bool			individual;	/* Individual Recovery flag */
+	/* push path */
+	struct frer_seqgen	seqgen;
+	/* recover path */
+	struct frer_rcvy	rcvy;
+};
+
+#define to_frer(a) ((struct tcf_frer *)(a))
+
+static inline bool is_tcf_frer(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->id == TCA_ID_FRER)
+		return true;
+#endif
+	return false;
+}
+
+#endif /* __NET_TC_FRER_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 6ddff028b81a..7ca79b3eb5b3 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -939,6 +939,22 @@ config NET_ACT_GATE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_gate.
 
+config NET_ACT_FRER
+	tristate "IEEE 802.1CB FRER tc action"
+	depends on NET_CLS_ACT
+	help
+	  Say Y here to enable the IEEE 802.1CB FRER tc action.  The action
+	  implements the Sequence Generation Function (egress R-TAG insertion
+	  with shared per-stream sequence counter) and the Sequence Recovery
+	  Function (ingress duplicate detection and elimination) described in
+	  IEEE 802.1CB-2017.
+
+	  Both Sequence Recovery (cross-port shared state via rcvy-id) and
+	  Individual Recovery (per-port independent state) are supported.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_frer.
+
 config NET_IFE_SKBMARK
 	tristate "Support to encoding decoding skb mark on IFE action"
 	depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5078ea84e6ad..d9f60434e7d7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_ACT_GATE)	+= act_gate.o
+obj-$(CONFIG_NET_ACT_FRER)	+= act_frer.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
diff --git a/net/sched/act_frer.c b/net/sched/act_frer.c
new file mode 100644
index 000000000000..7b6db643788d
--- /dev/null
+++ b/net/sched/act_frer.c
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2026 NXP */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/if_vlan.h>
+#include <linux/hrtimer.h>
+#include <linux/workqueue.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_frer.h>
+
+/* ------------------------------------------------------------------ */
+/* R-TAG wire structures (IEEE 802.1CB 7.8)                          */
+/* ------------------------------------------------------------------ */
+
+struct r_tag {
+	__be16 reserved;
+	__be16 sequence_nr;
+	__be16 encap_proto;
+} __packed;
+
+static struct tc_action_ops act_frer_ops;
+
+/* ------------------------------------------------------------------ */
+/* Recovery reset machinery                                            */
+/* ------------------------------------------------------------------ */
+
+struct frer_rcvy_work {
+	struct work_struct	work;
+	struct frer_rcvy	*rcvy;
+};
+
+static void frer_rcvy_reset(struct frer_rcvy *rcvy)
+{
+	if (rcvy->alg == TCA_FRER_RCVY_VECTOR_ALG) {
+		rcvy->rcvy_seq_num = (u32)(rcvy->seq_space - 1);
+		rcvy->seq_history  = 0;
+	}
+	rcvy->take_any = true;
+	rcvy->stats_resets++;
+}
+
+static void frer_rcvy_reset_work_fn(struct work_struct *work)
+{
+	struct frer_rcvy_work *rw =
+		container_of(work, struct frer_rcvy_work, work);
+	struct frer_rcvy *rcvy = rw->rcvy;
+
+	spin_lock_bh(&rcvy->lock);
+	frer_rcvy_reset(rcvy);
+	spin_unlock_bh(&rcvy->lock);
+	kfree(rw);
+}
+
+static enum hrtimer_restart frer_rcvy_hrtimer_fn(struct hrtimer *timer)
+{
+	struct frer_rcvy *rcvy =
+		container_of(timer, struct frer_rcvy, hrtimer);
+	struct frer_rcvy_work *rw;
+
+	/* Allocate in GFP_ATOMIC context; if it fails the state is not
+	 * reset this cycle - the next frame will attempt again.
+	 */
+	rw = kmalloc_obj(*rw);
+	if (rw) {
+		INIT_WORK(&rw->work, frer_rcvy_reset_work_fn);
+		rw->rcvy = rcvy;
+		schedule_work(&rw->work);
+	}
+	return HRTIMER_NORESTART;
+}
+
+static void frer_rcvy_timer_restart(struct frer_rcvy *rcvy)
+{
+	if (rcvy->reset_msec)
+		hrtimer_start(&rcvy->hrtimer,
+			      ms_to_ktime(rcvy->reset_msec),
+			      HRTIMER_MODE_REL_SOFT);
+}
+
+static void frer_rcvy_init_state(struct frer_rcvy *rcvy, u8 alg,
+				 u8 history_len, u32 reset_msec,
+				 bool take_no_seq)
+{
+	rcvy->alg          = alg;
+	rcvy->history_len  = history_len;
+	rcvy->reset_msec   = reset_msec;
+	rcvy->seq_space    = 1 << 16;
+	rcvy->take_no_seq  = take_no_seq;
+	rcvy->take_any     = true;
+	rcvy->rcvy_seq_num = (u32)(rcvy->seq_space - 1);
+	rcvy->seq_history  = 0;
+	spin_lock_init(&rcvy->lock);
+	hrtimer_setup(&rcvy->hrtimer, frer_rcvy_hrtimer_fn, CLOCK_MONOTONIC,
+		      HRTIMER_MODE_REL_SOFT);
+}
+
+/* ------------------------------------------------------------------ */
+/* R-TAG helpers                                                       */
+/* ------------------------------------------------------------------ */
+
+static int frer_rtag_push(struct sk_buff *skb, u16 seq_num)
+{
+	unsigned char *new_mac_header;
+	unsigned int data_offset;
+	unsigned int head_len;
+	struct vlan_ethhdr *vh;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+	__be16 saved_proto;
+
+	if (!skb_mac_header_was_set(skb))
+		return -EINVAL;
+
+	data_offset = skb->data - skb_mac_header(skb);
+
+	if (skb_cow_head(skb, data_offset + sizeof(*rtag)))
+		return -ENOMEM;
+
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		if (!pskb_may_pull(skb, sizeof(*vh)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		if (!pskb_may_pull(skb, sizeof(*eh)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	saved_proto = *proto_ptr;
+	*proto_ptr = htons(ETH_P_RTAG);
+
+	skb_push(skb, sizeof(*rtag));
+	skb_reset_mac_header(skb);
+
+	new_mac_header = skb_mac_header(skb);
+	memmove(new_mac_header, (unsigned char *)eh, head_len);
+
+	skb->protocol = htons(ETH_P_RTAG);
+	skb_set_network_header(skb, head_len);
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+
+	/* Write R-TAG after the Ethernet / VLAN header */
+	rtag = (struct r_tag *)(new_mac_header + head_len);
+	rtag->reserved    = 0;
+	rtag->sequence_nr = htons(seq_num);
+	rtag->encap_proto = saved_proto;
+
+	return 0;
+}
+
+static void frer_rtag_pop(struct sk_buff *skb)
+{
+	unsigned char *new_mac_header;
+	unsigned int data_offset;
+	unsigned int head_len;
+	struct vlan_ethhdr *vh;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+
+	data_offset = skb->data - skb_mac_header(skb);
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	if (*proto_ptr != htons(ETH_P_RTAG))
+		return;
+
+	rtag = (struct r_tag *)((unsigned char *)eh + head_len);
+	*proto_ptr = rtag->encap_proto;
+
+	skb->protocol = rtag->encap_proto;
+
+	skb_postpull_rcsum(skb, rtag, sizeof(struct r_tag));
+	skb_pull(skb, sizeof(*rtag));
+	skb_reset_mac_header(skb);
+
+	new_mac_header = skb_mac_header(skb);
+	memmove(new_mac_header, (unsigned char *)eh, head_len);
+
+	skb_set_network_header(skb, head_len);
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+}
+
+static int frer_rtag_decode(struct sk_buff *skb, int *seq)
+{
+	unsigned int data_offset;
+	struct vlan_ethhdr *vh;
+	unsigned int head_len;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+
+	if (!skb_mac_header_was_set(skb))
+		return -EINVAL;
+
+	data_offset = skb->data - skb_mac_header(skb);
+
+	if (skb_cow_head(skb, data_offset))
+		return -ENOMEM;
+
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		if (!pskb_may_pull(skb, sizeof(*vh) + sizeof(*rtag)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		if (!pskb_may_pull(skb, sizeof(*eh) + sizeof(*rtag)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+
+	if (*proto_ptr != htons(ETH_P_RTAG)) {
+		*seq = -1;
+		return 0;
+	}
+
+	rtag = (struct r_tag *)((unsigned char *)eh + head_len);
+
+	*seq = (int)ntohs(rtag->sequence_nr);
+
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+/* Recovery algorithms (called with rcvy->lock held)                  */
+/* ------------------------------------------------------------------ */
+
+/* Returns true = pass frame, false = discard frame.
+ * @individual: when true, restart the reset timer even on discarded frames
+ *   (rogue/duplicate), as required for Individual Recovery (IEEE 802.1CB 7.5).
+ */
+static bool frer_vector_alg(struct frer_rcvy *rcvy, int seq, bool individual)
+{
+	int delta;
+	bool restart_timer = false;
+	bool pass;
+
+	if (seq < 0) {
+		/* No R-TAG present */
+		rcvy->stats_tagless_pkts++;
+		if (rcvy->take_no_seq) {
+			restart_timer = true;
+			pass = true;
+		} else {
+			pass = false;
+		}
+		goto out;
+	}
+
+	if (rcvy->take_any) {
+		/* First frame after reset: accept unconditionally */
+		rcvy->take_any     = false;
+		rcvy->rcvy_seq_num = (u32)seq;
+		rcvy->seq_history  = BIT(0);
+		restart_timer = true;
+		pass = true;
+		goto out;
+	}
+
+	delta = (seq - (int)rcvy->rcvy_seq_num) &
+		(int)(rcvy->seq_space - 1);
+	/* Map delta > seq_space/2 to negative (signed wrap) */
+	if ((u32)delta & (u32)(rcvy->seq_space / 2))
+		delta -= (int)rcvy->seq_space;
+
+	if (delta >= (int)rcvy->history_len ||
+	    delta <= -(int)rcvy->history_len) {
+		/* Packet is out-of-range (rogue). */
+		rcvy->stats_rogue_pkts++;
+		if (individual)
+			restart_timer = true;
+		pass = false;
+		goto out;
+	}
+
+	if (delta <= 0) {
+		/* Packet is old: check whether already seen. */
+		if (rcvy->seq_history & BIT(-delta)) {
+			if (individual)
+				restart_timer = true;
+			/* Already received */
+			pass = false;
+		} else {
+			/* Out-of-order but not yet seen */
+			rcvy->seq_history |= BIT(-delta);
+			rcvy->stats_out_of_order_pkts++;
+			restart_timer = true;
+			pass = true;
+		}
+		goto out;
+	}
+
+	/* delta > 0: frame is newer than expected */
+	if (delta != 1)
+		rcvy->stats_out_of_order_pkts++;
+
+	/* Shift history forward, counting any gaps as lost */
+	while (--delta) {
+		if (!(rcvy->seq_history & BIT(rcvy->history_len - 1)))
+			rcvy->stats_lost_pkts++;
+		rcvy->seq_history <<= 1;
+	}
+	if (!(rcvy->seq_history & BIT(rcvy->history_len - 1)))
+		rcvy->stats_lost_pkts++;
+	rcvy->seq_history = (rcvy->seq_history << 1) | BIT(0);
+	rcvy->rcvy_seq_num = (u32)seq;
+	restart_timer = true;
+	pass = true;
+
+out:
+	if (restart_timer)
+		frer_rcvy_timer_restart(rcvy);
+	return pass;
+}
+
+static bool frer_match_alg(struct frer_rcvy *rcvy, int seq, bool individual)
+{
+	if (seq < 0) {
+		/* No R-TAG: Match alg cannot deduplicate, always pass. */
+		rcvy->stats_tagless_pkts++;
+		return true;
+	}
+
+	if (rcvy->take_any) {
+		rcvy->take_any     = false;
+		rcvy->rcvy_seq_num = (u32)seq;
+		frer_rcvy_timer_restart(rcvy);
+		return true;
+	}
+
+	if ((u32)seq == rcvy->rcvy_seq_num) {
+		/* Duplicate */
+		if (individual)
+			frer_rcvy_timer_restart(rcvy);
+		return false;
+	}
+
+	/* New sequence number: accept and update */
+	if ((u32)seq != ((rcvy->rcvy_seq_num + 1) % rcvy->seq_space))
+		rcvy->stats_out_of_order_pkts++;
+	rcvy->rcvy_seq_num = (u32)seq;
+	frer_rcvy_timer_restart(rcvy);
+	return true;
+}
+
+/* ------------------------------------------------------------------ */
+/* Netlink policy                                                      */
+/* ------------------------------------------------------------------ */
+
+static const struct nla_policy frer_policy[TCA_FRER_MAX + 1] = {
+	[TCA_FRER_PARMS]            = NLA_POLICY_EXACT_LEN(sizeof(struct tc_frer)),
+	[TCA_FRER_FUNC]             = { .type = NLA_U8 },
+	[TCA_FRER_TAG_TYPE]         = { .type = NLA_U8 },
+	[TCA_FRER_RCVY_INDIVIDUAL]  = { .type = NLA_FLAG },
+	[TCA_FRER_RCVY_ALG]         = { .type = NLA_U8 },
+	[TCA_FRER_RCVY_HISTORY_LEN] = NLA_POLICY_RANGE(NLA_U8, 1, 32),
+	[TCA_FRER_RCVY_RESET_MSEC]  = { .type = NLA_U32 },
+	[TCA_FRER_RCVY_TAKE_NO_SEQ] = { .type = NLA_FLAG },
+	[TCA_FRER_RCVY_TAG_POP]     = { .type = NLA_FLAG },
+};
+
+/* ------------------------------------------------------------------ */
+/* Action init                                                         */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action **a,
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
+	struct nlattr *tb[TCA_FRER_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tcf_frer *f;
+	struct tc_frer *parm;
+	bool exists = false;
+	int ret = 0, err, index;
+	u8 func, tag_type;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: attributes required");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FRER_MAX, nla, frer_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_FRER_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_PARMS missing");
+		return -EINVAL;
+	}
+	if (!tb[TCA_FRER_FUNC]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_FUNC missing");
+		return -EINVAL;
+	}
+	if (!tb[TCA_FRER_TAG_TYPE]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_TAG_TYPE missing");
+		return -EINVAL;
+	}
+
+	func     = nla_get_u8(tb[TCA_FRER_FUNC]);
+	tag_type = nla_get_u8(tb[TCA_FRER_TAG_TYPE]);
+
+	if (func != TCA_FRER_FUNC_PUSH && func != TCA_FRER_FUNC_RECOVER) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: unknown func");
+		return -EINVAL;
+	}
+	if (tag_type != TCA_FRER_TAG_RTAG) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: only rtag supported");
+		return -EOPNOTSUPP;
+	}
+
+	parm  = nla_data(tb[TCA_FRER_PARMS]);
+	index = parm->index;
+
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
+	if (err < 0)
+		return err;
+	exists = err;
+
+	if (exists && bind)
+		return ACT_P_BOUND;
+
+	if (!exists) {
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_frer_ops, bind, flags);
+		if (ret) {
+			tcf_idr_cleanup(tn, index);
+			return ret;
+		}
+		ret = ACT_P_CREATED;
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+		tcf_idr_release(*a, bind);
+		return -EEXIST;
+	}
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	f = to_frer(*a);
+
+	spin_lock_bh(&f->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+	f->func     = func;
+	f->tag_type = tag_type;
+	f->tag_pop  = !!tb[TCA_FRER_RCVY_TAG_POP];
+
+	if (func == TCA_FRER_FUNC_PUSH) {
+		if (ret == ACT_P_CREATED) {
+			spin_lock_init(&f->seqgen.lock);
+			f->seqgen.seq_space = 1 << 16;
+		}
+		/* gen_seq_num starts at 0 on creation; preserved on replace */
+	} else {
+		u8 alg = tb[TCA_FRER_RCVY_ALG] ?
+			 nla_get_u8(tb[TCA_FRER_RCVY_ALG]) :
+			 TCA_FRER_RCVY_VECTOR_ALG;
+		u8 history_len = tb[TCA_FRER_RCVY_HISTORY_LEN] ?
+				 nla_get_u8(tb[TCA_FRER_RCVY_HISTORY_LEN]) : 32;
+		u32 reset_msec = tb[TCA_FRER_RCVY_RESET_MSEC] ?
+				 nla_get_u32(tb[TCA_FRER_RCVY_RESET_MSEC]) : 0;
+		bool take_no_seq = !!tb[TCA_FRER_RCVY_TAKE_NO_SEQ];
+
+		if (alg != TCA_FRER_RCVY_VECTOR_ALG &&
+		    alg != TCA_FRER_RCVY_MATCH_ALG) {
+			spin_unlock_bh(&f->tcf_lock);
+			NL_SET_ERR_MSG_MOD(extack, "frer: unknown recovery algorithm");
+			err = -EINVAL;
+			goto release_idr;
+		}
+
+		f->individual = !!tb[TCA_FRER_RCVY_INDIVIDUAL];
+
+		/* Cancel any running reset timer before re-initialising. */
+		if (ret != ACT_P_CREATED && f->rcvy.reset_msec) {
+			spin_unlock_bh(&f->tcf_lock);
+			hrtimer_cancel(&f->rcvy.hrtimer);
+			spin_lock_bh(&f->tcf_lock);
+		}
+
+		frer_rcvy_init_state(&f->rcvy, alg, history_len,
+				     reset_msec, take_no_seq);
+	}
+
+	spin_unlock_bh(&f->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+
+	return ret;
+
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+/* ------------------------------------------------------------------ */
+/* Data path                                                           */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_act(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_frer *f = to_frer(a);
+	int retval;
+
+	tcf_lastuse_update(&f->tcf_tm);
+	tcf_action_update_bstats(&f->common, skb);
+	retval = READ_ONCE(f->tcf_action);
+
+	if (f->func == TCA_FRER_FUNC_PUSH) {
+		struct frer_seqgen *sg = &f->seqgen;
+		u16 seq;
+
+		spin_lock(&sg->lock);
+		seq = (u16)sg->gen_seq_num;
+		if (++sg->gen_seq_num >= sg->seq_space)
+			sg->gen_seq_num = 0;
+		sg->stats_pkts++;
+		spin_unlock(&sg->lock);
+
+		if (frer_rtag_push(skb, seq) < 0) {
+			tcf_action_inc_drop_qstats(&f->common);
+			return TC_ACT_SHOT;
+		}
+	} else {
+		struct frer_rcvy *rcvy = &f->rcvy;
+		bool pass;
+		int seq;
+
+		if (frer_rtag_decode(skb, &seq) < 0) {
+			tcf_action_inc_drop_qstats(&f->common);
+			return TC_ACT_SHOT;
+		}
+
+		spin_lock(&rcvy->lock);
+		if (rcvy->alg == TCA_FRER_RCVY_VECTOR_ALG)
+			pass = frer_vector_alg(rcvy, seq, f->individual);
+		else
+			pass = frer_match_alg(rcvy, seq, f->individual);
+
+		if (pass) {
+			rcvy->stats_passed_pkts++;
+			spin_unlock(&rcvy->lock);
+			if (f->tag_pop)
+				frer_rtag_pop(skb);
+			return retval;
+		}
+
+		rcvy->stats_discarded_pkts++;
+		spin_unlock(&rcvy->lock);
+		return TC_ACT_SHOT;
+	}
+
+	return retval;
+}
+
+/* ------------------------------------------------------------------ */
+/* Dump                                                                */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_frer *f = to_frer(a);
+	struct tc_frer opt = {
+		.index   = f->tcf_index,
+		.refcnt  = refcount_read(&f->tcf_refcnt) - ref,
+		.bindcnt = atomic_read(&f->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&f->tcf_lock);
+	opt.action = f->tcf_action;
+
+	if (nla_put(skb, TCA_FRER_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_FRER_FUNC, f->func))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_FRER_TAG_TYPE, f->tag_type))
+		goto nla_put_failure;
+	if (f->tag_pop && nla_put_flag(skb, TCA_FRER_RCVY_TAG_POP))
+		goto nla_put_failure;
+
+	if (f->func == TCA_FRER_FUNC_PUSH) {
+		spin_lock(&f->seqgen.lock);
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_SEQGEN_PKTS,
+				      f->seqgen.stats_pkts, TCA_FRER_PAD)) {
+			spin_unlock(&f->seqgen.lock);
+			goto nla_put_failure;
+		}
+		spin_unlock(&f->seqgen.lock);
+	} else {
+		u64 tagless, ooo, rogue, lost, resets, passed, discarded;
+		struct frer_rcvy *rcvy = &f->rcvy;
+
+		spin_lock(&rcvy->lock);
+		tagless    = rcvy->stats_tagless_pkts;
+		ooo        = rcvy->stats_out_of_order_pkts;
+		rogue      = rcvy->stats_rogue_pkts;
+		lost       = rcvy->stats_lost_pkts;
+		resets     = rcvy->stats_resets;
+		passed     = rcvy->stats_passed_pkts;
+		discarded  = rcvy->stats_discarded_pkts;
+		spin_unlock(&rcvy->lock);
+
+		if (f->individual && nla_put_flag(skb, TCA_FRER_RCVY_INDIVIDUAL))
+			goto nla_put_failure;
+		if (nla_put_u8(skb, TCA_FRER_RCVY_ALG, rcvy->alg))
+			goto nla_put_failure;
+		if (nla_put_u8(skb, TCA_FRER_RCVY_HISTORY_LEN, rcvy->history_len))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_FRER_RCVY_RESET_MSEC, rcvy->reset_msec))
+			goto nla_put_failure;
+		if (rcvy->take_no_seq && nla_put_flag(skb, TCA_FRER_RCVY_TAKE_NO_SEQ))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_TAGLESS_PKTS,
+				      tagless, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_OUT_OF_ORDER_PKTS,
+				      ooo, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_ROGUE_PKTS,
+				      rogue, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_LOST_PKTS,
+				      lost, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_RESETS,
+				      resets, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_PASSED_PKTS,
+				      passed, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_DISCARDED_PKTS,
+				      discarded, TCA_FRER_PAD))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&t, &f->tcf_tm);
+	if (nla_put_64bit(skb, TCA_FRER_TM, sizeof(t), &t, TCA_FRER_PAD))
+		goto nla_put_failure;
+
+	spin_unlock_bh(&f->tcf_lock);
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&f->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* ------------------------------------------------------------------ */
+/* Cleanup                                                             */
+/* ------------------------------------------------------------------ */
+
+static void tcf_frer_cleanup(struct tc_action *a)
+{
+	struct tcf_frer *f = to_frer(a);
+
+	if (f->func == TCA_FRER_FUNC_RECOVER)
+		hrtimer_cancel(&f->rcvy.hrtimer);
+}
+
+/* ------------------------------------------------------------------ */
+/* Walker / search / stats / fill-size / offload                      */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static void tcf_frer_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+				  u64 drops, u64 lastuse, bool hw)
+{
+	struct tcf_frer *f = to_frer(a);
+	struct tcf_t *tm = &f->tcf_tm;
+
+	tcf_action_update_stats(a, bytes, packets, drops, hw);
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static size_t tcf_frer_get_fill_size(const struct tc_action *act)
+{
+	return nla_total_size(sizeof(struct tc_frer)) /* TCA_FRER_PARMS */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_FUNC */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_TAG_TYPE */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_TAG_POP (flag) */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_INDIVIDUAL (flag) */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_RCVY_ALG */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_RCVY_HISTORY_LEN */
+		+ nla_total_size(sizeof(u32)) /* TCA_FRER_RCVY_RESET_MSEC */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_TAKE_NO_SEQ (flag) */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_TAGLESS_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_OUT_OF_ORDER_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_ROGUE_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_LOST_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_RESETS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_PASSED_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_DISCARDED_PKTS */
+		+ nla_total_size_64bit(sizeof(struct tcf_t)); /* TCA_FRER_TM */
+}
+
+static int tcf_frer_offload_act_setup(struct tc_action *act, void *entry_data,
+				      u32 *index_inc, bool bind,
+				      struct netlink_ext_ack *extack)
+{
+	if (bind) {
+		struct flow_action_entry *entry = entry_data;
+		struct tcf_frer *f = to_frer(act);
+
+		entry->id            = FLOW_ACTION_FRER;
+		entry->frer.func     = f->func;
+		entry->frer.tag_type = f->tag_type;
+		entry->frer.tag_pop  = f->tag_pop;
+
+		if (f->func != TCA_FRER_FUNC_PUSH) {
+			entry->frer.individual       = f->individual;
+			entry->frer.rcvy_alg         = f->rcvy.alg;
+			entry->frer.rcvy_history_len = f->rcvy.history_len;
+			entry->frer.rcvy_reset_msec  = f->rcvy.reset_msec;
+			entry->frer.take_no_seq      = f->rcvy.take_no_seq;
+		}
+		*index_inc = 1;
+	} else {
+		struct flow_offload_action *fl_action = entry_data;
+
+		fl_action->id = FLOW_ACTION_FRER;
+	}
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+/* Module glue                                                         */
+/* ------------------------------------------------------------------ */
+
+static struct tc_action_ops act_frer_ops = {
+	.kind		    = "frer",
+	.id		    = TCA_ID_FRER,
+	.owner		    = THIS_MODULE,
+	.act		    = tcf_frer_act,
+	.init		    = tcf_frer_init,
+	.cleanup	    = tcf_frer_cleanup,
+	.dump		    = tcf_frer_dump,
+	.walk		    = tcf_frer_walker,
+	.stats_update	    = tcf_frer_stats_update,
+	.get_fill_size	    = tcf_frer_get_fill_size,
+	.offload_act_setup  = tcf_frer_offload_act_setup,
+	.size		    = sizeof(struct tcf_frer),
+};
+
+static __net_init int frer_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+
+	return tc_action_net_init(net, tn, &act_frer_ops);
+}
+
+static void __net_exit frer_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, act_frer_ops.net_id);
+}
+
+static struct pernet_operations frer_net_ops = {
+	.init       = frer_init_net,
+	.exit_batch = frer_exit_net,
+	.id         = &act_frer_ops.net_id,
+	.size       = sizeof(struct tc_action_net),
+};
+
+static int __init frer_init_module(void)
+{
+	return tcf_register_action(&act_frer_ops, &frer_net_ops);
+}
+
+static void __exit frer_cleanup_module(void)
+{
+	tcf_unregister_action(&act_frer_ops, &frer_net_ops);
+}
+
+module_init(frer_init_module);
+module_exit(frer_cleanup_module);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IEEE 802.1CB FRER tc action");
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 1/6] uapi: if_ether: add ETH_P_RTAG for IEEE 802.1CB R-TAG
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

The IEEE 802.1CB-2017 standard defines the Redundancy Tag (R-TAG) with
EtherType 0xF1C1. Add ETH_P_RTAG to the kernel's EtherType definitions
so that it can be used by tc classifiers (e.g. cls_flower) and the FRER
tc action for stream identification on the ingress path.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index fb5efc8e06cc..2d909078cde1 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -122,6 +122,7 @@
 #define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_A5PSW	0xE001		/* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
+#define ETH_P_RTAG	0xF1C1		/* Redundancy Tag (IEEE 802.1CB) */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_NXP_NETC  0xFD3A		/* NXP NETC DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 0/6] tc: introduce FRER action (IEEE 802.1CB)
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1

This series introduces a new TC action implementing
Frame Replication and Elimination for Reliability (FRER)
as defined in IEEE 802.1CB.

The FRER action enables:
- Frame replication (push)
- Sequence numbering via R-TAG
- Frame elimination based on sequence recovery

Patch overview:
 1. Add ETH_P_RTAG definition
 2. Introduce TCA_ID_FRER
 3. Add tc_frer uAPI
 4. Implement act_frer kernel module
 5. Add tc-testing selftest JSON coverage
 6. Add kselftest integration test

The implementation currently focuses on software datapath.  Hardware
offload is exposed through the flow offload API (FLOW_ACTION_FRER);
driver-side support for specific hardware will be submitted separately.

Usage scenarios:

=== Scenario 1a: Talker End - single port (no replication) ===

  The simplest case: a single egress path.  The frer push action
  inserts an R-TAG on the egress of the physical interface.  No
  mirror or virtual interface is needed.

    CPU
     |
    eth0 egress clsact:
         action frer push index 1  <- insert R-TAG seq=N
         |
        eth0
    [R-TAG seq=N | payload]
      Path A --> network

  Configuration:

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 egress protocol ip flower skip_hw \
        action frer push index 1

=== Scenario 1b: Talker End - dual port replication via bond + cross-mirror ===

  A bond interface (balance-rr) aggregates both physical ports.  The
  frer push action is placed on each slave's egress; each slave also
  mirrors every outgoing frame to the other slave.  This cross-mirror
  ensures that every frame transmitted by the bond (regardless of which
  slave the round-robin selects) carries an R-TAG and reaches both
  physical links.  If one link goes down, the bond continues on the
  remaining slave without any traffic interruption.

    CPU (socket on bond0)
         |
        bond0 (balance-rr)
        /          \
     eth0            eth1
    egress clsact:   egress clsact:
    action frer push index 1   action frer push index 1
    action mirred egress       action mirred egress
        mirror dev eth1            mirror dev eth0
         |                              |
        eth0                          eth1
    [R-TAG seq=N | payload]   [R-TAG seq=N | payload]
      Path A --> network         Path B --> network

  Configuration:

    ip link add bond0 type bond mode balance-rr miimon 100
    ip link set eth0 master bond0
    ip link set eth1 master bond0
    ip link set eth0 up
    ip link set eth1 up
    ip link set bond0 up
    ip addr add 192.0.2.1/24 dev bond0

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 egress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev eth1

    tc qdisc add dev eth1 clsact
    tc filter add dev eth1 egress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev eth0

=== Scenario 2: Listener End - shared sequence recovery via bond ===

  Both physical ports are bonded (balance-rr).  Each port's ingress
  references the same recover action by index.  The first copy of each
  sequence number passes (R-TAG stripped by tag-pop) and is delivered
  directly to the bond's IP stack; the duplicate is discarded.  No
  separate convergence interface is needed because the bond already
  provides a single IP address over both slaves.

    eth0 (Path A in)            eth1 (Path B in)
    [R-TAG seq=N | payload]     [R-TAG seq=N | payload]
          |                           |
    ingress clsact              ingress clsact
    flower: match stream        flower: match stream
    action frer recover   <-->  action frer recover
        index 10 (shared,           index 10 (shared,
        tag-pop, spinlock           same action object)
        protected)
          |                           |
          +-----------+---------------+
                      |
                   bond0 (IP_DST) ----> IP stack / CPU
                              [payload, R-TAG removed by tag-pop]

  Configuration:

    ip link add bond0 type bond mode balance-rr miimon 100
    ip link set eth0 master bond0
    ip link set eth1 master bond0
    ip link set eth0 up
    ip link set eth1 up
    ip link set bond0 up
    ip addr add 192.0.2.2/24 dev bond0

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 ingress protocol all flower skip_hw \
        action frer recover alg vector history-length 16 \
            reset-time 2000 tag-pop index 10

    tc qdisc add dev eth1 clsact
    tc filter add dev eth1 ingress protocol all flower skip_hw \
        action frer recover index 10

=== Scenario 3a: Relay System - ingress sequence recovery ===

  A relay node receives redundant streams on two ingress ports and
  eliminates duplicates before forwarding.  The two ingress ports
  share the same recover action by index.  The surviving frame is
  redirected to an egress port and forwarded to the next segment.

    upstream
      
    Path A --> swp0 (ingress)   Path B --> swp1 (ingress)
                 |                          |
           ingress clsact        ingress clsact
           flower: match stream  flower: match stream
           action frer recover   action frer recover
               index 10          index 10 (shared)
           action mirred         action mirred
               redirect              redirect
               dev swp2              dev swp2
                 |                     |
                 +----------+----------+
                            |
                         swp2 --> downstream

  Configuration:

    tc qdisc add dev swp0 clsact
    tc filter add dev swp0 ingress protocol all flower skip_hw \
        action frer recover alg vector history-length 16 \
            reset-time 2000 tag-pop index 10 \
        action mirred egress redirect dev swp2

    tc qdisc add dev swp1 clsact
    tc filter add dev swp1 ingress protocol all flower skip_hw \
        action frer recover index 10 \
        action mirred egress redirect dev swp2

=== Scenario 3b: Relay System - ingress frame replication (push) ===

  A relay node receives frames from a talker on swp0 ingress, inserts
  an R-TAG, and replicates them onto two egress ports towards the next
  network segment.  FDB learning and flooding are disabled on all relay
  ports; MAC forwarding entries are configured statically to prevent
  duplicate frames from looping through the bridge.

    upstream
         |
        swp0 ingress clsact:
         action frer push index 1         <- insert new R-TAG seq=M
         action mirred egress mirror dev swp2 <- copy to Path B'
         action mirred egress redirect dev swp1 <- to Path A'
         |                                      |
        swp1                                  swp2
    [R-TAG seq=M | payload]           [R-TAG seq=M | payload]
      Path A' --> downstream            Path B' --> downstream

  Configuration:

    tc qdisc add dev swp0 clsact
    tc filter add dev swp0 ingress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev swp2 \
        action mirred egress redirect dev swp1

    # Disable FDB learning and flooding on all relay ports to prevent
    # duplicate frames from looping back through the bridge.
    bridge link set dev swp0 learning off flood off
    bridge link set dev swp1 learning off flood off
    bridge link set dev swp2 learning off flood off
    bridge fdb add DST_MAC dev swp1 master static
    bridge fdb add DST_MAC dev swp2 master static

Known limitations:

  1. Only R-TAG (EtherType 0xF1C1, IEEE 802.1CB Section 7.8) is
     currently supported as the redundancy tag type.  HSR
     (IEC 62439-3) and PRP (IEC 62439-3) tag formats are defined in
     the UAPI (TCA_FRER_TAG_HSR, TCA_FRER_TAG_PRP) but not yet
     implemented; attempts to use them are rejected with -EOPNOTSUPP.
     Support for HSR and PRP tags will be added in a follow-up series.

Changes since RFC (https://lkml.org/lkml/2021/9/28/535):

  1. The frer action can now be attached to either ingress or egress
     clsact.  For talker-end frame replication the action is placed on
     the egress of the outgoing interface. For relay-system replication
     the action is placed on the ingress of the receiving interface,
     followed by mirred redirect to the egress ports.

  2. Reset timer reworked following Vinicius Costa Gomes' review.

  3. Vector recovery algorithm corrected following Ferenc Fejes' review.

  4. A bond is used on the end system to aggregate two device interfaces.
     addressing Vladimir’s comment that TC-FRER is not applicable to end
     systems. See Scenario 1b(talker end) and Scenario 2(listener end).
     The kselftest script (frer_test.sh) test this on TEST 2.

  5. Added detailed usage scenario descriptions with ASCII topology
     diagrams.  Added tc-testing JSON test cases (32 cases) and a
     TAP-format kselftest script (frer_test.sh) with five end-to-end
     functional tests and one relay bridge topology test.

Xiaoliang Yang (6):
  uapi: if_ether: add ETH_P_RTAG for IEEE 802.1CB R-TAG
  uapi: pkt_cls: add TCA_ID_FRER action identifier
  uapi: tc_act: add tc_frer UAPI header
  net: sched: act_frer: add FRER tc action
  selftest: add tc-testing JSON test cases for act_frer
  selftests: net: add kselftest for IEEE 802.1CB FRER tc action

 include/net/flow_offload.h                    |   11 +
 include/net/tc_act/tc_frer.h                  |   71 ++
 include/uapi/linux/if_ether.h                 |    1 +
 include/uapi/linux/pkt_cls.h                  |    1 +
 include/uapi/linux/tc_act/tc_frer.h           |   89 ++
 net/sched/Kconfig                             |   16 +
 net/sched/Makefile                            |    1 +
 net/sched/act_frer.c                          |  835 ++++++++++++++
 tools/testing/selftests/net/Makefile          |    1 +
 tools/testing/selftests/net/frer_test.sh      | 1013 +++++++++++++++++
 .../tc-testing/tc-tests/actions/frer.json     |  785 +++++++++++++
 11 files changed, 2824 insertions(+)
 create mode 100644 include/net/tc_act/tc_frer.h
 create mode 100644 include/uapi/linux/tc_act/tc_frer.h
 create mode 100644 net/sched/act_frer.c
 create mode 100755 tools/testing/selftests/net/frer_test.sh
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/frer.json

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH v4 1/3] dt-bindings: net: add Realtek RTL8125 PCIe Ethernet
From: Krzysztof Kozlowski @ 2026-06-22  9:08 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: ricardo, nic_swsd, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Heiko Stuebner, Sebastian Reichel, netdev,
	devicetree, linux-kernel, linux-arm-kernel, linux-rockchip
In-Reply-To: <876a38f8-75ea-4b32-bb65-216cb3adb436@gmail.com>

On Wed, Jun 17, 2026 at 06:43:42PM +0200, Heiner Kallweit wrote:
> On 17.06.2026 14:58, Ricardo Pardini via B4 Relay wrote:
> > From: Ricardo Pardini <ricardo@pardini.net>
> > 
> > Add a binding for fixed/soldered Realtek RTL8125 PCIe Ethernet
> > controller.
> > 
> > The "pciVVVV,DDDD" compatibles are the Open Firmware PCI Bus Binding
> > spelling, auto-derived from PCI-SIG vendor/device IDs, but they still
> > need a binding when used in a board DT - analogous to "usbVVVV,PPPP"

Ricardo,

No, they do not need. They are already documented, they already have a
binding, see: dtschema/schemas/pci/pci-device.yaml


> > compatibles documented in their own bindings (e.g. microchip,lan95xx)
> > so board DTs attaching properties (fixed MAC, nvmem cell, ...) to
> > these PCI function nodes can be validated.
> > 
> > Suggested-by: Sebastian Reichel <sebastian.reichel@collabora.com>
> > Signed-off-by: Ricardo Pardini <ricardo@pardini.net>
> > ---
> >  .../devicetree/bindings/net/realtek,rtl8125.yaml   | 43 ++++++++++++++++++++++
> >  MAINTAINERS                                        |  1 +
> >  2 files changed, 44 insertions(+)
> > 
> > diff --git a/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml b/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml
> > new file mode 100644
> > index 0000000000000..eee13fbc1e6a6
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml
> > @@ -0,0 +1,43 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: http://devicetree.org/schemas/net/realtek,rtl8125.yaml#
> > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > +
> > +title: Realtek RTL8125 2.5 Gigabit PCIe Ethernet Controller
> > +
> > +maintainers:
> > +  - Heiner Kallweit <hkallweit1@gmail.com>
> > +
> > +description:
> > +  The Realtek RTL8125 is a 2.5GBASE-T Ethernet controller with a PCIe host
> > +  interface.
> > +
> > +allOf:
> > +  - $ref: ethernet-controller.yaml#
> > +
> > +properties:
> > +  compatible:
> > +    const: pci10ec,8125
> 
> IIRC we came to the conclusion that the compatible string isn't used in the
> relevant code path. Then why add it here? Is there an alignment on this?

Heiner, it is used - in the DTS.

> If it should be added here, then an explaining comment would be helpful.

Commit msg should explain that.  The compatible is used, so it
must be documented and in fact already is, so you need to specify them
ONLY if device nodes have some other properties, like being an ethernet
controller.

I assume that this is the case here, although that should be mentioned
in the commit msg.

Best regards,
Krzysztof


^ permalink raw reply

* [PATCH v2 1/2] net: fman: fix clk reference leak in read_dts_node()
From: ZhaoJinming @ 2026-06-22  9:05 UTC (permalink / raw)
  To: horms
  Cc: andrew+netdev, davem, edumazet, kuba, linux-kernel, madalin.bucur,
	netdev, pabeni, sean.anderson, zhaojinming
In-Reply-To: <20260619121328.922138-3-horms@kernel.org>

of_clk_get() returns a reference that must be released with clk_put()
when the clock is no longer needed. The current code never calls
clk_put(clk), leaking the reference on both the success path and the
clk_rate == 0 error path.

Add clk_put(clk) after the clock rate is consumed on the success path,
and jump to a new clk_put label on the error path to properly release
the clock reference.

Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 013273a2de32..31b0081bdf91 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2736,11 +2736,13 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		err = -EINVAL;
 		dev_err(&of_dev->dev, "%s: Failed to determine FM%d clock rate\n",
 			__func__, fman->dts_params.id);
-		goto fman_node_put;
+		goto clk_put;
 	}
 	/* Rounding to MHz */
 	fman->dts_params.clk_freq = DIV_ROUND_UP(clk_rate, 1000000);
 
+	clk_put(clk);
+
 	err = of_property_read_u32_array(fm_node, "fsl,qman-channel-range",
 					 &range[0], 2);
 	if (err) {
@@ -2818,6 +2820,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 
 fman_node_put:
 	of_node_put(fm_node);
+clk_put:
+	clk_put(clk);
 fman_free:
 	kfree(fman);
 	return ERR_PTR(err);
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 2/2] net: fman: use devm_kzalloc() for fman and rely on devres
From: ZhaoJinming @ 2026-06-22  9:05 UTC (permalink / raw)
  To: horms
  Cc: andrew+netdev, davem, edumazet, kuba, linux-kernel, madalin.bucur,
	netdev, pabeni, sean.anderson, zhaojinming
In-Reply-To: <20260622090505.2418478-1-zhaojinming@uniontech.com>

The driver now allocates the top-level struct fman with devm_kzalloc()
so that its lifetime is bound to the device and resources are released
automatically by the driver core on probe failure or device removal.

Remove the explicit kfree(fman) from the error paths in fman_config()
and read_dts_node() to avoid double-free/use-after-free and to follow
the devm_ allocation convention.

After of_find_matching_node() consumes fm_node's reference via
of_node_put(from), the post-muram error paths no longer need to clean
up fm_node, so replace goto fman_free with direct return ERR_PTR(err).

This change complements the existing use of devm_* resources (irq,
ioremap, etc.) and simplifies the error handling paths.

Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 31b0081bdf91..23b938afe17a 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -1793,8 +1793,6 @@ static int fman_config(struct fman *fman)
 	kfree(fman->cfg);
 err_fm_drv:
 	kfree(fman->state);
-err_fm_state:
-	kfree(fman);
 	return -EINVAL;
 }
 
@@ -2697,7 +2695,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	struct clk *clk;
 	u32 clk_rate;
 
-	fman = kzalloc_obj(*fman);
+	fman = devm_kzalloc(&of_dev->dev, sizeof(*fman), GFP_KERNEL);
 	if (!fman)
 		return ERR_PTR(-ENOMEM);
 
@@ -2759,7 +2757,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		err = -EINVAL;
 		dev_err(&of_dev->dev, "%s: could not find MURAM node\n",
 			__func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	err = of_address_to_resource(muram_node, 0,
@@ -2768,7 +2766,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		of_node_put(muram_node);
 		dev_err(&of_dev->dev, "%s: of_address_to_resource() = %d\n",
 			__func__, err);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	of_node_put(muram_node);
@@ -2778,7 +2776,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (err < 0) {
 		dev_err(&of_dev->dev, "%s: irq %d allocation failed (error = %d)\n",
 			__func__, irq, err);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	if (fman->dts_params.err_irq != 0) {
@@ -2788,7 +2786,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		if (err < 0) {
 			dev_err(&of_dev->dev, "%s: irq %d allocation failed (error = %d)\n",
 				__func__, fman->dts_params.err_irq, err);
-			goto fman_free;
+			return ERR_PTR(err);
 		}
 	}
 
@@ -2796,7 +2794,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (IS_ERR(base_addr)) {
 		err = PTR_ERR(base_addr);
 		dev_err(&of_dev->dev, "%s: devm_ioremap() failed\n", __func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	fman->dts_params.base_addr = base_addr;
@@ -2808,7 +2806,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (err) {
 		dev_err(&of_dev->dev, "%s: of_platform_populate() failed\n",
 			__func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 #ifdef CONFIG_DPAA_ERRATUM_A050385
@@ -2822,8 +2820,6 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	of_node_put(fm_node);
 clk_put:
 	clk_put(clk);
-fman_free:
-	kfree(fman);
 	return ERR_PTR(err);
 }
 
-- 
2.20.1


^ permalink raw reply related

* Re: [RFC v2] Enabling CONFIG_NTP_PPS for NOHZ by adding ntp_error to system_time_snapshot
From: David Woodhouse @ 2026-06-22  9:04 UTC (permalink / raw)
  To: Thomas Gleixner, John Stultz, Stephen Boyd, Miroslav Lichvar,
	Richard Cochran, linux-kernel, netdev
  Cc: Rodolfo Giometti, Alexander Gordeev
In-Reply-To: <3b10d2e91b18f49d8a3e6226b08ac8cd9cb49aa6.camel@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 9710 bytes --]

On Sun, 2026-06-21 at 23:30 +0100, David Woodhouse wrote:
> Open question: *how* should this be exposed? It's all very well putting
> it into ktime_get_snapshot_id() like this, and we could easily make an
> argument that pps_get_ts() should just add it unconditionally, because
> *not* doing so makes no sense.

Hm, I'm leaning towards adding it unconditionally in
ktime_get_snapshot_id() and get_device_system_crosststamp(), and not
adding the extra field to the system_time_snapshot at all...

From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 19 Jun 2026 00:00:29 +0100
Subject: [PATCH] timekeeping: Apply extrapolated ntp_error to clock snapshots

The time reported in ::systime of a system_time_snapshot is known to be
slightly inaccurate because of the way that the reported realtime clock
sawtooths around the *intended* time series, limited by the integer mult
value used to calculate the inter-tick times, and designed to ensure
smoothness and monotonicity for its consumers.

It is particularly inaccurate in a tickless kernel, where ntp_err_mult
is not adjusted on each tick, allowing the reported clock to diverge
from the intended time for a large number of ticks before re-converging.

This appears to be the reason why CONFIG_NTP_PPS is not enabled on
tickless kernels — because at that scale of precision, the realtime
snapshot at the time of the pulse bears little relation to the time the
kernel *actually* believes it to be, thus introducing random errors into
the PPS phase correction.

It would be better for callers of get_device_system_crosststamp() and
ktime_get_snapshot_id() to receive the *accurate* time, not the
sanitized version provided to gettimeofday().

Compute the deviation in snapshot_ntp_error() and add it to the returned
::systime so the snapshot lands on the ideal line. It sums four terms in
ns << NTP_SCALE_SHIFT before converting to signed ns:

  - tk->ntp_error, the deviation as of the last update;
  - (cycle_delta * ntp_err_frac), the fractional-mult drift accrued
    since then (cycle_delta is at most a tick on a tickful kernel, but
    many ticks' worth under NO_HZ);
  - (cycle_delta * ntp_err_mult), subtracting the applied +1 mult dither
    over the same span;
  - the sub-nanosecond fraction dropped when the read was truncated to
    whole ns (low shift bits, exact despite the multiply overflowing).

The helper uses the timekeeper selected for the requested clock id, so
all NTP-disciplined clocks are corrected, including the AUX clocks (each
has its own NTP instance); only CLOCK_MONOTONIC_RAW is undisciplined and
gets no correction. The residual is then a single clocksource cycle, the
same bound as a tickful kernel.

Note that this *unconditionally* changes the ::systime returned by all
snapshot and cross timestamp consumers (PTP SYS_OFFSET_PRECISE/EXTENDED,
etc.): it is now the ideal NTP-disciplined time rather than the raw
accumulated clock.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Assisted-by: Kiro:claude-opus-4.8
---
 include/linux/timekeeper_internal.h |  6 +++
 kernel/time/timekeeping.c           | 71 +++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 5dc7f8bf2740..b487e7d925fe 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -97,6 +97,11 @@ struct tk_read_base {
  * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
  *				ntp shifted nano seconds.
  * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @ntp_err_frac:		Fractional part of the per-cycle NTP-ideal mult that the
+ *				integer @mult truncates, as a fraction of 2^32 in
+ *				clock-shifted nanoseconds per cycle. Used to
+ *				extrapolate @ntp_error to an arbitrary cycle count in
+ *				the lockless snapshot readers (ktime_get_snapshot_id).
  * @cs_tick_adj:		Per-second adjustment handed to NTP via ntp_clear()
  *				accounting for the difference between the nominal
  *				NTP interval and the real time taken by the
@@ -187,6 +192,7 @@ struct timekeeper {
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
+	u64			ntp_err_frac;
 	s64			cs_tick_adj;
 	u32			skip_second_overflow;
 	s64			skew_delta;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index de07ef65da32..56f4a22d13d7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -422,6 +422,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	tk->tkr_mono.mult = clock->mult;
 	tk->tkr_raw.mult = clock->mult;
 	tk->ntp_err_mult = 0;
+	tk->ntp_err_frac = 0;
 	tk->skip_second_overflow = 0;
 	tk->skew_delta = 0;
 
@@ -1226,6 +1227,51 @@ static inline u64 tk_clock_read_snapshot(const struct tk_read_base *tkr,
 	return clock->read(clock);
 }
 
+/*
+ * snapshot_ntp_error - record how far a snapshot's ::systime is from the
+ * ideal NTP-disciplined time at @now, in signed nanoseconds, so a caller
+ * can land exactly on the ideal line by adding it to ::systime.
+ *
+ * The value is summed in ns << NTP_SCALE_SHIFT from four parts:
+ *
+ *  - tk->ntp_error, the deviation accumulated as of the last timekeeping
+ *    update (tkr_mono.cycle_last);
+ *  - (cycle_delta * ntp_err_frac), the fractional-mult drift accrued over
+ *    the cycles read since then -- at most a tick on a tickful kernel, but
+ *    potentially many ticks' worth under NO_HZ;
+ *  - (cycle_delta * ntp_err_mult), subtracting the applied +1 mult dither
+ *    over the same span;
+ *  - the sub-nanosecond fraction that ::systime dropped when the read was
+ *    truncated to whole ns (the low @shift bits, exact even though the
+ *    multiply overflows).
+ *
+ * CLOCK_MONOTONIC_RAW is not NTP-disciplined and carries no error. Every
+ * other clock id uses its own timekeeper @tk -- including the AUX clocks,
+ * which each have their own NTP instance.
+ */
+static s64 snapshot_ntp_error(const struct timekeeper *tk, clockid_t clock_id,
+			      u64 now)
+{
+	u64 cycle_delta;
+	u32 nes;
+	s64 tmp, err;
+
+	if (clock_id == CLOCK_MONOTONIC_RAW)
+		return 0;
+
+	cycle_delta = (now - tk->tkr_mono.cycle_last) & tk->tkr_mono.mask;
+	nes = tk->ntp_error_shift;
+
+	err = tk->ntp_error;
+	err += ((s64)mul_u64_u64_shr(cycle_delta, tk->ntp_err_frac, 32) -
+		(s64)(cycle_delta * tk->ntp_err_mult)) << nes;
+
+	tmp = (s64)(cycle_delta * tk->tkr_mono.mult + tk->tkr_mono.xtime_nsec);
+	tmp &= (1ULL << tk->tkr_mono.shift) - 1;
+	err += tmp << nes;
+
+	return (err + (1LL << (NTP_SCALE_SHIFT - 1))) >> NTP_SCALE_SHIFT;
+}
 
 /**
  * ktime_get_snapshot_id -  Simultaneously snapshot a given clock ID with
@@ -1238,6 +1284,7 @@ void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *syst
 {
 	ktime_t base_raw, base_sys, offs_sys, *offs, offs_zero = 0;
 	u64 nsec_raw, nsec_sys, now;
+	s64 ntp_error;
 	struct timekeeper *tk;
 	struct tk_data *tkd;
 	unsigned int seq;
@@ -1300,10 +1347,12 @@ void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *syst
 
 		nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
 		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+
+		ntp_error = snapshot_ntp_error(tk, clock_id, now);
 	} while (read_seqcount_retry(&tkd->seq, seq));
 
 	systime_snapshot->cycles = now;
-	systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys);
+	systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys) + ntp_error;
 	systime_snapshot->monoraw = ktime_add_ns(base_raw, nsec_raw);
 
 	/*
@@ -1552,6 +1601,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	unsigned int seq, clock_was_set_seq = 0;
 	ktime_t base_sys, base_raw, *offs;
 	u64 nsec_sys, nsec_raw;
+	s64 ntp_error;
 	u8 cs_was_changed_seq;
 	bool do_interp;
 	struct timekeeper *tk;
@@ -1617,9 +1667,10 @@ int get_device_system_crosststamp(int (*get_time_fn)
 
 		nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
 		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
+		ntp_error = snapshot_ntp_error(tk, xtstamp->clock_id, cycles);
 	} while (read_seqcount_retry(&tkd->seq, seq));
 
-	xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys);
+	xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys) + ntp_error;
 	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
 
 	/*
@@ -2447,6 +2498,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 {
 	u64 ntp_tl = ntp_tick_length(tk->id);
 	s64 skew = ntp_get_skew_delta(tk->id);
+	u64 dividend;
 	u32 mult;
 
 	/*
@@ -2467,8 +2519,19 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		 * scale it back up to the full per-tick rate for the mult bias.
 		 */
 		skew *= NTP_INTERVAL_FREQ;
-		mult = div64_u64((tk->ntp_tick + skew) >> tk->ntp_error_shift,
-				 tk->cycle_interval);
+		dividend = (tk->ntp_tick + skew) >> tk->ntp_error_shift;
+		mult = div64_u64(dividend, tk->cycle_interval);
+		/*
+		 * Stash the fractional part of the per-cycle ideal mult that
+		 * the integer @mult discards, scaled by 2^32, in clock-shifted
+		 * ns per cycle. The lockless snapshot readers use it to
+		 * extrapolate @ntp_error forward over the cycles accumulated
+		 * since the last tick (which on a NO_HZ kernel may be many
+		 * ticks' worth).
+		 */
+		tk->ntp_err_frac = div64_u64((dividend - (u64)mult *
+					      tk->cycle_interval) << 32,
+					     tk->cycle_interval);
 	}
 
 	/*
-- 
2.43.0


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply related

* Re: [PATCH net] net/mlx5e: Use sender devcom for MPV master-up
From: Tariq Toukan @ 2026-06-22  9:01 UTC (permalink / raw)
  To: Manjunath Patil, Saeed Mahameed, Tariq Toukan, Mark Bloch,
	Leon Romanovsky, netdev
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Patrisious Haddad, linux-rdma, linux-kernel, stable
In-Reply-To: <20260610173915.4053423-1-manjunath.b.patil@oracle.com>



On 10/06/2026 20:39, Manjunath Patil wrote:
> After PCIe DPC recovery, mlx5 reloads the affected functions and
> replays multiport affiliation events. In the reported failure, the
> first relevant device error was:
> 
>    pcieport 0000:10:01.1: DPC: containment event
>    pcieport 0000:10:01.1: PCIe Bus Error: severity=Uncorrected (Fatal)
>    pcieport 0000:10:01.1:    [ 5] SDES                   (First)
> 
> mlx5 recovered the PCI functions and resumed 0000:11:00.1. During
> that resume, RDMA multiport binding replayed
> MLX5_DRIVER_EVENT_AFFILIATION_DONE and mlx5e sent
> MPV_DEVCOM_MASTER_UP. The host then panicked with:
> 
>    BUG: kernel NULL pointer dereference, address: 0000000000000010
>    RIP: mlx5_devcom_comp_set_ready+0x5/0x40 [mlx5_core]
>    RDI: 0000000000000000
> 
> Call trace included:
> 
>    mlx5_devcom_comp_set_ready
>    mlx5e_devcom_event_mpv
>    mlx5_devcom_send_event
>    mlx5_ib_bind_slave_port
>    mlx5r_mp_probe
>    mlx5_pci_resume
> 
> MPV devcom registration publishes mlx5e private data to the component
> peer list before mlx5e_devcom_init_mpv() stores the returned component
> device in priv->devcom. A concurrent master-up event can therefore
> reach a peer whose private data is visible but whose priv->devcom
> backpointer is still NULL.
> 
> MPV_DEVCOM_MASTER_UP already carries the sender/master mlx5e private
> data as event_data. The ready bit is stored on the shared devcom
> component, not on an individual peer. Use the sender devcom when
> marking the MPV component ready.
> 
> This preserves the readiness transition while avoiding a NULL
> dereference of the peer devcom pointer during affiliation replay after
> PCI error recovery.
> 
> Fixes: bf11485f8419 ("net/mlx5: Register mlx5e priv to devcom in MPV mode")
> Assisted-by: Codex:gpt-5
> Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
> Cc: stable@vger.kernel.org # 6.7+
> ---

Thanks for your patch and sorry for the late response.

>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 +++++--
>   1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 8f2b3abe0092..f7ff20b97e8c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -211,11 +211,14 @@ static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
>   
>   static int mlx5e_devcom_event_mpv(int event, void *my_data, void *event_data)
>   {
> -	struct mlx5e_priv *slave_priv = my_data;
> +	struct mlx5e_priv *master_priv = event_data;
>   

makes sense.

>   	switch (event) {
>   	case MPV_DEVCOM_MASTER_UP:
> -		mlx5_devcom_comp_set_ready(slave_priv->devcom, true);
> +		if (!master_priv || !master_priv->devcom)
> +			return -EINVAL;

is this currently possible? or just being defensive?
if this return is unreachable I'd drop it.

> +
> +		mlx5_devcom_comp_set_ready(master_priv->devcom, true);
>   		break;
>   	case MPV_DEVCOM_MASTER_DOWN:
>   		/* no need for comp set ready false since we unregister after


^ permalink raw reply

* Re: [PATCH v3 net] net: watchdog: fix refcount tracking races
From: Eric Dumazet @ 2026-06-22  8:59 UTC (permalink / raw)
  To: Marek Szyprowski
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+381d82bbf0253710b35d,
	syzbot+3479efbc2821cb2a79f2
In-Reply-To: <a443376e-5187-4268-93b3-58047ef113a8@samsung.com>

On Wed, Jun 17, 2026 at 3:48 AM Marek Szyprowski
<m.szyprowski@samsung.com> wrote:
>
> Dear All,
>
> On 11.06.2026 17:27, Eric Dumazet wrote:
> > Blamed commit converted the untracked dev_hold()/dev_put() calls
> > in the watchdog code to use the tracked dev_hold_track()/dev_put_track()
> > (which were later renamed/interfaced to netdev_hold() and netdev_put()).
> >
> > By introducing dev->watchdog_dev_tracker to store the
> > reference tracking information without adding synchronization
> > between netdev_watchdog_up() and dev_watchdog(), it enabled the
> > race condition where this pointer could be overwritten or freed
> > concurrently, leading to the list corruption crash syzbot reported:
> >
> > list_del corruption, ffff888114a18c00->next is NULL
> >  kernel BUG at lib/list_debug.c:52 !
> > Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
> > CPU: 1 UID: 0 PID: 91 Comm: kworker/u8:5 Not tainted syzkaller #0 PREEMPT(lazy)
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
> > Workqueue: events_unbound linkwatch_event
> >  RIP: 0010:__list_del_entry_valid_or_report.cold+0x22/0x2a lib/list_debug.c:52
> > Call Trace:
> >  <TASK>
> >   __list_del_entry_valid include/linux/list.h:132 [inline]
> >   __list_del_entry include/linux/list.h:246 [inline]
> >   list_move_tail include/linux/list.h:341 [inline]
> >   ref_tracker_free+0x1a7/0x6c0 lib/ref_tracker.c:329
> >   netdev_tracker_free include/linux/netdevice.h:4491 [inline]
> >   netdev_put include/linux/netdevice.h:4508 [inline]
> >   netdev_put include/linux/netdevice.h:4504 [inline]
> >   netdev_watchdog_down net/sched/sch_generic.c:600 [inline]
> >   dev_deactivate_many+0x28c/0xfe0 net/sched/sch_generic.c:1363
> >   dev_deactivate+0x109/0x1d0 net/sched/sch_generic.c:1397
> >   linkwatch_do_dev net/core/link_watch.c:184 [inline]
> >   linkwatch_do_dev+0xd3/0x120 net/core/link_watch.c:166
> >   __linkwatch_run_queue+0x3a5/0x810 net/core/link_watch.c:240
> >   linkwatch_event+0x8f/0xc0 net/core/link_watch.c:314
> >   process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314
> >   process_scheduled_works kernel/workqueue.c:3397 [inline]
> >   worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478
> >   kthread+0x370/0x450 kernel/kthread.c:436
> >   ret_from_fork+0x69a/0xc80 arch/x86/kernel/process.c:158
> >   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
> >
> > This patch has three coordinated parts:
> >
> > 1) Add dev->watchdog_lock and dev->watchdog_ref_held to serialize watchdog operations.
> >
> > 2) Remove netdev_watchdog_up() call from netif_carrier_on():
> >    This ensures netdev_watchdog_up() is only called from process/BH context
> >    (via linkwatch workqueue dev_activate()), allowing us to use
> >    spin_lock_bh() for synchronization.
> >
> > 3) Synchronize watchdog up and watchdog timer:
> >    Protect netdev_watchdog_up() with tx_global_lock and watchdog_lock.
> >    Only allocate a new tracker in netdev_watchdog_up() if one is
> >    not already present.
> >    In dev_watchdog(), ensure we don't release the tracker if the
> >    timer was rescheduled either by dev_watchdog() itself or concurrently
> >    by netdev_watchdog_up().
> >
> > Fixes: f12bf6f3f942 ("net: watchdog: add net device refcount tracker")
> > Reported-by: syzbot+381d82bbf0253710b35d@syzkaller.appspotmail.com
> > Closes: https://lore.kernel.org/netdev/6a26b751.c25708ab.1b19ef.0013.GAE@google.com/T/#u
> > Tested-by: syzbot+3479efbc2821cb2a79f2@syzkaller.appspotmail.com
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> This patch landed recently in linux-next as commit 8eed5519e496 ("net: watchdog:
> fix refcount tracking races"). In my tests I found that it causes the following
> deadlock during system suspend/resume on QEmu's ARM64bit 'virt' machine:
>
> root@target:~# time rtcwake -s10 -mmem
> rtcwake: assuming RTC uses UTC ...
> rtcwake: wakeup from "mem" using /dev/rtc0 at Wed Jun 17 10:46:12 2026
> PM: suspend entry (s2idle)
> Filesystems sync: 0.055 seconds
> Freezing user space processes
> Freezing user space processes completed (elapsed 0.006 seconds)
> OOM killer disabled.
> Freezing remaining freezable tasks
> Freezing remaining freezable tasks completed (elapsed 0.003 seconds)
>
> ============================================
> WARNING: possible recursive locking detected
> 7.1.0-rc7+ #13003 Not tainted
> --------------------------------------------
> rtcwake/254 is trying to acquire lock:
> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netdev_watchdog_up+0x40/0x108
>
> but task is already holding lock:
> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>
> other info that might help us debug this:
>  Possible unsafe locking scenario:
>
>        CPU0
>        ----
>   lock(&dev->tx_global_lock);
>   lock(&dev->tx_global_lock);
>
>  *** DEADLOCK ***
>
>  May be due to missing lock nesting notation
>
> 6 locks held by rtcwake/254:
>  #0: ffff0000071ab3e8 (sb_writers#5){.+.+}-{0:0}, at: vfs_write+0x1ec/0x35c
>  #1: ffff00000d22c480 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0xf0/0x1c4
>  #2: ffff0000049162c8 (kn->active#61){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1c4
>  #3: ffffaa79533c03b0 (system_transition_mutex){+.+.}-{4:4}, at: pm_suspend+0x98/0x608
>  #4: ffff000005e3a138 (&dev->mutex){....}-{4:4}, at: device_resume+0xb4/0x254
>  #5: ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>
> stack backtrace:
> CPU: 1 UID: 0 PID: 254 Comm: rtcwake Not tainted 7.1.0-rc7+ #13003 PREEMPT
> Hardware name: linux,dummy-virt (DT)
> Call trace:
>  show_stack+0x18/0x24 (C)
>  dump_stack_lvl+0x90/0xd0
>  dump_stack+0x18/0x24
>  print_deadlock_bug+0x260/0x350
>  __lock_acquire+0x11b8/0x225c
>  lock_acquire+0x1c4/0x3f0
>  _raw_spin_lock_bh+0x50/0x68
>  netdev_watchdog_up+0x40/0x108
>  netif_device_attach+0x9c/0xb0
>  virtnet_restore+0x100/0x21c
>  virtio_device_restore_priv+0x11c/0x1d0
>  virtio_device_restore+0x14/0x20
>  virtio_mmio_restore+0x34/0x40
>  platform_pm_resume+0x2c/0x68
>  dpm_run_callback+0xa0/0x240
>  device_resume+0x120/0x254
>  dpm_resume+0x1f8/0x2ec
>  dpm_resume_end+0x18/0x34
>  suspend_devices_and_enter+0x1d0/0x990
>  pm_suspend+0x1ec/0x608
>  state_store+0x8c/0x110
>  kobj_attr_store+0x18/0x2c
>  sysfs_kf_write+0x50/0x7c
>  kernfs_fop_write_iter+0x130/0x1c4
>  vfs_write+0x2b8/0x35c
>  ksys_write+0x6c/0x104
>  __arm64_sys_write+0x1c/0x28
>  invoke_syscall+0x54/0x110
>  el0_svc_common.constprop.0+0x40/0xe8
>  do_el0_svc+0x20/0x2c
>  el0_svc+0x54/0x338
>  el0t_64_sync_handler+0xa0/0xe4
>  el0t_64_sync+0x198/0x19c
>
>
> Reverting $subject on top of linux-next fixes this issue.

Thanks for the report Marek!

Acquiring tx_global_lock in netdev_watchdog_up() appears unnecessary anyway
because the critical state (timer and refcount tracker) is already
protected by dev->watchdog_lock.

Could you try this patch?

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3f1c510df850dbdbaf10d483547c7b1f3a5d5482..ef2b4bf51564173751c74fefe17e3913ed2fa056
100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -594,9 +594,8 @@ void netdev_watchdog_up(struct net_device *dev)
                return;
        if (dev->watchdog_timeo <= 0)
                dev->watchdog_timeo = 5*HZ;
-       spin_lock_bh(&dev->tx_global_lock);

-       spin_lock(&dev->watchdog_lock);
+       spin_lock_bh(&dev->watchdog_lock);
        if (!mod_timer(&dev->watchdog_timer,
                       round_jiffies(jiffies + dev->watchdog_timeo))) {
                if (!dev->watchdog_ref_held) {
@@ -605,9 +604,7 @@ void netdev_watchdog_up(struct net_device *dev)
                        dev->watchdog_ref_held = true;
                }
        }
-       spin_unlock(&dev->watchdog_lock);
-
-       spin_unlock_bh(&dev->tx_global_lock);
+       spin_unlock_bh(&dev->watchdog_lock);
 }
 EXPORT_SYMBOL_GPL(netdev_watchdog_up);

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox