Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 5/6] selftest: add tc-testing JSON test cases for act_frer
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Add a tc-testing JSON file covering the FRER (IEEE 802.1CB Frame
Replication and Elimination for Reliability) tc action (act_frer).

The test suite contains 32 test cases and exercises:

 - Creating push and recover actions with default and explicit parameters
   (tag-type, alg vector/match, history-length, reset-time, tag-pop,
   individual, take-no-seq)
 - Boundary values for history-length (1 and 32) and reset-time (0)
 - Combining multiple flags (frer_0011, frer_0012)
 - Statistics output format for push (SeqGen) and recover (passed,
   discarded, tagless, out-of-order, rogue, lost, resets)
 - Replace and delete operations
 - Flush all actions
 - Duplicate-index failure (expExitCode 255)
 - Control actions (continue, pipe) placed after the index token
 - Binding push and recover actions to egress/ingress clsact filters
 - Sharing a recover action across two filters and verifying the
   reference count increments
 - not_in_hw flag present in show output

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 .../tc-testing/tc-tests/actions/frer.json     | 785 ++++++++++++++++++
 1 file changed, 785 insertions(+)
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/frer.json

diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json b/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json
new file mode 100644
index 000000000000..d5be6ae156f7
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/frer.json
@@ -0,0 +1,785 @@
+[
+  {
+    "id": "frer_0001",
+    "name": "Create frer push action with default parameters",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0002",
+    "name": "Create frer push action with explicit tag-type rtag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push tag-type rtag index 2",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 2",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 2",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0003",
+    "name": "Create frer recover action with default parameters",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10 alg vector history-length [0-9]+ reset-time [0-9]+",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0004",
+    "name": "Create frer recover action with vector algorithm explicit",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector index 11",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 11",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 11 alg vector",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0005",
+    "name": "Create frer recover action with match algorithm",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg match index 12",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 12",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 12 alg match",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0006",
+    "name": "Create frer recover action with history-length 16",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 16 index 13",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 13",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 13 alg vector history-length 16",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0007",
+    "name": "Create frer recover action with reset-time 2000",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector reset-time 2000 index 14",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 14",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 14 alg vector history-length [0-9]+ reset-time 2000",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0008",
+    "name": "Create frer recover action with tag-pop flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover tag-pop index 15",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 15",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 15.*tag-pop",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0009",
+    "name": "Create frer recover action with individual flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover individual index 16",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 16",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 16.*individual",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0010",
+    "name": "Create frer recover action with take-no-seq flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover take-no-seq index 17",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 17",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 17.*take-no-seq",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0011",
+    "name": "Create frer recover action with all parameters combined",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 16 reset-time 1000 tag-pop individual index 20",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 20",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 20 individual alg vector history-length 16 reset-time 1000 tag-pop",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0012",
+    "name": "Create frer recover action with match alg and all flags",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg match take-no-seq tag-pop individual index 21",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 21",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 21 individual alg match history-length [0-9]+ reset-time [0-9]+ tag-pop take-no-seq",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0013",
+    "name": "Show frer push action SeqGen statistics (zero after create)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 1",
+    "matchPattern": "SeqGen packets: 0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0014",
+    "name": "Show frer recover action Statistics line (zero after create)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover alg vector history-length 16 reset-time 1000 tag-pop index 10"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "Statistics: passed=0 discarded=0 tagless=0 out-of-order=0 rogue=0 lost=0 resets=0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0015",
+    "name": "Show frer recover action Statistics fields present",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover index 10"
+    ],
+    "cmdUnderTest": "$TC -s actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "Statistics: passed=[0-9]+ discarded=[0-9]+ tagless=[0-9]+ out-of-order=[0-9]+ rogue=[0-9]+ lost=[0-9]+ resets=[0-9]+",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0016",
+    "name": "Replace frer push action (same index)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions replace action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0017",
+    "name": "Replace frer recover action changing algorithm from vector to match",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover alg vector index 10"
+    ],
+    "cmdUnderTest": "$TC actions replace action frer recover alg match index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10 alg match",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0018",
+    "name": "Delete frer push action by index",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions del action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "frer push tag-type rtag index 1",
+    "matchCount": "0",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0019",
+    "name": "Flush all frer actions",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1",
+      "$TC actions add action frer recover index 10",
+      "$TC actions add action frer recover index 11"
+    ],
+    "cmdUnderTest": "$TC actions flush action frer",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "action order [0-9]+: frer",
+    "matchCount": "0",
+    "teardown": [
+      "$TC actions flush action frer 2>/dev/null || true"
+    ]
+  },
+  {
+    "id": "frer_0020",
+    "name": "Add duplicate frer action index fails without replace flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1",
+    "expExitCode": "255",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0021",
+    "name": "Create frer push action with continue control action",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer push index 1 continue",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "action order [0-9]+: frer push tag-type rtag index 1.*control continue",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0022",
+    "name": "Create frer recover action with pipe control action",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover index 10 pipe",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 10.*control pipe",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0023",
+    "name": "Create frer recover action history-length minimum boundary (1)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 1 index 30",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 30",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 30 alg vector history-length 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0024",
+    "name": "Create frer recover action history-length maximum boundary (32)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector history-length 32 index 31",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 31",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 31 alg vector history-length 32",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0025",
+    "name": "Create frer recover action with reset-time 0 (timer disabled)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ]
+    ],
+    "cmdUnderTest": "$TC actions add action frer recover alg vector reset-time 0 index 32",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 32",
+    "matchPattern": "action order [0-9]+: frer recover tag-type rtag index 32 alg vector history-length [0-9]+ reset-time 0",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0026",
+    "name": "List all frer actions shows correct count",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1",
+      "$TC actions add action frer recover alg vector index 10",
+      "$TC actions add action frer recover alg match tag-pop index 11"
+    ],
+    "cmdUnderTest": "$TC actions show action frer",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer",
+    "matchPattern": "action order [0-9]+: frer",
+    "matchCount": "3",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0027",
+    "name": "Bind frer push action to egress clsact filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy egress protocol ip flower skip_hw action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC filter show dev frer_dummy egress",
+    "matchPattern": "frer push tag-type rtag index 1",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0028",
+    "name": "Bind frer recover action to ingress clsact filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy ingress protocol all flower skip_hw action frer recover alg vector history-length 16 reset-time 1000 tag-pop index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC filter show dev frer_dummy ingress",
+    "matchPattern": "frer recover tag-type rtag index 10 alg vector history-length 16 reset-time 1000",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0029",
+    "name": "Share frer recover action across two ingress filters (refcount check)",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_a 2>/dev/null || true",
+      "ip link del frer_b 2>/dev/null || true",
+      "ip link add frer_a type dummy",
+      "ip link add frer_b type dummy",
+      "ip link set frer_a up",
+      "ip link set frer_b up",
+      "$TC qdisc add dev frer_a clsact",
+      "$TC qdisc add dev frer_b clsact",
+      "$TC filter add dev frer_a ingress protocol all flower skip_hw action frer recover alg vector history-length 16 tag-pop index 10"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_b ingress protocol all flower skip_hw action frer recover index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 10",
+    "matchPattern": "ref [2-9][0-9]*",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_a clsact",
+      "$TC qdisc del dev frer_b clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_a",
+      "ip link del frer_b"
+    ]
+  },
+  {
+    "id": "frer_0030",
+    "name": "frer push action refcount increments when bound to filter",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "ip link del frer_dummy 2>/dev/null || true",
+      "ip link add frer_dummy type dummy",
+      "ip link set frer_dummy up",
+      "$TC qdisc add dev frer_dummy clsact",
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC filter add dev frer_dummy egress protocol ip flower skip_hw action frer push index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC -s actions show action frer index 1",
+    "matchPattern": "ref [2-9][0-9]*",
+    "matchCount": "1",
+    "teardown": [
+      "$TC qdisc del dev frer_dummy clsact",
+      "$TC actions flush action frer",
+      "ip link del frer_dummy"
+    ]
+  },
+  {
+    "id": "frer_0031",
+    "name": "frer push output shows not_in_hw flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer push index 1"
+    ],
+    "cmdUnderTest": "$TC actions show action frer index 1",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 1",
+    "matchPattern": "not_in_hw",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  },
+  {
+    "id": "frer_0032",
+    "name": "frer recover output shows not_in_hw flag",
+    "category": [
+      "actions",
+      "frer"
+    ],
+    "setup": [
+      [
+        "modprobe act_frer",
+        0,
+        1
+      ],
+      "$TC actions add action frer recover index 10"
+    ],
+    "cmdUnderTest": "$TC actions show action frer index 10",
+    "expExitCode": "0",
+    "verifyCmd": "$TC actions show action frer index 10",
+    "matchPattern": "not_in_hw",
+    "matchCount": "1",
+    "teardown": [
+      "$TC actions flush action frer"
+    ]
+  }
+]
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 2/6] uapi: pkt_cls: add TCA_ID_FRER action identifier
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Register TCA_ID_FRER in the global tc action ID enum so that the FRER
tc action can be identified uniquely among all tc actions.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/uapi/linux/pkt_cls.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 28d94b11d1aa..9b87f0455110 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -139,6 +139,7 @@ enum tca_id {
 	TCA_ID_MPLS,
 	TCA_ID_CT,
 	TCA_ID_GATE,
+	TCA_ID_FRER,
 	/* other actions go here */
 	__TCA_ID_MAX = 255
 };
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 4/6] net: sched: act_frer: add FRER tc action
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

Introduce the FRER tc action for IEEE 802.1CB.  This patch adds the
module skeleton, the shared sequence-generator infrastructure, the
TCA_FRER_FUNC_PUSH data path, and the TCA_FRER_FUNC_RECOVER data path.

Sequence generation (IEEE 802.1CB Section 7.4.1):
  Each push action embeds a struct frer_seqgen directly in tcf_frer,
  protected by a per-action spinlock.  The sequence counter wraps at
  65536 (16-bit R-TAG field).  When a Talker chains "action frer push"
  with "action mirred egress mirror", both the primary and the mirrored
  frame carry the same R-TAG because mirred copies the already-modified
  skb.  No changes to act_mirred are required (Split function,
  Section 7.7).

Sequence Recovery vs. Individual Recovery (IEEE 802.1CB Section 7.5):

  Sequence Recovery (cross-port deduplication):
    Multiple ingress filters on different ports share one recover
    action by referencing the same action index.  They all operate on
    the same struct frer_rcvy embedded in that tcf_frer instance and
    protected by a spinlock.  A frame arriving on any port is checked
    against the shared sequence history; the first copy passes and all
    later copies with the same sequence number are discarded.

  Individual Recovery (per-port independent deduplication):
    Each action uses its own frer_rcvy embedded directly in tcf_frer.
    Selected when the user sets the "individual" flag.

Recovery algorithms:
  Vector (7.4.3.4, default): 32-bit history bit-vector, handles
    out-of-order delivery within the window.
  Match (7.4.3.5): remembers only the last accepted sequence number.

Reset timer:
  An hrtimer fires after frerSeqRcvyResetMSec ms of inactivity.
  CLOCK_MONOTONIC is used throughout.  The reset runs in a workqueue
  to avoid holding the spinlock in the hrtimer callback.

R-TAG wire format (IEEE 802.1CB 7.8, EtherType 0xF1C1):
  [Dst MAC 6B][Src MAC 6B][Optional 802.1Q tag 4B][0xF1C1 2B]
  [Reserved 2B][Sequence Number 2B][Encapsulated EtherType 2B][Payload]

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/net/flow_offload.h   |  11 +
 include/net/tc_act/tc_frer.h |  71 +++
 net/sched/Kconfig            |  16 +
 net/sched/Makefile           |   1 +
 net/sched/act_frer.c         | 835 +++++++++++++++++++++++++++++++++++
 5 files changed, 934 insertions(+)
 create mode 100644 include/net/tc_act/tc_frer.h
 create mode 100644 net/sched/act_frer.c

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 70a02ee14308..8d97a5f293e6 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -184,6 +184,7 @@ enum flow_action_id {
 	FLOW_ACTION_VLAN_PUSH_ETH,
 	FLOW_ACTION_VLAN_POP_ETH,
 	FLOW_ACTION_CONTINUE,
+	FLOW_ACTION_FRER,
 	NUM_FLOW_ACTIONS,
 };
 
@@ -329,6 +330,16 @@ struct flow_action_entry {
 		struct {				/* FLOW_ACTION_PPPOE_PUSH */
 			u16		sid;
 		} pppoe;
+		struct {                                /* FLOW_ACTION_FRER */
+			u8		func;
+			u8		tag_type;
+			bool		individual;
+			u8		rcvy_alg;
+			u8		rcvy_history_len;
+			u32		rcvy_reset_msec;
+			bool		tag_pop;
+			bool		take_no_seq;
+		} frer;
 	};
 	struct flow_action_cookie *user_cookie; /* user defined action cookie */
 };
diff --git a/include/net/tc_act/tc_frer.h b/include/net/tc_act/tc_frer.h
new file mode 100644
index 000000000000..5f6f8ca70813
--- /dev/null
+++ b/include/net/tc_act/tc_frer.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright 2026 NXP */
+
+#ifndef __NET_TC_FRER_H
+#define __NET_TC_FRER_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_frer.h>
+
+/**
+ * struct frer_seqgen - sequence number generator state (embedded in tcf_frer)
+ */
+struct frer_seqgen {
+	u32		gen_seq_num;
+	u64		seq_space;	/* 1 << 16 */
+	spinlock_t	lock;		/* protects frer_seqgen state */
+	u64		stats_pkts;	/* frerCpsSeqGenPackets */
+};
+
+/**
+ * struct frer_rcvy - sequence recovery state (embedded in tcf_frer)
+ *
+ */
+struct frer_rcvy {
+	u8		alg;
+	u8		history_len;	/* 1-32 */
+	u32		reset_msec;
+	u64		seq_space;
+	u32		rcvy_seq_num;
+	u32		seq_history;
+	bool		take_any;
+	bool		take_no_seq;
+	struct hrtimer	hrtimer;
+	spinlock_t	lock;		/* protects frer_rcvy state */
+	/* statistics */
+	u64		stats_tagless_pkts;
+	u64		stats_out_of_order_pkts;
+	u64		stats_rogue_pkts;
+	u64		stats_lost_pkts;
+	u64		stats_resets;
+	u64		stats_passed_pkts;
+	u64		stats_discarded_pkts;
+};
+
+/**
+ * struct tcf_frer - per tc_action FRER private data
+ */
+struct tcf_frer {
+	struct tc_action	common;
+	u8			func;
+	u8			tag_type;
+	bool			tag_pop;
+	bool			individual;	/* Individual Recovery flag */
+	/* push path */
+	struct frer_seqgen	seqgen;
+	/* recover path */
+	struct frer_rcvy	rcvy;
+};
+
+#define to_frer(a) ((struct tcf_frer *)(a))
+
+static inline bool is_tcf_frer(const struct tc_action *a)
+{
+#ifdef CONFIG_NET_CLS_ACT
+	if (a->ops && a->ops->id == TCA_ID_FRER)
+		return true;
+#endif
+	return false;
+}
+
+#endif /* __NET_TC_FRER_H */
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 6ddff028b81a..7ca79b3eb5b3 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -939,6 +939,22 @@ config NET_ACT_GATE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_gate.
 
+config NET_ACT_FRER
+	tristate "IEEE 802.1CB FRER tc action"
+	depends on NET_CLS_ACT
+	help
+	  Say Y here to enable the IEEE 802.1CB FRER tc action.  The action
+	  implements the Sequence Generation Function (egress R-TAG insertion
+	  with shared per-stream sequence counter) and the Sequence Recovery
+	  Function (ingress duplicate detection and elimination) described in
+	  IEEE 802.1CB-2017.
+
+	  Both Sequence Recovery (cross-port shared state via rcvy-id) and
+	  Individual Recovery (per-port independent state) are supported.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_frer.
+
 config NET_IFE_SKBMARK
 	tristate "Support to encoding decoding skb mark on IFE action"
 	depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5078ea84e6ad..d9f60434e7d7 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_NET_IFE_SKBTCINDEX)	+= act_meta_skbtcindex.o
 obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_ACT_CT)	+= act_ct.o
 obj-$(CONFIG_NET_ACT_GATE)	+= act_gate.o
+obj-$(CONFIG_NET_ACT_FRER)	+= act_frer.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
 obj-$(CONFIG_NET_SCH_HFSC)	+= sch_hfsc.o
diff --git a/net/sched/act_frer.c b/net/sched/act_frer.c
new file mode 100644
index 000000000000..7b6db643788d
--- /dev/null
+++ b/net/sched/act_frer.c
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2026 NXP */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/if_vlan.h>
+#include <linux/hrtimer.h>
+#include <linux/workqueue.h>
+#include <net/act_api.h>
+#include <net/netlink.h>
+#include <net/pkt_cls.h>
+#include <net/tc_act/tc_frer.h>
+
+/* ------------------------------------------------------------------ */
+/* R-TAG wire structures (IEEE 802.1CB 7.8)                          */
+/* ------------------------------------------------------------------ */
+
+struct r_tag {
+	__be16 reserved;
+	__be16 sequence_nr;
+	__be16 encap_proto;
+} __packed;
+
+static struct tc_action_ops act_frer_ops;
+
+/* ------------------------------------------------------------------ */
+/* Recovery reset machinery                                            */
+/* ------------------------------------------------------------------ */
+
+struct frer_rcvy_work {
+	struct work_struct	work;
+	struct frer_rcvy	*rcvy;
+};
+
+static void frer_rcvy_reset(struct frer_rcvy *rcvy)
+{
+	if (rcvy->alg == TCA_FRER_RCVY_VECTOR_ALG) {
+		rcvy->rcvy_seq_num = (u32)(rcvy->seq_space - 1);
+		rcvy->seq_history  = 0;
+	}
+	rcvy->take_any = true;
+	rcvy->stats_resets++;
+}
+
+static void frer_rcvy_reset_work_fn(struct work_struct *work)
+{
+	struct frer_rcvy_work *rw =
+		container_of(work, struct frer_rcvy_work, work);
+	struct frer_rcvy *rcvy = rw->rcvy;
+
+	spin_lock_bh(&rcvy->lock);
+	frer_rcvy_reset(rcvy);
+	spin_unlock_bh(&rcvy->lock);
+	kfree(rw);
+}
+
+static enum hrtimer_restart frer_rcvy_hrtimer_fn(struct hrtimer *timer)
+{
+	struct frer_rcvy *rcvy =
+		container_of(timer, struct frer_rcvy, hrtimer);
+	struct frer_rcvy_work *rw;
+
+	/* Allocate in GFP_ATOMIC context; if it fails the state is not
+	 * reset this cycle - the next frame will attempt again.
+	 */
+	rw = kmalloc_obj(*rw);
+	if (rw) {
+		INIT_WORK(&rw->work, frer_rcvy_reset_work_fn);
+		rw->rcvy = rcvy;
+		schedule_work(&rw->work);
+	}
+	return HRTIMER_NORESTART;
+}
+
+static void frer_rcvy_timer_restart(struct frer_rcvy *rcvy)
+{
+	if (rcvy->reset_msec)
+		hrtimer_start(&rcvy->hrtimer,
+			      ms_to_ktime(rcvy->reset_msec),
+			      HRTIMER_MODE_REL_SOFT);
+}
+
+static void frer_rcvy_init_state(struct frer_rcvy *rcvy, u8 alg,
+				 u8 history_len, u32 reset_msec,
+				 bool take_no_seq)
+{
+	rcvy->alg          = alg;
+	rcvy->history_len  = history_len;
+	rcvy->reset_msec   = reset_msec;
+	rcvy->seq_space    = 1 << 16;
+	rcvy->take_no_seq  = take_no_seq;
+	rcvy->take_any     = true;
+	rcvy->rcvy_seq_num = (u32)(rcvy->seq_space - 1);
+	rcvy->seq_history  = 0;
+	spin_lock_init(&rcvy->lock);
+	hrtimer_setup(&rcvy->hrtimer, frer_rcvy_hrtimer_fn, CLOCK_MONOTONIC,
+		      HRTIMER_MODE_REL_SOFT);
+}
+
+/* ------------------------------------------------------------------ */
+/* R-TAG helpers                                                       */
+/* ------------------------------------------------------------------ */
+
+static int frer_rtag_push(struct sk_buff *skb, u16 seq_num)
+{
+	unsigned char *new_mac_header;
+	unsigned int data_offset;
+	unsigned int head_len;
+	struct vlan_ethhdr *vh;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+	__be16 saved_proto;
+
+	if (!skb_mac_header_was_set(skb))
+		return -EINVAL;
+
+	data_offset = skb->data - skb_mac_header(skb);
+
+	if (skb_cow_head(skb, data_offset + sizeof(*rtag)))
+		return -ENOMEM;
+
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		if (!pskb_may_pull(skb, sizeof(*vh)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		if (!pskb_may_pull(skb, sizeof(*eh)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	saved_proto = *proto_ptr;
+	*proto_ptr = htons(ETH_P_RTAG);
+
+	skb_push(skb, sizeof(*rtag));
+	skb_reset_mac_header(skb);
+
+	new_mac_header = skb_mac_header(skb);
+	memmove(new_mac_header, (unsigned char *)eh, head_len);
+
+	skb->protocol = htons(ETH_P_RTAG);
+	skb_set_network_header(skb, head_len);
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+
+	/* Write R-TAG after the Ethernet / VLAN header */
+	rtag = (struct r_tag *)(new_mac_header + head_len);
+	rtag->reserved    = 0;
+	rtag->sequence_nr = htons(seq_num);
+	rtag->encap_proto = saved_proto;
+
+	return 0;
+}
+
+static void frer_rtag_pop(struct sk_buff *skb)
+{
+	unsigned char *new_mac_header;
+	unsigned int data_offset;
+	unsigned int head_len;
+	struct vlan_ethhdr *vh;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+
+	data_offset = skb->data - skb_mac_header(skb);
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	if (*proto_ptr != htons(ETH_P_RTAG))
+		return;
+
+	rtag = (struct r_tag *)((unsigned char *)eh + head_len);
+	*proto_ptr = rtag->encap_proto;
+
+	skb->protocol = rtag->encap_proto;
+
+	skb_postpull_rcsum(skb, rtag, sizeof(struct r_tag));
+	skb_pull(skb, sizeof(*rtag));
+	skb_reset_mac_header(skb);
+
+	new_mac_header = skb_mac_header(skb);
+	memmove(new_mac_header, (unsigned char *)eh, head_len);
+
+	skb_set_network_header(skb, head_len);
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+}
+
+static int frer_rtag_decode(struct sk_buff *skb, int *seq)
+{
+	unsigned int data_offset;
+	struct vlan_ethhdr *vh;
+	unsigned int head_len;
+	struct ethhdr *eh;
+	struct r_tag *rtag;
+	__be16 *proto_ptr;
+
+	if (!skb_mac_header_was_set(skb))
+		return -EINVAL;
+
+	data_offset = skb->data - skb_mac_header(skb);
+
+	if (skb_cow_head(skb, data_offset))
+		return -ENOMEM;
+
+	if (data_offset > 0)
+		skb_push(skb, data_offset);
+
+	eh = eth_hdr(skb);
+	if (eth_type_vlan(eh->h_proto)) {
+		if (!pskb_may_pull(skb, sizeof(*vh) + sizeof(*rtag)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		vh = (struct vlan_ethhdr *)eh;
+		proto_ptr = &vh->h_vlan_encapsulated_proto;
+		head_len = sizeof(*vh);
+	} else {
+		if (!pskb_may_pull(skb, sizeof(*eh) + sizeof(*rtag)))
+			return -EINVAL;
+		eh = eth_hdr(skb);
+		proto_ptr = &eh->h_proto;
+		head_len = sizeof(*eh);
+	}
+
+	if (data_offset > 0)
+		skb_pull(skb, data_offset);
+
+	if (*proto_ptr != htons(ETH_P_RTAG)) {
+		*seq = -1;
+		return 0;
+	}
+
+	rtag = (struct r_tag *)((unsigned char *)eh + head_len);
+
+	*seq = (int)ntohs(rtag->sequence_nr);
+
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+/* Recovery algorithms (called with rcvy->lock held)                  */
+/* ------------------------------------------------------------------ */
+
+/* Returns true = pass frame, false = discard frame.
+ * @individual: when true, restart the reset timer even on discarded frames
+ *   (rogue/duplicate), as required for Individual Recovery (IEEE 802.1CB 7.5).
+ */
+static bool frer_vector_alg(struct frer_rcvy *rcvy, int seq, bool individual)
+{
+	int delta;
+	bool restart_timer = false;
+	bool pass;
+
+	if (seq < 0) {
+		/* No R-TAG present */
+		rcvy->stats_tagless_pkts++;
+		if (rcvy->take_no_seq) {
+			restart_timer = true;
+			pass = true;
+		} else {
+			pass = false;
+		}
+		goto out;
+	}
+
+	if (rcvy->take_any) {
+		/* First frame after reset: accept unconditionally */
+		rcvy->take_any     = false;
+		rcvy->rcvy_seq_num = (u32)seq;
+		rcvy->seq_history  = BIT(0);
+		restart_timer = true;
+		pass = true;
+		goto out;
+	}
+
+	delta = (seq - (int)rcvy->rcvy_seq_num) &
+		(int)(rcvy->seq_space - 1);
+	/* Map delta > seq_space/2 to negative (signed wrap) */
+	if ((u32)delta & (u32)(rcvy->seq_space / 2))
+		delta -= (int)rcvy->seq_space;
+
+	if (delta >= (int)rcvy->history_len ||
+	    delta <= -(int)rcvy->history_len) {
+		/* Packet is out-of-range (rogue). */
+		rcvy->stats_rogue_pkts++;
+		if (individual)
+			restart_timer = true;
+		pass = false;
+		goto out;
+	}
+
+	if (delta <= 0) {
+		/* Packet is old: check whether already seen. */
+		if (rcvy->seq_history & BIT(-delta)) {
+			if (individual)
+				restart_timer = true;
+			/* Already received */
+			pass = false;
+		} else {
+			/* Out-of-order but not yet seen */
+			rcvy->seq_history |= BIT(-delta);
+			rcvy->stats_out_of_order_pkts++;
+			restart_timer = true;
+			pass = true;
+		}
+		goto out;
+	}
+
+	/* delta > 0: frame is newer than expected */
+	if (delta != 1)
+		rcvy->stats_out_of_order_pkts++;
+
+	/* Shift history forward, counting any gaps as lost */
+	while (--delta) {
+		if (!(rcvy->seq_history & BIT(rcvy->history_len - 1)))
+			rcvy->stats_lost_pkts++;
+		rcvy->seq_history <<= 1;
+	}
+	if (!(rcvy->seq_history & BIT(rcvy->history_len - 1)))
+		rcvy->stats_lost_pkts++;
+	rcvy->seq_history = (rcvy->seq_history << 1) | BIT(0);
+	rcvy->rcvy_seq_num = (u32)seq;
+	restart_timer = true;
+	pass = true;
+
+out:
+	if (restart_timer)
+		frer_rcvy_timer_restart(rcvy);
+	return pass;
+}
+
+static bool frer_match_alg(struct frer_rcvy *rcvy, int seq, bool individual)
+{
+	if (seq < 0) {
+		/* No R-TAG: Match alg cannot deduplicate, always pass. */
+		rcvy->stats_tagless_pkts++;
+		return true;
+	}
+
+	if (rcvy->take_any) {
+		rcvy->take_any     = false;
+		rcvy->rcvy_seq_num = (u32)seq;
+		frer_rcvy_timer_restart(rcvy);
+		return true;
+	}
+
+	if ((u32)seq == rcvy->rcvy_seq_num) {
+		/* Duplicate */
+		if (individual)
+			frer_rcvy_timer_restart(rcvy);
+		return false;
+	}
+
+	/* New sequence number: accept and update */
+	if ((u32)seq != ((rcvy->rcvy_seq_num + 1) % rcvy->seq_space))
+		rcvy->stats_out_of_order_pkts++;
+	rcvy->rcvy_seq_num = (u32)seq;
+	frer_rcvy_timer_restart(rcvy);
+	return true;
+}
+
+/* ------------------------------------------------------------------ */
+/* Netlink policy                                                      */
+/* ------------------------------------------------------------------ */
+
+static const struct nla_policy frer_policy[TCA_FRER_MAX + 1] = {
+	[TCA_FRER_PARMS]            = NLA_POLICY_EXACT_LEN(sizeof(struct tc_frer)),
+	[TCA_FRER_FUNC]             = { .type = NLA_U8 },
+	[TCA_FRER_TAG_TYPE]         = { .type = NLA_U8 },
+	[TCA_FRER_RCVY_INDIVIDUAL]  = { .type = NLA_FLAG },
+	[TCA_FRER_RCVY_ALG]         = { .type = NLA_U8 },
+	[TCA_FRER_RCVY_HISTORY_LEN] = NLA_POLICY_RANGE(NLA_U8, 1, 32),
+	[TCA_FRER_RCVY_RESET_MSEC]  = { .type = NLA_U32 },
+	[TCA_FRER_RCVY_TAKE_NO_SEQ] = { .type = NLA_FLAG },
+	[TCA_FRER_RCVY_TAG_POP]     = { .type = NLA_FLAG },
+};
+
+/* ------------------------------------------------------------------ */
+/* Action init                                                         */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_init(struct net *net, struct nlattr *nla,
+			 struct nlattr *est, struct tc_action **a,
+			 struct tcf_proto *tp, u32 flags,
+			 struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+	bool bind = flags & TCA_ACT_FLAGS_BIND;
+	struct nlattr *tb[TCA_FRER_MAX + 1];
+	struct tcf_chain *goto_ch = NULL;
+	struct tcf_frer *f;
+	struct tc_frer *parm;
+	bool exists = false;
+	int ret = 0, err, index;
+	u8 func, tag_type;
+
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: attributes required");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FRER_MAX, nla, frer_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_FRER_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_PARMS missing");
+		return -EINVAL;
+	}
+	if (!tb[TCA_FRER_FUNC]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_FUNC missing");
+		return -EINVAL;
+	}
+	if (!tb[TCA_FRER_TAG_TYPE]) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: TCA_FRER_TAG_TYPE missing");
+		return -EINVAL;
+	}
+
+	func     = nla_get_u8(tb[TCA_FRER_FUNC]);
+	tag_type = nla_get_u8(tb[TCA_FRER_TAG_TYPE]);
+
+	if (func != TCA_FRER_FUNC_PUSH && func != TCA_FRER_FUNC_RECOVER) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: unknown func");
+		return -EINVAL;
+	}
+	if (tag_type != TCA_FRER_TAG_RTAG) {
+		NL_SET_ERR_MSG_MOD(extack, "frer: only rtag supported");
+		return -EOPNOTSUPP;
+	}
+
+	parm  = nla_data(tb[TCA_FRER_PARMS]);
+	index = parm->index;
+
+	err = tcf_idr_check_alloc(tn, &index, a, bind);
+	if (err < 0)
+		return err;
+	exists = err;
+
+	if (exists && bind)
+		return ACT_P_BOUND;
+
+	if (!exists) {
+		ret = tcf_idr_create_from_flags(tn, index, est, a,
+						&act_frer_ops, bind, flags);
+		if (ret) {
+			tcf_idr_cleanup(tn, index);
+			return ret;
+		}
+		ret = ACT_P_CREATED;
+	} else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
+		tcf_idr_release(*a, bind);
+		return -EEXIST;
+	}
+
+	err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
+	if (err < 0)
+		goto release_idr;
+
+	f = to_frer(*a);
+
+	spin_lock_bh(&f->tcf_lock);
+	goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+	f->func     = func;
+	f->tag_type = tag_type;
+	f->tag_pop  = !!tb[TCA_FRER_RCVY_TAG_POP];
+
+	if (func == TCA_FRER_FUNC_PUSH) {
+		if (ret == ACT_P_CREATED) {
+			spin_lock_init(&f->seqgen.lock);
+			f->seqgen.seq_space = 1 << 16;
+		}
+		/* gen_seq_num starts at 0 on creation; preserved on replace */
+	} else {
+		u8 alg = tb[TCA_FRER_RCVY_ALG] ?
+			 nla_get_u8(tb[TCA_FRER_RCVY_ALG]) :
+			 TCA_FRER_RCVY_VECTOR_ALG;
+		u8 history_len = tb[TCA_FRER_RCVY_HISTORY_LEN] ?
+				 nla_get_u8(tb[TCA_FRER_RCVY_HISTORY_LEN]) : 32;
+		u32 reset_msec = tb[TCA_FRER_RCVY_RESET_MSEC] ?
+				 nla_get_u32(tb[TCA_FRER_RCVY_RESET_MSEC]) : 0;
+		bool take_no_seq = !!tb[TCA_FRER_RCVY_TAKE_NO_SEQ];
+
+		if (alg != TCA_FRER_RCVY_VECTOR_ALG &&
+		    alg != TCA_FRER_RCVY_MATCH_ALG) {
+			spin_unlock_bh(&f->tcf_lock);
+			NL_SET_ERR_MSG_MOD(extack, "frer: unknown recovery algorithm");
+			err = -EINVAL;
+			goto release_idr;
+		}
+
+		f->individual = !!tb[TCA_FRER_RCVY_INDIVIDUAL];
+
+		/* Cancel any running reset timer before re-initialising. */
+		if (ret != ACT_P_CREATED && f->rcvy.reset_msec) {
+			spin_unlock_bh(&f->tcf_lock);
+			hrtimer_cancel(&f->rcvy.hrtimer);
+			spin_lock_bh(&f->tcf_lock);
+		}
+
+		frer_rcvy_init_state(&f->rcvy, alg, history_len,
+				     reset_msec, take_no_seq);
+	}
+
+	spin_unlock_bh(&f->tcf_lock);
+
+	if (goto_ch)
+		tcf_chain_put_by_act(goto_ch);
+
+	return ret;
+
+release_idr:
+	tcf_idr_release(*a, bind);
+	return err;
+}
+
+/* ------------------------------------------------------------------ */
+/* Data path                                                           */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_act(struct sk_buff *skb, const struct tc_action *a,
+			struct tcf_result *res)
+{
+	struct tcf_frer *f = to_frer(a);
+	int retval;
+
+	tcf_lastuse_update(&f->tcf_tm);
+	tcf_action_update_bstats(&f->common, skb);
+	retval = READ_ONCE(f->tcf_action);
+
+	if (f->func == TCA_FRER_FUNC_PUSH) {
+		struct frer_seqgen *sg = &f->seqgen;
+		u16 seq;
+
+		spin_lock(&sg->lock);
+		seq = (u16)sg->gen_seq_num;
+		if (++sg->gen_seq_num >= sg->seq_space)
+			sg->gen_seq_num = 0;
+		sg->stats_pkts++;
+		spin_unlock(&sg->lock);
+
+		if (frer_rtag_push(skb, seq) < 0) {
+			tcf_action_inc_drop_qstats(&f->common);
+			return TC_ACT_SHOT;
+		}
+	} else {
+		struct frer_rcvy *rcvy = &f->rcvy;
+		bool pass;
+		int seq;
+
+		if (frer_rtag_decode(skb, &seq) < 0) {
+			tcf_action_inc_drop_qstats(&f->common);
+			return TC_ACT_SHOT;
+		}
+
+		spin_lock(&rcvy->lock);
+		if (rcvy->alg == TCA_FRER_RCVY_VECTOR_ALG)
+			pass = frer_vector_alg(rcvy, seq, f->individual);
+		else
+			pass = frer_match_alg(rcvy, seq, f->individual);
+
+		if (pass) {
+			rcvy->stats_passed_pkts++;
+			spin_unlock(&rcvy->lock);
+			if (f->tag_pop)
+				frer_rtag_pop(skb);
+			return retval;
+		}
+
+		rcvy->stats_discarded_pkts++;
+		spin_unlock(&rcvy->lock);
+		return TC_ACT_SHOT;
+	}
+
+	return retval;
+}
+
+/* ------------------------------------------------------------------ */
+/* Dump                                                                */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_dump(struct sk_buff *skb, struct tc_action *a,
+			 int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_frer *f = to_frer(a);
+	struct tc_frer opt = {
+		.index   = f->tcf_index,
+		.refcnt  = refcount_read(&f->tcf_refcnt) - ref,
+		.bindcnt = atomic_read(&f->tcf_bindcnt) - bind,
+	};
+	struct tcf_t t;
+
+	spin_lock_bh(&f->tcf_lock);
+	opt.action = f->tcf_action;
+
+	if (nla_put(skb, TCA_FRER_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_FRER_FUNC, f->func))
+		goto nla_put_failure;
+	if (nla_put_u8(skb, TCA_FRER_TAG_TYPE, f->tag_type))
+		goto nla_put_failure;
+	if (f->tag_pop && nla_put_flag(skb, TCA_FRER_RCVY_TAG_POP))
+		goto nla_put_failure;
+
+	if (f->func == TCA_FRER_FUNC_PUSH) {
+		spin_lock(&f->seqgen.lock);
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_SEQGEN_PKTS,
+				      f->seqgen.stats_pkts, TCA_FRER_PAD)) {
+			spin_unlock(&f->seqgen.lock);
+			goto nla_put_failure;
+		}
+		spin_unlock(&f->seqgen.lock);
+	} else {
+		u64 tagless, ooo, rogue, lost, resets, passed, discarded;
+		struct frer_rcvy *rcvy = &f->rcvy;
+
+		spin_lock(&rcvy->lock);
+		tagless    = rcvy->stats_tagless_pkts;
+		ooo        = rcvy->stats_out_of_order_pkts;
+		rogue      = rcvy->stats_rogue_pkts;
+		lost       = rcvy->stats_lost_pkts;
+		resets     = rcvy->stats_resets;
+		passed     = rcvy->stats_passed_pkts;
+		discarded  = rcvy->stats_discarded_pkts;
+		spin_unlock(&rcvy->lock);
+
+		if (f->individual && nla_put_flag(skb, TCA_FRER_RCVY_INDIVIDUAL))
+			goto nla_put_failure;
+		if (nla_put_u8(skb, TCA_FRER_RCVY_ALG, rcvy->alg))
+			goto nla_put_failure;
+		if (nla_put_u8(skb, TCA_FRER_RCVY_HISTORY_LEN, rcvy->history_len))
+			goto nla_put_failure;
+		if (nla_put_u32(skb, TCA_FRER_RCVY_RESET_MSEC, rcvy->reset_msec))
+			goto nla_put_failure;
+		if (rcvy->take_no_seq && nla_put_flag(skb, TCA_FRER_RCVY_TAKE_NO_SEQ))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_TAGLESS_PKTS,
+				      tagless, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_OUT_OF_ORDER_PKTS,
+				      ooo, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_ROGUE_PKTS,
+				      rogue, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_LOST_PKTS,
+				      lost, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_RESETS,
+				      resets, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_PASSED_PKTS,
+				      passed, TCA_FRER_PAD))
+			goto nla_put_failure;
+		if (nla_put_u64_64bit(skb, TCA_FRER_STATS_DISCARDED_PKTS,
+				      discarded, TCA_FRER_PAD))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&t, &f->tcf_tm);
+	if (nla_put_64bit(skb, TCA_FRER_TM, sizeof(t), &t, TCA_FRER_PAD))
+		goto nla_put_failure;
+
+	spin_unlock_bh(&f->tcf_lock);
+	return skb->len;
+
+nla_put_failure:
+	spin_unlock_bh(&f->tcf_lock);
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* ------------------------------------------------------------------ */
+/* Cleanup                                                             */
+/* ------------------------------------------------------------------ */
+
+static void tcf_frer_cleanup(struct tc_action *a)
+{
+	struct tcf_frer *f = to_frer(a);
+
+	if (f->func == TCA_FRER_FUNC_RECOVER)
+		hrtimer_cancel(&f->rcvy.hrtimer);
+}
+
+/* ------------------------------------------------------------------ */
+/* Walker / search / stats / fill-size / offload                      */
+/* ------------------------------------------------------------------ */
+
+static int tcf_frer_walker(struct net *net, struct sk_buff *skb,
+			   struct netlink_callback *cb, int type,
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
+}
+
+static void tcf_frer_stats_update(struct tc_action *a, u64 bytes, u64 packets,
+				  u64 drops, u64 lastuse, bool hw)
+{
+	struct tcf_frer *f = to_frer(a);
+	struct tcf_t *tm = &f->tcf_tm;
+
+	tcf_action_update_stats(a, bytes, packets, drops, hw);
+	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
+static size_t tcf_frer_get_fill_size(const struct tc_action *act)
+{
+	return nla_total_size(sizeof(struct tc_frer)) /* TCA_FRER_PARMS */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_FUNC */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_TAG_TYPE */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_TAG_POP (flag) */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_INDIVIDUAL (flag) */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_RCVY_ALG */
+		+ nla_total_size(sizeof(u8)) /* TCA_FRER_RCVY_HISTORY_LEN */
+		+ nla_total_size(sizeof(u32)) /* TCA_FRER_RCVY_RESET_MSEC */
+		+ nla_total_size(0) /* TCA_FRER_RCVY_TAKE_NO_SEQ (flag) */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_TAGLESS_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_OUT_OF_ORDER_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_ROGUE_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_LOST_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_RESETS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_PASSED_PKTS */
+		+ nla_total_size_64bit(sizeof(u64)) /* TCA_FRER_STATS_DISCARDED_PKTS */
+		+ nla_total_size_64bit(sizeof(struct tcf_t)); /* TCA_FRER_TM */
+}
+
+static int tcf_frer_offload_act_setup(struct tc_action *act, void *entry_data,
+				      u32 *index_inc, bool bind,
+				      struct netlink_ext_ack *extack)
+{
+	if (bind) {
+		struct flow_action_entry *entry = entry_data;
+		struct tcf_frer *f = to_frer(act);
+
+		entry->id            = FLOW_ACTION_FRER;
+		entry->frer.func     = f->func;
+		entry->frer.tag_type = f->tag_type;
+		entry->frer.tag_pop  = f->tag_pop;
+
+		if (f->func != TCA_FRER_FUNC_PUSH) {
+			entry->frer.individual       = f->individual;
+			entry->frer.rcvy_alg         = f->rcvy.alg;
+			entry->frer.rcvy_history_len = f->rcvy.history_len;
+			entry->frer.rcvy_reset_msec  = f->rcvy.reset_msec;
+			entry->frer.take_no_seq      = f->rcvy.take_no_seq;
+		}
+		*index_inc = 1;
+	} else {
+		struct flow_offload_action *fl_action = entry_data;
+
+		fl_action->id = FLOW_ACTION_FRER;
+	}
+	return 0;
+}
+
+/* ------------------------------------------------------------------ */
+/* Module glue                                                         */
+/* ------------------------------------------------------------------ */
+
+static struct tc_action_ops act_frer_ops = {
+	.kind		    = "frer",
+	.id		    = TCA_ID_FRER,
+	.owner		    = THIS_MODULE,
+	.act		    = tcf_frer_act,
+	.init		    = tcf_frer_init,
+	.cleanup	    = tcf_frer_cleanup,
+	.dump		    = tcf_frer_dump,
+	.walk		    = tcf_frer_walker,
+	.stats_update	    = tcf_frer_stats_update,
+	.get_fill_size	    = tcf_frer_get_fill_size,
+	.offload_act_setup  = tcf_frer_offload_act_setup,
+	.size		    = sizeof(struct tcf_frer),
+};
+
+static __net_init int frer_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, act_frer_ops.net_id);
+
+	return tc_action_net_init(net, tn, &act_frer_ops);
+}
+
+static void __net_exit frer_exit_net(struct list_head *net_list)
+{
+	tc_action_net_exit(net_list, act_frer_ops.net_id);
+}
+
+static struct pernet_operations frer_net_ops = {
+	.init       = frer_init_net,
+	.exit_batch = frer_exit_net,
+	.id         = &act_frer_ops.net_id,
+	.size       = sizeof(struct tc_action_net),
+};
+
+static int __init frer_init_module(void)
+{
+	return tcf_register_action(&act_frer_ops, &frer_net_ops);
+}
+
+static void __exit frer_cleanup_module(void)
+{
+	tcf_unregister_action(&act_frer_ops, &frer_net_ops);
+}
+
+module_init(frer_init_module);
+module_exit(frer_cleanup_module);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IEEE 802.1CB FRER tc action");
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 1/6] uapi: if_ether: add ETH_P_RTAG for IEEE 802.1CB R-TAG
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1
In-Reply-To: <20260622092118.6846-1-xiaoliang.yang_1@nxp.com>

The IEEE 802.1CB-2017 standard defines the Redundancy Tag (R-TAG) with
EtherType 0xF1C1. Add ETH_P_RTAG to the kernel's EtherType definitions
so that it can be used by tc classifiers (e.g. cls_flower) and the FRER
tc action for stream identification on the ingress path.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
---
 include/uapi/linux/if_ether.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index fb5efc8e06cc..2d909078cde1 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -122,6 +122,7 @@
 #define ETH_P_DSA_8021Q	0xDADB		/* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_DSA_A5PSW	0xE001		/* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_IFE	0xED3E		/* ForCES inter-FE LFB type */
+#define ETH_P_RTAG	0xF1C1		/* Redundancy Tag (IEEE 802.1CB) */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_NXP_NETC  0xFD3A		/* NXP NETC DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 0/6] tc: introduce FRER action (IEEE 802.1CB)
From: Xiaoliang Yang @ 2026-06-22  9:21 UTC (permalink / raw)
  To: netdev, linux-kernel, linux-kselftest
  Cc: davem, edumazet, kuba, pabeni, jhs, jiri, horms, shuah,
	vladimir.oltean, vinicius.gomes, fejes, xiaoliang.yang_1

This series introduces a new TC action implementing
Frame Replication and Elimination for Reliability (FRER)
as defined in IEEE 802.1CB.

The FRER action enables:
- Frame replication (push)
- Sequence numbering via R-TAG
- Frame elimination based on sequence recovery

Patch overview:
 1. Add ETH_P_RTAG definition
 2. Introduce TCA_ID_FRER
 3. Add tc_frer uAPI
 4. Implement act_frer kernel module
 5. Add tc-testing selftest JSON coverage
 6. Add kselftest integration test

The implementation currently focuses on software datapath.  Hardware
offload is exposed through the flow offload API (FLOW_ACTION_FRER);
driver-side support for specific hardware will be submitted separately.

Usage scenarios:

=== Scenario 1a: Talker End - single port (no replication) ===

  The simplest case: a single egress path.  The frer push action
  inserts an R-TAG on the egress of the physical interface.  No
  mirror or virtual interface is needed.

    CPU
     |
    eth0 egress clsact:
         action frer push index 1  <- insert R-TAG seq=N
         |
        eth0
    [R-TAG seq=N | payload]
      Path A --> network

  Configuration:

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 egress protocol ip flower skip_hw \
        action frer push index 1

=== Scenario 1b: Talker End - dual port replication via bond + cross-mirror ===

  A bond interface (balance-rr) aggregates both physical ports.  The
  frer push action is placed on each slave's egress; each slave also
  mirrors every outgoing frame to the other slave.  This cross-mirror
  ensures that every frame transmitted by the bond (regardless of which
  slave the round-robin selects) carries an R-TAG and reaches both
  physical links.  If one link goes down, the bond continues on the
  remaining slave without any traffic interruption.

    CPU (socket on bond0)
         |
        bond0 (balance-rr)
        /          \
     eth0            eth1
    egress clsact:   egress clsact:
    action frer push index 1   action frer push index 1
    action mirred egress       action mirred egress
        mirror dev eth1            mirror dev eth0
         |                              |
        eth0                          eth1
    [R-TAG seq=N | payload]   [R-TAG seq=N | payload]
      Path A --> network         Path B --> network

  Configuration:

    ip link add bond0 type bond mode balance-rr miimon 100
    ip link set eth0 master bond0
    ip link set eth1 master bond0
    ip link set eth0 up
    ip link set eth1 up
    ip link set bond0 up
    ip addr add 192.0.2.1/24 dev bond0

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 egress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev eth1

    tc qdisc add dev eth1 clsact
    tc filter add dev eth1 egress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev eth0

=== Scenario 2: Listener End - shared sequence recovery via bond ===

  Both physical ports are bonded (balance-rr).  Each port's ingress
  references the same recover action by index.  The first copy of each
  sequence number passes (R-TAG stripped by tag-pop) and is delivered
  directly to the bond's IP stack; the duplicate is discarded.  No
  separate convergence interface is needed because the bond already
  provides a single IP address over both slaves.

    eth0 (Path A in)            eth1 (Path B in)
    [R-TAG seq=N | payload]     [R-TAG seq=N | payload]
          |                           |
    ingress clsact              ingress clsact
    flower: match stream        flower: match stream
    action frer recover   <-->  action frer recover
        index 10 (shared,           index 10 (shared,
        tag-pop, spinlock           same action object)
        protected)
          |                           |
          +-----------+---------------+
                      |
                   bond0 (IP_DST) ----> IP stack / CPU
                              [payload, R-TAG removed by tag-pop]

  Configuration:

    ip link add bond0 type bond mode balance-rr miimon 100
    ip link set eth0 master bond0
    ip link set eth1 master bond0
    ip link set eth0 up
    ip link set eth1 up
    ip link set bond0 up
    ip addr add 192.0.2.2/24 dev bond0

    tc qdisc add dev eth0 clsact
    tc filter add dev eth0 ingress protocol all flower skip_hw \
        action frer recover alg vector history-length 16 \
            reset-time 2000 tag-pop index 10

    tc qdisc add dev eth1 clsact
    tc filter add dev eth1 ingress protocol all flower skip_hw \
        action frer recover index 10

=== Scenario 3a: Relay System - ingress sequence recovery ===

  A relay node receives redundant streams on two ingress ports and
  eliminates duplicates before forwarding.  The two ingress ports
  share the same recover action by index.  The surviving frame is
  redirected to an egress port and forwarded to the next segment.

    upstream
      
    Path A --> swp0 (ingress)   Path B --> swp1 (ingress)
                 |                          |
           ingress clsact        ingress clsact
           flower: match stream  flower: match stream
           action frer recover   action frer recover
               index 10          index 10 (shared)
           action mirred         action mirred
               redirect              redirect
               dev swp2              dev swp2
                 |                     |
                 +----------+----------+
                            |
                         swp2 --> downstream

  Configuration:

    tc qdisc add dev swp0 clsact
    tc filter add dev swp0 ingress protocol all flower skip_hw \
        action frer recover alg vector history-length 16 \
            reset-time 2000 tag-pop index 10 \
        action mirred egress redirect dev swp2

    tc qdisc add dev swp1 clsact
    tc filter add dev swp1 ingress protocol all flower skip_hw \
        action frer recover index 10 \
        action mirred egress redirect dev swp2

=== Scenario 3b: Relay System - ingress frame replication (push) ===

  A relay node receives frames from a talker on swp0 ingress, inserts
  an R-TAG, and replicates them onto two egress ports towards the next
  network segment.  FDB learning and flooding are disabled on all relay
  ports; MAC forwarding entries are configured statically to prevent
  duplicate frames from looping through the bridge.

    upstream
         |
        swp0 ingress clsact:
         action frer push index 1         <- insert new R-TAG seq=M
         action mirred egress mirror dev swp2 <- copy to Path B'
         action mirred egress redirect dev swp1 <- to Path A'
         |                                      |
        swp1                                  swp2
    [R-TAG seq=M | payload]           [R-TAG seq=M | payload]
      Path A' --> downstream            Path B' --> downstream

  Configuration:

    tc qdisc add dev swp0 clsact
    tc filter add dev swp0 ingress protocol ip flower skip_hw \
        action frer push index 1 \
        action mirred egress mirror dev swp2 \
        action mirred egress redirect dev swp1

    # Disable FDB learning and flooding on all relay ports to prevent
    # duplicate frames from looping back through the bridge.
    bridge link set dev swp0 learning off flood off
    bridge link set dev swp1 learning off flood off
    bridge link set dev swp2 learning off flood off
    bridge fdb add DST_MAC dev swp1 master static
    bridge fdb add DST_MAC dev swp2 master static

Known limitations:

  1. Only R-TAG (EtherType 0xF1C1, IEEE 802.1CB Section 7.8) is
     currently supported as the redundancy tag type.  HSR
     (IEC 62439-3) and PRP (IEC 62439-3) tag formats are defined in
     the UAPI (TCA_FRER_TAG_HSR, TCA_FRER_TAG_PRP) but not yet
     implemented; attempts to use them are rejected with -EOPNOTSUPP.
     Support for HSR and PRP tags will be added in a follow-up series.

Changes since RFC (https://lkml.org/lkml/2021/9/28/535):

  1. The frer action can now be attached to either ingress or egress
     clsact.  For talker-end frame replication the action is placed on
     the egress of the outgoing interface. For relay-system replication
     the action is placed on the ingress of the receiving interface,
     followed by mirred redirect to the egress ports.

  2. Reset timer reworked following Vinicius Costa Gomes' review.

  3. Vector recovery algorithm corrected following Ferenc Fejes' review.

  4. A bond is used on the end system to aggregate two device interfaces.
     addressing Vladimir’s comment that TC-FRER is not applicable to end
     systems. See Scenario 1b(talker end) and Scenario 2(listener end).
     The kselftest script (frer_test.sh) test this on TEST 2.

  5. Added detailed usage scenario descriptions with ASCII topology
     diagrams.  Added tc-testing JSON test cases (32 cases) and a
     TAP-format kselftest script (frer_test.sh) with five end-to-end
     functional tests and one relay bridge topology test.

Xiaoliang Yang (6):
  uapi: if_ether: add ETH_P_RTAG for IEEE 802.1CB R-TAG
  uapi: pkt_cls: add TCA_ID_FRER action identifier
  uapi: tc_act: add tc_frer UAPI header
  net: sched: act_frer: add FRER tc action
  selftest: add tc-testing JSON test cases for act_frer
  selftests: net: add kselftest for IEEE 802.1CB FRER tc action

 include/net/flow_offload.h                    |   11 +
 include/net/tc_act/tc_frer.h                  |   71 ++
 include/uapi/linux/if_ether.h                 |    1 +
 include/uapi/linux/pkt_cls.h                  |    1 +
 include/uapi/linux/tc_act/tc_frer.h           |   89 ++
 net/sched/Kconfig                             |   16 +
 net/sched/Makefile                            |    1 +
 net/sched/act_frer.c                          |  835 ++++++++++++++
 tools/testing/selftests/net/Makefile          |    1 +
 tools/testing/selftests/net/frer_test.sh      | 1013 +++++++++++++++++
 .../tc-testing/tc-tests/actions/frer.json     |  785 +++++++++++++
 11 files changed, 2824 insertions(+)
 create mode 100644 include/net/tc_act/tc_frer.h
 create mode 100644 include/uapi/linux/tc_act/tc_frer.h
 create mode 100644 net/sched/act_frer.c
 create mode 100755 tools/testing/selftests/net/frer_test.sh
 create mode 100644 tools/testing/selftests/tc-testing/tc-tests/actions/frer.json

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH v4 1/3] dt-bindings: net: add Realtek RTL8125 PCIe Ethernet
From: Krzysztof Kozlowski @ 2026-06-22  9:08 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: ricardo, nic_swsd, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Heiko Stuebner, Sebastian Reichel, netdev,
	devicetree, linux-kernel, linux-arm-kernel, linux-rockchip
In-Reply-To: <876a38f8-75ea-4b32-bb65-216cb3adb436@gmail.com>

On Wed, Jun 17, 2026 at 06:43:42PM +0200, Heiner Kallweit wrote:
> On 17.06.2026 14:58, Ricardo Pardini via B4 Relay wrote:
> > From: Ricardo Pardini <ricardo@pardini.net>
> > 
> > Add a binding for fixed/soldered Realtek RTL8125 PCIe Ethernet
> > controller.
> > 
> > The "pciVVVV,DDDD" compatibles are the Open Firmware PCI Bus Binding
> > spelling, auto-derived from PCI-SIG vendor/device IDs, but they still
> > need a binding when used in a board DT - analogous to "usbVVVV,PPPP"

Ricardo,

No, they do not need. They are already documented, they already have a
binding, see: dtschema/schemas/pci/pci-device.yaml


> > compatibles documented in their own bindings (e.g. microchip,lan95xx)
> > so board DTs attaching properties (fixed MAC, nvmem cell, ...) to
> > these PCI function nodes can be validated.
> > 
> > Suggested-by: Sebastian Reichel <sebastian.reichel@collabora.com>
> > Signed-off-by: Ricardo Pardini <ricardo@pardini.net>
> > ---
> >  .../devicetree/bindings/net/realtek,rtl8125.yaml   | 43 ++++++++++++++++++++++
> >  MAINTAINERS                                        |  1 +
> >  2 files changed, 44 insertions(+)
> > 
> > diff --git a/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml b/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml
> > new file mode 100644
> > index 0000000000000..eee13fbc1e6a6
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/net/realtek,rtl8125.yaml
> > @@ -0,0 +1,43 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: http://devicetree.org/schemas/net/realtek,rtl8125.yaml#
> > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > +
> > +title: Realtek RTL8125 2.5 Gigabit PCIe Ethernet Controller
> > +
> > +maintainers:
> > +  - Heiner Kallweit <hkallweit1@gmail.com>
> > +
> > +description:
> > +  The Realtek RTL8125 is a 2.5GBASE-T Ethernet controller with a PCIe host
> > +  interface.
> > +
> > +allOf:
> > +  - $ref: ethernet-controller.yaml#
> > +
> > +properties:
> > +  compatible:
> > +    const: pci10ec,8125
> 
> IIRC we came to the conclusion that the compatible string isn't used in the
> relevant code path. Then why add it here? Is there an alignment on this?

Heiner, it is used - in the DTS.

> If it should be added here, then an explaining comment would be helpful.

Commit msg should explain that.  The compatible is used, so it
must be documented and in fact already is, so you need to specify them
ONLY if device nodes have some other properties, like being an ethernet
controller.

I assume that this is the case here, although that should be mentioned
in the commit msg.

Best regards,
Krzysztof


^ permalink raw reply

* [PATCH v2 1/2] net: fman: fix clk reference leak in read_dts_node()
From: ZhaoJinming @ 2026-06-22  9:05 UTC (permalink / raw)
  To: horms
  Cc: andrew+netdev, davem, edumazet, kuba, linux-kernel, madalin.bucur,
	netdev, pabeni, sean.anderson, zhaojinming
In-Reply-To: <20260619121328.922138-3-horms@kernel.org>

of_clk_get() returns a reference that must be released with clk_put()
when the clock is no longer needed. The current code never calls
clk_put(clk), leaking the reference on both the success path and the
clk_rate == 0 error path.

Add clk_put(clk) after the clock rate is consumed on the success path,
and jump to a new clk_put label on the error path to properly release
the clock reference.

Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 013273a2de32..31b0081bdf91 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2736,11 +2736,13 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		err = -EINVAL;
 		dev_err(&of_dev->dev, "%s: Failed to determine FM%d clock rate\n",
 			__func__, fman->dts_params.id);
-		goto fman_node_put;
+		goto clk_put;
 	}
 	/* Rounding to MHz */
 	fman->dts_params.clk_freq = DIV_ROUND_UP(clk_rate, 1000000);
 
+	clk_put(clk);
+
 	err = of_property_read_u32_array(fm_node, "fsl,qman-channel-range",
 					 &range[0], 2);
 	if (err) {
@@ -2818,6 +2820,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 
 fman_node_put:
 	of_node_put(fm_node);
+clk_put:
+	clk_put(clk);
 fman_free:
 	kfree(fman);
 	return ERR_PTR(err);
-- 
2.20.1


^ permalink raw reply related

* [PATCH v2 2/2] net: fman: use devm_kzalloc() for fman and rely on devres
From: ZhaoJinming @ 2026-06-22  9:05 UTC (permalink / raw)
  To: horms
  Cc: andrew+netdev, davem, edumazet, kuba, linux-kernel, madalin.bucur,
	netdev, pabeni, sean.anderson, zhaojinming
In-Reply-To: <20260622090505.2418478-1-zhaojinming@uniontech.com>

The driver now allocates the top-level struct fman with devm_kzalloc()
so that its lifetime is bound to the device and resources are released
automatically by the driver core on probe failure or device removal.

Remove the explicit kfree(fman) from the error paths in fman_config()
and read_dts_node() to avoid double-free/use-after-free and to follow
the devm_ allocation convention.

After of_find_matching_node() consumes fm_node's reference via
of_node_put(from), the post-muram error paths no longer need to clean
up fm_node, so replace goto fman_free with direct return ERR_PTR(err).

This change complements the existing use of devm_* resources (irq,
ioremap, etc.) and simplifies the error handling paths.

Signed-off-by: ZhaoJinming <zhaojinming@uniontech.com>
---
 drivers/net/ethernet/freescale/fman/fman.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 31b0081bdf91..23b938afe17a 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -1793,8 +1793,6 @@ static int fman_config(struct fman *fman)
 	kfree(fman->cfg);
 err_fm_drv:
 	kfree(fman->state);
-err_fm_state:
-	kfree(fman);
 	return -EINVAL;
 }
 
@@ -2697,7 +2695,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	struct clk *clk;
 	u32 clk_rate;
 
-	fman = kzalloc_obj(*fman);
+	fman = devm_kzalloc(&of_dev->dev, sizeof(*fman), GFP_KERNEL);
 	if (!fman)
 		return ERR_PTR(-ENOMEM);
 
@@ -2759,7 +2757,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		err = -EINVAL;
 		dev_err(&of_dev->dev, "%s: could not find MURAM node\n",
 			__func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	err = of_address_to_resource(muram_node, 0,
@@ -2768,7 +2766,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		of_node_put(muram_node);
 		dev_err(&of_dev->dev, "%s: of_address_to_resource() = %d\n",
 			__func__, err);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	of_node_put(muram_node);
@@ -2778,7 +2776,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (err < 0) {
 		dev_err(&of_dev->dev, "%s: irq %d allocation failed (error = %d)\n",
 			__func__, irq, err);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	if (fman->dts_params.err_irq != 0) {
@@ -2788,7 +2786,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		if (err < 0) {
 			dev_err(&of_dev->dev, "%s: irq %d allocation failed (error = %d)\n",
 				__func__, fman->dts_params.err_irq, err);
-			goto fman_free;
+			return ERR_PTR(err);
 		}
 	}
 
@@ -2796,7 +2794,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (IS_ERR(base_addr)) {
 		err = PTR_ERR(base_addr);
 		dev_err(&of_dev->dev, "%s: devm_ioremap() failed\n", __func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 	fman->dts_params.base_addr = base_addr;
@@ -2808,7 +2806,7 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	if (err) {
 		dev_err(&of_dev->dev, "%s: of_platform_populate() failed\n",
 			__func__);
-		goto fman_free;
+		return ERR_PTR(err);
 	}
 
 #ifdef CONFIG_DPAA_ERRATUM_A050385
@@ -2822,8 +2820,6 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 	of_node_put(fm_node);
 clk_put:
 	clk_put(clk);
-fman_free:
-	kfree(fman);
 	return ERR_PTR(err);
 }
 
-- 
2.20.1


^ permalink raw reply related

* Re: [RFC v2] Enabling CONFIG_NTP_PPS for NOHZ by adding ntp_error to system_time_snapshot
From: David Woodhouse @ 2026-06-22  9:04 UTC (permalink / raw)
  To: Thomas Gleixner, John Stultz, Stephen Boyd, Miroslav Lichvar,
	Richard Cochran, linux-kernel, netdev
  Cc: Rodolfo Giometti, Alexander Gordeev
In-Reply-To: <3b10d2e91b18f49d8a3e6226b08ac8cd9cb49aa6.camel@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 9710 bytes --]

On Sun, 2026-06-21 at 23:30 +0100, David Woodhouse wrote:
> Open question: *how* should this be exposed? It's all very well putting
> it into ktime_get_snapshot_id() like this, and we could easily make an
> argument that pps_get_ts() should just add it unconditionally, because
> *not* doing so makes no sense.

Hm, I'm leaning towards adding it unconditionally in
ktime_get_snapshot_id() and get_device_system_crosststamp(), and not
adding the extra field to the system_time_snapshot at all...

From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 19 Jun 2026 00:00:29 +0100
Subject: [PATCH] timekeeping: Apply extrapolated ntp_error to clock snapshots

The time reported in ::systime of a system_time_snapshot is known to be
slightly inaccurate because of the way that the reported realtime clock
sawtooths around the *intended* time series, limited by the integer mult
value used to calculate the inter-tick times, and designed to ensure
smoothness and monotonicity for its consumers.

It is particularly inaccurate in a tickless kernel, where ntp_err_mult
is not adjusted on each tick, allowing the reported clock to diverge
from the intended time for a large number of ticks before re-converging.

This appears to be the reason why CONFIG_NTP_PPS is not enabled on
tickless kernels — because at that scale of precision, the realtime
snapshot at the time of the pulse bears little relation to the time the
kernel *actually* believes it to be, thus introducing random errors into
the PPS phase correction.

It would be better for callers of get_device_system_crosststamp() and
ktime_get_snapshot_id() to receive the *accurate* time, not the
sanitized version provided to gettimeofday().

Compute the deviation in snapshot_ntp_error() and add it to the returned
::systime so the snapshot lands on the ideal line. It sums four terms in
ns << NTP_SCALE_SHIFT before converting to signed ns:

  - tk->ntp_error, the deviation as of the last update;
  - (cycle_delta * ntp_err_frac), the fractional-mult drift accrued
    since then (cycle_delta is at most a tick on a tickful kernel, but
    many ticks' worth under NO_HZ);
  - (cycle_delta * ntp_err_mult), subtracting the applied +1 mult dither
    over the same span;
  - the sub-nanosecond fraction dropped when the read was truncated to
    whole ns (low shift bits, exact despite the multiply overflowing).

The helper uses the timekeeper selected for the requested clock id, so
all NTP-disciplined clocks are corrected, including the AUX clocks (each
has its own NTP instance); only CLOCK_MONOTONIC_RAW is undisciplined and
gets no correction. The residual is then a single clocksource cycle, the
same bound as a tickful kernel.

Note that this *unconditionally* changes the ::systime returned by all
snapshot and cross timestamp consumers (PTP SYS_OFFSET_PRECISE/EXTENDED,
etc.): it is now the ideal NTP-disciplined time rather than the raw
accumulated clock.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Assisted-by: Kiro:claude-opus-4.8
---
 include/linux/timekeeper_internal.h |  6 +++
 kernel/time/timekeeping.c           | 71 +++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 5dc7f8bf2740..b487e7d925fe 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -97,6 +97,11 @@ struct tk_read_base {
  * @ntp_error_shift:		Shift conversion between clock shifted nano seconds and
  *				ntp shifted nano seconds.
  * @ntp_err_mult:		Multiplication factor for scaled math conversion
+ * @ntp_err_frac:		Fractional part of the per-cycle NTP-ideal mult that the
+ *				integer @mult truncates, as a fraction of 2^32 in
+ *				clock-shifted nanoseconds per cycle. Used to
+ *				extrapolate @ntp_error to an arbitrary cycle count in
+ *				the lockless snapshot readers (ktime_get_snapshot_id).
  * @cs_tick_adj:		Per-second adjustment handed to NTP via ntp_clear()
  *				accounting for the difference between the nominal
  *				NTP interval and the real time taken by the
@@ -187,6 +192,7 @@ struct timekeeper {
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
+	u64			ntp_err_frac;
 	s64			cs_tick_adj;
 	u32			skip_second_overflow;
 	s64			skew_delta;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index de07ef65da32..56f4a22d13d7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -422,6 +422,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	tk->tkr_mono.mult = clock->mult;
 	tk->tkr_raw.mult = clock->mult;
 	tk->ntp_err_mult = 0;
+	tk->ntp_err_frac = 0;
 	tk->skip_second_overflow = 0;
 	tk->skew_delta = 0;
 
@@ -1226,6 +1227,51 @@ static inline u64 tk_clock_read_snapshot(const struct tk_read_base *tkr,
 	return clock->read(clock);
 }
 
+/*
+ * snapshot_ntp_error - record how far a snapshot's ::systime is from the
+ * ideal NTP-disciplined time at @now, in signed nanoseconds, so a caller
+ * can land exactly on the ideal line by adding it to ::systime.
+ *
+ * The value is summed in ns << NTP_SCALE_SHIFT from four parts:
+ *
+ *  - tk->ntp_error, the deviation accumulated as of the last timekeeping
+ *    update (tkr_mono.cycle_last);
+ *  - (cycle_delta * ntp_err_frac), the fractional-mult drift accrued over
+ *    the cycles read since then -- at most a tick on a tickful kernel, but
+ *    potentially many ticks' worth under NO_HZ;
+ *  - (cycle_delta * ntp_err_mult), subtracting the applied +1 mult dither
+ *    over the same span;
+ *  - the sub-nanosecond fraction that ::systime dropped when the read was
+ *    truncated to whole ns (the low @shift bits, exact even though the
+ *    multiply overflows).
+ *
+ * CLOCK_MONOTONIC_RAW is not NTP-disciplined and carries no error. Every
+ * other clock id uses its own timekeeper @tk -- including the AUX clocks,
+ * which each have their own NTP instance.
+ */
+static s64 snapshot_ntp_error(const struct timekeeper *tk, clockid_t clock_id,
+			      u64 now)
+{
+	u64 cycle_delta;
+	u32 nes;
+	s64 tmp, err;
+
+	if (clock_id == CLOCK_MONOTONIC_RAW)
+		return 0;
+
+	cycle_delta = (now - tk->tkr_mono.cycle_last) & tk->tkr_mono.mask;
+	nes = tk->ntp_error_shift;
+
+	err = tk->ntp_error;
+	err += ((s64)mul_u64_u64_shr(cycle_delta, tk->ntp_err_frac, 32) -
+		(s64)(cycle_delta * tk->ntp_err_mult)) << nes;
+
+	tmp = (s64)(cycle_delta * tk->tkr_mono.mult + tk->tkr_mono.xtime_nsec);
+	tmp &= (1ULL << tk->tkr_mono.shift) - 1;
+	err += tmp << nes;
+
+	return (err + (1LL << (NTP_SCALE_SHIFT - 1))) >> NTP_SCALE_SHIFT;
+}
 
 /**
  * ktime_get_snapshot_id -  Simultaneously snapshot a given clock ID with
@@ -1238,6 +1284,7 @@ void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *syst
 {
 	ktime_t base_raw, base_sys, offs_sys, *offs, offs_zero = 0;
 	u64 nsec_raw, nsec_sys, now;
+	s64 ntp_error;
 	struct timekeeper *tk;
 	struct tk_data *tkd;
 	unsigned int seq;
@@ -1300,10 +1347,12 @@ void ktime_get_snapshot_id(clockid_t clock_id, struct system_time_snapshot *syst
 
 		nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
 		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+
+		ntp_error = snapshot_ntp_error(tk, clock_id, now);
 	} while (read_seqcount_retry(&tkd->seq, seq));
 
 	systime_snapshot->cycles = now;
-	systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys);
+	systime_snapshot->systime = ktime_add_ns(base_sys, offs_sys + nsec_sys) + ntp_error;
 	systime_snapshot->monoraw = ktime_add_ns(base_raw, nsec_raw);
 
 	/*
@@ -1552,6 +1601,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	unsigned int seq, clock_was_set_seq = 0;
 	ktime_t base_sys, base_raw, *offs;
 	u64 nsec_sys, nsec_raw;
+	s64 ntp_error;
 	u8 cs_was_changed_seq;
 	bool do_interp;
 	struct timekeeper *tk;
@@ -1617,9 +1667,10 @@ int get_device_system_crosststamp(int (*get_time_fn)
 
 		nsec_sys = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
 		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
+		ntp_error = snapshot_ntp_error(tk, xtstamp->clock_id, cycles);
 	} while (read_seqcount_retry(&tkd->seq, seq));
 
-	xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys);
+	xtstamp->sys_systime = ktime_add_ns(base_sys, nsec_sys) + ntp_error;
 	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
 
 	/*
@@ -2447,6 +2498,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 {
 	u64 ntp_tl = ntp_tick_length(tk->id);
 	s64 skew = ntp_get_skew_delta(tk->id);
+	u64 dividend;
 	u32 mult;
 
 	/*
@@ -2467,8 +2519,19 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		 * scale it back up to the full per-tick rate for the mult bias.
 		 */
 		skew *= NTP_INTERVAL_FREQ;
-		mult = div64_u64((tk->ntp_tick + skew) >> tk->ntp_error_shift,
-				 tk->cycle_interval);
+		dividend = (tk->ntp_tick + skew) >> tk->ntp_error_shift;
+		mult = div64_u64(dividend, tk->cycle_interval);
+		/*
+		 * Stash the fractional part of the per-cycle ideal mult that
+		 * the integer @mult discards, scaled by 2^32, in clock-shifted
+		 * ns per cycle. The lockless snapshot readers use it to
+		 * extrapolate @ntp_error forward over the cycles accumulated
+		 * since the last tick (which on a NO_HZ kernel may be many
+		 * ticks' worth).
+		 */
+		tk->ntp_err_frac = div64_u64((dividend - (u64)mult *
+					      tk->cycle_interval) << 32,
+					     tk->cycle_interval);
 	}
 
 	/*
-- 
2.43.0


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply related

* Re: [PATCH net] net/mlx5e: Use sender devcom for MPV master-up
From: Tariq Toukan @ 2026-06-22  9:01 UTC (permalink / raw)
  To: Manjunath Patil, Saeed Mahameed, Tariq Toukan, Mark Bloch,
	Leon Romanovsky, netdev
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Patrisious Haddad, linux-rdma, linux-kernel, stable
In-Reply-To: <20260610173915.4053423-1-manjunath.b.patil@oracle.com>



On 10/06/2026 20:39, Manjunath Patil wrote:
> After PCIe DPC recovery, mlx5 reloads the affected functions and
> replays multiport affiliation events. In the reported failure, the
> first relevant device error was:
> 
>    pcieport 0000:10:01.1: DPC: containment event
>    pcieport 0000:10:01.1: PCIe Bus Error: severity=Uncorrected (Fatal)
>    pcieport 0000:10:01.1:    [ 5] SDES                   (First)
> 
> mlx5 recovered the PCI functions and resumed 0000:11:00.1. During
> that resume, RDMA multiport binding replayed
> MLX5_DRIVER_EVENT_AFFILIATION_DONE and mlx5e sent
> MPV_DEVCOM_MASTER_UP. The host then panicked with:
> 
>    BUG: kernel NULL pointer dereference, address: 0000000000000010
>    RIP: mlx5_devcom_comp_set_ready+0x5/0x40 [mlx5_core]
>    RDI: 0000000000000000
> 
> Call trace included:
> 
>    mlx5_devcom_comp_set_ready
>    mlx5e_devcom_event_mpv
>    mlx5_devcom_send_event
>    mlx5_ib_bind_slave_port
>    mlx5r_mp_probe
>    mlx5_pci_resume
> 
> MPV devcom registration publishes mlx5e private data to the component
> peer list before mlx5e_devcom_init_mpv() stores the returned component
> device in priv->devcom. A concurrent master-up event can therefore
> reach a peer whose private data is visible but whose priv->devcom
> backpointer is still NULL.
> 
> MPV_DEVCOM_MASTER_UP already carries the sender/master mlx5e private
> data as event_data. The ready bit is stored on the shared devcom
> component, not on an individual peer. Use the sender devcom when
> marking the MPV component ready.
> 
> This preserves the readiness transition while avoiding a NULL
> dereference of the peer devcom pointer during affiliation replay after
> PCI error recovery.
> 
> Fixes: bf11485f8419 ("net/mlx5: Register mlx5e priv to devcom in MPV mode")
> Assisted-by: Codex:gpt-5
> Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
> Cc: stable@vger.kernel.org # 6.7+
> ---

Thanks for your patch and sorry for the late response.

>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 +++++--
>   1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 8f2b3abe0092..f7ff20b97e8c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -211,11 +211,14 @@ static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
>   
>   static int mlx5e_devcom_event_mpv(int event, void *my_data, void *event_data)
>   {
> -	struct mlx5e_priv *slave_priv = my_data;
> +	struct mlx5e_priv *master_priv = event_data;
>   

makes sense.

>   	switch (event) {
>   	case MPV_DEVCOM_MASTER_UP:
> -		mlx5_devcom_comp_set_ready(slave_priv->devcom, true);
> +		if (!master_priv || !master_priv->devcom)
> +			return -EINVAL;

is this currently possible? or just being defensive?
if this return is unreachable I'd drop it.

> +
> +		mlx5_devcom_comp_set_ready(master_priv->devcom, true);
>   		break;
>   	case MPV_DEVCOM_MASTER_DOWN:
>   		/* no need for comp set ready false since we unregister after


^ permalink raw reply

* Re: [PATCH v3 net] net: watchdog: fix refcount tracking races
From: Eric Dumazet @ 2026-06-22  8:59 UTC (permalink / raw)
  To: Marek Szyprowski
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, syzbot+381d82bbf0253710b35d,
	syzbot+3479efbc2821cb2a79f2
In-Reply-To: <a443376e-5187-4268-93b3-58047ef113a8@samsung.com>

On Wed, Jun 17, 2026 at 3:48 AM Marek Szyprowski
<m.szyprowski@samsung.com> wrote:
>
> Dear All,
>
> On 11.06.2026 17:27, Eric Dumazet wrote:
> > Blamed commit converted the untracked dev_hold()/dev_put() calls
> > in the watchdog code to use the tracked dev_hold_track()/dev_put_track()
> > (which were later renamed/interfaced to netdev_hold() and netdev_put()).
> >
> > By introducing dev->watchdog_dev_tracker to store the
> > reference tracking information without adding synchronization
> > between netdev_watchdog_up() and dev_watchdog(), it enabled the
> > race condition where this pointer could be overwritten or freed
> > concurrently, leading to the list corruption crash syzbot reported:
> >
> > list_del corruption, ffff888114a18c00->next is NULL
> >  kernel BUG at lib/list_debug.c:52 !
> > Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
> > CPU: 1 UID: 0 PID: 91 Comm: kworker/u8:5 Not tainted syzkaller #0 PREEMPT(lazy)
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
> > Workqueue: events_unbound linkwatch_event
> >  RIP: 0010:__list_del_entry_valid_or_report.cold+0x22/0x2a lib/list_debug.c:52
> > Call Trace:
> >  <TASK>
> >   __list_del_entry_valid include/linux/list.h:132 [inline]
> >   __list_del_entry include/linux/list.h:246 [inline]
> >   list_move_tail include/linux/list.h:341 [inline]
> >   ref_tracker_free+0x1a7/0x6c0 lib/ref_tracker.c:329
> >   netdev_tracker_free include/linux/netdevice.h:4491 [inline]
> >   netdev_put include/linux/netdevice.h:4508 [inline]
> >   netdev_put include/linux/netdevice.h:4504 [inline]
> >   netdev_watchdog_down net/sched/sch_generic.c:600 [inline]
> >   dev_deactivate_many+0x28c/0xfe0 net/sched/sch_generic.c:1363
> >   dev_deactivate+0x109/0x1d0 net/sched/sch_generic.c:1397
> >   linkwatch_do_dev net/core/link_watch.c:184 [inline]
> >   linkwatch_do_dev+0xd3/0x120 net/core/link_watch.c:166
> >   __linkwatch_run_queue+0x3a5/0x810 net/core/link_watch.c:240
> >   linkwatch_event+0x8f/0xc0 net/core/link_watch.c:314
> >   process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314
> >   process_scheduled_works kernel/workqueue.c:3397 [inline]
> >   worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478
> >   kthread+0x370/0x450 kernel/kthread.c:436
> >   ret_from_fork+0x69a/0xc80 arch/x86/kernel/process.c:158
> >   ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
> >
> > This patch has three coordinated parts:
> >
> > 1) Add dev->watchdog_lock and dev->watchdog_ref_held to serialize watchdog operations.
> >
> > 2) Remove netdev_watchdog_up() call from netif_carrier_on():
> >    This ensures netdev_watchdog_up() is only called from process/BH context
> >    (via linkwatch workqueue dev_activate()), allowing us to use
> >    spin_lock_bh() for synchronization.
> >
> > 3) Synchronize watchdog up and watchdog timer:
> >    Protect netdev_watchdog_up() with tx_global_lock and watchdog_lock.
> >    Only allocate a new tracker in netdev_watchdog_up() if one is
> >    not already present.
> >    In dev_watchdog(), ensure we don't release the tracker if the
> >    timer was rescheduled either by dev_watchdog() itself or concurrently
> >    by netdev_watchdog_up().
> >
> > Fixes: f12bf6f3f942 ("net: watchdog: add net device refcount tracker")
> > Reported-by: syzbot+381d82bbf0253710b35d@syzkaller.appspotmail.com
> > Closes: https://lore.kernel.org/netdev/6a26b751.c25708ab.1b19ef.0013.GAE@google.com/T/#u
> > Tested-by: syzbot+3479efbc2821cb2a79f2@syzkaller.appspotmail.com
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> This patch landed recently in linux-next as commit 8eed5519e496 ("net: watchdog:
> fix refcount tracking races"). In my tests I found that it causes the following
> deadlock during system suspend/resume on QEmu's ARM64bit 'virt' machine:
>
> root@target:~# time rtcwake -s10 -mmem
> rtcwake: assuming RTC uses UTC ...
> rtcwake: wakeup from "mem" using /dev/rtc0 at Wed Jun 17 10:46:12 2026
> PM: suspend entry (s2idle)
> Filesystems sync: 0.055 seconds
> Freezing user space processes
> Freezing user space processes completed (elapsed 0.006 seconds)
> OOM killer disabled.
> Freezing remaining freezable tasks
> Freezing remaining freezable tasks completed (elapsed 0.003 seconds)
>
> ============================================
> WARNING: possible recursive locking detected
> 7.1.0-rc7+ #13003 Not tainted
> --------------------------------------------
> rtcwake/254 is trying to acquire lock:
> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netdev_watchdog_up+0x40/0x108
>
> but task is already holding lock:
> ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>
> other info that might help us debug this:
>  Possible unsafe locking scenario:
>
>        CPU0
>        ----
>   lock(&dev->tx_global_lock);
>   lock(&dev->tx_global_lock);
>
>  *** DEADLOCK ***
>
>  May be due to missing lock nesting notation
>
> 6 locks held by rtcwake/254:
>  #0: ffff0000071ab3e8 (sb_writers#5){.+.+}-{0:0}, at: vfs_write+0x1ec/0x35c
>  #1: ffff00000d22c480 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0xf0/0x1c4
>  #2: ffff0000049162c8 (kn->active#61){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1c4
>  #3: ffffaa79533c03b0 (system_transition_mutex){+.+.}-{4:4}, at: pm_suspend+0x98/0x608
>  #4: ffff000005e3a138 (&dev->mutex){....}-{4:4}, at: device_resume+0xb4/0x254
>  #5: ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
>
> stack backtrace:
> CPU: 1 UID: 0 PID: 254 Comm: rtcwake Not tainted 7.1.0-rc7+ #13003 PREEMPT
> Hardware name: linux,dummy-virt (DT)
> Call trace:
>  show_stack+0x18/0x24 (C)
>  dump_stack_lvl+0x90/0xd0
>  dump_stack+0x18/0x24
>  print_deadlock_bug+0x260/0x350
>  __lock_acquire+0x11b8/0x225c
>  lock_acquire+0x1c4/0x3f0
>  _raw_spin_lock_bh+0x50/0x68
>  netdev_watchdog_up+0x40/0x108
>  netif_device_attach+0x9c/0xb0
>  virtnet_restore+0x100/0x21c
>  virtio_device_restore_priv+0x11c/0x1d0
>  virtio_device_restore+0x14/0x20
>  virtio_mmio_restore+0x34/0x40
>  platform_pm_resume+0x2c/0x68
>  dpm_run_callback+0xa0/0x240
>  device_resume+0x120/0x254
>  dpm_resume+0x1f8/0x2ec
>  dpm_resume_end+0x18/0x34
>  suspend_devices_and_enter+0x1d0/0x990
>  pm_suspend+0x1ec/0x608
>  state_store+0x8c/0x110
>  kobj_attr_store+0x18/0x2c
>  sysfs_kf_write+0x50/0x7c
>  kernfs_fop_write_iter+0x130/0x1c4
>  vfs_write+0x2b8/0x35c
>  ksys_write+0x6c/0x104
>  __arm64_sys_write+0x1c/0x28
>  invoke_syscall+0x54/0x110
>  el0_svc_common.constprop.0+0x40/0xe8
>  do_el0_svc+0x20/0x2c
>  el0_svc+0x54/0x338
>  el0t_64_sync_handler+0xa0/0xe4
>  el0t_64_sync+0x198/0x19c
>
>
> Reverting $subject on top of linux-next fixes this issue.

Thanks for the report Marek!

Acquiring tx_global_lock in netdev_watchdog_up() appears unnecessary anyway
because the critical state (timer and refcount tracker) is already
protected by dev->watchdog_lock.

Could you try this patch?

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 3f1c510df850dbdbaf10d483547c7b1f3a5d5482..ef2b4bf51564173751c74fefe17e3913ed2fa056
100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -594,9 +594,8 @@ void netdev_watchdog_up(struct net_device *dev)
                return;
        if (dev->watchdog_timeo <= 0)
                dev->watchdog_timeo = 5*HZ;
-       spin_lock_bh(&dev->tx_global_lock);

-       spin_lock(&dev->watchdog_lock);
+       spin_lock_bh(&dev->watchdog_lock);
        if (!mod_timer(&dev->watchdog_timer,
                       round_jiffies(jiffies + dev->watchdog_timeo))) {
                if (!dev->watchdog_ref_held) {
@@ -605,9 +604,7 @@ void netdev_watchdog_up(struct net_device *dev)
                        dev->watchdog_ref_held = true;
                }
        }
-       spin_unlock(&dev->watchdog_lock);
-
-       spin_unlock_bh(&dev->tx_global_lock);
+       spin_unlock_bh(&dev->watchdog_lock);
 }
 EXPORT_SYMBOL_GPL(netdev_watchdog_up);

^ permalink raw reply

* Re: [PATCH net v2] amt: don't read the IP source address from a reallocated skb header
From: Taehee Yoo @ 2026-06-22  8:58 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Michael Bommarito, David S . Miller, Paolo Abeni, Eric Dumazet,
	Andrew Lunn, netdev, linux-kernel
In-Reply-To: <20260621150011.33c2fe80@kernel.org>

On Mon, Jun 22, 2026 at 7:00 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 17 Jun 2026 08:34:43 -0400 Michael Bommarito wrote:
> > amt_update_handler() caches iph = ip_hdr(skb) and then calls
> > pskb_may_pull(). pskb_may_pull() can reallocate the skb head: the new
> > head is allocated and the old one is freed. The cached iph is not
> > refreshed, so the following tunnel lookup reads iph->saddr from the
> > freed head. On an AMT relay this lookup runs for every incoming
> > membership update, before the update's nonce and response MAC are
> > validated.
> >
> > The sibling handlers amt_multicast_data_handler() and
> > amt_membership_query_handler() re-read ip_hdr() after the pull and are
> > not affected; only amt_update_handler() keeps the pre-pull pointer.
>
> Sashikos point out a bunch more of these in AMT:
> https://sashiko.dev/#/patchset/20260617123443.3586930-1-michael.bommarito@gmail.com
> https://netdev-ai.bots.linux.dev/sashiko/#/patchset/20260617123443.3586930-1-michael.bommarito@gmail.com
>
> Let's fix them all with one patch?

Agreed.
Michael, could you please fix the remaining ones Sashiko flagged?

Thanks a lot!
Taehee Yoo

> --
> pw-bot: cr

^ permalink raw reply

* Re: [patch V2 18/25] timekeeping: Prepare for cross timestamps on arbitrary clock IDs
From: David Woodhouse @ 2026-06-22  8:55 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Miroslav Lichvar, John Stultz, Stephen Boyd, Anna-Maria Behnsen,
	Frederic Weisbecker, thomas.weissschuh, Arthur Kiyanovski,
	Rodolfo Giometti, Vincent Donnefort, Marc Zyngier, Oliver Upton,
	kvmarm, Oliver Upton, Richard Cochran, netdev, Takashi Iwai,
	Miri Korenblit, Johannes Berg, Jacob Keller, Tony Nguyen,
	Saeed Mahameed, Peter Hilber, Michael S. Tsirkin, virtualization,
	linux-wireless, linux-sound, Vadim Fedorenko
In-Reply-To: <20260529195557.846634842@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 890 bytes --]

On Fri, 2026-05-29 at 22:01 +0200, Thomas Gleixner wrote:
> From: Thomas Gleixner <tglx@kernel.org>
> 
> PTP device system crosstime stamps support only CLOCK_REALTIME, which is
> meaningless for AUX clocks. The PTP core hands in the clock ID already, so
> prepare the core code to honor it.
> 
>  - Add a new sys_systime field to struct system_device_crosststamp which
>    aliases the sys_realtime field. Once all users are converted
>    sys_realtime can be removed.
> 
>  - Prepare get_device_system_crosststamp() and the related code for it by
>    switching to sys_systime and providing the initial changes to utilize
>    different time keepers.
> 
> No functional change intended.

We ended up with ktime_get_snapshot_id() also supporting CLOCK_BOOTTIME
and CLOCK_MONOTONIC_RAW, but not get_device_system_crosststamp().
Should we make that consistent?

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* RE: [Intel-wired-lan] [PATCH 1/2] igc: Wait for MAC passthrough after reset
From: Loktionov, Aleksandr @ 2026-06-22  8:54 UTC (permalink / raw)
  To: kao, acelan, Ruinskiy, Dima
  Cc: Nguyen, Anthony L, Kitszel, Przemyslaw, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <ajiHH-RaHUjgraMh@acelan-Precision-5480>



> -----Original Message-----
> From: Chia-Lin Kao (AceLan) <acelan.kao@canonical.com>
> Sent: Monday, June 22, 2026 3:58 AM
> To: Ruinskiy, Dima <dima.ruinskiy@intel.com>
> Cc: Loktionov, Aleksandr <aleksandr.loktionov@intel.com>; Nguyen,
> Anthony L <anthony.l.nguyen@intel.com>; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Andrew Lunn <andrew+netdev@lunn.ch>;
> David S. Miller <davem@davemloft.net>; Eric Dumazet
> <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo Abeni
> <pabeni@redhat.com>; intel-wired-lan@lists.osuosl.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [Intel-wired-lan] [PATCH 1/2] igc: Wait for MAC
> passthrough after reset
> 
> On Thu, Jun 18, 2026 at 11:51:35AM +0300, Ruinskiy, Dima wrote:
> > On 18/06/2026 10:55, Loktionov, Aleksandr wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Intel-wired-lan <intel-wired-lan-bounces@osuosl.org> On
> > > > Behalf Of Chia-Lin Kao (AceLan) via Intel-wired-lan
> > > > Sent: Thursday, June 18, 2026 9:33 AM
> > > > To: Nguyen, Anthony L <anthony.l.nguyen@intel.com>; Kitszel,
> > > > Przemyslaw <przemyslaw.kitszel@intel.com>
> > > > Cc: Andrew Lunn <andrew+netdev@lunn.ch>; David S. Miller
> > > > <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>; Jakub
> > > > Kicinski <kuba@kernel.org>; Paolo Abeni <pabeni@redhat.com>;
> > > > intel- wired-lan@lists.osuosl.org; netdev@vger.kernel.org;
> linux-
> > > > kernel@vger.kernel.org
> > > > Subject: [Intel-wired-lan] [PATCH 1/2] igc: Wait for MAC
> > > > passthrough after reset
> > > >
> > > > Some systems support MAC passthrough for dock Ethernet
> controllers
> > > > by having firmware rewrite the receive address registers after
> the
> > > > controller reset completes.
> > > >
> > > > igc resets the controller before reading RAL0/RAH0, so that
> reset
> > > > can restore the controller native MAC address temporarily. If
> the
> > > > driver reads the registers immediately, it can race the firmware
> > > > rewrite and keep the native dock MAC instead of the host
> passthrough MAC.
> > > >
> > > > For LMVP devices, poll RAL0/RAH0 after reset and before reading
> > > > the MAC address. Stop once the address registers change to
> another
> > > > valid Ethernet address, allowing firmware a bounded window to
> > > > complete the passthrough update.
> > > >
> Hi Aleksandr and Dima,
> 
> Let me answer your questions below.
> 
> > > Good day, Chia-Lin
> > >
> > > It'd be great if you could share more details on how to reproduce
> the issue.
> > >
> > > What exact hardware setup is affected (dock model, NIC, system)?
> We've observed this issue for a long time, and encountered the issue
> on Lenovo's P15 Gen 2 (type 20YQ, 20YR) Laptops (ThinkPad) the first
> time at 2021 and added 600ms delay.
> Recently, we encountered the same issue on Dell, too, and then
> increased the delay to 1000ms.
> And now, the issue occurs again.
> 
> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1942999
> https://lore.kernel.org/lkml/20210702045120.22855-2-
> aaron.ma@canonical.com/
> https://bugs.launchpad.net/ubuntu/+source/linux-oem-6.17/+bug/2143197
> 
> > > Which firmware/BIOS version?
> It doesn't happen on a single firmware or BIOS, and not a single
> hardware or a single brand.
> 
> > > How often does the race trigger?
> It may happen when re-plug the dock cable.
> With the mainline kernel, it's easy to reproduce the issue by re-
> plugging the dock cable.
> 
> > > Do you have a way to reliably reproduce it?
> Yes, I can find some machines to reproduce the issue reliably.
> 
> > >
> > > Also, what is the observed behavior vs. expected behavior? For
> > > example, which MAC address is seen and which one should be used?
> Here is the debugging logs, fc:4c:ea:ae:a1:e3 is the MAC address of
> the machine, and c4:d6:d3:83:75:d1 is the MAC of the dock.
> 
> It gets the correct passthrough MAC address after bootup and the first
> re-plug at 40s, and fails to update the MAC address in time after
> couple of re-plugs.
> 
> [    0.689873] igc 0000:70:00.0: MAC debug before reset_hw:
> RAL0=0xaeea4cfc RAH0=0x8000e3a1 RAR0=fc:4c:ea:ae:a1:e3 valid=1
> [    0.755187] igc 0000:70:00.0: MAC debug after reset_hw:
> RAL0=0x83d3d6c4 RAH0=0x8000d175 RAR0=c4:d6:d3:83:75:d1 valid=1
> [    0.755576] igc 0000:70:00.0: MAC debug:
> eth_platform_get_mac_address ret=-19, reading RAR0/NVM fallback
> [    0.755582] igc 0000:70:00.0: MAC debug: read_mac_addr ret=0
> addr=fc:4c:ea:ae:a1:e3 perm_addr=fc:4c:ea:ae:a1:e3
> [    4.687730] igc 0000:70:00.0: MAC debug firmware: fwnode=<none>
> props(mac=0 local=0 address=0) fwnode_ret=-19
> fwnode_mac=00:00:00:00:00:00 device_ret=-2
> device_mac=00:00:00:00:00:00 is_tbt=0 external=0 hotplug_bridge=0
> [    4.687739] igc 0000:70:00.0: MAC debug before reset_hw:
> RAL0=0xaeea4cfc RAH0=0x8000e3a1 RAR0=fc:4c:ea:ae:a1:e3 valid=1
> [    4.748545] igc 0000:70:00.0: MAC debug after reset_hw:
> RAL0=0x83d3d6c4 RAH0=0x8000d175 RAR0=c4:d6:d3:83:75:d1 valid=1
> [    4.748937] igc 0000:70:00.0: MAC debug:
> eth_platform_get_mac_address ret=-19, reading RAR0/NVM fallback
> [    4.748944] igc 0000:70:00.0: MAC debug: read_mac_addr ret=0
> addr=fc:4c:ea:ae:a1:e3 perm_addr=fc:4c:ea:ae:a1:e3
> [   40.892715] igc 0000:70:00.0: MAC debug firmware: fwnode=<none>
> props(mac=0 local=0 address=0) fwnode_ret=-19
> fwnode_mac=00:00:00:00:00:00 device_ret=-2
> device_mac=00:00:00:00:00:00 is_tbt=0 external=0 hotplug_bridge=0
> [   40.892724] igc 0000:70:00.0: MAC debug before reset_hw:
> RAL0=0x83d3d6c4 RAH0=0x8000d175 RAR0=c4:d6:d3:83:75:d1 valid=1
> [   40.953524] igc 0000:70:00.0: MAC debug after reset_hw:
> RAL0=0x83d3d6c4 RAH0=0x8000d175 RAR0=c4:d6:d3:83:75:d1 valid=1
> [   40.953933] igc 0000:70:00.0: MAC debug:
> eth_platform_get_mac_address ret=-19, reading RAR0/NVM fallback
> [   40.953941] igc 0000:70:00.0: MAC debug: read_mac_addr ret=0
> addr=c4:d6:d3:83:75:d1 perm_addr=c4:d6:d3:83:75:d1
> ...
> [  307.387282] igc 0000:70:00.0: MAC poll change at 700ms:
> RAL0=0xaeea4cfc RAH0=0x8000e3a1 RAR0=fc:4c:ea:ae:a1:e3 valid=1
> prev=c4:d6:d3:83:75:d1 [  328.826084] igc 0000:38:00.0: MAC poll
> change at 1000ms: RAL0=0xaeea4cfc RAH0=0x8000e3a1
> RAR0=fc:4c:ea:ae:a1:e3 valid=1 prev=c4:d6:d3:83:75:d1 [  429.070519]
> igc 0000:38:00.0: MAC poll change at 1100ms: RAL0=0xaeea4cfc
> RAH0=0x8000e3a1 RAR0=fc:4c:ea:ae:a1:e3 valid=1 prev=c4:d6:d3:83:75:d1
> [  466.509571] igc 0000:70:00.0: MAC poll change at 1000ms:
> RAL0=0xaeea4cfc RAH0=0x8000e3a1 RAR0=fc:4c:ea:ae:a1:e3 valid=1
> prev=c4:d6:d3:83:75:d1
> 

Please include the info into commit message, so users can grep error and find the fix.
Exact bash commands for reproduction can also help administrators to decide whether they need to patch their OS.

Thank you

...


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Christian König @ 2026-06-22  8:51 UTC (permalink / raw)
  To: Kaitao Cheng, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt
  Cc: David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, linux-kernel, cgroups,
	linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf, netdev,
	dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, Kaitao Cheng
In-Reply-To: <20260622040533.29824-2-kaitao.cheng@linux.dev>

On 6/22/26 06:05, Kaitao Cheng wrote:
> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> 
> The list_for_each*_safe() helpers are used when the loop body may
> remove the current entry.  Their API exposes the temporary cursor at
> every call site, even though most users only need it for the iterator
> implementation and never reference it in the loop body.
> 
> Add *_mutable() variants for list and hlist iteration.  The new helpers
> support both forms: callers may keep passing an explicit temporary cursor
> when they need to inspect or reset it, or omit it and let the helper use
> a unique internal cursor.

That sounds like a bad idea to me. The macro should really be doing one job and that as best as it can.

> This makes call sites that only mutate the list through the current entry
> less noisy, while keeping the existing *_safe() helpers available for
> compatibility.

This can be perfectly used for code that which really needs the separate variable for the next entry.

Regards,
Christian.


> 
> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> ---
>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 231 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/list.h b/include/linux/list.h
> index 09d979976b3b..1081def7cea9 100644
> --- a/include/linux/list.h
> +++ b/include/linux/list.h
> @@ -7,6 +7,7 @@
>  #include <linux/stddef.h>
>  #include <linux/poison.h>
>  #include <linux/const.h>
> +#include <linux/args.h>
>  
>  #include <asm/barrier.h>
>  
> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>  #define list_for_each_prev(pos, head) \
>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>  
> -/**
> - * list_for_each_safe - iterate over a list safe against removal of list entry
> - * @pos:	the &struct list_head to use as a loop cursor.
> - * @n:		another &struct list_head to use as temporary storage
> - * @head:	the head for your list.
> +/*
> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>   */
>  #define list_for_each_safe(pos, n, head) \
>  	for (pos = (head)->next, n = pos->next; \
>  	     !list_is_head(pos, (head)); \
>  	     pos = n, n = pos->next)
>  
> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
> +	     !list_is_head(pos, (head));				\
> +	     pos = tmp, tmp = pos->next)
> +
> +#define __list_for_each_mutable1(pos, head)				\
> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
> +
> +#define __list_for_each_mutable2(pos, next, head)			\
> +	list_for_each_safe(pos, next, head)
> +
>  /**
> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
> + * list_for_each_mutable - iterate over a list safe against entry removal
>   * @pos:	the &struct list_head to use as a loop cursor.
> - * @n:		another &struct list_head to use as temporary storage
> - * @head:	the head for your list.
> + * @...:	either (head) or (next, head)
> + *
> + * next:	another &struct list_head to use as optional temporary storage.
> + *		The temporary cursor is internal unless explicitly supplied by
> + *		the caller.
> + * head:	the head for your list.
> + */
> +#define list_for_each_mutable(pos, ...)					\
> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
> +		(pos, __VA_ARGS__)
> +
> +/*
> + * list_for_each_prev_safe is an old interface, use list_for_each_prev_mutable instead.
>   */
>  #define list_for_each_prev_safe(pos, n, head) \
>  	for (pos = (head)->prev, n = pos->prev; \
>  	     !list_is_head(pos, (head)); \
>  	     pos = n, n = pos->prev)
>  
> +#define __list_for_each_prev_mutable_internal(pos, tmp, head)		\
> +	for (typeof(pos) tmp = (pos = (head)->prev)->prev;		\
> +	     !list_is_head(pos, (head));				\
> +	     pos = tmp, tmp = pos->prev)
> +
> +#define __list_for_each_prev_mutable1(pos, head)			\
> +	__list_for_each_prev_mutable_internal(pos, __UNIQUE_ID(prev), head)
> +
> +#define __list_for_each_prev_mutable2(pos, prev, head)			\
> +	list_for_each_prev_safe(pos, prev, head)
> +
> +/**
> + * list_for_each_prev_mutable - iterate over a list backwards safe against entry removal
> + * @pos:	the &struct list_head to use as a loop cursor.
> + * @...:	either (head) or (prev, head)
> + *
> + * prev:	another &struct list_head to use as optional temporary storage.
> + *		The temporary cursor is internal unless explicitly supplied by
> + *		the caller.
> + * head:	the head for your list.
> + */
> +#define list_for_each_prev_mutable(pos, ...)				\
> +	CONCATENATE(__list_for_each_prev_mutable, COUNT_ARGS(__VA_ARGS__)) \
> +		(pos, __VA_ARGS__)
> +
>  /**
>   * list_count_nodes - count nodes in the list
>   * @head:	the head for your list.
> @@ -895,12 +940,8 @@ static inline size_t list_count_nodes(struct list_head *head)
>  	for (; !list_entry_is_head(pos, head, member);			\
>  	     pos = list_prev_entry(pos, member))
>  
> -/**
> - * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
> - * @pos:	the type * to use as a loop cursor.
> - * @n:		another type * to use as temporary storage
> - * @head:	the head for your list.
> - * @member:	the name of the list_head within the struct.
> +/*
> + * list_for_each_entry_safe is an old interface, use list_for_each_entry_mutable instead.
>   */
>  #define list_for_each_entry_safe(pos, n, head, member)			\
>  	for (pos = list_first_entry(head, typeof(*pos), member),	\
> @@ -908,15 +949,36 @@ static inline size_t list_count_nodes(struct list_head *head)
>  	     !list_entry_is_head(pos, head, member); 			\
>  	     pos = n, n = list_next_entry(n, member))
>  
> +#define __list_for_each_entry_mutable_internal(pos, tmp, head, member)	\
> +	for (typeof(pos) tmp = list_next_entry(pos =			\
> +		list_first_entry(head, typeof(*pos), member), member);	\
> +	     !list_entry_is_head(pos, head, member);			\
> +	     pos = tmp, tmp = list_next_entry(tmp, member))
> +
> +#define __list_for_each_entry_mutable2(pos, head, member)		\
> +	__list_for_each_entry_mutable_internal(pos, __UNIQUE_ID(next), head, member)
> +
> +#define __list_for_each_entry_mutable3(pos, next, head, member)		\
> +	list_for_each_entry_safe(pos, next, head, member)
> +
>  /**
> - * list_for_each_entry_safe_continue - continue list iteration safe against removal
> + * list_for_each_entry_mutable - iterate over a list safe against entry removal
>   * @pos:	the type * to use as a loop cursor.
> - * @n:		another type * to use as temporary storage
> - * @head:	the head for your list.
> - * @member:	the name of the list_head within the struct.
> + * @...:	either (head, member) or (next, head, member)
>   *
> - * Iterate over list of given type, continuing after current point,
> - * safe against removal of list entry.
> + * next:	another type * to use as optional temporary storage. The
> + *		temporary cursor is internal unless explicitly supplied by the
> + *		caller.
> + * head:	the head for your list.
> + * member:	the name of the list_head within the struct.
> + */
> +#define list_for_each_entry_mutable(pos, ...)				\
> +	CONCATENATE(__list_for_each_entry_mutable, COUNT_ARGS(__VA_ARGS__)) \
> +		(pos, __VA_ARGS__)
> +
> +/*
> + * list_for_each_entry_safe_continue is an old interface,
> + * use list_for_each_entry_mutable_continue instead.
>   */
>  #define list_for_each_entry_safe_continue(pos, n, head, member) 		\
>  	for (pos = list_next_entry(pos, member), 				\
> @@ -924,30 +986,79 @@ static inline size_t list_count_nodes(struct list_head *head)
>  	     !list_entry_is_head(pos, head, member);				\
>  	     pos = n, n = list_next_entry(n, member))
>  
> +#define __list_for_each_entry_mutable_continue_internal(pos, tmp, head, member) \
> +	for (typeof(pos) tmp = list_next_entry(pos =			\
> +		list_next_entry(pos, member), member);			\
> +	     !list_entry_is_head(pos, head, member);			\
> +	     pos = tmp, tmp = list_next_entry(tmp, member))
> +
> +#define __list_for_each_entry_mutable_continue2(pos, head, member)	\
> +	__list_for_each_entry_mutable_continue_internal(pos,		\
> +		__UNIQUE_ID(next), head, member)
> +
> +#define __list_for_each_entry_mutable_continue3(pos, next, head, member) \
> +	list_for_each_entry_safe_continue(pos, next, head, member)
> +
>  /**
> - * list_for_each_entry_safe_from - iterate over list from current point safe against removal
> + * list_for_each_entry_mutable_continue - continue list iteration safe against removal
>   * @pos:	the type * to use as a loop cursor.
> - * @n:		another type * to use as temporary storage
> - * @head:	the head for your list.
> - * @member:	the name of the list_head within the struct.
> + * @...:	either (head, member) or (next, head, member)
>   *
> - * Iterate over list of given type from current point, safe against
> - * removal of list entry.
> + * next:	another type * to use as optional temporary storage. The
> + *		temporary cursor is internal unless explicitly supplied by the
> + *		caller.
> + * head:	the head for your list.
> + * member:	the name of the list_head within the struct.
> + *
> + * Iterate over list of given type, continuing after current point,
> + * safe against removal of list entry.
> + */
> +#define list_for_each_entry_mutable_continue(pos, ...)			\
> +	CONCATENATE(__list_for_each_entry_mutable_continue,		\
> +		COUNT_ARGS(__VA_ARGS__))(pos, __VA_ARGS__)
> +
> +/*
> + * list_for_each_entry_safe_from is an old interface,
> + * use list_for_each_entry_mutable_from instead.
>   */
>  #define list_for_each_entry_safe_from(pos, n, head, member) 			\
>  	for (n = list_next_entry(pos, member);					\
>  	     !list_entry_is_head(pos, head, member);				\
>  	     pos = n, n = list_next_entry(n, member))
>  
> +#define __list_for_each_entry_mutable_from_internal(pos, tmp, head, member) \
> +	for (typeof(pos) tmp = list_next_entry(pos, member);		\
> +	     !list_entry_is_head(pos, head, member);			\
> +	     pos = tmp, tmp = list_next_entry(tmp, member))
> +
> +#define __list_for_each_entry_mutable_from2(pos, head, member)		\
> +	__list_for_each_entry_mutable_from_internal(pos,		\
> +		__UNIQUE_ID(next), head, member)
> +
> +#define __list_for_each_entry_mutable_from3(pos, next, head, member)	\
> +	list_for_each_entry_safe_from(pos, next, head, member)
> +
>  /**
> - * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
> + * list_for_each_entry_mutable_from - iterate over list from current point safe against removal
>   * @pos:	the type * to use as a loop cursor.
> - * @n:		another type * to use as temporary storage
> - * @head:	the head for your list.
> - * @member:	the name of the list_head within the struct.
> + * @...:	either (head, member) or (next, head, member)
>   *
> - * Iterate backwards over list of given type, safe against removal
> - * of list entry.
> + * next:	another type * to use as optional temporary storage. The
> + *		temporary cursor is internal unless explicitly supplied by the
> + *		caller.
> + * head:	the head for your list.
> + * member:	the name of the list_head within the struct.
> + *
> + * Iterate over list of given type from current point, safe against
> + * removal of list entry.
> + */
> +#define list_for_each_entry_mutable_from(pos, ...)			\
> +	CONCATENATE(__list_for_each_entry_mutable_from,			\
> +		COUNT_ARGS(__VA_ARGS__))(pos, __VA_ARGS__)
> +
> +/*
> + * list_for_each_entry_safe_reverse is an old interface,
> + * use list_for_each_entry_mutable_reverse instead.
>   */
>  #define list_for_each_entry_safe_reverse(pos, n, head, member)		\
>  	for (pos = list_last_entry(head, typeof(*pos), member),		\
> @@ -955,6 +1066,37 @@ static inline size_t list_count_nodes(struct list_head *head)
>  	     !list_entry_is_head(pos, head, member); 			\
>  	     pos = n, n = list_prev_entry(n, member))
>  
> +#define __list_for_each_entry_mutable_reverse_internal(pos, tmp, head, member) \
> +	for (typeof(pos) tmp = list_prev_entry(pos =			\
> +		list_last_entry(head, typeof(*pos), member), member);	\
> +	     !list_entry_is_head(pos, head, member);			\
> +	     pos = tmp, tmp = list_prev_entry(tmp, member))
> +
> +#define __list_for_each_entry_mutable_reverse2(pos, head, member)	\
> +	__list_for_each_entry_mutable_reverse_internal(pos,		\
> +		__UNIQUE_ID(prev), head, member)
> +
> +#define __list_for_each_entry_mutable_reverse3(pos, prev, head, member)	\
> +	list_for_each_entry_safe_reverse(pos, prev, head, member)
> +
> +/**
> + * list_for_each_entry_mutable_reverse - iterate backwards over list safe against removal
> + * @pos:	the type * to use as a loop cursor.
> + * @...:	either (head, member) or (prev, head, member)
> + *
> + * prev:	another type * to use as optional temporary storage. The
> + *		temporary cursor is internal unless explicitly supplied by the
> + *		caller.
> + * head:	the head for your list.
> + * member:	the name of the list_head within the struct.
> + *
> + * Iterate backwards over list of given type, safe against removal
> + * of list entry.
> + */
> +#define list_for_each_entry_mutable_reverse(pos, ...)			\
> +	CONCATENATE(__list_for_each_entry_mutable_reverse,		\
> +		COUNT_ARGS(__VA_ARGS__))(pos, __VA_ARGS__)
> +
>  /**
>   * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
>   * @pos:	the loop cursor used in the list_for_each_entry_safe loop
> @@ -1189,6 +1331,31 @@ static inline void hlist_splice_init(struct hlist_head *from,
>  	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
>  	     pos = n)
>  
> +#define __hlist_for_each_mutable_internal(pos, tmp, head)		\
> +	for (typeof(pos) tmp = (pos = (head)->first) ? pos->next : NULL; \
> +	     pos;							\
> +	     pos = tmp, tmp = pos ? pos->next : NULL)
> +
> +#define __hlist_for_each_mutable1(pos, head)				\
> +	__hlist_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
> +
> +#define __hlist_for_each_mutable2(pos, next, head)			\
> +	hlist_for_each_safe(pos, next, head)
> +
> +/**
> + * hlist_for_each_mutable - iterate over a hlist safe against entry removal
> + * @pos:	the &struct hlist_node to use as a loop cursor.
> + * @...:	either (head) or (next, head)
> + *
> + * next:	another &struct hlist_node to use as optional temporary storage.
> + *		The temporary cursor is internal unless explicitly supplied by
> + *		the caller.
> + * head:	the head for your hlist.
> + */
> +#define hlist_for_each_mutable(pos, ...)				\
> +	CONCATENATE(__hlist_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
> +		(pos, __VA_ARGS__)
> +
>  #define hlist_entry_safe(ptr, type, member) \
>  	({ typeof(ptr) ____ptr = (ptr); \
>  	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
> @@ -1224,18 +1391,44 @@ static inline void hlist_splice_init(struct hlist_head *from,
>  	for (; pos;							\
>  	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
>  
> -/**
> - * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
> - * @pos:	the type * to use as a loop cursor.
> - * @n:		a &struct hlist_node to use as temporary storage
> - * @head:	the head for your list.
> - * @member:	the name of the hlist_node within the struct.
> +/*
> + * hlist_for_each_entry_safe is an old interface, use hlist_for_each_entry_mutable instead.
>   */
>  #define hlist_for_each_entry_safe(pos, n, head, member) 		\
>  	for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
>  	     pos && ({ n = pos->member.next; 1; });			\
>  	     pos = hlist_entry_safe(n, typeof(*pos), member))
>  
> +#define __hlist_for_each_entry_mutable_internal(pos, tmp, head, member)	\
> +	for (struct hlist_node *tmp = (pos =				\
> +		hlist_entry_safe((head)->first, typeof(*pos), member)) ? \
> +		pos->member.next : NULL;				\
> +	     pos;							\
> +	     pos = hlist_entry_safe((tmp), typeof(*pos), member),	\
> +		tmp = pos ? pos->member.next : NULL)
> +
> +#define __hlist_for_each_entry_mutable2(pos, head, member)		\
> +	__hlist_for_each_entry_mutable_internal(pos,			\
> +		__UNIQUE_ID(next), head, member)
> +
> +#define __hlist_for_each_entry_mutable3(pos, next, head, member)	\
> +	hlist_for_each_entry_safe(pos, next, head, member)
> +
> +/**
> + * hlist_for_each_entry_mutable - iterate over hlist safe against entry removal
> + * @pos:	the type * to use as a loop cursor.
> + * @...:	either (head, member) or (next, head, member)
> + *
> + * next:	a &struct hlist_node to use as optional temporary storage. The
> + *		temporary cursor is internal unless explicitly supplied by the
> + *		caller.
> + * head:	the head for your hlist.
> + * member:	the name of the hlist_node within the struct.
> + */
> +#define hlist_for_each_entry_mutable(pos, ...)				\
> +	CONCATENATE(__hlist_for_each_entry_mutable,			\
> +		COUNT_ARGS(__VA_ARGS__))(pos, __VA_ARGS__)
> +
>  /**
>   * hlist_count_nodes - count nodes in the hlist
>   * @head:	the head for your hlist.


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: David Laight @ 2026-06-22  8:42 UTC (permalink / raw)
  To: Kaitao Cheng
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König, David Howells, Simona Vetter, Randy Dunlap,
	Luca Ceresoli, Philipp Stanner, linux-block, linux-kernel,
	cgroups, linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf,
	netdev, dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, Kaitao Cheng
In-Reply-To: <20260622040533.29824-2-kaitao.cheng@linux.dev>

On Mon, 22 Jun 2026 12:05:31 +0800
Kaitao Cheng <kaitao.cheng@linux.dev> wrote:

> From: Kaitao Cheng <chengkaitao@kylinos.cn>
> 
> The list_for_each*_safe() helpers are used when the loop body may
> remove the current entry.  Their API exposes the temporary cursor at
> every call site, even though most users only need it for the iterator
> implementation and never reference it in the loop body.
> 
> Add *_mutable() variants for list and hlist iteration.  The new helpers
> support both forms: callers may keep passing an explicit temporary cursor
> when they need to inspect or reset it, or omit it and let the helper use
> a unique internal cursor.

I'm not really sure 'mutable' means anything either.
It is possible to make it valid for the loop body (or even other threads)
to delete arbitrary list items - but that needs significant extra overheads.

It might be worth doing something that doesn't need the extra variable,
but there is little point doing all the churn just to rename things.

> 
> This makes call sites that only mutate the list through the current entry
> less noisy, while keeping the existing *_safe() helpers available for
> compatibility.
> 
> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
> ---
>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>  1 file changed, 231 insertions(+), 38 deletions(-)
> 
> diff --git a/include/linux/list.h b/include/linux/list.h
> index 09d979976b3b..1081def7cea9 100644
> --- a/include/linux/list.h
> +++ b/include/linux/list.h
> @@ -7,6 +7,7 @@
>  #include <linux/stddef.h>
>  #include <linux/poison.h>
>  #include <linux/const.h>
> +#include <linux/args.h>
>  
>  #include <asm/barrier.h>
>  
> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>  #define list_for_each_prev(pos, head) \
>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>  
> -/**
> - * list_for_each_safe - iterate over a list safe against removal of list entry
> - * @pos:	the &struct list_head to use as a loop cursor.
> - * @n:		another &struct list_head to use as temporary storage
> - * @head:	the head for your list.
> +/*
> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>   */
>  #define list_for_each_safe(pos, n, head) \
>  	for (pos = (head)->next, n = pos->next; \
>  	     !list_is_head(pos, (head)); \
>  	     pos = n, n = pos->next)
>  
> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\

Use auto

> +	     !list_is_head(pos, (head));				\
> +	     pos = tmp, tmp = pos->next)
> +
> +#define __list_for_each_mutable1(pos, head)				\
> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
> +
> +#define __list_for_each_mutable2(pos, next, head)			\
> +	list_for_each_safe(pos, next, head)
> +
>  /**
> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
> + * list_for_each_mutable - iterate over a list safe against entry removal
>   * @pos:	the &struct list_head to use as a loop cursor.
> - * @n:		another &struct list_head to use as temporary storage
> - * @head:	the head for your list.
> + * @...:	either (head) or (next, head)
> + *
> + * next:	another &struct list_head to use as optional temporary storage.
> + *		The temporary cursor is internal unless explicitly supplied by
> + *		the caller.
> + * head:	the head for your list.
> + */
> +#define list_for_each_mutable(pos, ...)					\
> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
> +		(pos, __VA_ARGS__)

The variable argument count logic really just slows down compilation.
Maybe there aren't enough copies of this code to make that significant.
But just because you can do it doesn't mean it is a gooD idea.
I'm also not sure it really adds anything to the readability.

And, it you are going to make the middle argument optional there is
no need to change the macro name.

	David



^ permalink raw reply

* Re: [PATCH 1/2] Protect skb pointer used by two different kernel instances
From: Eric Dumazet @ 2026-06-22  8:38 UTC (permalink / raw)
  To: Selvamani.Rajagopal
  Cc: Parthiban Veerasooran, Andrew Lunn, Piergiorgio Beruto,
	David S. Miller, Jakub Kicinski, Paolo Abeni, netdev,
	linux-kernel, Andrew Lunn
In-Reply-To: <20260621-fix-race-condition-and-crash-v1-1-87e290d9357f@onsemi.com>

On Sun, Jun 21, 2026 at 9:33 PM Selvamani Rajagopal via B4 Relay
<devnull+Selvamani.Rajagopal.onsemi.com@kernel.org> wrote:
>
> From: Selvamani Rajagopal <Selvamani.Rajagopal@onsemi.com>
>
> Threaded IRQ uses waiting_tx_skb. Transmit path also uses
> this pointer without any mutual exclusion protection. As a
> result, it might leak skb buffer, particularly threaded IRQ
> runs in the middle of tranmsmit path, near skb_linearize.
>
> Fixes: b542d13fab0f ("net: ethernet: oa_tc6: Interrupt is active low, level triggered.")
> Signed-off-by: Selvamani Rajagopal <Selvamani.Rajagopal@onsemi.com>
> ---

OK but please use "net: ethernet: oa_tc6:" prefix in the patch title.

^ permalink raw reply

* [PATCH net V3 3/3] net/mlx5e: Fix publication race for priv->channel_stats[]
From: Tariq Toukan @ 2026-06-22  8:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Cosmin Ratiu, Eran Ben Elisha, Feng Liu, Haiyang Zhang,
	Lama Kayal, Leon Romanovsky, linux-kernel, linux-rdma, Mark Bloch,
	Nimrod Oren, Saeed Mahameed, Tariq Toukan, Gal Pressman,
	Alexei Lazar, Simon Horman, Carolina Jubran, Kees Cook,
	Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20260622083646.593220-1-tariqt@nvidia.com>

From: Feng Liu <feliu@nvidia.com>

mlx5e_channel_stats_alloc() publishes a new entry to
priv->channel_stats[] and then increments priv->stats_nch as a
publication token, but neither store carries any memory barrier:

	priv->channel_stats[ix] = kvzalloc_node(...);
	if (!priv->channel_stats[ix])
		return -ENOMEM;
	priv->stats_nch++;

Concurrent readers compute the loop bound from priv->stats_nch and
then dereference priv->channel_stats[i] using plain accesses, e.g.

	for (i = 0; i < priv->stats_nch; i++) {
		struct mlx5e_channel_stats *cs = priv->channel_stats[i];
		... cs->rq.packets ...
	}

On weakly-ordered architectures (ARM, PowerPC, RISC-V) the writes to
channel_stats[ix] and stats_nch may become visible to other CPUs out
of program order. A reader can observe stats_nch == N while still
seeing channel_stats[N-1] == NULL, leading to a NULL pointer
dereference in the channel_stats loop.

This has been observed in production on BlueField-3 DPUs (arm64),
where ovs-vswitchd queries netdev statistics over netlink during NIC
bringup, racing mlx5e_open_channel() -> mlx5e_channel_stats_alloc()
on another CPU:

  Unable to handle kernel NULL pointer dereference at virtual address 0x840
  Hardware name: BlueField-3 DPU
  pc : mlx5e_fold_sw_stats64+0x30/0x180 [mlx5_core]
  Call trace:
   mlx5e_fold_sw_stats64+0x30/0x180 [mlx5_core]
   dev_get_stats+0x50/0xc0
   ovs_vport_get_stats+0x38/0xac [openvswitch]
   ovs_vport_cmd_fill_info+0x194/0x290 [openvswitch]
   ovs_vport_cmd_get+0xbc/0x10c [openvswitch]
   genl_family_rcv_msg_doit+0xd0/0x160
   genl_rcv_msg+0xec/0x1f0
   netlink_rcv_skb+0x64/0x130
   genl_rcv+0x40/0x60
   netlink_unicast+0x2fc/0x370
   netlink_sendmsg+0x1dc/0x454
   ...
   __arm64_sys_sendmsg+0x2c/0x40

Add mlx5e_stats_nch_write() and mlx5e_stats_nch_read() helpers in en.h
that wrap the smp_store_release()/smp_load_acquire() pair on stats_nch.
The release/acquire pair establishes the contract:

  stats_nch == N  =>  channel_stats[0..N-1] are visible and non-NULL.

Publish the stats_nch increment via mlx5e_stats_nch_write() in the
writer (mlx5e_channel_stats_alloc()), and read stats_nch via
mlx5e_stats_nch_read() in all readers: mlx5e RX/TX queue stats,
mlx5e_get_base_stats(), ethtool channels stats, IPoIB stats, the
sw_stats fold and the HV VHCA stats agent.

Fixes: fa691d0c9c08 ("net/mlx5e: Allocate per-channel stats dynamically at first usage")
Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Eran Ben Elisha <eranbe@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Nimrod Oren <noren@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       | 12 ++++++++++++
 .../ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c | 10 ++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 14 ++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c |  9 +++++----
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  3 ++-
 5 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2270e2e550dd..d507289096c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -987,6 +987,18 @@ struct mlx5e_priv {
 	struct ethtool_fec_hist_range *fec_ranges;
 };
 
+static inline u16 mlx5e_stats_nch_read(const struct mlx5e_priv *priv)
+{
+	/* Pairs with smp_store_release in mlx5e_stats_nch_write(). */
+	return smp_load_acquire(&priv->stats_nch);
+}
+
+static inline void mlx5e_stats_nch_write(struct mlx5e_priv *priv, u16 n)
+{
+	/* Pairs with smp_load_acquire in mlx5e_stats_nch_read(). */
+	smp_store_release(&priv->stats_nch, n);
+}
+
 struct mlx5e_dev {
 	struct net_device *netdev;
 	struct devlink_port dl_port;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
index 2e495442a547..9747d7736d37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -33,9 +33,10 @@ mlx5e_hv_vhca_fill_ring_stats(struct mlx5e_priv *priv, int ch,
 static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data,
 				     int buf_len)
 {
+	u16 nch = mlx5e_stats_nch_read(priv);
 	int ch, i = 0;
 
-	for (ch = 0; ch < priv->stats_nch; ch++) {
+	for (ch = 0; ch < nch; ch++) {
 		void *buf = data + i;
 
 		if (WARN_ON_ONCE(buf +
@@ -50,8 +51,9 @@ static void mlx5e_hv_vhca_fill_stats(struct mlx5e_priv *priv, void *data,
 
 static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv)
 {
-	return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) *
-		priv->stats_nch);
+	u16 nch = mlx5e_stats_nch_read(priv);
+
+	return sizeof(struct mlx5e_hv_vhca_per_ring_stats) * nch;
 }
 
 static int mlx5e_hv_vhca_stats_buf_max_size(struct mlx5e_priv *priv)
@@ -106,7 +108,7 @@ static void mlx5e_hv_vhca_stats_control(struct mlx5_hv_vhca_agent *agent,
 	sagent = &priv->stats_agent;
 
 	block->version = MLX5_HV_VHCA_STATS_VERSION;
-	block->rings   = priv->stats_nch;
+	block->rings   = mlx5e_stats_nch_read(priv);
 
 	if (!block->command) {
 		cancel_delayed_work_sync(&priv->stats_agent.work);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 775f0c6e55c9..aa8610cedaa8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2773,7 +2773,7 @@ static int mlx5e_channel_stats_alloc(struct mlx5e_priv *priv, int ix, int cpu)
 						GFP_KERNEL, cpu_to_node(cpu));
 	if (!priv->channel_stats[ix])
 		return -ENOMEM;
-	priv->stats_nch++;
+	mlx5e_stats_nch_write(priv, priv->stats_nch + 1);
 
 	return 0;
 }
@@ -4040,9 +4040,10 @@ static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type,
 
 void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s)
 {
+	u16 nch = mlx5e_stats_nch_read(priv);
 	int i;
 
-	for (i = 0; i < priv->stats_nch; i++) {
+	for (i = 0; i < nch; i++) {
 		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
 		struct mlx5e_rq_stats *xskrq_stats = &channel_stats->xskrq;
 		struct mlx5e_rq_stats *rq_stats = &channel_stats->rq;
@@ -5488,7 +5489,7 @@ static void mlx5e_get_queue_stats_rx(struct net_device *dev, int i,
 	struct mlx5e_rq_stats *xskrq_stats;
 	struct mlx5e_rq_stats *rq_stats;
 
-	if (mlx5e_is_uplink_rep(priv) || !priv->stats_nch)
+	if (mlx5e_is_uplink_rep(priv) || !mlx5e_stats_nch_read(priv))
 		return;
 
 	channel_stats = priv->channel_stats[i];
@@ -5512,7 +5513,7 @@ static void mlx5e_get_queue_stats_tx(struct net_device *dev, int i,
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_sq_stats *sq_stats;
 
-	if (!priv->stats_nch)
+	if (!mlx5e_stats_nch_read(priv))
 		return;
 
 	/* no special case needed for ptp htb etc since txq2sq_stats is kept up
@@ -5538,6 +5539,7 @@ static void mlx5e_get_base_stats(struct net_device *dev,
 				 struct netdev_queue_stats_tx *tx)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
+	u16 nch = mlx5e_stats_nch_read(priv);
 	struct mlx5e_ptp *ptp_channel;
 	int i, tc;
 
@@ -5549,7 +5551,7 @@ static void mlx5e_get_base_stats(struct net_device *dev,
 		rx->hw_gro_wire_packets = 0;
 		rx->hw_gro_wire_bytes = 0;
 
-		for (i = priv->channels.params.num_channels; i < priv->stats_nch; i++) {
+		for (i = priv->channels.params.num_channels; i < nch; i++) {
 			struct netdev_queue_stats_rx rx_i = {0};
 
 			mlx5e_get_queue_stats_rx(dev, i, &rx_i);
@@ -5585,7 +5587,7 @@ static void mlx5e_get_base_stats(struct net_device *dev,
 	tx->stop = 0;
 	tx->wake = 0;
 
-	for (i = 0; i < priv->stats_nch; i++) {
+	for (i = 0; i < nch; i++) {
 		struct mlx5e_channel_stats *channel_stats = priv->channel_stats[i];
 
 		/* handle two cases:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 7f33261ba655..de38b60806c2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -515,6 +515,7 @@ static void mlx5e_stats_update_stats_rq_page_pool(struct mlx5e_channel *c)
 static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
 {
 	struct mlx5e_sw_stats *s = &priv->stats.sw;
+	u16 nch = mlx5e_stats_nch_read(priv);
 	int i;
 
 	memset(s, 0, sizeof(*s));
@@ -522,7 +523,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(sw)
 	for (i = 0; i < priv->channels.num; i++) /* for active channels only */
 		mlx5e_stats_update_stats_rq_page_pool(priv->channels.c[i]);
 
-	for (i = 0; i < priv->stats_nch; i++) {
+	for (i = 0; i < nch; i++) {
 		struct mlx5e_channel_stats *channel_stats =
 			priv->channel_stats[i];
 
@@ -2614,7 +2615,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(ptp) { return; }
 
 static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels)
 {
-	int max_nch = priv->stats_nch;
+	int max_nch = mlx5e_stats_nch_read(priv);
 
 	return (NUM_RQ_STATS * max_nch) +
 	       (NUM_CH_STATS * max_nch) +
@@ -2627,8 +2628,8 @@ static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(channels)
 
 static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels)
 {
+	int max_nch = mlx5e_stats_nch_read(priv);
 	bool is_xsk = priv->xsk.ever_used;
-	int max_nch = priv->stats_nch;
 	int i, j, tc;
 
 	for (i = 0; i < max_nch; i++)
@@ -2660,8 +2661,8 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(channels)
 
 static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(channels)
 {
+	int max_nch = mlx5e_stats_nch_read(priv);
 	bool is_xsk = priv->xsk.ever_used;
-	int max_nch = priv->stats_nch;
 	int i, j, tc;
 
 	for (i = 0; i < max_nch; i++)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 0a6003fe60e9..674bed721e63 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -135,10 +135,11 @@ void mlx5i_cleanup(struct mlx5e_priv *priv)
 
 static void mlx5i_grp_sw_update_stats(struct mlx5e_priv *priv)
 {
+	u16 nch = mlx5e_stats_nch_read(priv);
 	struct rtnl_link_stats64 s = {};
 	int i, j;
 
-	for (i = 0; i < priv->stats_nch; i++) {
+	for (i = 0; i < nch; i++) {
 		struct mlx5e_channel_stats *channel_stats;
 		struct mlx5e_rq_stats *rq_stats;
 
-- 
2.44.0


^ permalink raw reply related

* [PATCH net V3 2/3] net/mlx5e: Fix HV VHCA stats agent registration race
From: Tariq Toukan @ 2026-06-22  8:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Cosmin Ratiu, Eran Ben Elisha, Feng Liu, Haiyang Zhang,
	Lama Kayal, Leon Romanovsky, linux-kernel, linux-rdma, Mark Bloch,
	Nimrod Oren, Saeed Mahameed, Tariq Toukan, Gal Pressman,
	Alexei Lazar, Simon Horman, Carolina Jubran, Kees Cook,
	Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20260622083646.593220-1-tariqt@nvidia.com>

From: Feng Liu <feliu@nvidia.com>

mlx5e_hv_vhca_stats_create() registers the stats agent through
mlx5_hv_vhca_agent_create(). The helper publishes the agent in
hv_vhca->agents[type] under agents_lock and immediately schedules an
asynchronous control invalidation on the HV VHCA workqueue before
returning to mlx5e.

The asynchronous invalidation invokes the control agent's invalidate
callback, which reads the hypervisor control block and forwards the
command to mlx5e_hv_vhca_stats_control(). That callback may either:

  - call cancel_delayed_work_sync(&priv->stats_agent.work), or
  - call queue_delayed_work(priv->wq, &sagent->work, sagent->delay).

However, the delayed_work and priv->stats_agent.agent are only
initialized after mlx5_hv_vhca_agent_create() returns to mlx5e:

    agent = mlx5_hv_vhca_agent_create(...);   /* publish + invalidate */
    ...
    priv->stats_agent.agent = agent;          /* too late */
    INIT_DELAYED_WORK(&priv->stats_agent.work, ...); /* too late */

If the asynchronous control path runs before the two assignments
above, it can:

  - Operate on an uninitialized delayed_work whose timer.function is
    NULL. queue_delayed_work() calls add_timer() unconditionally, so
    when the timer expires the timer softirq invokes a NULL function
    pointer.
  - Re-initialize the timer later through INIT_DELAYED_WORK() while
    the timer is already enqueued in the timer wheel, corrupting the
    hlist (entry.pprev cleared while the previous bucket node still
    points at this entry).
  - When the worker eventually runs, mlx5e_hv_vhca_stats_work() reads
    sagent->agent (NULL) and dereferences it inside
    mlx5_hv_vhca_agent_write().

Fix this by:

  - Initializing priv->stats_agent.work before invoking
    mlx5_hv_vhca_agent_create(), so the work is always in a valid
    state when the control callback observes it.
  - Adding a struct mlx5_hv_vhca_agent **ctx_update out-parameter
    to mlx5_hv_vhca_agent_create(). The helper writes the agent
    pointer to *ctx_update before publishing into hv_vhca->agents[]
    and triggering the agents_update flow, so any callback
    subsequently invoked from that flow already sees a valid
    priv->stats_agent.agent. This avoids having the control
    callback participate in agent initialization.

While at it, clear priv->stats_agent.{agent,buf} after teardown and
on the agent_create() failure path. Without this, an enable/disable
cycle hitting an early-return in create can lead to a UAF or
double-destroy of stale pointers from the previous cycle.

Fixes: cef35af34d6d ("net/mlx5e: Add mlx5e HV VHCA stats agent")
Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Eran Ben Elisha <eranbe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../mellanox/mlx5/core/en/hv_vhca_stats.c     | 22 ++++++++++++-------
 .../ethernet/mellanox/mlx5/core/lib/hv_vhca.c |  8 +++++--
 .../ethernet/mellanox/mlx5/core/lib/hv_vhca.h |  6 +++--
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
index 06cbd49d4e98..2e495442a547 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -73,7 +73,7 @@ static void mlx5e_hv_vhca_stats_work(struct work_struct *work)
 	sagent = container_of(dwork, struct mlx5e_hv_vhca_stats_agent, work);
 	priv = container_of(sagent, struct mlx5e_priv, stats_agent);
 	buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
-	agent = sagent->agent;
+	agent = READ_ONCE(sagent->agent);
 	buf = sagent->buf;
 
 	memset(buf, 0, buf_len);
@@ -135,11 +135,14 @@ void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 	if (!priv->stats_agent.buf)
 		return;
 
+	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
+
 	agent = mlx5_hv_vhca_agent_create(priv->mdev->hv_vhca,
 					  MLX5_HV_VHCA_AGENT_STATS,
 					  mlx5e_hv_vhca_stats_control, NULL,
 					  mlx5e_hv_vhca_stats_cleanup,
-					  priv);
+					  priv,
+					  &priv->stats_agent.agent);
 
 	if (IS_ERR_OR_NULL(agent)) {
 		if (IS_ERR(agent))
@@ -148,18 +151,21 @@ void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 				    agent);
 
 		kvfree(priv->stats_agent.buf);
-		return;
+		priv->stats_agent.buf = NULL;
 	}
-
-	priv->stats_agent.agent = agent;
-	INIT_DELAYED_WORK(&priv->stats_agent.work, mlx5e_hv_vhca_stats_work);
 }
 
 void mlx5e_hv_vhca_stats_destroy(struct mlx5e_priv *priv)
 {
-	if (IS_ERR_OR_NULL(priv->stats_agent.agent))
+	struct mlx5_hv_vhca_agent *agent;
+
+	agent = READ_ONCE(priv->stats_agent.agent);
+	if (IS_ERR_OR_NULL(agent))
 		return;
 
-	mlx5_hv_vhca_agent_destroy(priv->stats_agent.agent);
+	mlx5_hv_vhca_agent_destroy(agent);
 	kvfree(priv->stats_agent.buf);
+
+	WRITE_ONCE(priv->stats_agent.agent, NULL);
+	priv->stats_agent.buf = NULL;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
index d6dc7bce855e..305752dab7bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.c
@@ -190,7 +190,7 @@ mlx5_hv_vhca_control_agent_create(struct mlx5_hv_vhca *hv_vhca)
 	return mlx5_hv_vhca_agent_create(hv_vhca, MLX5_HV_VHCA_AGENT_CONTROL,
 					 NULL,
 					 mlx5_hv_vhca_control_agent_invalidate,
-					 NULL, NULL);
+					 NULL, NULL, NULL);
 }
 
 static void mlx5_hv_vhca_control_agent_destroy(struct mlx5_hv_vhca_agent *agent)
@@ -256,7 +256,8 @@ mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
 					     u64 block_mask),
 			  void (*cleaup)(struct mlx5_hv_vhca_agent *agent),
-			  void *priv)
+			  void *priv,
+			  struct mlx5_hv_vhca_agent **ctx_update)
 {
 	struct mlx5_hv_vhca_agent *agent;
 
@@ -284,6 +285,9 @@ mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 	agent->invalidate = invalidate;
 	agent->cleanup   = cleaup;
 
+	if (ctx_update)
+		WRITE_ONCE(*ctx_update, agent);
+
 	mutex_lock(&hv_vhca->agents_lock);
 	hv_vhca->agents[type] = agent;
 	mutex_unlock(&hv_vhca->agents_lock);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
index f240ffe5116c..8b3974cf0ee4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h
@@ -43,7 +43,8 @@ mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
 					     u64 block_mask),
 			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
-			  void *context);
+			  void *context,
+			  struct mlx5_hv_vhca_agent **ctx_update);
 
 void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent);
 int mlx5_hv_vhca_agent_write(struct mlx5_hv_vhca_agent *agent,
@@ -84,7 +85,8 @@ mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca,
 			  void (*invalidate)(struct mlx5_hv_vhca_agent*,
 					     u64 block_mask),
 			  void (*cleanup)(struct mlx5_hv_vhca_agent *agent),
-			  void *context)
+			  void *context,
+			  struct mlx5_hv_vhca_agent **ctx_update)
 {
 	return NULL;
 }
-- 
2.44.0


^ permalink raw reply related

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Jani Nikula @ 2026-06-22  8:37 UTC (permalink / raw)
  To: Kaitao Cheng, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König
  Cc: David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, linux-kernel, cgroups,
	linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf, netdev,
	dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, chengkaitao
In-Reply-To: <20260622040533.29824-1-kaitao.cheng@linux.dev>

On Mon, 22 Jun 2026, Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> Add *_mutable() iterator variants for list, hlist and llist.  The new
> helpers are variadic and support both forms.  In the common case, the
> caller omits the temporary cursor and the macro creates a unique internal
> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
> explicit temporary cursor, the caller can still pass it and the helper
> keeps the existing *_safe() behaviour.
>
> For example, a call site may use the shorter form:
>
>   list_for_each_entry_mutable(pos, head, member)
>
> or keep the explicit temporary cursor form:
>
>   list_for_each_entry_mutable(pos, tmp, head, member)

I'm unconvinced it's a good idea to allow two forms with macro trickery,
*especially* when it's not the last argument you can omit. I think it's
a footgun.

IMO stick with the first form only, and there'll always be the _safe
variant that can be used when the temp pointer is needed.


BR,
Jani.


-- 
Jani Nikula, Intel

^ permalink raw reply

* [PATCH net V3 1/3] net/mlx5e: Fix HV VHCA stats zero-sized buffer allocation
From: Tariq Toukan @ 2026-06-22  8:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Cosmin Ratiu, Eran Ben Elisha, Feng Liu, Haiyang Zhang,
	Lama Kayal, Leon Romanovsky, linux-kernel, linux-rdma, Mark Bloch,
	Nimrod Oren, Saeed Mahameed, Tariq Toukan, Gal Pressman,
	Alexei Lazar, Simon Horman, Carolina Jubran, Kees Cook,
	Eran Ben Elisha, Saeed Mahameed
In-Reply-To: <20260622083646.593220-1-tariqt@nvidia.com>

From: Feng Liu <feliu@nvidia.com>

mlx5e_hv_vhca_stats_create() is called from mlx5e_nic_enable(),
before mlx5e_open(). At that point priv->stats_nch is still zero,
because it is only ever incremented in mlx5e_channel_stats_alloc(),
which is reached only from mlx5e_open_channel().

mlx5e_hv_vhca_stats_buf_size() therefore returns 0, and
kvzalloc(0, GFP_KERNEL) returns ZERO_SIZE_PTR ((void *)16) rather
than NULL. The "if (!buf)" guard does not catch this, and
mlx5e_hv_vhca_stats_create() completes "successfully" with
priv->stats_agent.buf set to ZERO_SIZE_PTR.

Once channels are opened (priv->stats_nch > 0) and the hypervisor
enables stats reporting, mlx5e_hv_vhca_stats_work() recomputes
buf_len using the new non-zero stats_nch and calls
memset(buf, 0, buf_len) on ZERO_SIZE_PTR, faulting at address 0x10.

Allocate the buffer based on priv->max_nch, which is set in
mlx5e_priv_init() and is the upper bound on stats_nch:

  - Add a separate helper mlx5e_hv_vhca_stats_buf_max_size() that
    returns sizeof(per_ring_stats) * max(max_nch, stats_nch), and
    use it for the kvzalloc() in mlx5e_hv_vhca_stats_create().
  - Keep mlx5e_hv_vhca_stats_buf_size() (which returns based on
    stats_nch) for the worker's active payload size, so the wire
    format (block->rings = stats_nch) and the amount of data filled
    by mlx5e_hv_vhca_fill_stats() are unchanged.

The max(max_nch, stats_nch) guard handles the rare case where
mlx5e_attach_netdev() recomputes max_nch downward across a
detach/resume cycle while priv->stats_nch persists (mlx5e_detach_netdev
does not call mlx5e_priv_cleanup, so stats_nch is only reset when
the netdev is destroyed). Without the guard, the worker could compute
buf_len from stats_nch and overrun the smaller buffer allocated based
on the reduced max_nch.

This mirrors the existing mlx5e pattern of preallocating arrays of
size max_nch (e.g. priv->channel_stats) and lazily populating
entries up to stats_nch on demand.

Fixes: fa691d0c9c08 ("net/mlx5e: Allocate per-channel stats dynamically at first usage")
Signed-off-by: Feng Liu <feliu@nvidia.com>
Reviewed-by: Eran Ben Elisha <eranbe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
index 195863b2c013..06cbd49d4e98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/hv_vhca_stats.c
@@ -54,6 +54,12 @@ static int mlx5e_hv_vhca_stats_buf_size(struct mlx5e_priv *priv)
 		priv->stats_nch);
 }

+static int mlx5e_hv_vhca_stats_buf_max_size(struct mlx5e_priv *priv)
+{
+	return (sizeof(struct mlx5e_hv_vhca_per_ring_stats) *
+		max(priv->max_nch, priv->stats_nch));
+}
+
 static void mlx5e_hv_vhca_stats_work(struct work_struct *work)
 {
 	struct mlx5e_hv_vhca_stats_agent *sagent;
@@ -122,7 +128,7 @@ static void mlx5e_hv_vhca_stats_cleanup(struct mlx5_hv_vhca_agent *agent)

 void mlx5e_hv_vhca_stats_create(struct mlx5e_priv *priv)
 {
-	int buf_len = mlx5e_hv_vhca_stats_buf_size(priv);
+	int buf_len = mlx5e_hv_vhca_stats_buf_max_size(priv);
 	struct mlx5_hv_vhca_agent *agent;

 	priv->stats_agent.buf = kvzalloc(buf_len, GFP_KERNEL);
-- 
2.44.0

^ permalink raw reply related

* [PATCH net V3 0/3] net/mlx5e: Fix crashes in dynamic per-channel stats and HV VHCA agent
From: Tariq Toukan @ 2026-06-22  8:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	netdev, Paolo Abeni
  Cc: Cosmin Ratiu, Eran Ben Elisha, Feng Liu, Haiyang Zhang,
	Lama Kayal, Leon Romanovsky, linux-kernel, linux-rdma, Mark Bloch,
	Nimrod Oren, Saeed Mahameed, Tariq Toukan, Gal Pressman,
	Alexei Lazar, Simon Horman, Carolina Jubran, Kees Cook,
	Eran Ben Elisha, Saeed Mahameed

Hi,

Since per-channel stats were converted to be allocated and published
lazily at first channel open in commit fa691d0c9c08 ("net/mlx5e:
Allocate per-channel stats dynamically at first usage"),
priv->channel_stats[] and priv->stats_nch are filled in
incrementally during interface bring-up. This opened a window in
which the various stats readers - most of them reachable from
userspace via netlink/netdev stats queries - can race with
mlx5e_open_channel() on another CPU and observe partially
initialized state. The HV VHCA stats agent, which is created
before the channels are opened, hits related problems of its own.

This series by Feng fixes the resulting crashes.

Regards,
Tariq

V3:
- Rebase on current net.

V2:
https://lore.kernel.org/all/20260617140127.573117-1-tariqt@nvidia.com/

Feng Liu (3):
  net/mlx5e: Fix HV VHCA stats zero-sized buffer allocation
  net/mlx5e: Fix HV VHCA stats agent registration race
  net/mlx5e: Fix publication race for priv->channel_stats[]

 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 12 ++++++
 .../mellanox/mlx5/core/en/hv_vhca_stats.c     | 38 +++++++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 14 ++++---
 .../ethernet/mellanox/mlx5/core/en_stats.c    |  9 +++--
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib.c |  3 +-
 .../ethernet/mellanox/mlx5/core/lib/hv_vhca.c |  8 +++-
 .../ethernet/mellanox/mlx5/core/lib/hv_vhca.h |  6 ++-
 7 files changed, 63 insertions(+), 27 deletions(-)

base-commit: d07d80b6a129a44538cda1549b7acf95154fb197
-- 
2.44.0

^ permalink raw reply

* Re: [RFC net-next 08/15] ipxlat: add translation engine and dispatch core
From: Beniamino Galvani @ 2026-06-22  8:32 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Ralf Lici, netdev, Daniel Gröber, Antonio Quartulli,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, linux-kernel
In-Reply-To: <87a4tab1vs.fsf@toke.dk>

On Thu, Jun 04, 2026 at 08:23:51PM +0200, Toke Høiland-Jørgensen wrote:
> Ralf Lici <ralf@mandelbit.com> writes:
> 
> > This commit introduces the core start_xmit processing flow: validate,
> > select action, translate, and forward. It centralizes action resolution
> > in the dispatch layer and keeps per-direction translation logic separate
> > from device glue. The result is a single data-path entry point with
> > explicit control over drop/forward/emit behavior.
> >
> > Signed-off-by: Ralf Lici <ralf@mandelbit.com>
> 
> This is very cool! Going quickly through the series, this seems like
> thorough work that will be cool to have available in the kernel, so
> thanks for doing this! I'll be quite happy to retire my barebones
> BPF-based implementation once this lands :)

Hi,

speaking as a maintainer of NetworkManager, I would also like to see
this feature in the kernel!

In NetworkManager currently we are using a BPF program [1] to
implement the CLAT, but that approach comes with limitations: for
example, we can't fragment v4->v6 packets if needed, and it's not
possible to recompute checksums in certain cases (e.g. for v4->v6 UDP
packets with zero checksum, and for fragmented ICMP). systemd-networkd
is also adding CLAT support via BPF [2], with a fallback to userspace
for the cases that can't be handled in kernel.

It would be very useful to have a native in-kernel CLAT that solves
the limitations of BPF-based solutions, and can be used by different
tools without having to re-implement everything from scratch.

Beniamino

[1] https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/blob/1.57.2-dev/src/core/bpf/clat.bpf.c
[2] https://github.com/systemd/systemd/pull/41412

^ permalink raw reply

* Re: [PATCH 1/2] fs: Add bpf_sock_read_xattr() kfunc to read socket xattrs
From: Christian Brauner @ 2026-06-22  8:21 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Christian Brauner, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Alexei Starovoitov, Daniel Borkmann, Alexander Viro,
	Jan Kara, Simon Horman, Kuniyuki Iwashima, Willem de Bruijn,
	linux-fsdevel, netdev, bpf, Andrii Nakryiko, Martin KaFai Lau,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Song Liu,
	Yonghong Song, Jiri Olsa
In-Reply-To: <DJDJX62AS415.2BVILN08QK149@gmail.com>

> lgtm.
> How do you want to route it? Thought vfs tree for the next merge window?

Yes, thank you for looking!


^ permalink raw reply

* Re: [PATCH net,v2 00/14] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2026-06-22  8:16 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
In-Reply-To: <20260620222738.112506-1-pablo@netfilter.org>

Hi,

Sashiko reports two issues, one in:

- netfilter: flowtable: fix offloaded ct timeout never being extended
  which is real for net/sched/act_ct.c, this was a preexisting issue,
  we can follow up on it.

- netfilter: nf_conntrack_expect: use conntrack GC to reap expectations
  I already planned to follow up on this and a few more subtle issues
  (includeing one related patch I have withdrew because it is
   incomplete).

Please apply, thanks.

On Sun, Jun 21, 2026 at 12:27:24AM +0200, Pablo Neira Ayuso wrote:
> This is v2, dropping two patches that need a bit more work,
> uncovered by sashiko. I have revisit the working of this cover
> letter to refine it.
> 
> -o-
> 
> Hi,
>  
> The following patchset contains Netfilter fixes for net. This batches
> fixes for real crashes with trivial/correctness fixes. There is too
> a rework of the conntrack expectation timeout strategy to deal with
> a possible race when removing an expectation.
>  
> 1) Fix the incorrect flowtable timeout extension for entries in
>    hw offload, from Adrian Bente. This is correcting a defect in
>    the functionality, no crash.
>  
> 2) Hold reference to device under the fake dst in br_netfilter,
>    from Haoze Xie. This is fixing a possible UaF if the device
>    is removed while packet is sitting in nfqueue.
>  
> 3) Reject template conntrack in xt_cluster, otherwise access to
>    uninitialize conntrack fields are possible leading to WARN_ON
>    due to unset layer 3 protocol. From Wyatt Feng.
>  
> 4) Make sure the IPv6 tunnel header is in the linear skb data
>    area before pulling. While at it remove incomplete NEXTHDR_DEST
>    support. From Lorenzo Bianconi. This possibly leading to crash
>    if IPv4 header is not in the linear area.
>  
> 5) Use test_bit_acquire in ipset hash set to avoid reordering
>    of subsequent memory access. This is addressing a LLM related
>    report, no crash has been observed. From Jozsef Kadlecsik.
>  
> 6) Use test_bit_acquire in ipset bitmap set too, for the same
>    reason as in the previous patch, from Jozsef Kadlecsik.
>  
> 7) Call kfree_rcu() after rcu_assign_pointer() to address a
>    possible UaF if kfree_rcu() runs inmediately, which to my
>    understanding never happens. Never observed in practise,
>    reported by LLM. Also from Jozsef Kadlecsik.
> 
> 8) Use disable_delayed_work_sync() instead cancel_delayed_work_sync()
>    to avoid that ipset GC handler re-queues work as reported by LLM.
>    From Jozsef Kadlecsik. This is for correctness.
>  
> 9) Restore the check in nft_payload for exceeding payloda offset
>     over 2^16. From Florian Westphal. This fixes a silent truncation,
>     not a big deal, but better be assertive and reject it.
>  
> 10) Validate NFT_META_BRI_IIFHWADDR can only run from bridge
>     prerouting. From Florian Westphal. Harmless but it could allow
>     to read bytes from skb->cb.
>  
> 11) Zero out destination hardware address during the flowtable
>     path setup, also from Florian. This is a correctness fix, LLM
>     points that possible infoleak can happen but topology to achieve
>     it is not clear.
> 
> 12) Skip IPv4 options if present when building the IPV4 reject reply.
>     Otherwise bytes in the IPv4 options header can be sent back to
>     origin where the ICMP header is being expected. Again from
>     Florian Westphal.
>  
> 13) Replace timer API for expectation by GC worker approach. This
>     is implicitly fixing a race between nf_ct_remove_expectations()
>     which might fail to remove the expectation due to timer_del()
>     returning false because timer has expired and callback is
>     being run concurrently. This fix is addressing a crash that has
>     been already reported with a reproducer.
> 
> 14) Check if br_vlan_get_pvid_rcu() fails, otherwise possible stack
>     infoleak of 4-bytes. From Florian Westphal.
> 
> Please, pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-26-06-21
> 
> Thanks.
> 
> ----------------------------------------------------------------
> 
> The following changes since commit 96e7f9122aae0ed000ee321f324b812a447906d9:
> 
>   eth: fbnic: take netif_addr_lock_bh() around rx mode address programming (2026-06-18 18:36:26 -0700)
> 
> are available in the Git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-26-06-21
> 
> for you to fetch changes up to 27dd2997746d54ebc079bb13161cc1bdd401d4a6:
> 
>   netfilter: nft_meta_bridge: fix NFT_META_BRI_IIFPVID stack leak (2026-06-21 00:18:37 +0200)
> 
> ----------------------------------------------------------------
> netfilter pull request 26-06-21
> 
> ----------------------------------------------------------------
> Adrian Bente (1):
>       netfilter: flowtable: fix offloaded ct timeout never being extended
> 
> Florian Westphal (5):
>       netfilter: nft_payload: reject offsets exceeding 65535 bytes
>       netfilter: nft_meta_bridge: add validate callback for get operations
>       netfilter: nft_flow_offload: zero device address for non-ether case
>       netfilter: nf_reject: skip iphdr options when looking for icmp header
>       netfilter: nft_meta_bridge: fix NFT_META_BRI_IIFPVID stack leak
> 
> Haoze Xie (1):
>       netfilter: nf_queue: pin bridge device while NFQUEUE holds fake dst
> 
> Jozsef Kadlecsik (4):
>       netfilter: ipset: Don't use test_bit() in lockless RCU readers in hash types
>       netfilter: ipset: Don't use test_bit() in lockless RCU readers in bitmap types
>       netfilter: ipset: fix order of kfree_rcu() and rcu_assign_pointer()
>       netfilter: ipset: make sure gc is properly stopped
> 
> Lorenzo Bianconi (1):
>       netfilter: flowtable: fix and simplify IP6IP6 tunnel handling
> 
> Pablo Neira Ayuso (1):
>       netfilter: nf_conntrack_expect: use conntrack GC to reap expectations
> 
> Wyatt Feng (1):
>       netfilter: xt_cluster: reject template conntracks in hash match
> 
>  include/net/netfilter/nf_conntrack_expect.h        |  16 ++-
>  include/net/netfilter/nf_queue.h                   |   1 +
>  include/net/netfilter/nft_meta.h                   |   2 +
>  include/uapi/linux/netfilter/nf_conntrack_common.h |   1 +
>  net/bridge/netfilter/nft_meta_bridge.c             |  23 +++-
>  net/ipv4/netfilter/nf_reject_ipv4.c                |   2 +-
>  net/ipv6/ip6_tunnel.c                              |   7 +
>  net/netfilter/ipset/ip_set_bitmap_gen.h            |   4 +-
>  net/netfilter/ipset/ip_set_bitmap_ip.c             |   2 +-
>  net/netfilter/ipset/ip_set_bitmap_ipmac.c          |   2 +-
>  net/netfilter/ipset/ip_set_bitmap_port.c           |   2 +-
>  net/netfilter/ipset/ip_set_core.c                  |   4 +-
>  net/netfilter/ipset/ip_set_hash_gen.h              |  12 +-
>  net/netfilter/nf_conntrack_core.c                  |  33 ++++-
>  net/netfilter/nf_conntrack_expect.c                | 145 ++++++++++-----------
>  net/netfilter/nf_conntrack_h323_main.c             |   4 +-
>  net/netfilter/nf_conntrack_helper.c                |  10 +-
>  net/netfilter/nf_conntrack_netlink.c               |  22 ++--
>  net/netfilter/nf_conntrack_sip.c                   |  13 +-
>  net/netfilter/nf_flow_table_core.c                 |  13 +-
>  net/netfilter/nf_flow_table_ip.c                   |  80 +++---------
>  net/netfilter/nf_flow_table_path.c                 |   4 +-
>  net/netfilter/nf_queue.c                           |  14 ++
>  net/netfilter/nfnetlink_queue.c                    |   3 +
>  net/netfilter/nft_ct.c                             |   3 +-
>  net/netfilter/nft_meta.c                           |   5 +-
>  net/netfilter/nft_payload.c                        |  16 ++-
>  net/netfilter/xt_cluster.c                         |   2 +-
>  .../selftests/net/netfilter/nft_flowtable.sh       |   8 +-
>  29 files changed, 254 insertions(+), 199 deletions(-)
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox