Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v1 07/11] drm/xe/ras: Introduce correctable error handling
From: Raag Jadav @ 2026-04-17 21:16 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: simona.vetter, airlied, kuba, lijo.lazar, Hawking.Zhang, davem,
	pabeni, edumazet, maarten, zachary.mckevitt, rodrigo.vivi,
	riana.tauro, michal.wajdeczko, matthew.d.roper,
	umesh.nerlige.ramappa, mallesh.koujalagi, soham.purkait,
	anoop.c.vijay, aravind.iddamsetty, Raag Jadav
In-Reply-To: <20260417211730.837345-1-raag.jadav@intel.com>

Add initial support for correctable error handling which is serviced
using system controller event. Currently we only log the errors in
dmesg but this serves as a foundation for RAS infrastructure and will
be further extended to facilitate other RAS features.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
Reviewed-by: Mallesh Koujalagi <mallesh.koujalagi@intel.com>
---
 drivers/gpu/drm/xe/Makefile           |  1 +
 drivers/gpu/drm/xe/xe_ras.c           | 92 +++++++++++++++++++++++++++
 drivers/gpu/drm/xe/xe_ras.h           | 15 +++++
 drivers/gpu/drm/xe/xe_ras_types.h     | 73 +++++++++++++++++++++
 drivers/gpu/drm/xe/xe_sysctrl_event.c |  3 +-
 5 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/xe/xe_ras.c
 create mode 100644 drivers/gpu/drm/xe/xe_ras.h
 create mode 100644 drivers/gpu/drm/xe/xe_ras_types.h

diff --git a/drivers/gpu/drm/xe/Makefile b/drivers/gpu/drm/xe/Makefile
index 1c863b711ae9..22f17bd1082d 100644
--- a/drivers/gpu/drm/xe/Makefile
+++ b/drivers/gpu/drm/xe/Makefile
@@ -114,6 +114,7 @@ xe-y += xe_bb.o \
 	xe_pxp_submit.o \
 	xe_query.o \
 	xe_range_fence.o \
+	xe_ras.o \
 	xe_reg_sr.o \
 	xe_reg_whitelist.o \
 	xe_ring_ops.o \
diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
new file mode 100644
index 000000000000..08e91348c459
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#include "xe_printk.h"
+#include "xe_ras.h"
+#include "xe_ras_types.h"
+#include "xe_sysctrl.h"
+#include "xe_sysctrl_event_types.h"
+
+/* Severity of detected errors  */
+enum xe_ras_severity {
+	XE_RAS_SEV_NOT_SUPPORTED = 0,
+	XE_RAS_SEV_CORRECTABLE,
+	XE_RAS_SEV_UNCORRECTABLE,
+	XE_RAS_SEV_INFORMATIONAL,
+	XE_RAS_SEV_MAX
+};
+
+/* Major IP blocks/components where errors can originate */
+enum xe_ras_component {
+	XE_RAS_COMP_NOT_SUPPORTED = 0,
+	XE_RAS_COMP_DEVICE_MEMORY,
+	XE_RAS_COMP_CORE_COMPUTE,
+	XE_RAS_COMP_RESERVED,
+	XE_RAS_COMP_PCIE,
+	XE_RAS_COMP_FABRIC,
+	XE_RAS_COMP_SOC_INTERNAL,
+	XE_RAS_COMP_MAX
+};
+
+static const char *const xe_ras_severities[] = {
+	[XE_RAS_SEV_NOT_SUPPORTED]		= "Not Supported",
+	[XE_RAS_SEV_CORRECTABLE]		= "Correctable Error",
+	[XE_RAS_SEV_UNCORRECTABLE]		= "Uncorrectable Error",
+	[XE_RAS_SEV_INFORMATIONAL]		= "Informational Error",
+};
+static_assert(ARRAY_SIZE(xe_ras_severities) == XE_RAS_SEV_MAX);
+
+static const char *const xe_ras_components[] = {
+	[XE_RAS_COMP_NOT_SUPPORTED]		= "Not Supported",
+	[XE_RAS_COMP_DEVICE_MEMORY]		= "Device Memory",
+	[XE_RAS_COMP_CORE_COMPUTE]		= "Core Compute",
+	[XE_RAS_COMP_RESERVED]			= "Reserved",
+	[XE_RAS_COMP_PCIE]			= "PCIe",
+	[XE_RAS_COMP_FABRIC]			= "Fabric",
+	[XE_RAS_COMP_SOC_INTERNAL]		= "SoC Internal",
+};
+static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX);
+
+static inline const char *sev_to_str(u8 sev)
+{
+	if (sev >= XE_RAS_SEV_MAX)
+		sev = XE_RAS_SEV_NOT_SUPPORTED;
+
+	return xe_ras_severities[sev];
+}
+
+static inline const char *comp_to_str(u8 comp)
+{
+	if (comp >= XE_RAS_COMP_MAX)
+		comp = XE_RAS_COMP_NOT_SUPPORTED;
+
+	return xe_ras_components[comp];
+}
+
+void xe_ras_counter_threshold_crossed(struct xe_device *xe,
+				      struct xe_sysctrl_event_response *response)
+{
+	struct xe_ras_threshold_crossed *pending = (void *)&response->data;
+	struct xe_ras_error_class *errors = pending->counters;
+	u32 counter_id, ncounters = pending->ncounters;
+
+	if (!ncounters || ncounters > XE_RAS_NUM_COUNTERS) {
+		xe_err(xe, "sysctrl: unexpected counter threshold crossed %u\n", ncounters);
+		return;
+	}
+
+	BUILD_BUG_ON(sizeof(response->data) < sizeof(*pending));
+	xe_warn(xe, "[RAS]: counter threshold crossed, %u new errors\n", ncounters);
+
+	for (counter_id = 0; counter_id < ncounters; counter_id++) {
+		u8 severity, component;
+
+		severity = errors[counter_id].common.severity;
+		component = errors[counter_id].common.component;
+
+		xe_warn(xe, "[RAS]: %s %s detected\n",
+			comp_to_str(component), sev_to_str(severity));
+	}
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
new file mode 100644
index 000000000000..ea90593b62dc
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_RAS_H_
+#define _XE_RAS_H_
+
+struct xe_device;
+struct xe_sysctrl_event_response;
+
+void xe_ras_counter_threshold_crossed(struct xe_device *xe,
+				      struct xe_sysctrl_event_response *response);
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h
new file mode 100644
index 000000000000..4e63c67f806a
--- /dev/null
+++ b/drivers/gpu/drm/xe/xe_ras_types.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2026 Intel Corporation
+ */
+
+#ifndef _XE_RAS_TYPES_H_
+#define _XE_RAS_TYPES_H_
+
+#include <linux/types.h>
+
+#define XE_RAS_NUM_COUNTERS			16
+
+/**
+ * struct xe_ras_error_common - Error fields that are common across all products
+ */
+struct xe_ras_error_common {
+	/** @severity: Error severity */
+	u8 severity;
+	/** @component: IP block where error originated */
+	u8 component;
+} __packed;
+
+/**
+ * struct xe_ras_error_unit - Error unit information
+ */
+struct xe_ras_error_unit {
+	/** @tile: Tile identifier */
+	u8 tile;
+	/** @instance: Instance identifier specific to IP */
+	u32 instance;
+} __packed;
+
+/**
+ * struct xe_ras_error_cause - Error cause information
+ */
+struct xe_ras_error_cause {
+	/** @cause: Cause/checker */
+	u32 cause;
+	/** @reserved: For future use */
+	u8 reserved;
+} __packed;
+
+/**
+ * struct xe_ras_error_product - Error fields that are specific to the product
+ */
+struct xe_ras_error_product {
+	/** @unit: Unit within IP block */
+	struct xe_ras_error_unit unit;
+	/** @cause: Cause/checker */
+	struct xe_ras_error_cause cause;
+} __packed;
+
+/**
+ * struct xe_ras_error_class - Combines common and product-specific parts
+ */
+struct xe_ras_error_class {
+	/** @common: Common error type and component */
+	struct xe_ras_error_common common;
+	/** @product: Product-specific unit and cause */
+	struct xe_ras_error_product product;
+} __packed;
+
+/**
+ * struct xe_ras_threshold_crossed - Data for threshold crossed event
+ */
+struct xe_ras_threshold_crossed {
+	/** @ncounters: Number of error counters that crossed thresholds */
+	u32 ncounters;
+	/** @counters: Array of error counters that crossed threshold */
+	struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS];
+} __packed;
+
+#endif
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_event.c b/drivers/gpu/drm/xe/xe_sysctrl_event.c
index 74163e0bafe2..e96af8be07a2 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_event.c
+++ b/drivers/gpu/drm/xe/xe_sysctrl_event.c
@@ -6,6 +6,7 @@
 #include "xe_device.h"
 #include "xe_irq.h"
 #include "xe_printk.h"
+#include "xe_ras.h"
 #include "xe_sysctrl.h"
 #include "xe_sysctrl_event_types.h"
 #include "xe_sysctrl_mailbox.h"
@@ -35,7 +36,7 @@ static void get_pending_event(struct xe_sysctrl *sc, struct xe_sysctrl_mailbox_c
 		}
 
 		if (response->event == XE_SYSCTRL_EVENT_THRESHOLD_CROSSED)
-			xe_warn(xe, "[RAS]: counter threshold crossed\n");
+			xe_ras_counter_threshold_crossed(xe, response);
 		else
 			xe_err(xe, "sysctrl: unexpected event %#x\n", response->event);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v1 08/11] drm/xe/ras: Get error threshold support
From: Raag Jadav @ 2026-04-17 21:16 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: simona.vetter, airlied, kuba, lijo.lazar, Hawking.Zhang, davem,
	pabeni, edumazet, maarten, zachary.mckevitt, rodrigo.vivi,
	riana.tauro, michal.wajdeczko, matthew.d.roper,
	umesh.nerlige.ramappa, mallesh.koujalagi, soham.purkait,
	anoop.c.vijay, aravind.iddamsetty, Raag Jadav
In-Reply-To: <20260417211730.837345-1-raag.jadav@intel.com>

System controller allows programming per error threshold value, which
it uses to raise error events to the driver. Get it using mailbox
command so that it can be exposed to the user.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
 drivers/gpu/drm/xe/xe_ras.c                   | 73 +++++++++++++++++++
 drivers/gpu/drm/xe/xe_ras.h                   |  3 +
 drivers/gpu/drm/xe/xe_ras_types.h             | 22 ++++++
 drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h |  2 +
 4 files changed, 100 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 08e91348c459..3e93f838aa4a 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -3,11 +3,14 @@
  * Copyright © 2026 Intel Corporation
  */
 
+#include "xe_pm.h"
 #include "xe_printk.h"
 #include "xe_ras.h"
 #include "xe_ras_types.h"
 #include "xe_sysctrl.h"
 #include "xe_sysctrl_event_types.h"
+#include "xe_sysctrl_mailbox.h"
+#include "xe_sysctrl_mailbox_types.h"
 
 /* Severity of detected errors  */
 enum xe_ras_severity {
@@ -49,6 +52,23 @@ static const char *const xe_ras_components[] = {
 };
 static_assert(ARRAY_SIZE(xe_ras_components) == XE_RAS_COMP_MAX);
 
+/* uAPI mapping */
+static const int drm_to_xe_ras_components[] = {
+	[DRM_XE_RAS_ERR_COMP_CORE_COMPUTE]	= XE_RAS_COMP_CORE_COMPUTE,
+	[DRM_XE_RAS_ERR_COMP_SOC_INTERNAL]	= XE_RAS_COMP_SOC_INTERNAL,
+	[DRM_XE_RAS_ERR_COMP_DEVICE_MEMORY]	= XE_RAS_COMP_DEVICE_MEMORY,
+	[DRM_XE_RAS_ERR_COMP_PCIE]		= XE_RAS_COMP_PCIE,
+	[DRM_XE_RAS_ERR_COMP_FABRIC]		= XE_RAS_COMP_FABRIC
+};
+static_assert(ARRAY_SIZE(drm_to_xe_ras_components) == DRM_XE_RAS_ERR_COMP_MAX);
+
+/* uAPI mapping */
+static const int drm_to_xe_ras_severities[] = {
+	[DRM_XE_RAS_ERR_SEV_CORRECTABLE]	= XE_RAS_SEV_CORRECTABLE,
+	[DRM_XE_RAS_ERR_SEV_UNCORRECTABLE]	= XE_RAS_SEV_UNCORRECTABLE
+};
+static_assert(ARRAY_SIZE(drm_to_xe_ras_severities) == DRM_XE_RAS_ERR_SEV_MAX);
+
 static inline const char *sev_to_str(u8 sev)
 {
 	if (sev >= XE_RAS_SEV_MAX)
@@ -90,3 +110,56 @@ void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 			comp_to_str(component), sev_to_str(severity));
 	}
 }
+
+static void ras_command_prepare(struct xe_sysctrl_mailbox_command *command,
+				void *request, size_t request_len, void *response,
+				size_t response_len, u8 hdr_cmd)
+{
+	struct xe_sysctrl_app_msg_hdr header = {};
+
+	header.data = REG_FIELD_PREP(APP_HDR_GROUP_ID_MASK, XE_SYSCTRL_GROUP_GFSP) |
+		      REG_FIELD_PREP(APP_HDR_COMMAND_MASK, hdr_cmd);
+
+	command->header = header;
+	command->data_in = request;
+	command->data_in_len = request_len;
+	command->data_out = response;
+	command->data_out_len = response_len;
+}
+
+int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold)
+{
+	struct xe_ras_get_threshold_response response = {};
+	struct xe_ras_get_threshold_request request = {};
+	struct xe_sysctrl_mailbox_command command = {};
+	struct xe_ras_error_class counter = {};
+	size_t len;
+	int ret;
+
+	counter.common.severity = drm_to_xe_ras_severities[severity];
+	counter.common.component = drm_to_xe_ras_components[component];
+	request.counter = counter;
+
+	ras_command_prepare(&command, &request, sizeof(request), &response,
+			    sizeof(response), XE_SYSCTRL_CMD_GET_THRESHOLD);
+
+	guard(xe_pm_runtime)(xe);
+	ret = xe_sysctrl_send_command(&xe->sc, &command, &len);
+	if (ret) {
+		xe_err(xe, "sysctrl: failed to get threshold %d\n", ret);
+		return ret;
+	}
+
+	if (len != sizeof(response)) {
+		xe_err(xe, "sysctrl: unexpected get threshold response length %zu (expected %zu)\n",
+		       len, sizeof(response));
+		return -EIO;
+	}
+
+	counter = response.counter;
+	*threshold = response.threshold;
+
+	xe_dbg(xe, "[RAS]: Get threshold %u for %s %s\n", response.threshold,
+	       comp_to_str(counter.common.component), sev_to_str(counter.common.severity));
+	return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
index ea90593b62dc..982bbe61461e 100644
--- a/drivers/gpu/drm/xe/xe_ras.h
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -6,10 +6,13 @@
 #ifndef _XE_RAS_H_
 #define _XE_RAS_H_
 
+#include <linux/types.h>
+
 struct xe_device;
 struct xe_sysctrl_event_response;
 
 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 				      struct xe_sysctrl_event_response *response);
+int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h
index 4e63c67f806a..d5da93d65cf5 100644
--- a/drivers/gpu/drm/xe/xe_ras_types.h
+++ b/drivers/gpu/drm/xe/xe_ras_types.h
@@ -70,4 +70,26 @@ struct xe_ras_threshold_crossed {
 	struct xe_ras_error_class counters[XE_RAS_NUM_COUNTERS];
 } __packed;
 
+/**
+ * struct xe_ras_get_threshold_request - Request structure for get threshold
+ */
+struct xe_ras_get_threshold_request {
+	/** @counter: Counter to get threshold for */
+	struct xe_ras_error_class counter;
+	/** @reserved: Reserved for future use */
+	u32 reserved;
+} __packed;
+
+/**
+ * struct xe_ras_get_threshold_response - Response structure for get threshold
+ */
+struct xe_ras_get_threshold_response {
+	/** @counter: Counter id */
+	struct xe_ras_error_class counter;
+	/** @threshold: Threshold value */
+	u32 threshold;
+	/** @reserved: Reserved for future use */
+	u32 reserved[4];
+} __packed;
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
index 84d7c647e743..a1b71218deca 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
+++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
@@ -22,9 +22,11 @@ enum xe_sysctrl_group {
 /**
  * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group
  *
+ * @XE_SYSCTRL_CMD_GET_THRESHOLD: Retrieve error threshold
  * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event
  */
 enum xe_sysctrl_gfsp_cmd {
+	XE_SYSCTRL_CMD_GET_THRESHOLD		= 0x05,
 	XE_SYSCTRL_CMD_GET_PENDING_EVENT	= 0x07,
 };
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v1 09/11] drm/xe/ras: Set error threshold support
From: Raag Jadav @ 2026-04-17 21:16 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: simona.vetter, airlied, kuba, lijo.lazar, Hawking.Zhang, davem,
	pabeni, edumazet, maarten, zachary.mckevitt, rodrigo.vivi,
	riana.tauro, michal.wajdeczko, matthew.d.roper,
	umesh.nerlige.ramappa, mallesh.koujalagi, soham.purkait,
	anoop.c.vijay, aravind.iddamsetty, Raag Jadav
In-Reply-To: <20260417211730.837345-1-raag.jadav@intel.com>

System controller allows programming per error threshold value, which
it uses to raise error events to the driver. Set it using mailbox
command so that it can be programmed by the user.

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
 drivers/gpu/drm/xe/xe_ras.c                   | 42 +++++++++++++++++++
 drivers/gpu/drm/xe/xe_ras.h                   |  1 +
 drivers/gpu/drm/xe/xe_ras_types.h             | 28 +++++++++++++
 drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h |  2 +
 4 files changed, 73 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_ras.c b/drivers/gpu/drm/xe/xe_ras.c
index 3e93f838aa4a..26e063166c5f 100644
--- a/drivers/gpu/drm/xe/xe_ras.c
+++ b/drivers/gpu/drm/xe/xe_ras.c
@@ -163,3 +163,45 @@ int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32
 	       comp_to_str(counter.common.component), sev_to_str(counter.common.severity));
 	return 0;
 }
+
+int xe_ras_set_threshold(struct xe_device *xe, u32 severity, u32 component, u32 threshold)
+{
+	struct xe_ras_set_threshold_response response = {};
+	struct xe_ras_set_threshold_request request = {};
+	struct xe_sysctrl_mailbox_command command = {};
+	struct xe_ras_error_class counter = {};
+	size_t len;
+	int ret;
+
+	counter.common.severity = drm_to_xe_ras_severities[severity];
+	counter.common.component = drm_to_xe_ras_components[component];
+	request.counter = counter;
+	request.threshold = threshold;
+
+	ras_command_prepare(&command, &request, sizeof(request), &response,
+			    sizeof(response), XE_SYSCTRL_CMD_SET_THRESHOLD);
+
+	guard(xe_pm_runtime)(xe);
+	ret = xe_sysctrl_send_command(&xe->sc, &command, &len);
+	if (ret) {
+		xe_err(xe, "sysctrl: failed to set threshold %d\n", ret);
+		return ret;
+	}
+
+	if (len != sizeof(response)) {
+		xe_err(xe, "sysctrl: unexpected set threshold response length %zu (expected %zu)\n",
+		       len, sizeof(response));
+		return -EIO;
+	}
+
+	if (response.status) {
+		xe_err(xe, "sysctrl: set threshold operation failed %#x\n", response.status);
+		return -EIO;
+	}
+
+	counter = response.counter;
+
+	xe_dbg(xe, "[RAS]: Set threshold %u for %s %s\n", response.threshold,
+	       comp_to_str(counter.common.component), sev_to_str(counter.common.severity));
+	return 0;
+}
diff --git a/drivers/gpu/drm/xe/xe_ras.h b/drivers/gpu/drm/xe/xe_ras.h
index 982bbe61461e..d1f71b1de723 100644
--- a/drivers/gpu/drm/xe/xe_ras.h
+++ b/drivers/gpu/drm/xe/xe_ras.h
@@ -14,5 +14,6 @@ struct xe_sysctrl_event_response;
 void xe_ras_counter_threshold_crossed(struct xe_device *xe,
 				      struct xe_sysctrl_event_response *response);
 int xe_ras_get_threshold(struct xe_device *xe, u32 severity, u32 component, u32 *threshold);
+int xe_ras_set_threshold(struct xe_device *xe, u32 severity, u32 component, u32 threshold);
 
 #endif
diff --git a/drivers/gpu/drm/xe/xe_ras_types.h b/drivers/gpu/drm/xe/xe_ras_types.h
index d5da93d65cf5..d7e4a02a661d 100644
--- a/drivers/gpu/drm/xe/xe_ras_types.h
+++ b/drivers/gpu/drm/xe/xe_ras_types.h
@@ -92,4 +92,32 @@ struct xe_ras_get_threshold_response {
 	u32 reserved[4];
 } __packed;
 
+/**
+ * struct xe_ras_set_threshold_request - Request structure for set threshold
+ */
+struct xe_ras_set_threshold_request {
+	/** @counter: Counter to set threshold for */
+	struct xe_ras_error_class counter;
+	/** @threshold: Threshold value to set */
+	u32 threshold;
+	/** @reserved: Reserved for future use */
+	u32 reserved;
+} __packed;
+
+/**
+ * struct xe_ras_set_threshold_response - Response structure for set threshold
+ */
+struct xe_ras_set_threshold_response {
+	/** @counter: Counter id */
+	struct xe_ras_error_class counter;
+	/** @threshold_old: Old threshold value */
+	u32 threshold_old;
+	/** @threshold: New threshold value */
+	u32 threshold;
+	/** @status: Set threshold operation status */
+	u32 status;
+	/** @reserved: Reserved for future use */
+	u32 reserved[2];
+} __packed;
+
 #endif
diff --git a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
index a1b71218deca..b865768e903b 100644
--- a/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
+++ b/drivers/gpu/drm/xe/xe_sysctrl_mailbox_types.h
@@ -23,10 +23,12 @@ enum xe_sysctrl_group {
  * enum xe_sysctrl_gfsp_cmd - Commands supported by GFSP group
  *
  * @XE_SYSCTRL_CMD_GET_THRESHOLD: Retrieve error threshold
+ * @XE_SYSCTRL_CMD_SET_THRESHOLD: Set error threshold
  * @XE_SYSCTRL_CMD_GET_PENDING_EVENT: Retrieve pending event
  */
 enum xe_sysctrl_gfsp_cmd {
 	XE_SYSCTRL_CMD_GET_THRESHOLD		= 0x05,
+	XE_SYSCTRL_CMD_SET_THRESHOLD		= 0x06,
 	XE_SYSCTRL_CMD_GET_PENDING_EVENT	= 0x07,
 };
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v1 10/11] drm/xe/drm_ras: Wire up error threshold callbacks
From: Raag Jadav @ 2026-04-17 21:16 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: simona.vetter, airlied, kuba, lijo.lazar, Hawking.Zhang, davem,
	pabeni, edumazet, maarten, zachary.mckevitt, rodrigo.vivi,
	riana.tauro, michal.wajdeczko, matthew.d.roper,
	umesh.nerlige.ramappa, mallesh.koujalagi, soham.purkait,
	anoop.c.vijay, aravind.iddamsetty, Raag Jadav
In-Reply-To: <20260417211730.837345-1-raag.jadav@intel.com>

Now that we have get/set error threshold support in xe driver, wire them
up to drm_ras so that the user can make use of both functionalities.

$ sudo ynl --family drm_ras --do get-error-threshold --json \
  '{"node-id":0, "error-id":2}'
{'error-id': 2, 'error-name': 'soc-internal', 'error-threshold': 0}

$ sudo ynl --family drm_ras --do set-error-threshold --json \
  '{"node-id":0, "error-id":2, "error-threshold":8}'
None

Signed-off-by: Raag Jadav <raag.jadav@intel.com>
---
 drivers/gpu/drm/xe/xe_drm_ras.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_drm_ras.c b/drivers/gpu/drm/xe/xe_drm_ras.c
index e07dc23a155e..824dabd5c29e 100644
--- a/drivers/gpu/drm/xe/xe_drm_ras.c
+++ b/drivers/gpu/drm/xe/xe_drm_ras.c
@@ -11,6 +11,7 @@
 
 #include "xe_device_types.h"
 #include "xe_drm_ras.h"
+#include "xe_ras.h"
 
 static const char * const error_components[] = DRM_XE_RAS_ERROR_COMPONENT_NAMES;
 static const char * const error_severity[] = DRM_XE_RAS_ERROR_SEVERITY_NAMES;
@@ -47,6 +48,27 @@ static int query_correctable_error_counter(struct drm_ras_node *ep, u32 error_id
 	return hw_query_error_counter(info, error_id, name, val);
 }
 
+static int query_correctable_error_threshold(struct drm_ras_node *ep, u32 error_id,
+					     const char **name, u32 *val)
+{
+	struct xe_device *xe = ep->priv;
+
+	if (!xe->info.has_sysctrl)
+		return -EOPNOTSUPP;
+
+	return xe_ras_get_threshold(xe, DRM_XE_RAS_ERR_SEV_CORRECTABLE, error_id, val);
+}
+
+static int set_correctable_error_threshold(struct drm_ras_node *ep, u32 error_id, u32 val)
+{
+	struct xe_device *xe = ep->priv;
+
+	if (!xe->info.has_sysctrl)
+		return -EOPNOTSUPP;
+
+	return xe_ras_set_threshold(xe, DRM_XE_RAS_ERR_SEV_CORRECTABLE, error_id, val);
+}
+
 static struct xe_drm_ras_counter *allocate_and_copy_counters(struct xe_device *xe)
 {
 	struct xe_drm_ras_counter *counter;
@@ -92,10 +114,13 @@ static int assign_node_params(struct xe_device *xe, struct drm_ras_node *node,
 	if (IS_ERR(ras->info[severity]))
 		return PTR_ERR(ras->info[severity]);
 
-	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
+	if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE) {
 		node->query_error_counter = query_correctable_error_counter;
-	else
+		node->query_error_threshold = query_correctable_error_threshold;
+		node->set_error_threshold = set_correctable_error_threshold;
+	} else {
 		node->query_error_counter = query_uncorrectable_error_counter;
+	}
 
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH v1 11/11] drm/xe/ras: Add flag for Xe RAS
From: Raag Jadav @ 2026-04-17 21:16 UTC (permalink / raw)
  To: intel-xe, dri-devel, netdev
  Cc: simona.vetter, airlied, kuba, lijo.lazar, Hawking.Zhang, davem,
	pabeni, edumazet, maarten, zachary.mckevitt, rodrigo.vivi,
	riana.tauro, michal.wajdeczko, matthew.d.roper,
	umesh.nerlige.ramappa, mallesh.koujalagi, soham.purkait,
	anoop.c.vijay, aravind.iddamsetty, Raag Jadav
In-Reply-To: <20260417211730.837345-1-raag.jadav@intel.com>

From: Riana Tauro <riana.tauro@intel.com>

Add a flag for RAS. If enabled, XE driver registers with
drm_ras and exposes supported counters.

Currently this is enabled for PVC and CRI.

Signed-off-by: Riana Tauro <riana.tauro@intel.com>
---
 drivers/gpu/drm/xe/xe_device_types.h | 2 ++
 drivers/gpu/drm/xe/xe_hw_error.c     | 2 +-
 drivers/gpu/drm/xe/xe_pci.c          | 3 +++
 drivers/gpu/drm/xe/xe_pci_types.h    | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
index 31df9debcbb0..7a8afd06e6b8 100644
--- a/drivers/gpu/drm/xe/xe_device_types.h
+++ b/drivers/gpu/drm/xe/xe_device_types.h
@@ -191,6 +191,8 @@ struct xe_device {
 		u8 has_ctx_tlb_inval:1;
 		/** @info.has_range_tlb_inval: Has range based TLB invalidations */
 		u8 has_range_tlb_inval:1;
+		/** @info.has_ras: Device supports RAS (Reliability, Availability, Serviceability) */
+		u8 has_ras:1;
 		/** @info.has_soc_remapper_sysctrl: Has SoC remapper system controller */
 		u8 has_soc_remapper_sysctrl:1;
 		/** @info.has_soc_remapper_telem: Has SoC remapper telemetry support */
diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
index 2a31b430570e..3ab0fceb151f 100644
--- a/drivers/gpu/drm/xe/xe_hw_error.c
+++ b/drivers/gpu/drm/xe/xe_hw_error.c
@@ -520,7 +520,7 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
 
 static int hw_error_info_init(struct xe_device *xe)
 {
-	if (xe->info.platform != XE_PVC)
+	if (!xe->info.has_ras)
 		return 0;
 
 	return xe_drm_ras_init(xe);
diff --git a/drivers/gpu/drm/xe/xe_pci.c b/drivers/gpu/drm/xe/xe_pci.c
index 278c2860a4f6..10ff207affa9 100644
--- a/drivers/gpu/drm/xe/xe_pci.c
+++ b/drivers/gpu/drm/xe/xe_pci.c
@@ -365,6 +365,7 @@ static const __maybe_unused struct xe_device_desc pvc_desc = {
 	.vm_max_level = 4,
 	.vram_flags = XE_VRAM_FLAGS_NEED64K,
 	.has_mbx_power_limits = false,
+	.has_ras = true,
 };
 
 static const struct xe_device_desc mtl_desc = {
@@ -472,6 +473,7 @@ static const struct xe_device_desc cri_desc = {
 	.require_force_probe = true,
 	.va_bits = 57,
 	.vm_max_level = 4,
+	.has_ras = true,
 };
 
 static const struct xe_device_desc nvlp_desc = {
@@ -761,6 +763,7 @@ static int xe_info_init_early(struct xe_device *xe,
 	xe->info.has_page_reclaim_hw_assist = desc->has_page_reclaim_hw_assist;
 	xe->info.has_pre_prod_wa = desc->has_pre_prod_wa;
 	xe->info.has_pxp = desc->has_pxp;
+	xe->info.has_ras = desc->has_ras;
 	xe->info.has_soc_remapper_sysctrl = desc->has_soc_remapper_sysctrl;
 	xe->info.has_soc_remapper_telem = desc->has_soc_remapper_telem;
 	xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) &&
diff --git a/drivers/gpu/drm/xe/xe_pci_types.h b/drivers/gpu/drm/xe/xe_pci_types.h
index 5b85e2c24b7b..70a9d4995cbd 100644
--- a/drivers/gpu/drm/xe/xe_pci_types.h
+++ b/drivers/gpu/drm/xe/xe_pci_types.h
@@ -54,6 +54,7 @@ struct xe_device_desc {
 	u8 has_pre_prod_wa:1;
 	u8 has_page_reclaim_hw_assist:1;
 	u8 has_pxp:1;
+	u8 has_ras:1;
 	u8 has_soc_remapper_sysctrl:1;
 	u8 has_soc_remapper_telem:1;
 	u8 has_sriov:1;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v6 3/3] dts: s32g: Add GPR syscon region
From: Jared Kangas @ 2026-04-17 21:36 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Chester Lin, Matthias Brugger, Ghennadi Procopciuc,
	NXP S32 Linux Team, Frank Li, Sascha Hauer,
	Pengutronix Kernel Team, Fabio Estevam, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, linux-arm-kernel, imx,
	devicetree, linux-kernel, linaro-s32, netdev
In-Reply-To: <0e922537c02d1c47734142090f98eb78e921ed34.1769764941.git.dan.carpenter@linaro.org>

Hi Dan,

On Fri, Jan 30, 2026 at 04:19:52PM +0300, Dan Carpenter wrote:
> Add the GPR syscon region for the s32 chipset.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
> ---
>
> [snip]
>
> diff --git a/arch/arm64/boot/dts/freescale/s32g3.dtsi b/arch/arm64/boot/dts/freescale/s32g3.dtsi
> index e314f3c7d61d..be03db737384 100644
> --- a/arch/arm64/boot/dts/freescale/s32g3.dtsi
> +++ b/arch/arm64/boot/dts/freescale/s32g3.dtsi
> @@ -383,6 +383,11 @@ usdhc0-200mhz-grp4 {
>  			};
>  		};
>  
> +		gpr: syscon@4007c000 {
> +			compatible = "nxp,s32g3-gpr", "syscon";
> +			reg = <0x4007c000 0x3000>;
> +		};
> +
>  		ocotp: nvmem@400a4000 {
>  			compatible = "nxp,s32g3-ocotp", "nxp,s32g2-ocotp";
>  			reg = <0x400a4000 0x400>;
> @@ -808,6 +813,7 @@ gmac0: ethernet@4033c000 {
>  			compatible = "nxp,s32g2-dwmac";
>  			reg = <0x4033c000 0x2000>, /* gmac IP */
>  			      <0x4007c004 0x4>;    /* GMAC_0_CTRL_STS */
> +			nxp,phy-sel = <&gpr 0x4>;
>  			interrupt-parent = <&gic>;
>  			interrupts = <GIC_SPI 57 IRQ_TYPE_LEVEL_HIGH>;
>  			interrupt-names = "macirq";

I gave this a test on an S32G-VNP-RDB3 and didn't see any issues on the
dwmac-s32 side, but this appears to trigger a panic when reading the new
debugfs regmap/*/registers file for the syscon node:

    # grep 4007c000 /proc/vmallocinfo
    0xffff800083da8000-0xffff800083dac000   16384 ioremap_prot+0x74/0xe0 phys=0x000000004007c000 ioremap
    # cat /sys/kernel/debug/regmap/dummy-syscon@0x000000004007c000/registers
    Internal error: synchronous external abort: 0000000096000210 [#1]  SMP
    [...]
    CPU: 0 UID: 0 PID: 4344 Comm: cat Tainted: G   M        E  X   ------  ---  6.12.0+ #226 PREEMPT_RT
    Tainted: [M]=MACHINE_CHECK, [E]=UNSIGNED_MODULE, [X]=AUX
    [...]
    pc : regmap_mmio_read32le+0x44/0xa0
    lr : regmap_mmio_read32le+0x44/0xa0
    [...]
    x23: ffff00080c080000 x22: ffff000802ac4c00 x21: ffff800087b13c9c
    x20: ffff800080a46494 x19: ffff800083da810c x18: 0000000000000004
    [...]
    x5 : ffff800080a46448 x4 : ffff800083da8000 x3 : ffff800080a46494
    x2 : ffff800080a47230 x1 : ffff800083da810c x0 : 0000000000000020
    Call trace:
     regmap_mmio_read32le+0x44/0xa0 (P)
     regmap_mmio_read+0x4c/0x80
     [...]
    Code: 52800400 8b214093 aa1303e1 97f4caf0 (b9400275)
    ---[ end trace 0000000000000000 ]---
    Kernel panic - not syncing: synchronous external abort: Fatal exception

Running this through decodecode gives:

    All code
    ========
       0:   52800400        mov     w0, #0x20                       // #32
       4:   8b214093        add     x19, x4, w1, uxtw
       8:   aa1303e1        mov     x1, x19
       c:   97f4caf0        bl      0xffffffffffd32bcc
      10:*  b9400275        ldr     w21, [x19]              <-- trapping instruction

    Code starting with the faulting instruction
    ===========================================
       0:   b9400275        ldr     w21, [x19]

x19's offset from the base address in /proc/vmallocinfo is 0x10c, which
points to a bad read at physical address 0x4007c10c; I also confirmed
that the preceding memory reads back without issues:

    # head -c 990 /sys/kernel/debug/regmap/dummy-syscon@0x000000004007c000/registers | tail -1
    0104: 00000000
    # head -c 1005 /sys/kernel/debug/regmap/dummy-syscon@0x000000004007c000/registers | tail -1
    0108: 00000000
    # head -c 1020 /sys/kernel/debug/regmap/dummy-syscon@0x000000004007c000/registers | tail -1
    <panic>

Best,
Jared


^ permalink raw reply

* Re: [PATCH] tcp: fix orphan count order in __tcp_close()
From: Ruben Kelevra @ 2026-04-17 21:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: netdev, ncardwell, kuniyu, davem, dsahern, kuba, pabeni, horms
In-Reply-To: <CANn89i+zy=YttsDesBzvtSrkdY+==L_L6j4O7EFjYKEggcXbGA@mail.gmail.com>

On Fri, Apr 17, 2026 at 10:54 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Apr 17, 2026 at 1:25 PM RubenKelevra <rubenkelevra@gmail.com> wrote:
> >
> > __tcp_close() calls sock_orphan(sk) first and drains the backlog with
> > __release_sock(sk), which might call tcp_done() which decrements the
> > tcp_orphan_count. After which we will increment tcp_orphan_count again.
> >
> > Since tcp_orphan_count is an unsigned int, we underflow to uint_max if we
> > started with a 0 - at least on all current supported platforms.
> >
> > I could not locate a direct user of this value, and in
> > tcp_orphan_count_sum() this underflow is contained by adding the unsigned
> > int value into a signed int sum, causing it to behave like -1 on current
> > supported platforms and then get clamped by max(n, 0) to 0.
> >
> > The impact therefore is currently limited to e.g. tcp_too_many_orphans()
> > checking an artificially low value, if the cached sum is refreshed within
> > this timeframe.
> >
> > This fix mirrors the previous fix I found while investigating: commit
> > 75c2d9077c63 ("[TCP]: Fix sock_orphan dead lock")
> >
> > Later commit eb4dea585304 ("net: Fix percpu counters deadlock") moved the
> > increment down for old percpu_counter reasons. commit 19757cebf0c5 ("tcp:
> > switch orphan_count to bare per-cpu counters") changed orphan accounting to
> > plain per-cpu counters, so that old reason no longer applies the same way
> > now.
>
> I find this patch rather confusing. Have you used an LLM to generate it?
>
> You are mentioning old patches that are not relevant (bh disable/enable).
>
> Given we advise only increasing tcp_max_orphans value (default being
> 262144 on modern hosts),
> your patch has really no effect.

Hey Eric,

no. But I'm not a native speaker. I'm sorry if my explanation did not
meet your or the project's communication quality.

The point I was trying to make is that currently we underflow and
overflow an unsigned and signed int in case we start with 0 in
tcp_orphan_count. And while it currently has no real effect on real
world applications, as we use a 100ms cached value anyway, I just
wanted to clean up this "old" approach which was apparently chosen for
historical reasons to circumvent bugs, not because it was the intent
implementation.

That's what my list of old commits about the history was meant to reference.

Best regards,

Ruben

On Fri, Apr 17, 2026 at 10:54 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Apr 17, 2026 at 1:25 PM RubenKelevra <rubenkelevra@gmail.com> wrote:
> >
> > __tcp_close() calls sock_orphan(sk) first and drains the backlog with
> > __release_sock(sk), which might call tcp_done() which decrements the
> > tcp_orphan_count. After which we will increment tcp_orphan_count again.
> >
> > Since tcp_orphan_count is an unsigned int, we underflow to uint_max if we
> > started with a 0 - at least on all current supported platforms.
> >
> > I could not locate a direct user of this value, and in
> > tcp_orphan_count_sum() this underflow is contained by adding the unsigned
> > int value into a signed int sum, causing it to behave like -1 on current
> > supported platforms and then get clamped by max(n, 0) to 0.
> >
> > The impact therefore is currently limited to e.g. tcp_too_many_orphans()
> > checking an artificially low value, if the cached sum is refreshed within
> > this timeframe.
> >
> > This fix mirrors the previous fix I found while investigating: commit
> > 75c2d9077c63 ("[TCP]: Fix sock_orphan dead lock")
> >
> > Later commit eb4dea585304 ("net: Fix percpu counters deadlock") moved the
> > increment down for old percpu_counter reasons. commit 19757cebf0c5 ("tcp:
> > switch orphan_count to bare per-cpu counters") changed orphan accounting to
> > plain per-cpu counters, so that old reason no longer applies the same way
> > now.
>
> I find this patch rather confusing. Have you used an LLM to generate it?
>
> You are mentioning old patches that are not relevant (bh disable/enable).
>
> Given we advise only increasing tcp_max_orphans value (default being
> 262144 on modern hosts),
> your patch has really no effect.

^ permalink raw reply

* Re: [PATCH] tcp: fix orphan count order in __tcp_close()
From: Eric Dumazet @ 2026-04-17 21:43 UTC (permalink / raw)
  To: Ruben Kelevra
  Cc: netdev, ncardwell, kuniyu, davem, dsahern, kuba, pabeni, horms
In-Reply-To: <CAGHX7-OsuADyxGr-QoRRspuXYm_3aS590Ez4zvGsjyPsSKLJsg@mail.gmail.com>

On Fri, Apr 17, 2026 at 2:36 PM Ruben Kelevra <rubenkelevra@gmail.com> wrote:
>
> Hey Eric,
>
> no. But I'm not a native speaker. I'm sorry if my explanation did not
> meet your or the project's communication quality.
>
> The point I was trying to make is that currently we underflow and
> overflow an unsigned and signed int in case we start with 0 in
> tcp_orphan_count. And while it currently has no real effect on real
> world applications, as we use a 100ms cached value anyway, I just
> wanted to clean up this "old" approach which was apparently chosen for
> historical reasons to circumvent bugs, not because it was the intent
> implementation.
>
> That's what my list of old commits about the history was meant to reference.

We are using per-cpu counters, folded every 100ms, so the precise
value is irrelevant,
as long as it is balanced (does not increase to infinity)

Thanks.

^ permalink raw reply

* Re: [PATCH net] ipv6: Implement limits on extension header parsing
From: Eric Dumazet @ 2026-04-17 21:45 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: kuba, dsahern, tom, willemdebruijn.kernel, idosch, pabeni, netdev
In-Reply-To: <20260417171831.687053-1-daniel@iogearbox.net>

On Fri, Apr 17, 2026 at 10:18 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> ipv6_{skip_exthdr,find_hdr}() and ip6_tnl_parse_tlv_enc_lim() iterate
> over IPv6 extension headers until they find a non-extension-header
> protocol or run out of packet data. The loops have no iteration counter,
> relying solely on the packet length to bound them. For a crafted packet
> with 8-byte extension headers filling a 64KB jumbogram, this means a
> worst case of up to ~8k iterations with a skb_header_pointer call each.
> ipv6_skip_exthdr(), for example, is used where it parses the inner
> quoted packet inside an incoming ICMPv6 error:
>
>   - icmpv6_rcv
>     - checksum validation
>     - case ICMPV6_DEST_UNREACH
>       - icmpv6_notify
>         - pskb_may_pull()       <- pull inner IPv6 header
>         - ipv6_skip_exthdr()    <- iterates here
>         - pskb_may_pull()
>         - ipprot->err_handler() <- sk lookup (matching sk not required)
>
> The per-iteration cost of ipv6_skip_exthdr itself is generally light,
> but skb_header_pointer becomes more costly on reassembled packets: the
> first ~1KB of the inner packet are in the skb's linear area, but the
> remaining ~63KB are in the frag_list where skb_copy_bits is needed to
> read data.
>
> Add a configurable limit via a new sysctl net.ipv6.max_ext_hdrs_number
> (default 32, minimum 1). All three extension header walking functions
> are bound by this limit. The sysctl is in line with commit 47d3d7ac656a
> ("ipv6: Implement limits on Hop-by-Hop and Destination options"). The
> init_net is used since plumbing a struct net * through all helpers
> would touch a lot of callsites.
>
> There's an ongoing IETF draft-ietf-6man-eh-limits-18 that states that
> 8 extension headers before the transport header is the baseline which
> routers MUST handle; section 7 details also why limits are needed.
>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

Reviewed-by: Eric Dumazet <edumazet@google.com>

Thanks!

^ permalink raw reply

* [PATCH net] ipv6: Apply max_dst_opts_cnt to ip6_tnl_parse_tlv_enc_lim
From: Daniel Borkmann @ 2026-04-17 22:03 UTC (permalink / raw)
  To: kuba; +Cc: edumazet, dsahern, tom, willemdebruijn.kernel, idosch, pabeni,
	netdev

Commit 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and
Destination options") added net.ipv6.max_{hbh,dst}_opts_{cnt,len}
and applied them in ip6_parse_tlv(), the generic TLV walker
invoked from ipv6_destopt_rcv() and ipv6_parse_hopopts().

ip6_tnl_parse_tlv_enc_lim() does not go through ip6_parse_tlv();
it has its own hand-rolled TLV scanner inside its NEXTHDR_DEST
branch which looks for IPV6_TLV_TNL_ENCAP_LIMIT. That inner
loop is bounded only by optlen, which can be up to 2048 bytes.
Stuffing the Destination Options header with 2046 Pad1 (type=0)
entries advances the scanner a single byte at a time, yielding
~2000 TLV iterations per extension header.

Reuse max_dst_opts_cnt to bound the TLV iterations, matching
the semantics from 47d3d7ac656a.

Fixes: 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv6/ip6_tunnel.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 907c6a2af331..0ab76f93c136 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -430,11 +430,16 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
 				break;
 		}
 		if (nexthdr == NEXTHDR_DEST) {
+			int tlv_max = READ_ONCE(init_net.ipv6.sysctl.max_dst_opts_cnt);
+			int tlv_cnt = 0;
 			u16 i = 2;

 			while (1) {
 				struct ipv6_tlv_tnl_enc_lim *tel;

+				if (unlikely(tlv_cnt++ >= tlv_max))
+					break;
+
 				/* No more room for encapsulation limit */
 				if (i + sizeof(*tel) > optlen)
 					break;
-- 
2.43.0

^ permalink raw reply related

* [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Ahmed, Aaron @ 2026-04-18  0:19 UTC (permalink / raw)
  To: stable@vger.kernel.org, netdev@vger.kernel.org
  Cc: ncardwell@google.com, edumazet@google.com, kuniyu@google.com

Hi,

We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.

Overview:

The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.

Reproducer:
```
/* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
 *
 * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
 * Run:    sudo sysctl -w net.core.wmem_max=4194304
 *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
 *         ./tcp_linger_memleak
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
	
#define NUM_CONNS 5000
#define PORT      6666

static void print_mem(const char *label) {
    FILE *f;
    char line[256];
    f = fopen("/proc/meminfo", "r");
    while (fgets(line, sizeof(line), f))
        if (strncmp(line, "MemAvailable:", 13) == 0)
            printf("%s: %s", label, line);
    fclose(f);
    f = fopen("/proc/net/sockstat", "r");
    while (fgets(line, sizeof(line), f))
        if (strncmp(line, "TCP:", 4) == 0)
            printf("%s: %s", label, line);
    fclose(f);
}

int main(void) {
    struct sockaddr_in addr = {
        .sin_family = AF_INET,
        .sin_port = htons(PORT),
        .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
    };
    int opt = 1;
    signal(SIGPIPE, SIG_IGN);

    int lsn = socket(AF_INET, SOCK_STREAM, 0);
    setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
    bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
    listen(lsn, NUM_CONNS);

    /* Fork client: connect N times, never read */
    pid_t child = fork();
    if (child == 0) {
        int fds[NUM_CONNS];
        for (int i = 0; i < NUM_CONNS; i++) {
            fds[i] = socket(AF_INET, SOCK_STREAM, 0);
            connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
        }
        pause(); /* sit forever, never read */
        _exit(0);
    }

    /* Accept all connections */
    int clients[NUM_CONNS];
    for (int i = 0; i < NUM_CONNS; i++)
        clients[i] = accept(lsn, NULL, NULL);

    /* Freeze client so it stops reading */
    kill(child, SIGSTOP);
    printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
    print_mem("BEFORE");

    /* Fill buffers and close with SO_LINGER(1,0) */
    char buf[2048];
    memset(buf, 'A', sizeof(buf));
    for (int i = 0; i < NUM_CONNS; i++) {
        int flags = fcntl(clients[i], F_GETFL, 0);
        fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
        while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
        struct linger lg = { .l_onoff = 1, .l_linger = 0 };
        setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
        close(clients[i]);
    }

    sleep(2);
    printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
    print_mem("AFTER");
    kill(child, SIGKILL);
    waitpid(child, NULL, 0);
    close(lsn);
    return 0;
}
```
Output (Tested on 6.18.20):
```
=== 5000 connections established, client frozen ===
BEFORE: MemAvailable:   95491288 kB
BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0

=== All sockets closed with SO_LINGER(1,0) ===
AFTER: MemAvailable:   95321800 kB
AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
```

Thanks,
Aaron Ahmed



^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Herbert Xu @ 2026-04-18  0:44 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeJe8oIyYUi-NtCQ@slm.duckdns.org>

On Fri, Apr 17, 2026 at 06:25:22AM -1000, Tejun Heo wrote:
>
> That'd be great but looking at the commit, I'm not sure it reliably avoids
> allocation in the synchronous path.

If insecure_elasticity is set it should skip the slow path
altogether and just do the insertion unconditionally.  So
there will be no kmallocs at all.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Kuniyuki Iwashima @ 2026-04-18  0:44 UTC (permalink / raw)
  To: Ahmed, Aaron
  Cc: stable@vger.kernel.org, netdev@vger.kernel.org,
	ncardwell@google.com, edumazet@google.com
In-Reply-To: <48BADABE-4DFB-4DAD-8248-E94D8F5238D2@amazon.com>

Hi Aaron :)

Thanks for the report.

On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com> wrote:
>
> Hi,
>
> We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
>
> Overview:
>
> The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
>
> Reproducer:
> ```
> /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
>  *
>  * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
>  * Run:    sudo sysctl -w net.core.wmem_max=4194304
>  *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
>  *         ./tcp_linger_memleak
>  */
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <signal.h>
> #include <sys/socket.h>
> #include <sys/wait.h>
> #include <netinet/in.h>
>
> #define NUM_CONNS 5000
> #define PORT      6666
>
> static void print_mem(const char *label) {
>     FILE *f;
>     char line[256];
>     f = fopen("/proc/meminfo", "r");
>     while (fgets(line, sizeof(line), f))
>         if (strncmp(line, "MemAvailable:", 13) == 0)
>             printf("%s: %s", label, line);
>     fclose(f);
>     f = fopen("/proc/net/sockstat", "r");
>     while (fgets(line, sizeof(line), f))
>         if (strncmp(line, "TCP:", 4) == 0)
>             printf("%s: %s", label, line);
>     fclose(f);
> }
>
> int main(void) {
>     struct sockaddr_in addr = {
>         .sin_family = AF_INET,
>         .sin_port = htons(PORT),
>         .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
>     };
>     int opt = 1;
>     signal(SIGPIPE, SIG_IGN);
>
>     int lsn = socket(AF_INET, SOCK_STREAM, 0);
>     setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
>     bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
>     listen(lsn, NUM_CONNS);
>
>     /* Fork client: connect N times, never read */
>     pid_t child = fork();
>     if (child == 0) {
>         int fds[NUM_CONNS];
>         for (int i = 0; i < NUM_CONNS; i++) {
>             fds[i] = socket(AF_INET, SOCK_STREAM, 0);
>             connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
>         }
>         pause(); /* sit forever, never read */
>         _exit(0);
>     }
>
>     /* Accept all connections */
>     int clients[NUM_CONNS];
>     for (int i = 0; i < NUM_CONNS; i++)
>         clients[i] = accept(lsn, NULL, NULL);
>
>     /* Freeze client so it stops reading */
>     kill(child, SIGSTOP);
>     printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
>     print_mem("BEFORE");
>
>     /* Fill buffers and close with SO_LINGER(1,0) */
>     char buf[2048];
>     memset(buf, 'A', sizeof(buf));
>     for (int i = 0; i < NUM_CONNS; i++) {
>         int flags = fcntl(clients[i], F_GETFL, 0);
>         fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
>         while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
>         struct linger lg = { .l_onoff = 1, .l_linger = 0 };
>         setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
>         close(clients[i]);
>     }
>
>     sleep(2);
>     printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
>     print_mem("AFTER");
>     kill(child, SIGKILL);
>     waitpid(child, NULL, 0);
>     close(lsn);
>     return 0;
> }
> ```
> Output (Tested on 6.18.20):
> ```
> === 5000 connections established, client frozen ===
> BEFORE: MemAvailable:   95491288 kB
> BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
>
> === All sockets closed with SO_LINGER(1,0) ===
> AFTER: MemAvailable:   95321800 kB
> AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> ```

Unfortunately, it dies immediately on my end.

=== 5000 connections established, client frozen ===
Segmentation fault         (core dumped) ./linux/tcp_linger


Did you see actual memory leak with kmemleak or is it
just the tcp_mem counter that is really leaked ?

# echo clear > /sys/kernel/debug/kmemleak
~ run repro ~
# echo scan > /sys/kernel/debug/kmemleak

^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Tejun Heo @ 2026-04-18  0:52 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeLT8eB_xfzLxqbI@gondor.apana.org.au>

Hello,

On Sat, Apr 18, 2026 at 08:44:33AM +0800, Herbert Xu wrote:
> On Fri, Apr 17, 2026 at 06:25:22AM -1000, Tejun Heo wrote:
> >
> > That'd be great but looking at the commit, I'm not sure it reliably avoids
> > allocation in the synchronous path.
> 
> If insecure_elasticity is set it should skip the slow path
> altogether and just do the insertion unconditionally.  So
> there will be no kmallocs at all.

I see. Thanks, that should work. How should we go about reverting the
removal?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH for-7.1-fixes 1/2] rhashtable: add no_sync_grow option
From: Herbert Xu @ 2026-04-18  0:53 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev
In-Reply-To: <aeLV6aDhM0-S4oQ1@slm.duckdns.org>

On Fri, Apr 17, 2026 at 02:52:57PM -1000, Tejun Heo wrote:
>
> I see. Thanks, that should work. How should we go about reverting the
> removal?

I'll work on that today and then you can include it in your
two-patch series.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [BUG] net: tcp: SO_LINGER with l_linger=0 leaks memory when closing sockets with pending send data
From: Kuniyuki Iwashima @ 2026-04-18  1:06 UTC (permalink / raw)
  To: Ahmed, Aaron
  Cc: stable@vger.kernel.org, netdev@vger.kernel.org,
	ncardwell@google.com, edumazet@google.com
In-Reply-To: <CAAVpQUCfMsWBpPpywbwBLRCdHUqWqFBoDK=17dwDkG6T0dQxzw@mail.gmail.com>

On Fri, Apr 17, 2026 at 5:44 PM Kuniyuki Iwashima <kuniyu@google.com> wrote:
>
> Hi Aaron :)
>
> Thanks for the report.
>
> On Fri, Apr 17, 2026 at 5:20 PM Ahmed, Aaron <aarnahmd@amazon.com> wrote:
> >
> > Hi,
> >
> > We have identified a TCP memory leak issue on Amazon Linux with kernel versions 5.15.168 through 6.18.20 that occurs when closing sockets with SO_LINGER set to l_onoff=1, l_linger=0, on servers handling many persistent connections with full write buffers.
> >
> > Overview:
> >
> > The issue was discovered on a public-facing non-blocking TCP server that maintains many persistent connections and streams data to clients. When a client cannot read fast enough, the TCP write socket buffer on the server side fills up and send() returns EAGAIN. At that point, the server application disconnects the slow client by setting SO_LINGER to l_onoff=1, l_linger=0 and calling close(). This is intended to immediately reset the connection and release all associated kernel resources. However, while the socket disappears from netstat and sockstat (TCP inuse drops), the write buffer memory is not properly reclaimed. /proc/net/sockstat shows TCP mem pages accumulating with no owning sockets, causing the leaked memory to grow past the tcp_mem limits. Setting SO_LINGER to l_onoff=1, l_linger=1 instead does not leak. With l_linger=1, the connection goes through FIN_WAIT1 → FIN_WAIT2 → CLOSE (confirmed with BPF tcpstates), and all memory is freed properly. With l_linger=0, the connection transitions directly from ESTABLISHED → CLOSE via RST, bypassing the FIN states entirely.
> >
> > Reproducer:
> > ```
> > /* tcp_linger_memleak.c - SO_LINGER(0) TCP memory leak reproducer
> >  *
> >  * Build:  gcc -O2 -o tcp_linger_memleak tcp_linger_memleak.c
> >  * Run:    sudo sysctl -w net.core.wmem_max=4194304
> >  *         sudo sysctl -w net.ipv4.tcp_rmem="4096 8192 16384"
> >  *         ./tcp_linger_memleak
> >  */
> > #include <stdio.h>
> > #include <stdlib.h>
> > #include <string.h>
> > #include <unistd.h>
> > #include <errno.h>
> > #include <fcntl.h>
> > #include <signal.h>
> > #include <sys/socket.h>
> > #include <sys/wait.h>
> > #include <netinet/in.h>
> >
> > #define NUM_CONNS 5000
> > #define PORT      6666
> >
> > static void print_mem(const char *label) {
> >     FILE *f;
> >     char line[256];
> >     f = fopen("/proc/meminfo", "r");
> >     while (fgets(line, sizeof(line), f))
> >         if (strncmp(line, "MemAvailable:", 13) == 0)
> >             printf("%s: %s", label, line);
> >     fclose(f);
> >     f = fopen("/proc/net/sockstat", "r");
> >     while (fgets(line, sizeof(line), f))
> >         if (strncmp(line, "TCP:", 4) == 0)
> >             printf("%s: %s", label, line);
> >     fclose(f);
> > }
> >
> > int main(void) {
> >     struct sockaddr_in addr = {
> >         .sin_family = AF_INET,
> >         .sin_port = htons(PORT),
> >         .sin_addr.s_addr = htonl(INADDR_LOOPBACK)
> >     };
> >     int opt = 1;
> >     signal(SIGPIPE, SIG_IGN);
> >
> >     int lsn = socket(AF_INET, SOCK_STREAM, 0);
> >     setsockopt(lsn, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
> >     bind(lsn, (struct sockaddr *)&addr, sizeof(addr));
> >     listen(lsn, NUM_CONNS);
> >
> >     /* Fork client: connect N times, never read */
> >     pid_t child = fork();
> >     if (child == 0) {
> >         int fds[NUM_CONNS];
> >         for (int i = 0; i < NUM_CONNS; i++) {
> >             fds[i] = socket(AF_INET, SOCK_STREAM, 0);
> >             connect(fds[i], (struct sockaddr *)&addr, sizeof(addr));
> >         }
> >         pause(); /* sit forever, never read */
> >         _exit(0);
> >     }
> >
> >     /* Accept all connections */
> >     int clients[NUM_CONNS];
> >     for (int i = 0; i < NUM_CONNS; i++)
> >         clients[i] = accept(lsn, NULL, NULL);
> >
> >     /* Freeze client so it stops reading */
> >     kill(child, SIGSTOP);
> >     printf("=== %d connections established, client frozen ===\n", NUM_CONNS);
> >     print_mem("BEFORE");
> >
> >     /* Fill buffers and close with SO_LINGER(1,0) */
> >     char buf[2048];
> >     memset(buf, 'A', sizeof(buf));
> >     for (int i = 0; i < NUM_CONNS; i++) {
> >         int flags = fcntl(clients[i], F_GETFL, 0);
> >         fcntl(clients[i], F_SETFL, flags | O_NONBLOCK);
> >         while (send(clients[i], buf, sizeof(buf), MSG_NOSIGNAL) > 0);
> >         struct linger lg = { .l_onoff = 1, .l_linger = 0 };
> >         setsockopt(clients[i], SOL_SOCKET, SO_LINGER, &lg, sizeof(lg));
> >         close(clients[i]);
> >     }
> >
> >     sleep(2);
> >     printf("\n=== All sockets closed with SO_LINGER(1,0) ===\n");
> >     print_mem("AFTER");
> >     kill(child, SIGKILL);
> >     waitpid(child, NULL, 0);
> >     close(lsn);
> >     return 0;
> > }
> > ```
> > Output (Tested on 6.18.20):
> > ```
> > === 5000 connections established, client frozen ===
> > BEFORE: MemAvailable:   95491288 kB
> > BEFORE: TCP: inuse 10005 orphan 0 tw 5 alloc 10006 mem 0
> >
> > === All sockets closed with SO_LINGER(1,0) ===
> > AFTER: MemAvailable:   95321800 kB
> > AFTER: TCP: inuse 5 orphan 0 tw 5 alloc 5006 mem 8300
> > ```
>
> Unfortunately, it dies immediately on my end.
>
> === 5000 connections established, client frozen ===
> Segmentation fault         (core dumped) ./linux/tcp_linger

This was due to small ulimit -n and fopen() returned
NULL being passed to fgets().

But I don't see any leak of memory nor counter after
the repro.

Note that the tcp_mem counter could be cached in
per-cpu counters, see proto_memory_pcpu_drain() etc.

---8<---
[root@fedora ~]# unshare -n
[root@fedora ~]# ip link set lo up
[root@fedora ~]# echo clear > /sys/kernel/debug/kmemleak
[root@fedora ~]# ulimit -n 100000 && ./linux/tcp_linger
=== 5000 connections established, client frozen ===
BEFORE: MemAvailable:   54683048 kB
BEFORE: TCP: inuse 10001 orphan 0 tw 0 alloc 10008 mem 0
=== All sockets closed with SO_LINGER(1,0) ===
AFTER: MemAvailable:   54616304 kB
AFTER: TCP: inuse 1 orphan 0 tw 0 alloc 5008 mem 3842
[root@fedora ~]# cat /proc/net/sockstat
sockets: used 0
TCP: inuse 0 orphan 0 tw 0 alloc 7 mem 0
UDP: inuse 0 mem 0
RAW: inuse 0
FRAG: inuse 0 memory 0
[root@fedora ~]# cat /proc/meminfo | grep Available
MemAvailable:   54732456 kB
[root@fedora ~]# echo scan > /sys/kernel/debug/kmemleak
[root@fedora ~]#
---8<---

^ permalink raw reply

* [PATCH] rhashtable: Restore insecure_elasticity toggle
From: Herbert Xu @ 2026-04-18  1:38 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev, NeilBrown
In-Reply-To: <aeLWH_HgSHF4buiJ@gondor.apana.org.au>

Some users of rhashtable cannot handle insertion failures, and
are happy to accept the consequences of a hash table that having
very long chains.

Restore the insecure_elasticity toggle for these users.  In
addition to disabling the chain length checks, this also removes
the emergency resize that would otherwise occur when the hash
table occupancy hits 100% (an async resize is still scheduled
at 75%).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 015c8298bebc..72082428d6c6 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
+ * @insecure_elasticity: Set to true to disable chain length checks
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -61,6 +62,7 @@ struct rhashtable_params {
 	u16			head_offset;
 	unsigned int		max_size;
 	u16			min_size;
+	bool			insecure_elasticity;
 	bool			automatic_shrinking;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 0480509a6339..c793849d3f61 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -821,14 +821,15 @@ static __always_inline void *__rhashtable_insert_fast(
 		goto out;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !params->insecure_elasticity)
 		goto slow_path;
 
 	data = ERR_PTR(-E2BIG);
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		goto out_unlock;
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !params->insecure_elasticity)
 		goto slow_path;
 
 	/* Inserting at head of list makes unlocking free. */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6074ed5f66f3..b60d55e5b19b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		return NULL;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !ht->p->insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-ENOENT);
@@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one(
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		return ERR_PTR(-E2BIG);
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !ht->p->insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	head = rht_ptr(bkt, tbl, hash);
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* [v2 PATCH] rhashtable: Restore insecure_elasticity toggle
From: Herbert Xu @ 2026-04-18  1:41 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Thomas Graf, David Vernet, Andrea Righi, Changwoo Min,
	Emil Tsalapatis, linux-crypto, sched-ext, linux-kernel,
	Florian Westphal, netdev, NeilBrown
In-Reply-To: <aeLgjAeJuidWNy3N@gondor.apana.org.au>

This one actually compiles.
---8<---
Some users of rhashtable cannot handle insertion failures, and
are happy to accept the consequences of a hash table that having
very long chains.

Restore the insecure_elasticity toggle for these users.  In
addition to disabling the chain length checks, this also removes
the emergency resize that would otherwise occur when the hash
table occupancy hits 100% (an async resize is still scheduled
at 75%).

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 015c8298bebc..72082428d6c6 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg,
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
+ * @insecure_elasticity: Set to true to disable chain length checks
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
@@ -61,6 +62,7 @@ struct rhashtable_params {
 	u16			head_offset;
 	unsigned int		max_size;
 	u16			min_size;
+	bool			insecure_elasticity;
 	bool			automatic_shrinking;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 0480509a6339..7def3f0f556b 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -821,14 +821,15 @@ static __always_inline void *__rhashtable_insert_fast(
 		goto out;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !params.insecure_elasticity)
 		goto slow_path;
 
 	data = ERR_PTR(-E2BIG);
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		goto out_unlock;
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !params.insecure_elasticity)
 		goto slow_path;
 
 	/* Inserting at head of list makes unlocking free. */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6074ed5f66f3..fb2b7bc137ba 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 		return NULL;
 	}
 
-	if (elasticity <= 0)
+	if (elasticity <= 0 && !ht->p.insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-ENOENT);
@@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one(
 	if (unlikely(rht_grow_above_max(ht, tbl)))
 		return ERR_PTR(-E2BIG);
 
-	if (unlikely(rht_grow_above_100(ht, tbl)))
+	if (unlikely(rht_grow_above_100(ht, tbl)) &&
+	    !ht->p.insecure_elasticity)
 		return ERR_PTR(-EAGAIN);
 
 	head = rht_ptr(bkt, tbl, hash);
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* [PATCH net-next] r8169: report per-queue statistics through netdev qstats
From: Gustavo Arantes @ 2026-04-18  2:12 UTC (permalink / raw)
  To: Heiner Kallweit, nic_swsd
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-kernel

r8169 maintains synchronized per-CPU software counters for packet and byte
accounting, but does not expose them through the netdev qstats interface.

Add netdev_stat_ops callbacks and report the existing software counters
through queue 0 for both Rx and Tx. Provide zero base stats so device-scope
qstats report the packet and byte counters as supported and match the
existing RTNL statistics.

Signed-off-by: Gustavo Arantes <dev.gustavoa@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 70 +++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 791277e750ba..9d833b446383 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5175,6 +5175,75 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	pm_runtime_put_noidle(&pdev->dev);
 }
 
+static void rtl8169_fetch_sw_stats(struct net_device *dev,
+				   struct netdev_queue_stats_rx *rx,
+				   struct netdev_queue_stats_tx *tx)
+{
+	const struct pcpu_sw_netstats *stats;
+	unsigned int start;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+
+		stats = per_cpu_ptr(dev->tstats, cpu);
+		do {
+			start = u64_stats_fetch_begin(&stats->syncp);
+			rx_packets = u64_stats_read(&stats->rx_packets);
+			rx_bytes = u64_stats_read(&stats->rx_bytes);
+			tx_packets = u64_stats_read(&stats->tx_packets);
+			tx_bytes = u64_stats_read(&stats->tx_bytes);
+		} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+		rx->packets += rx_packets;
+		rx->bytes += rx_bytes;
+		tx->packets += tx_packets;
+		tx->bytes += tx_bytes;
+	}
+}
+
+static void rtl8169_get_queue_stats_rx(struct net_device *dev, int idx,
+				       struct netdev_queue_stats_rx *rx)
+{
+	struct netdev_queue_stats_tx tx = {};
+
+	if (idx)
+		return;
+
+	rx->packets = 0;
+	rx->bytes = 0;
+	rtl8169_fetch_sw_stats(dev, rx, &tx);
+}
+
+static void rtl8169_get_queue_stats_tx(struct net_device *dev, int idx,
+				       struct netdev_queue_stats_tx *tx)
+{
+	struct netdev_queue_stats_rx rx = {};
+
+	if (idx)
+		return;
+
+	tx->packets = 0;
+	tx->bytes = 0;
+	rtl8169_fetch_sw_stats(dev, &rx, tx);
+}
+
+static void rtl8169_get_base_stats(struct net_device *dev,
+				   struct netdev_queue_stats_rx *rx,
+				   struct netdev_queue_stats_tx *tx)
+{
+	rx->packets = 0;
+	rx->bytes = 0;
+	tx->packets = 0;
+	tx->bytes = 0;
+}
+
+static const struct netdev_stat_ops rtl8169_stat_ops = {
+	.get_queue_stats_rx	= rtl8169_get_queue_stats_rx,
+	.get_queue_stats_tx	= rtl8169_get_queue_stats_tx,
+	.get_base_stats		= rtl8169_get_base_stats,
+};
+
 static void rtl8169_net_suspend(struct rtl8169_private *tp)
 {
 	netif_device_detach(tp->dev);
@@ -5615,6 +5684,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	SET_NETDEV_DEV(dev, &pdev->dev);
 	dev->netdev_ops = &rtl_netdev_ops;
+	dev->stat_ops = &rtl8169_stat_ops;
 	tp = netdev_priv(dev);
 	tp->dev = dev;
 	tp->pci_dev = pdev;
-- 
2.51.2


^ permalink raw reply related

* Re: [PATCH bpf v3 2/2] selftests/bpf: Test TCP_NODELAY in TCP hdr opt callbacks
From: KaFai Wan @ 2026-04-18  2:19 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: daniel, john.fastabend, sdf, ast, andrii, eddyz87, memxor, song,
	yonghong.song, jolsa, davem, edumazet, kuba, pabeni, horms, shuah,
	jiayuan.chen, bpf, netdev, linux-kernel, linux-kselftest
In-Reply-To: <2026417162132.9MRI.martin.lau@linux.dev>

On Fri, 2026-04-17 at 09:25 -0700, Martin KaFai Lau wrote:
> On Fri, Apr 17, 2026 at 05:20:35PM +0800, KaFai Wan wrote:
> > diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > index 56685fc03c7e..7b9dbbb84316 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
> > @@ -461,7 +461,7 @@ static void misc(void)
> >  	const unsigned int nr_data = 2;
> >  	struct bpf_link *link;
> >  	struct sk_fds sk_fds;
> > -	int i, ret;
> > +	int i, ret, true_val = 1;
> >  
> >  	lport_linum_map_fd = bpf_map__fd(misc_skel->maps.lport_linum_map);
> >  
> > @@ -477,6 +477,10 @@ static void misc(void)
> >  		return;
> >  	}
> >  
> > +	ret = setsockopt(sk_fds.active_fd, SOL_TCP, TCP_NODELAY, &true_val, sizeof(true_val));
> 
> Same comment as in v2. Why this setsockopt is needed?

Sorry I miss this. It's from the review of v1, my first version would break the syscall setsockopt
and other CB besides HDR_OPT_LEN/WRITE_HDR_OPT. So in the test I check setsockopt() and
bpf_setsockopt() in PASSIVE_ESTABLISHED_CB to make sure patch#1 would not break user space and other
CB.

> The setsockopt in userspace is unnecessary. 

Is bpf_setsockopt() in PASSIVE_ESTABLISHED_CB also unnecessary? I'll respin if they are unnecessary.

> In the future,
> we may need to understand why it is needed here in the first place.

Okay, I'll remember that. Thanks for the review and guidance.

-- 
Thanks,
KaFai

^ permalink raw reply

* [PATCH net v3 0/2] bnge fixes
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta

Hi,
 This series fix two issues.

Patch-1: 
    Due to wrong HWRM sequence, driver do not get the correct
    information regarding resources and capabilities.
    The patch fixes the initial HWRM sequence.
Patch-2:
    Remove the unsupported backing store type initialization, which is
    not supported in Thor Ultra devices.

Thanks,
Vikas

v2->v3:
  Addressed Jakub Kicinski's comments.
https://lore.kernel.org/netdev/CAHLZf_uARgZzoTPnnPjxRu5AGeHEOw3yyTEbNHYP3brfwuW0Sw@mail.gmail.com/

v1->v2: 
   Include Fixes tags.


Vikas Gupta (2):
  bnge: fix initial HWRM sequence
  bnge: remove unsupported backing store type

 .../net/ethernet/broadcom/bnge/bnge_core.c    | 30 ++++++++++++++-----
 .../net/ethernet/broadcom/bnge/bnge_rmem.c    | 16 ----------
 2 files changed, 22 insertions(+), 24 deletions(-)

-- 
2.47.1


^ permalink raw reply

* [PATCH net v3 1/2] bnge: fix initial HWRM sequence
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta
In-Reply-To: <20260418023438.1597876-1-vikas.gupta@broadcom.com>

Firmware may not advertize correct resources if backing store is not
enabled before resource information is queried.
Fix the initial sequence of HWRMs so that driver gets capabilities
and resource information correctly.

Fixes: 3fa9e977a0cd ("bng_en: Initialize default configuration")
Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Rahul Gupta <rahul-rg.gupta@broadcom.com>
---
 .../net/ethernet/broadcom/bnge/bnge_core.c    | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_core.c b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
index 1c14c5fe8d61..68b74eb2c3a2 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_core.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_core.c
@@ -74,6 +74,13 @@ static int bnge_func_qcaps(struct bnge_dev *bd)
 		return rc;
 	}
 
+	return 0;
+}
+
+static int bnge_func_qrcaps_qcfg(struct bnge_dev *bd)
+{
+	int rc;
+
 	rc = bnge_hwrm_func_resc_qcaps(bd);
 	if (rc) {
 		dev_err(bd->dev, "query resc caps failure rc: %d\n", rc);
@@ -133,23 +140,28 @@ static int bnge_fw_register_dev(struct bnge_dev *bd)
 
 	bnge_hwrm_fw_set_time(bd);
 
-	rc =  bnge_hwrm_func_drv_rgtr(bd);
+	/* Get the resources and configuration from firmware */
+	rc = bnge_func_qcaps(bd);
 	if (rc) {
-		dev_err(bd->dev, "Failed to rgtr with firmware rc: %d\n", rc);
+		dev_err(bd->dev, "Failed querying caps rc: %d\n", rc);
 		return rc;
 	}
 
 	rc = bnge_alloc_ctx_mem(bd);
 	if (rc) {
 		dev_err(bd->dev, "Failed to allocate ctx mem rc: %d\n", rc);
-		goto err_func_unrgtr;
+		goto err_free_ctx_mem;
 	}
 
-	/* Get the resources and configuration from firmware */
-	rc = bnge_func_qcaps(bd);
+	rc = bnge_hwrm_func_drv_rgtr(bd);
 	if (rc) {
-		dev_err(bd->dev, "Failed initial configuration rc: %d\n", rc);
-		rc = -ENODEV;
+		dev_err(bd->dev, "Failed to rgtr with firmware rc: %d\n", rc);
+		goto err_free_ctx_mem;
+	}
+
+	rc = bnge_func_qrcaps_qcfg(bd);
+	if (rc) {
+		dev_err(bd->dev, "Failed querying resources rc: %d\n", rc);
 		goto err_func_unrgtr;
 	}
 
@@ -158,7 +170,9 @@ static int bnge_fw_register_dev(struct bnge_dev *bd)
 	return 0;
 
 err_func_unrgtr:
-	bnge_fw_unregister_dev(bd);
+	bnge_hwrm_func_drv_unrgtr(bd);
+err_free_ctx_mem:
+	bnge_free_ctx_mem(bd);
 	return rc;
 }
 
-- 
2.47.1


^ permalink raw reply related

* [PATCH net v3 2/2] bnge: remove unsupported backing store type
From: Vikas Gupta @ 2026-04-18  2:34 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: netdev, linux-kernel, vsrama-krishna.nemani, bhargava.marreddy,
	rajashekar.hudumula, ajit.khaparde, dharmender.garg,
	rahul-rg.gupta, Vikas Gupta
In-Reply-To: <20260418023438.1597876-1-vikas.gupta@broadcom.com>

The backing store type, BNGE_CTX_MRAV, is not applicable in Thor Ultra
devices. Remove it from the backing store configuration, as the firmware
will not populate entities in this backing store type, due to which the
driver load fails.

Fixes: 29c5b358f385 ("bng_en: Add backing store support")
Signed-off-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Dharmender Garg <dharmender.garg@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnge/bnge_rmem.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
index 94f15e08a88c..b066ee887a09 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_rmem.c
@@ -324,7 +324,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
 	u32 l2_qps, qp1_qps, max_qps;
 	u32 ena, entries_sp, entries;
 	u32 srqs, max_srqs, min;
-	u32 num_mr, num_ah;
 	u32 extra_srqs = 0;
 	u32 extra_qps = 0;
 	u32 fast_qpmd_qps;
@@ -390,21 +389,6 @@ int bnge_alloc_ctx_mem(struct bnge_dev *bd)
 	if (!bnge_is_roce_en(bd))
 		goto skip_rdma;
 
-	ctxm = &ctx->ctx_arr[BNGE_CTX_MRAV];
-	/* 128K extra is needed to accommodate static AH context
-	 * allocation by f/w.
-	 */
-	num_mr = min_t(u32, ctxm->max_entries / 2, 1024 * 256);
-	num_ah = min_t(u32, num_mr, 1024 * 128);
-	ctxm->split_entry_cnt = BNGE_CTX_MRAV_AV_SPLIT_ENTRY + 1;
-	if (!ctxm->mrav_av_entries || ctxm->mrav_av_entries > num_ah)
-		ctxm->mrav_av_entries = num_ah;
-
-	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, num_mr + num_ah, 2);
-	if (rc)
-		return rc;
-	ena |= FUNC_BACKING_STORE_CFG_REQ_ENABLES_MRAV;
-
 	ctxm = &ctx->ctx_arr[BNGE_CTX_TIM];
 	rc = bnge_setup_ctxm_pg_tbls(bd, ctxm, l2_qps + qp1_qps + extra_qps, 1);
 	if (rc)
-- 
2.47.1


^ permalink raw reply related

* [PATCH v1 net] tcp: Disable usec TS for SYN Cookie.
From: Kuniyuki Iwashima @ 2026-04-18  2:49 UTC (permalink / raw)
  To: Eric Dumazet, Neal Cardwell, David S. Miller, Jakub Kicinski,
	Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev

cookie_tcp_reqsk_alloc() sets tcp_rsk(req)->req_usec_ts to false
unconditionally.

If want_cookie is true in tcp_conn_request(), we should not set
tcp_rsk(req)->req_usec_ts.

Let's not call dst_tcp_usec_ts() for SYN Cookie.

Fixes: 614e8316aa4c ("tcp: add support for usec resolution in TCP TS values")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv4/syncookies.c | 3 ---
 net/ipv4/tcp_input.c  | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b5f0a65c6786..f5cd9e325d01 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -76,12 +76,9 @@ u64 cookie_init_timestamp(struct request_sock *req, u64 now)
 	if (ts > ts_now)
 		ts -= (1UL << TSBITS);
 
-	if (tcp_rsk(req)->req_usec_ts)
-		return ts * NSEC_PER_USEC;
 	return ts * NSEC_PER_MSEC;
 }
 
-
 static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
 				   __be16 dport, __u32 sseq, __u32 data)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cba89733d121..8bf202b95c68 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7720,7 +7720,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		st = af_ops->init_seq_and_ts_off(net, skb);
 
 	if (tmp_opt.tstamp_ok) {
-		tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
+		if (!want_cookie)
+			tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
 		tcp_rsk(req)->ts_off = st.ts_off;
 	}
 	if (!want_cookie && !isn) {
-- 
2.54.0.rc1.513.gad8abe7a5a-goog


^ permalink raw reply related

* [PATCH v8 net 0/6] netem: bug fixes
From: Stephen Hemminger @ 2026-04-18  3:19 UTC (permalink / raw)
  To: netdev; +Cc: jiri, jhs, horms, Stephen Hemminger

These bugs were found when doing AI-assisted review of sch_netem.c
during investigation of the packet duplication recursion problem
addressed in Jamal's series.

The fixes cover:

 - probability gaps in the 4-state Markov loss model
 - queue limit not accounting for reordered packets
 - PRNG reseeded on every tc change, breaking reproducibility
 - slot configuration not validated (inverted ranges, negative
   delays, negative limits)
 - slot delay arithmetic overflow for ranges above ~2.1 seconds
 - negative latency and jitter wrapping to huge time_to_send
   values via u64 arithmetic

v8 - added check for negative TCA_NETEM_LATENCY64 and TCA_NETEM_JITTER64
   - extended slot validation to cover dist_delay, dist_jitter,
     max_packets and max_bytes

Stephen Hemminger (6):
  net/sched: netem: fix probability gaps in 4-state loss model
  net/sched: netem: fix queue limit check to include reordered packets
  net/sched: netem: only reseed PRNG when seed is explicitly provided
  net/sched: netem: validate slot configuration
  net/sched: netem: fix slot delay calculation overflow
  net/sched: netem: check for negative latency and jitter

 net/sched/sch_netem.c | 76 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 64 insertions(+), 12 deletions(-)

-- 
2.53.0


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox