Linux Documentation
 help / color / mirror / Atom feed
* [RFC V2 1/3] lib/vsprintf: Add support for pgtable entries
From: Anshuman Khandual @ 2026-06-10  4:35 UTC (permalink / raw)
  To: linux-mm
  Cc: Anshuman Khandual, Andy Shevchenko, Rasmus Villemoes,
	Sergey Senozhatsky, Petr Mladek, Steven Rostedt, Jonathan Corbet,
	Andrew Morton, David Hildenbrand, linux-kernel, linux-doc,
	David Hildenbrand, Lorenzo Stoakes, Andy Whitcroft
In-Reply-To: <20260610043545.3725735-1-anshuman.khandual@arm.com>

Add some print formats for pgtable entries at any pgtable level. These new
formats are %pp[g|4|u|m|t][d|e] i.e %ppgd, %pp4d, %ppud, %ppmd, and %ppte.
These currently support both 32 bit and 64 bit pgtable entries that can be
extended up to 128 bit when required.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Lorenzo Stoakes <ljs@kernel.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-doc@vger.kernel.org

 Documentation/core-api/printk-formats.rst | 19 ++++++++
 lib/vsprintf.c                            | 58 +++++++++++++++++++++++
 scripts/checkpatch.pl                     |  2 +-
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index c0b1b6089307..e69f91a9dd9d 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -696,6 +696,25 @@ Rust
 Only intended to be used from Rust code to format ``core::fmt::Arguments``.
 Do *not* use it from C.
 
+Page Table Entry
+----------------
+
+::
+
+        %p[pgd|p4dp|pud|pmd|pte]
+
+Print page table entry at any level.
+
+Passed by reference.
+
+Examples for a 64 bit page table entry, given &(u64)0xc0ffee::
+
+        %ppte   0x0000000000c0ffee
+        %ppmd   0x0000000000c0ffee
+        %ppud   0x0000000000c0ffee
+        %pp4d   0x0000000000c0ffee
+        %ppgd   0x0000000000c0ffee
+
 Thanks
 ======
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 9f359b31c8d1..d4ad3048a4db 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -856,6 +856,59 @@ static char *default_pointer(char *buf, char *end, const void *ptr,
 	return ptr_to_id(buf, end, ptr, spec);
 }
 
+static char *pxd_pointer(char *buf, char *end, const void *ptr,
+			 struct printf_spec spec, const char *fmt)
+{
+	if (check_pointer(&buf, end, ptr, spec))
+		return buf;
+
+	if (fmt[1] == 't' && fmt[2] == 'e') {
+		pte_t *pte = (pte_t *)ptr;
+
+		static_assert(sizeof(pte_t) == 4 ||
+			      sizeof(pte_t) == 8,
+			      "pte_t size must be 4 or 8 bytes");
+		return special_hex_number(buf, end, pte_val(ptep_get(pte)), sizeof(pte_t));
+	}
+
+	if (fmt[1] == 'm' && fmt[2] == 'd') {
+		pmd_t *pmd = (pmd_t *)ptr;
+
+		static_assert(sizeof(pmd_t) == 4 ||
+			      sizeof(pmd_t) == 8,
+			      "pmd_t size must be 4 or 8 bytes");
+		return special_hex_number(buf, end, pmd_val(pmdp_get(pmd)), sizeof(pmd_t));
+	}
+
+	if (fmt[1] == 'u' && fmt[2] == 'd') {
+		pud_t *pud = (pud_t *)ptr;
+
+		static_assert(sizeof(pud_t) == 4 ||
+			      sizeof(pud_t) == 8,
+			      "pud_t size must be 4 or 8 bytes");
+		return special_hex_number(buf, end, pud_val(pudp_get(pud)), sizeof(pud_t));
+	}
+
+	if (fmt[1] == '4' && fmt[2] == 'd') {
+		p4d_t *p4d = (p4d_t *)ptr;
+
+		static_assert(sizeof(p4d_t) == 4 ||
+			      sizeof(p4d_t) == 8,
+			      "p4d_t size must be 4 or 8 bytes");
+		return special_hex_number(buf, end, p4d_val(p4dp_get(p4d)), sizeof(p4d_t));
+	}
+
+	if (fmt[1] == 'g' && fmt[2] == 'd') {
+		pgd_t *pgd = (pgd_t *)ptr;
+
+		static_assert(sizeof(pgd_t) == 4 ||
+			      sizeof(pgd_t) == 8,
+			      "pgd_t size must be 4 or 8 bytes");
+		return special_hex_number(buf, end, pgd_val(pgdp_get(pgd)), sizeof(pgd_t));
+	}
+	return default_pointer(buf, end, ptr, spec);
+}
+
 int kptr_restrict __read_mostly;
 
 static noinline_for_stack
@@ -2506,6 +2559,9 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
  *		Without an option prints the full name of the node
  *		f full name
  *		P node name, including a possible unit address
+ * - 'p[g|4|u|m|t|][d|e]' For a page table entry, this prints its
+ *			  contents in a hexadecimal format
+ *
  * - 'x' For printing the address unmodified. Equivalent to "%lx".
  *       Please read the documentation (path below) before using!
  * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
@@ -2615,6 +2671,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		default:
 			return error_string(buf, end, "(einval)", spec);
 		}
+	case 'p':
+		return pxd_pointer(buf, end, ptr, spec, fmt);
 	default:
 		return default_pointer(buf, end, ptr, spec);
 	}
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0492d6afc9a1..f68955858e29 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6975,7 +6975,7 @@ sub process {
 				my $fmt = get_quoted_string($lines[$count - 1], raw_line($count, 0));
 				$fmt =~ s/%%//g;
 
-				while ($fmt =~ /(\%[\*\d\.]*p(\w)(\w*))/g) {
+				while ($fmt =~ /(\%[\*\d\.]*p(\w)(\w*)(pte|pmd|pud|p4d|pgd))/g) {
 					$specifier = $1;
 					$extension = $2;
 					$qualifier = $3;
-- 
2.30.2


^ permalink raw reply related

* configurable block error injection v4
From: Christoph Hellwig @ 2026-06-10  5:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc

Hi all,

this series adds a new configurable block error injection facility.
We already have a few to inject block errors, but unfortunately most
of them are either not very useful or hard to use, or both:

 - The fail_make_request failure injection point can't distinguish
   different commands, different ranges in the file and can only injection
   plain I/O errors.
 - the should_fail_bio 'dynamic' failure injection has all the same issues
   as fail_make_request
 - dm-error can only fail all command in the table using BLK_STS_IOERR
   and requires setting up a new block device
 - dm-flakey and dm-dust allow all kinds of configurability, but still
   don't have good error selection, no good support for non-read/write
   commands and are limited to the dm table alignment requirements,
   which for zoned devices enforces setting them up for an entire zone.
   They also once again require setting up a stacked block device,
   which is really annoying in harnesses like xfstests

This series adds a new debugfs-based block layer error injection
that allows to configure what operations and ranges the injection
applied to, and what status to return.  It also allows to configure a
failure ratio similar to the xfs errortag injection.

Changes since v3:
 - use a static branch to guard the new condition
 - split out a new header so that jump_label.h doesn't get pulled into
   blk.h
 - more checking for impossible conditions in blk_status_to_tag
 - more spelling fixes

Changes since v2:
 - improve the documentation a bit
 - fix a spelling mistake in a comment

Changes since v1:
 - drop the should_fail_bio removal and cleanup depending on it, as it's
   used by eBPF programs and thus a hidden UABI.
 - as a result split the code out to it's own Kconfig symbol
 - various error handling fixed pointed out by Keith
 - documentation spelling fixes pointed out by Randy

Diffstat:
 Documentation/block/error-injection.rst |   59 ++++++
 Documentation/block/index.rst           |    1 
 block/Kconfig                           |    8 
 block/Makefile                          |    1 
 block/blk-core.c                        |   87 ++++++--
 block/blk-sysfs.c                       |    5 
 block/blk.h                             |    3 
 block/error-injection.c                 |  314 ++++++++++++++++++++++++++++++++
 block/error-injection.h                 |   21 ++
 block/genhd.c                           |    4 
 include/linux/blkdev.h                  |    6 
 11 files changed, 489 insertions(+), 20 deletions(-)

^ permalink raw reply

* [PATCH 1/4] block: add a macro to initialize the status table
From: Christoph Hellwig @ 2026-06-10  5:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc, Bart Van Assche, Hannes Reinecke
In-Reply-To: <20260610051015.1906799-1-hch@lst.de>

Prepare for adding a new value to the error table by adding a macro
to fill it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
---
 block/blk-core.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1c637db79e59..43121a9f99f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -132,39 +132,44 @@ inline const char *blk_op_str(enum req_op op)
 }
 EXPORT_SYMBOL_GPL(blk_op_str);
 
+#define ENT(_tag, _errno, _desc)	\
+[BLK_STS_##_tag] = {				\
+	.errno		= _errno,		\
+	.name		= _desc,		\
+}
 static const struct {
 	int		errno;
 	const char	*name;
 } blk_errors[] = {
-	[BLK_STS_OK]		= { 0,		"" },
-	[BLK_STS_NOTSUPP]	= { -EOPNOTSUPP, "operation not supported" },
-	[BLK_STS_TIMEOUT]	= { -ETIMEDOUT,	"timeout" },
-	[BLK_STS_NOSPC]		= { -ENOSPC,	"critical space allocation" },
-	[BLK_STS_TRANSPORT]	= { -ENOLINK,	"recoverable transport" },
-	[BLK_STS_TARGET]	= { -EREMOTEIO,	"critical target" },
-	[BLK_STS_RESV_CONFLICT]	= { -EBADE,	"reservation conflict" },
-	[BLK_STS_MEDIUM]	= { -ENODATA,	"critical medium" },
-	[BLK_STS_PROTECTION]	= { -EILSEQ,	"protection" },
-	[BLK_STS_RESOURCE]	= { -ENOMEM,	"kernel resource" },
-	[BLK_STS_DEV_RESOURCE]	= { -EBUSY,	"device resource" },
-	[BLK_STS_AGAIN]		= { -EAGAIN,	"nonblocking retry" },
-	[BLK_STS_OFFLINE]	= { -ENODEV,	"device offline" },
+	ENT(OK,			0,		""),
+	ENT(NOTSUPP,		-EOPNOTSUPP,	"operation not supported"),
+	ENT(TIMEOUT,		-ETIMEDOUT,	"timeout"),
+	ENT(NOSPC,		-ENOSPC,	"critical space allocation"),
+	ENT(TRANSPORT,		-ENOLINK,	"recoverable transport"),
+	ENT(TARGET,		-EREMOTEIO,	"critical target"),
+	ENT(RESV_CONFLICT,	-EBADE,		"reservation conflict"),
+	ENT(MEDIUM,		-ENODATA,	"critical medium"),
+	ENT(PROTECTION,		-EILSEQ,	"protection"),
+	ENT(RESOURCE,		-ENOMEM,	"kernel resource"),
+	ENT(DEV_RESOURCE,	-EBUSY,		"device resource"),
+	ENT(AGAIN,		-EAGAIN,	"nonblocking retry"),
+	ENT(OFFLINE,		-ENODEV,	"device offline"),
 
 	/* device mapper special case, should not leak out: */
-	[BLK_STS_DM_REQUEUE]	= { -EREMCHG, "dm internal retry" },
+	ENT(DM_REQUEUE,		-EREMCHG,	"dm internal retry"),
 
 	/* zone device specific errors */
-	[BLK_STS_ZONE_OPEN_RESOURCE]	= { -ETOOMANYREFS, "open zones exceeded" },
-	[BLK_STS_ZONE_ACTIVE_RESOURCE]	= { -EOVERFLOW, "active zones exceeded" },
+	ENT(ZONE_OPEN_RESOURCE, -ETOOMANYREFS,	"open zones exceeded"),
+	ENT(ZONE_ACTIVE_RESOURCE, -EOVERFLOW,	"active zones exceeded"),
 
 	/* Command duration limit device-side timeout */
-	[BLK_STS_DURATION_LIMIT]	= { -ETIME, "duration limit exceeded" },
-
-	[BLK_STS_INVAL]		= { -EINVAL,	"invalid" },
+	ENT(DURATION_LIMIT,	-ETIME,		"duration limit exceeded"),
+	ENT(INVAL,		-EINVAL,	"invalid"),
 
 	/* everything else not covered above: */
-	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
+	ENT(IOERR,		-EIO,		"I/O"),
 };
+#undef ENT
 
 blk_status_t errno_to_blk_status(int errno)
 {
-- 
2.53.0


^ permalink raw reply related

* [PATCH 2/4] block: add a "tag" for block status codes
From: Christoph Hellwig @ 2026-06-10  5:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc, Hannes Reinecke
In-Reply-To: <20260610051015.1906799-1-hch@lst.de>

The full name of the status codes is not good for user interfaces as it
can contain white spaces.  Add the name of the status code without the
BLK_STS_ prefix as a tag so that it can be used for user interfaces.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
---
 block/blk-core.c | 28 ++++++++++++++++++++++++++++
 block/blk.h      |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 43121a9f99f0..842b5c6f2fb4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -135,10 +135,12 @@ EXPORT_SYMBOL_GPL(blk_op_str);
 #define ENT(_tag, _errno, _desc)	\
 [BLK_STS_##_tag] = {				\
 	.errno		= _errno,		\
+	.tag		= __stringify(_tag),	\
 	.name		= _desc,		\
 }
 static const struct {
 	int		errno;
+	const char	*tag;
 	const char	*name;
 } blk_errors[] = {
 	ENT(OK,			0,		""),
@@ -203,6 +205,32 @@ const char *blk_status_to_str(blk_status_t status)
 	return blk_errors[idx].name;
 }
 
+const char *blk_status_to_tag(blk_status_t status)
+{
+	int idx = (__force int)status;
+
+	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors) || !blk_errors[idx].tag))
+		return "<null>";
+	return blk_errors[idx].tag;
+}
+
+blk_status_t tag_to_blk_status(const char *tag)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
+		if (blk_errors[i].tag &&
+		    !strcmp(blk_errors[i].tag, tag))
+			return (__force blk_status_t)i;
+	}
+
+	/*
+	 * Return BLK_STS_OK for mismatches as this function is intended to
+	 * parse error status values.
+	 */
+	return BLK_STS_OK;
+}
+
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
diff --git a/block/blk.h b/block/blk.h
index 7fdfb9012ce1..3ab2cdd6ed12 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
 const char *blk_status_to_str(blk_status_t status);
+const char *blk_status_to_tag(blk_status_t status);
+blk_status_t tag_to_blk_status(const char *tag);
 
 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 bool blk_queue_start_drain(struct request_queue *q);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 3/4] block: add a str_to_blk_op helper
From: Christoph Hellwig @ 2026-06-10  5:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc, Hannes Reinecke
In-Reply-To: <20260610051015.1906799-1-hch@lst.de>

Add a helper to find the REQ_OP_XYZ constant from the "XYZ" string.
This will be used for the error injection debugfs interface.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
---
 block/blk-core.c | 10 ++++++++++
 block/blk.h      |  1 +
 2 files changed, 11 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 842b5c6f2fb4..beaab7a71fba 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -132,6 +132,16 @@ inline const char *blk_op_str(enum req_op op)
 }
 EXPORT_SYMBOL_GPL(blk_op_str);
 
+enum req_op str_to_blk_op(const char *op)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(blk_op_name); i++)
+		if (blk_op_name[i] && !strcmp(blk_op_name[i], op))
+			return (enum req_op)i;
+	return REQ_OP_LAST;
+}
+
 #define ENT(_tag, _errno, _desc)	\
 [BLK_STS_##_tag] = {				\
 	.errno		= _errno,		\
diff --git a/block/blk.h b/block/blk.h
index 3ab2cdd6ed12..507ab34a6e90 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -53,6 +53,7 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 const char *blk_status_to_str(blk_status_t status);
 const char *blk_status_to_tag(blk_status_t status);
 blk_status_t tag_to_blk_status(const char *tag);
+enum req_op str_to_blk_op(const char *op);
 
 bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 bool blk_queue_start_drain(struct request_queue *q);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 4/4] block: add configurable error injection
From: Christoph Hellwig @ 2026-06-10  5:08 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc, Hannes Reinecke
In-Reply-To: <20260610051015.1906799-1-hch@lst.de>

Add a new block error injection interface that allows to inject specific
status code for specific ranges.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@kernel.org>
---
 Documentation/block/error-injection.rst |  59 +++++
 Documentation/block/index.rst           |   1 +
 block/Kconfig                           |   8 +
 block/Makefile                          |   1 +
 block/blk-core.c                        |   4 +
 block/blk-sysfs.c                       |   5 +
 block/error-injection.c                 | 314 ++++++++++++++++++++++++
 block/error-injection.h                 |  21 ++
 block/genhd.c                           |   4 +
 include/linux/blkdev.h                  |   6 +
 10 files changed, 423 insertions(+)
 create mode 100644 Documentation/block/error-injection.rst
 create mode 100644 block/error-injection.c
 create mode 100644 block/error-injection.h

diff --git a/Documentation/block/error-injection.rst b/Documentation/block/error-injection.rst
new file mode 100644
index 000000000000..81f31af82e65
--- /dev/null
+++ b/Documentation/block/error-injection.rst
@@ -0,0 +1,59 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+Configurable Error Injection
+============================
+
+Overview
+--------
+
+Configurable error injection allows injecting specific block layer status codes
+for sector ranges of a block device.  Errors can be injected unconditionally, or
+with a given probability.
+
+To use configurable error injection, CONFIG_BLK_ERROR_INJECTION must be enabled.
+
+The only interface is the error_injection debugfs file, which is created for
+each registered gendisk.  Writes to this file are used to create or delete rules
+and reads return a list of the current error injection sites.
+
+Options
+-------
+
+The following options specify the operations:
+
+===================	=======================================================
+add			add a new rule
+removeall		remove all existing rules
+===================	=======================================================
+
+The following options specify the details of the rule for the add operation:
+
+===================	=======================================================
+op=<string>		block layer operation this rule applies to.  This uses
+			the XYZ for each REQ_OP_XYZ operation, e.g. READ, WRITE
+			or DISCARD. Mandatory.
+status=<string>		Status to return.  This uses XYZ for each BLK_STS_XYZ
+			code, e.g. IOERR or MEDIUM. Mandatory.
+start=<number>		First block layer sector the rule applies to.
+			Optional, defaults to 0.
+nr_sectors=<number>	Number of sectors this rule applies.
+			Optional, defaults to the remainder of the device.
+chance=<number>		Only return a failure with a likelihood of 1/chance.
+			Optional, defaults to 1 (always).
+===================	=======================================================
+
+Example
+-------
+
+Return BLK_STS_IOERR for one in 10 reads of sector 0 of /dev/nvme0n1:
+
+	$ echo 'add,op=READ,start=0,status=IOERR,chance=10' > /sys/kernel/debug/block/nvme0n1/error_injection
+
+Return BLK_STS_MEDIUM for every write to /dev/nvme0n1:
+
+	$ echo 'add,op=WRITE,start=0,status=MEDIUM' > /sys/kernel/debug/block/nvme0n1/error_injection
+
+Remove all rules for /dev/nvme0n1:
+
+	$ echo 'removeall' > /sys/kernel/debug/block/nvme0n1/error_injection
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
index 9fea696f9daa..bfa1bbd31ddf 100644
--- a/Documentation/block/index.rst
+++ b/Documentation/block/index.rst
@@ -22,3 +22,4 @@ Block
    switching-sched
    writeback_cache_control
    ublk
+   error-injection
diff --git a/block/Kconfig b/block/Kconfig
index 15027963472d..70e4a66d941f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -221,6 +221,14 @@ config BLOCK_HOLDER_DEPRECATED
 config BLK_MQ_STACKING
 	bool
 
+config BLK_ERROR_INJECTION
+	bool "Enable block layer error injection"
+	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+	help
+	  Enable inserting arbitrary block errors through a debugfs interface.
+
+	  See Documentation/block/error-injection.rst for details.
+
 source "block/Kconfig.iosched"
 
 endif # BLOCK
diff --git a/block/Makefile b/block/Makefile
index 54130faacc21..e7bd320e3d69 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,7 @@ obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
 			disk-events.o blk-ia-ranges.o early-lookup.o
 
+obj-$(CONFIG_BLK_ERROR_INJECTION) += error-injection.o
 obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
diff --git a/block/blk-core.c b/block/blk-core.c
index beaab7a71fba..73a41df98c9a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -50,6 +50,7 @@
 #include "blk-cgroup.h"
 #include "blk-throttle.h"
 #include "blk-ioprio.h"
+#include "error-injection.h"
 
 struct dentry *blk_debugfs_root;
 
@@ -767,6 +768,9 @@ static void __submit_bio_noacct_mq(struct bio *bio)
 
 void submit_bio_noacct_nocheck(struct bio *bio, bool split)
 {
+	if (unlikely(blk_error_inject(bio)))
+		return;
+
 	blk_cgroup_bio_start(bio);
 
 	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f22c1f253eb3..520972676ab4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -19,6 +19,7 @@
 #include "blk-wbt.h"
 #include "blk-cgroup.h"
 #include "blk-throttle.h"
+#include "error-injection.h"
 
 struct queue_sysfs_entry {
 	struct attribute attr;
@@ -933,6 +934,8 @@ static void blk_debugfs_remove(struct gendisk *disk)
 
 	blk_debugfs_lock_nomemsave(q);
 	blk_trace_shutdown(q);
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+		blk_error_injection_exit(disk);
 	debugfs_remove_recursive(q->debugfs_dir);
 	q->debugfs_dir = NULL;
 	q->sched_debugfs_dir = NULL;
@@ -963,6 +966,8 @@ int blk_register_queue(struct gendisk *disk)
 
 	memflags = blk_debugfs_lock(q);
 	q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION))
+		blk_error_injection_init(disk);
 	if (queue_is_mq(q))
 		blk_mq_debugfs_register(q);
 	blk_debugfs_unlock(q, memflags);
diff --git a/block/error-injection.c b/block/error-injection.c
new file mode 100644
index 000000000000..7f7f0d3327bc
--- /dev/null
+++ b/block/error-injection.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2026 Christoph Hellwig.
+ */
+#include <linux/debugfs.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "blk.h"
+#include "error-injection.h"
+
+struct blk_error_inject {
+	struct list_head		entry;
+	sector_t			start;
+	sector_t			end;
+	enum req_op			op;
+	blk_status_t			status;
+
+	/* only inject every 1 / chance times */
+	unsigned int			chance;
+};
+
+DEFINE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+bool __blk_error_inject(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_error_inject *inj;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+		if (bio->bi_iter.bi_sector <= inj->end &&
+		    bio_end_sector(bio) > inj->start &&
+		    bio_op(bio) == inj->op) {
+			blk_status_t status = inj->status;
+
+			if (inj->chance > 1 &&
+			    (get_random_u32() % inj->chance) != 0)
+				continue;
+
+			pr_info_ratelimited("%pg: injecting %s error for %s at sector %llu:%u\n",
+					disk->part0,
+					blk_status_to_str(status),
+					blk_op_str(inj->op),
+					bio->bi_iter.bi_sector,
+					bio_sectors(bio));
+			rcu_read_unlock();
+			bio_endio_status(bio, status);
+			return true;
+		}
+	}
+	rcu_read_unlock();
+	return false;
+}
+
+static int error_inject_add(struct gendisk *disk, enum req_op op,
+		sector_t start, u64 nr_sectors, blk_status_t status,
+		unsigned int chance)
+{
+	struct blk_error_inject *inj;
+	int error = -EINVAL;
+
+	if (op == REQ_OP_LAST)
+		return -EINVAL;
+	if (status == BLK_STS_OK)
+		return -EINVAL;
+
+	inj = kzalloc_obj(*inj);
+	if (!inj)
+		return -ENOMEM;
+
+	if (nr_sectors) {
+		if (U64_MAX - nr_sectors < start)
+			goto out_free_inj;
+		inj->end = start + nr_sectors - 1;
+	} else {
+		inj->end = U64_MAX;
+	}
+
+	inj->op = op;
+	inj->start = start;
+	inj->status = status;
+	inj->chance = chance;
+
+	pr_debug_ratelimited("%pg: adding %s injection for %s at sector %llu:%llu\n",
+			disk->part0, blk_status_to_str(status),
+			blk_op_str(op),
+			start, nr_sectors);
+
+	/*
+	 * Add to the front of the list so that newer entries can partially
+	 * override other entries.  This also intentionally allows duplicate
+	 * entries as there is no real reason to reject them.
+	 */
+	mutex_lock(&disk->error_injection_lock);
+	if (!disk_live(disk)) {
+		mutex_unlock(&disk->error_injection_lock);
+		error = -ENODEV;
+		goto out_free_inj;
+	}
+	if (list_empty(&disk->error_injection_list))
+		static_branch_inc(&blk_error_injection_enabled);
+	list_add_rcu(&inj->entry, &disk->error_injection_list);
+	set_bit(GD_ERROR_INJECT, &disk->state);
+	mutex_unlock(&disk->error_injection_lock);
+	return 0;
+
+out_free_inj:
+	kfree(inj);
+	return error;
+}
+
+static void error_inject_removeall(struct gendisk *disk)
+{
+	struct blk_error_inject *inj;
+
+	mutex_lock(&disk->error_injection_lock);
+	clear_bit(GD_ERROR_INJECT, &disk->state);
+	while ((inj = list_first_entry_or_null(&disk->error_injection_list,
+			struct blk_error_inject, entry))) {
+		list_del_rcu(&inj->entry);
+		mutex_unlock(&disk->error_injection_lock);
+
+		kfree_rcu_mightsleep(inj);
+
+		mutex_lock(&disk->error_injection_lock);
+	}
+	static_branch_dec(&blk_error_injection_enabled);
+	mutex_unlock(&disk->error_injection_lock);
+}
+
+enum options {
+	Opt_add			= (1u << 0),
+	Opt_removeall		= (1u << 1),
+
+	Opt_op			= (1u << 16),
+	Opt_start		= (1u << 17),
+	Opt_nr_sectors		= (1u << 18),
+	Opt_status		= (1u << 19),
+	Opt_chance		= (1u << 20),
+
+	Opt_invalid,
+};
+
+static const match_table_t opt_tokens = {
+	{ Opt_add,			"add",			},
+	{ Opt_removeall,		"removeall",		},
+	{ Opt_op,			"op=%s",		},
+	{ Opt_start,			"start=%u"		},
+	{ Opt_nr_sectors,		"nr_sectors=%u"		},
+	{ Opt_status,			"status=%s"		},
+	{ Opt_chance,			"chance=%u"		},
+	{ Opt_invalid,			NULL,			},
+};
+
+static int match_op(substring_t *args, enum req_op *op)
+{
+	const char *tag;
+
+	tag = match_strdup(args);
+	if (!tag)
+		return -ENOMEM;
+	*op = str_to_blk_op(tag);
+	if (*op == REQ_OP_LAST)
+		pr_warn("invalid op '%s'\n", tag);
+	kfree(tag);
+	return 0;
+}
+
+static int match_status(substring_t *args, blk_status_t *status)
+{
+	const char *tag;
+
+	tag = match_strdup(args);
+	if (!tag)
+		return -ENOMEM;
+	*status = tag_to_blk_status(tag);
+	if (!*status)
+		pr_warn("invalid status '%s'\n", tag);
+	kfree(tag);
+	return 0;
+}
+
+static ssize_t blk_error_injection_parse_options(struct gendisk *disk,
+		char *options)
+{
+	enum { Unset, Add, Removeall } action = Unset;
+	unsigned int option_mask = 0, chance = 1;
+	enum req_op op = REQ_OP_LAST;
+	u64 start = 0, nr_sectors = 0;
+	blk_status_t status = BLK_STS_OK;
+	substring_t args[MAX_OPT_ARGS];
+	char *p;
+
+	while ((p = strsep(&options, ",\n")) != NULL) {
+		int error = 0;
+		ssize_t token;
+
+		if (!*p)
+			continue;
+		token = match_token(p, opt_tokens, args);
+		option_mask |= token;
+		switch (token) {
+		case Opt_add:
+			if (action != Unset)
+				return -EINVAL;
+			action = Add;
+			break;
+		case Opt_removeall:
+			if (action != Unset)
+				return -EINVAL;
+			action = Removeall;
+			break;
+		case Opt_op:
+			error = match_op(args, &op);
+			break;
+		case Opt_start:
+			error = match_u64(args, &start);
+			break;
+		case Opt_nr_sectors:
+			error = match_u64(args, &nr_sectors);
+			break;
+		case Opt_status:
+			error = match_status(args, &status);
+			break;
+		case Opt_chance:
+			error = match_uint(args, &chance);
+			if (!error && chance == 0)
+				error = -EINVAL;
+			break;
+		default:
+			pr_warn("unknown parameter or missing value '%s'\n", p);
+			error = -EINVAL;
+		}
+		if (error)
+			return error;
+	}
+
+	switch (action) {
+	case Add:
+		return error_inject_add(disk, op, start, nr_sectors, status,
+				chance);
+	case Removeall:
+		if (option_mask & ~Opt_removeall)
+			return -EINVAL;
+		error_inject_removeall(disk);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t blk_error_injection_write(struct file *file,
+		const char __user *ubuf, size_t count, loff_t *pos)
+{
+	struct gendisk *disk = file_inode(file)->i_private;
+	char *options;
+	int error;
+
+	options = memdup_user_nul(ubuf, count);
+	if (IS_ERR(options))
+		return PTR_ERR(options);
+	error = blk_error_injection_parse_options(disk, options);
+	kfree(options);
+
+	if (error)
+		return error;
+	return count;
+}
+
+static int blk_error_injection_show(struct seq_file *s, void *private)
+{
+	struct gendisk *disk = s->private;
+	struct blk_error_inject *inj;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(inj, &disk->error_injection_list, entry) {
+		seq_printf(s, "%llu:%llu status=%s,chance=%u",
+			inj->start, inj->end,
+			blk_status_to_tag(inj->status), inj->chance);
+		seq_putc(s, '\n');
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
+static int blk_error_injection_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, blk_error_injection_show, inode->i_private);
+}
+
+static int blk_error_injection_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+
+static const struct file_operations blk_error_injection_fops = {
+	.owner		= THIS_MODULE,
+	.write		= blk_error_injection_write,
+	.read		= seq_read,
+	.open		= blk_error_injection_open,
+	.release	= blk_error_injection_release,
+};
+
+void blk_error_injection_init(struct gendisk *disk)
+{
+	debugfs_create_file("error_injection", 0600, disk->queue->debugfs_dir,
+			disk, &blk_error_injection_fops);
+}
+
+void blk_error_injection_exit(struct gendisk *disk)
+{
+	error_inject_removeall(disk);
+}
diff --git a/block/error-injection.h b/block/error-injection.h
new file mode 100644
index 000000000000..9821d773abab
--- /dev/null
+++ b/block/error-injection.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_ERROR_INJECTION_H
+#define _BLK_ERROR_INJECTION_H 1
+
+#include <linux/jump_label.h>
+
+DECLARE_STATIC_KEY_FALSE(blk_error_injection_enabled);
+
+void blk_error_injection_init(struct gendisk *disk);
+void blk_error_injection_exit(struct gendisk *disk);
+bool __blk_error_inject(struct bio *bio);
+static inline bool blk_error_inject(struct bio *bio)
+{
+	if (IS_ENABLED(CONFIG_BLK_ERROR_INJECTION) &&
+	    static_branch_unlikely(&blk_error_injection_enabled) &&
+	    test_bit(GD_ERROR_INJECT, &bio->bi_bdev->bd_disk->state))
+		return __blk_error_inject(bio);
+	return false;
+}
+
+#endif /* _BLK_ERROR_INJECTION_H */
diff --git a/block/genhd.c b/block/genhd.c
index 7d6854fd28e9..f84b6a355b57 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1485,6 +1485,10 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
 	INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif
+#ifdef CONFIG_BLK_ERROR_INJECTION
+	mutex_init(&disk->error_injection_lock);
+	INIT_LIST_HEAD(&disk->error_injection_list);
 #endif
 	mutex_init(&disk->rqos_state_mutex);
 	kobject_init(&disk->queue_kobj, &blk_queue_ktype);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 57e84d59a642..5070851cf924 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -176,6 +176,7 @@ struct gendisk {
 #define GD_SUPPRESS_PART_SCAN		5
 #define GD_OWNS_QUEUE			6
 #define GD_ZONE_APPEND_USED		7
+#define GD_ERROR_INJECT			8
 
 	struct mutex open_mutex;	/* open/close mutex */
 	unsigned open_partitions;	/* number of open partitions */
@@ -227,6 +228,11 @@ struct gendisk {
 	 */
 	struct blk_independent_access_ranges *ia_ranges;
 
+#ifdef CONFIG_BLK_ERROR_INJECTION
+	struct mutex		error_injection_lock;
+	struct list_head	error_injection_list;
+#endif
+
 	struct mutex rqos_state_mutex;	/* rqos state change mutex */
 };
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v5] Docs/{admin-guide,mm}/damon: fix DAMON documentation details
From: Doehyun Baek @ 2026-06-10  5:39 UTC (permalink / raw)
  To: SeongJae Park, Andrew Morton
  Cc: Doehyun Baek, David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
	Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko,
	Jonathan Corbet, Shuah Khan, damon, linux-mm, linux-doc,
	linux-kernel

Fix minor DAMON documentation issues.  Correct the sysfs scheme file name
apply_interval_us, the DAMON_STAT module count, a malformed reference, a
misplaced label indentation, and a few typos.

Signed-off-by: Doehyun Baek <doehyunbaek@gmail.com>
---
Changes from v4:
- Rebased on mm-new.
- Sent the English documentation fixes as a standalone patch.
- Dropped the Chinese translation patch from this submission.

 Documentation/admin-guide/mm/damon/usage.rst |  8 ++++----
 Documentation/mm/damon/design.rst            | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst
index 011296f1e7c2..b2649ea011f9 100644
--- a/Documentation/admin-guide/mm/damon/usage.rst
+++ b/Documentation/admin-guide/mm/damon/usage.rst
@@ -246,7 +246,7 @@ writing to and reading from the files.
 Under ``nr_regions`` directory, two files for the lower-bound and upper-bound
 of DAMON's monitoring regions (``min`` and ``max``, respectively), which
 controls the monitoring overhead, exist.  You can set and get the values by
-writing to and rading from the files.
+writing to and reading from the files.
 
 For more details about the intervals and monitoring regions range, please refer
 to the Design document (:doc:`/mm/damon/design`).
@@ -264,7 +264,7 @@ Please refer to  the :ref:`design document of the feature
 <damon_design_monitoring_intervals_autotuning>` for the internal of the tuning
 mechanism.  Reading and writing the four files under ``intervals_goal``
 directory shows and updates the tuning parameters that described in the
-:ref:design doc <damon_design_monitoring_intervals_autotuning>` with the same
+:ref:`design doc <damon_design_monitoring_intervals_autotuning>` with the same
 names.  The tuning starts with the user-set ``sample_us`` and ``aggr_us``.  The
 tuning-applied current values of the two intervals can be read from the
 ``sample_us`` and ``aggr_us`` files after writing ``update_tuned_intervals`` to
@@ -377,7 +377,7 @@ schemes/<N>/
 In each scheme directory, nine directories (``access_pattern``, ``quotas``,
 ``watermarks``, ``core_filters``, ``ops_filters``, ``filters``, ``dests``,
 ``stats``, and ``tried_regions``) and three files (``action``, ``target_nid``
-and ``apply_interval``) exist.
+and ``apply_interval_us``) exist.
 
 The ``action`` file is for setting and getting the scheme's :ref:`action
 <damon_design_damos_action>`.  The keywords that can be written to and read
@@ -743,7 +743,7 @@ counter).  Finally the tenth field (``X``) shows the ``age`` of the region
 (refer to :ref:`design <damon_design_age_tracking>` for more details of the
 counter).
 
-If the event was ``damon:damos_beofre_apply``, the ``perf script`` output would
+If the event was ``damon:damos_before_apply``, the ``perf script`` output would
 be somewhat like below::
 
     kdamond.0 47293 [000] 80801.060214: damon:damos_before_apply: ctx_idx=0 scheme_idx=0 target_idx=0 nr_regions=11 121932607488-135128711168: 0 136
diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index 2da7ca0d3d17..c16a3bb288d0 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -86,7 +86,7 @@ To know how user-space can do the configuration via :ref:`DAMON sysfs interface
 documentation.
 
 
- .. _damon_design_vaddr_target_regions_construction:
+.. _damon_design_vaddr_target_regions_construction:
 
 VMA-based Target Address Range Construction
 -------------------------------------------
@@ -930,11 +930,11 @@ control parameters for the usage would also need to be optimized for the
 purpose.
 
 To support such cases, yet more DAMON API user kernel modules that provide more
-simple and optimized user space interfaces are available.  Currently, two
-modules for proactive reclamation and LRU lists manipulation are provided.  For
-more detail, please read the usage documents for those
-(:doc:`/admin-guide/mm/damon/stat`, :doc:`/admin-guide/mm/damon/reclaim` and
-:doc:`/admin-guide/mm/damon/lru_sort`).
+simple and optimized user space interfaces are available.  Currently, three
+modules for access monitoring statistics, proactive reclamation, and LRU lists
+manipulation are provided.  For more detail, please read the usage documents for
+those (:doc:`/admin-guide/mm/damon/stat`, :doc:`/admin-guide/mm/damon/reclaim`
+and :doc:`/admin-guide/mm/damon/lru_sort`).
 
 .. _damon_design_special_purpose_modules_exclusivity:
 

base-commit: ce70d5abbf4f9930a07eddb06f40a0ea3494e33a
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v3 2/3] Documentation: security-bugs: explain what is and is not a security bug
From: Greg KH @ 2026-06-10  6:10 UTC (permalink / raw)
  To: Askar Safin
  Cc: w, corbet, leon, linux-doc, linux-kernel, security, skhan,
	workflows
In-Reply-To: <CAPnZJGAKHu4rR8+W67KRQYVwRqi3x2Y+iWwhG7a2bY7oEawhfg@mail.gmail.com>

On Wed, Jun 10, 2026 at 04:03:43AM +0300, Askar Safin wrote:
> Thank you for answer!
> 
> On Tue, Jun 9, 2026 at 11:44 AM Greg KH <gregkh@linuxfoundation.org> wrote:
> > > - If unprivileged user prevents privileged user from suspending
> > > system, is this security bug?
> >
> > Physical access of suspending a machine feels like an odd threat model
> > to be worried about :)
> 
> I think you didn't understand me here. I meant the following situation:
> unprivileged user without physical access was somehow able
> to prevent privileged user with physical access from suspending
> or hibernating the system.

If you can find a bug like this, sure, we'll be glad to review the fix
for it.  As for it being a "security" issue, that will depend on the
specific case as "can not suspend" doesn't seem to fix the definition of
"vulnerability" to me.

thanks,

greg k-h

^ permalink raw reply

* [PATCH net-next v09 0/5] net: hinic3: PF initialization
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier

This is [3/3] part of hinic3 Ethernet driver second submission.
With this patch hinic3 becomes a complete Ethernet driver with
pf and vf.

Add 20 ethtool ops for information of queue, rss, coalesce and eth data.
Add MTU size validation
Config netdev watchdog timeout.
Remove unneed coalesce parameters.

Changes:

PATCH 03 V01: https://lore.kernel.org/netdev/cover.1773387649.git.zhuyikai1@h-partners.com/
* Add rmon/pause/phy/mac/ctrl stats (Ioana Ciornei)

PATCH 03 V02: https://lore.kernel.org/netdev/cover.1774684571.git.zhuyikai1@h-partners.com/
* Modify "return -EINVAL" intension problem (AI review)
* Use le16_to_cpu for rss_indir pair.out->buf (AI review)
* Use u32 instead of int in coalesce_limits to avoid overflow (AI review)
* Remove redundant u64_stats_update_begin/end when reading stats without
  concurrent reader (AI review)
* Modify nic_dev->stats.syncp logic (AI review)
* Complete rxq/txq stats stats fileds in hinic3_rx/txq_get_stats (AI review)
* Remove statistics values in rtnl_link_stats64 from ethtool statistics
  values (AI review)
* Add channel_cfg_lock & channel_res_lock to protect resources access (AI review)
* Remove OutOfRangeLengthField, FrameToolong and InRangeLengthErrors (Ioana Ciornei)
* Remove redundant mtu commit (Maxime Chevialler)

PATCH 03 V03: https://lore.kernel.org/netdev/cover.1774940117.git.zhuyikai1@h-partners.com/
* Change unnedd to unneeded (AI review)
* Remove packets,bytes,errors and dropped in hinic3_rx/tx_queue_stats (AI review)
* Remove duplicated entried in hinic3_port_stats[] (AI review)
* change stats_info.head.status to ps->head.status (AI review)

PATCH 03 V04: https://lore.kernel.org/netdev/cover.1775618797.git.zhuyikai1@h-partners.com/
* Remove restore_drop_sge in hinic3_rx_queue_stats (AI review)
* Remove hinic3_nic_stats (AI review)
* Use old_q_param to store old config and use it in error handling (Mohsin Bashir)
* Add netdev_info to inform the user that depth is trimmed (Mohsin Bashir)
* Remove const in hinic3_get_qp_stats_strings parameters (Mohsin Bashir)
* Change EOPNOTSUPP to ERANGE in is_coalesce_exceed_limit (Mohsin Bashir)
* Update nic_dev->rss_type after hinic3_set_rss_type (Mohsin Bashir)
* Modify MGMT_STATUS_CMD_UNSUPPORTED to EOPNOTSUPP for complying with the
  error code specifications (Mohsin Bashir)

PATCH 03 V05: https://lore.kernel.org/netdev/cover.1775711066.git.zhuyikai1@h-partners.com/
* Clear HINIC3_CHANGE_RES_INVALID bit in error handling (AI review)
* Use low >= high to avoid low=high in is_coalesce_legal (AI review)
* As tx and rx share interrupts, we only use ETHTOOL_COALESCE_RX_USECS for
user setting to avoid user misunderstanding. So we do not add
ETHTOOL_COALESCE_TX_USECS. (Mohsin Bashir & AI review)

PATCH 03 V06: https://lore.kernel.org/netdev/cover.1779867397.git.zhuyikai1@h-partners.com/
* Remove redundant rx_jumbo_pending and rx_mini_pending judgement (Jakub Kicinski)
* Remove redundant max tx_pending judgement when .get_ringparam already got the
   max value (Jakub Kicinski)
* Use extack instead of netdev_err/netdev_info/netdev_warning (Jakub Kicinski)
* Remove HINIC3_CHANNEL_RES_VALID and only use HINIC3_CHANGE_RES_INVALID
  bit (Jakub Kicinski)
* Deference freed pointers in hinic3_change_channel_settings error
  handling (Jakub Kicinski)
* Modify hinic3_open_channel (Jakub Kicinski)

PATCH 03 V07: https://lore.kernel.org/netdev/cover.1779940072.git.zhuyikai1@h-partners.com/
* Remove the trailing '\n' in NL_SET_ERR_MSG* (Jakub Kicinski)

PATCH 03 V08: https://lore.kernel.org/netdev/cover.1780907605.git.wudi234@huawei.com/
* Update patch commit to be more accurate for change information (AI review)
* Modify <net/devlink.h> to <linux/netlink.h> (AI review)
* Use sq/rq_depth instead of rx/txqs[0].q_depth (AI review)
* hinic3_change_channel_settings() runs under change_res_mutex
  to protect dynamic channel updates (AI review)
* hinic3_close() serializes close and channel reconfiguration paths (AI review)
* Remove useless u64_stats_init in hinic3_get_drv_queue_stats() (AI review)
* hinic3_get_drv_queue_stats() fills rxq stats at correct idx
  when txq is null (AI review)
* Remove unnecessary semicolons after the closing bracket of
  static inline functions (AI review)
* Disable preemption in hinic3_rx_fill_buffers() (AI review)
* Add spinlock to protect per-queue coalesce parameters from
  concurrent access (AI review)
* Reject static coalesce config when adaptive RX coalesce is
  enabled (AI review)
* Validate interface state and queue ID before modifying coalesce
  parameters (AI review)
* Differentiate get_coalesce output for adaptive/static modes and
  zero-fill the struct (AI review)
* Return -ERANGE for invalid coalesce frame limits and simplify
  error message (AI review)
* Remove unused watchdog timeout definition and assignment (AI review)
* Return -EOPNOTSUPP when RSS type query is unsupported by firmware (AI review)
* Reject L4 hash bits for pure L3 flow types (AI review)
* Split RSS type conversion to correctly handle L3-only flows (AI review)
* Reprogram RSS indir table and HW parameters after channel count
  changes (AI review)
* Propagate exact error code from channel parameter validation (AI review)
* Update SW RSS indir copy only after HW programming succeeds (AI review)
* Program RSS hash type to HW before updating SW state to prevent
  inconsistency (AI review)

PATCH 03 V09:
* Remove useless messages in hinic3_set_ringparam() (Jakub Kicinski)
* Solve the big-endian and little-endian issue (Jakub Kicinski)

Fan Gong (5):
  hinic3: Add ethtool queue ops
  hinic3: Add ethtool statistic ops
  hinic3: Add ethtool coalesce ops
  hinic3: Add ethtool rss ops
  hinic3: Remove unneeded coalesce parameters

 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 834 +++++++++++++++++-
 .../ethernet/huawei/hinic3/hinic3_hw_intf.h   |  11 +
 .../net/ethernet/huawei/hinic3/hinic3_irq.c   |  14 +-
 .../net/ethernet/huawei/hinic3/hinic3_main.c  |   7 +
 .../huawei/hinic3/hinic3_mgmt_interface.h     |  39 +
 .../huawei/hinic3/hinic3_netdev_ops.c         | 104 ++-
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.c   |  64 ++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.h   | 109 +++
 .../ethernet/huawei/hinic3/hinic3_nic_dev.h   |  11 +
 .../ethernet/huawei/hinic3/hinic3_nic_io.c    |   4 +-
 .../ethernet/huawei/hinic3/hinic3_nic_io.h    |   8 +-
 .../net/ethernet/huawei/hinic3/hinic3_rss.c   | 539 ++++++++++-
 .../net/ethernet/huawei/hinic3/hinic3_rss.h   |  19 +
 .../net/ethernet/huawei/hinic3/hinic3_rx.c    |  72 +-
 .../net/ethernet/huawei/hinic3/hinic3_rx.h    |  18 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.c    |  62 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.h    |   2 +
 17 files changed, 1883 insertions(+), 34 deletions(-)


base-commit: 903db046d5579bef0ea699eae4b279dd6455fc9f
-- 
2.43.0


^ permalink raw reply

* [PATCH net-next v09 1/5] hinic3: Add ethtool queue ops
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1781062575.git.wudi234@huawei.com>

  Implement following ethtool callback function:
.get_ringparam
.set_ringparam

  These callbacks allow users to utilize ethtool for detailed
queue depth configuration and monitoring.

Co-developed-by: Wu Di <wudi234@huawei.com>
Signed-off-by: Wu Di <wudi234@huawei.com>
Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   |  93 ++++++++++++++++
 .../net/ethernet/huawei/hinic3/hinic3_irq.c   |   5 +-
 .../net/ethernet/huawei/hinic3/hinic3_main.c  |   6 +
 .../huawei/hinic3/hinic3_netdev_ops.c         | 104 ++++++++++++++++--
 .../ethernet/huawei/hinic3/hinic3_nic_dev.h   |   9 ++
 .../ethernet/huawei/hinic3/hinic3_nic_io.c    |   4 +-
 .../ethernet/huawei/hinic3/hinic3_nic_io.h    |   8 +-
 .../net/ethernet/huawei/hinic3/hinic3_rx.c    |   2 +-
 8 files changed, 217 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index 90fc16288de9..be9992a235f7 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/etherdevice.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/ethtool.h>
 
 #include "hinic3_lld.h"
@@ -409,6 +410,96 @@ hinic3_get_link_ksettings(struct net_device *netdev,
 	return 0;
 }
 
+static void hinic3_get_ringparam(struct net_device *netdev,
+				 struct ethtool_ringparam *ring,
+				 struct kernel_ethtool_ringparam *kernel_ring,
+				 struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+
+	ring->rx_max_pending = HINIC3_MAX_RX_QUEUE_DEPTH;
+	ring->tx_max_pending = HINIC3_MAX_TX_QUEUE_DEPTH;
+	ring->rx_pending = nic_dev->q_params.rq_depth;
+	ring->rx_pending = nic_dev->q_params.sq_depth;
+}
+
+static void hinic3_update_qp_depth(struct net_device *netdev,
+				   u32 sq_depth, u32 rq_depth)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u16 i;
+
+	nic_dev->q_params.sq_depth = sq_depth;
+	nic_dev->q_params.rq_depth = rq_depth;
+	for (i = 0; i < nic_dev->max_qps; i++) {
+		nic_dev->txqs[i].q_depth = sq_depth;
+		nic_dev->txqs[i].q_mask = sq_depth - 1;
+		nic_dev->rxqs[i].q_depth = rq_depth;
+		nic_dev->rxqs[i].q_mask = rq_depth - 1;
+	}
+}
+
+static int hinic3_check_ringparam_valid(struct net_device *netdev,
+					const struct ethtool_ringparam *ring,
+					struct netlink_ext_ack *extack)
+{
+	if (ring->tx_pending < HINIC3_MIN_QUEUE_DEPTH ||
+	    ring->rx_pending < HINIC3_MIN_QUEUE_DEPTH) {
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "Queue depth out of range tx[%d-%d] rx[%d-%d]",
+				       HINIC3_MIN_QUEUE_DEPTH,
+				       HINIC3_MAX_TX_QUEUE_DEPTH,
+				       HINIC3_MIN_QUEUE_DEPTH,
+				       HINIC3_MAX_RX_QUEUE_DEPTH);
+
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_set_ringparam(struct net_device *netdev,
+				struct ethtool_ringparam *ring,
+				struct kernel_ethtool_ringparam *kernel_ring,
+				struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_dyna_txrxq_params q_params = {};
+	u32 new_sq_depth, new_rq_depth;
+	int err;
+
+	err = hinic3_check_ringparam_valid(netdev, ring, extack);
+	if (err)
+		return err;
+
+	new_sq_depth = 1U << ilog2(ring->tx_pending);
+	new_rq_depth = 1U << ilog2(ring->rx_pending);
+	if (new_sq_depth == nic_dev->q_params.sq_depth &&
+	    new_rq_depth == nic_dev->q_params.rq_depth)
+		return 0;
+
+	if (new_sq_depth != ring->tx_pending ||
+	    new_rq_depth != ring->rx_pending)
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "Requested Tx/Rx ring depth %u/%u trimmed to %u/%u",
+				       ring->tx_pending, ring->rx_pending,
+				       new_sq_depth, new_rq_depth);
+
+	if (!netif_running(netdev)) {
+		hinic3_update_qp_depth(netdev, new_sq_depth, new_rq_depth);
+	} else {
+		q_params = nic_dev->q_params;
+		q_params.sq_depth = new_sq_depth;
+		q_params.rq_depth = new_rq_depth;
+
+		err = hinic3_change_channel_settings(netdev, &q_params);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
 	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
 					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -417,6 +508,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_msglevel                   = hinic3_get_msglevel,
 	.set_msglevel                   = hinic3_set_msglevel,
 	.get_link                       = ethtool_op_get_link,
+	.get_ringparam                  = hinic3_get_ringparam,
+	.set_ringparam                  = hinic3_set_ringparam,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index e7d6c2033b45..bc4d879f9be4 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -137,7 +137,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 	struct hinic3_interrupt_info info = {};
 	int err;
 
-	if (q_id >= nic_dev->q_params.num_qps)
+	if (q_id >= nic_dev->q_params.num_qps ||
+	    !mutex_trylock(&nic_dev->change_res_mutex))
 		return 0;
 
 	info.interrupt_coalesc_set = 1;
@@ -156,6 +157,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 		nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
 	}
 
+	mutex_unlock(&nic_dev->change_res_mutex);
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index 0a888fe4c975..c87624a5e5dc 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -179,6 +179,7 @@ static int hinic3_sw_init(struct net_device *netdev)
 	int err;
 
 	mutex_init(&nic_dev->port_state_mutex);
+	mutex_init(&nic_dev->change_res_mutex);
 
 	nic_dev->q_params.sq_depth = HINIC3_SQ_DEPTH;
 	nic_dev->q_params.rq_depth = HINIC3_RQ_DEPTH;
@@ -315,6 +316,9 @@ static void hinic3_link_status_change(struct net_device *netdev,
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
 
+	if (!mutex_trylock(&nic_dev->change_res_mutex))
+		return;
+
 	if (link_status_up) {
 		if (netif_carrier_ok(netdev))
 			return;
@@ -330,6 +334,8 @@ static void hinic3_link_status_change(struct net_device *netdev,
 		netif_carrier_off(netdev);
 		netdev_dbg(netdev, "Link is down\n");
 	}
+
+	mutex_unlock(&nic_dev->change_res_mutex);
 }
 
 static void hinic3_port_module_event_handler(struct net_device *netdev,
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
index da73811641a9..047214cfc753 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
@@ -288,7 +288,8 @@ static void hinic3_free_channel_resources(struct net_device *netdev,
 	hinic3_free_qps(nic_dev, qp_params);
 }
 
-static int hinic3_open_channel(struct net_device *netdev)
+static int hinic3_prepare_channel(struct net_device *netdev,
+				  struct hinic3_dyna_txrxq_params *qp_params)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
 	int err;
@@ -299,16 +300,28 @@ static int hinic3_open_channel(struct net_device *netdev)
 		return err;
 	}
 
-	err = hinic3_configure_txrxqs(netdev, &nic_dev->q_params);
+	err = hinic3_configure_txrxqs(netdev, qp_params);
 	if (err) {
 		netdev_err(netdev, "Failed to configure txrxqs\n");
 		goto err_free_qp_ctxts;
 	}
 
+	return 0;
+
+err_free_qp_ctxts:
+	hinic3_free_qp_ctxts(nic_dev);
+
+	return err;
+}
+
+static int hinic3_open_channel(struct net_device *netdev)
+{
+	int err;
+
 	err = hinic3_qps_irq_init(netdev);
 	if (err) {
 		netdev_err(netdev, "Failed to init txrxq irq\n");
-		goto err_free_qp_ctxts;
+		return err;
 	}
 
 	err = hinic3_configure(netdev);
@@ -321,8 +334,6 @@ static int hinic3_open_channel(struct net_device *netdev)
 
 err_uninit_qps_irq:
 	hinic3_qps_irq_uninit(netdev);
-err_free_qp_ctxts:
-	hinic3_free_qp_ctxts(nic_dev);
 
 	return err;
 }
@@ -428,6 +439,74 @@ static void hinic3_vport_down(struct net_device *netdev)
 	}
 }
 
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+			       struct hinic3_dyna_txrxq_params *trxq_params)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_dyna_txrxq_params cur_trxq_params = {};
+	struct hinic3_dyna_qp_params new_qp_params = {};
+	struct hinic3_dyna_qp_params cur_qp_params = {};
+	int err;
+
+	cur_trxq_params = nic_dev->q_params;
+
+	hinic3_config_num_qps(netdev, trxq_params);
+
+	err = hinic3_alloc_channel_resources(netdev, &new_qp_params,
+					     trxq_params);
+	if (err) {
+		netdev_err(netdev, "Failed to alloc channel resources\n");
+		return err;
+	}
+
+	mutex_lock(&nic_dev->change_res_mutex);
+	hinic3_vport_down(netdev);
+	hinic3_close_channel(netdev);
+	hinic3_get_cur_qps(nic_dev, &cur_qp_params);
+
+	hinic3_init_qps(nic_dev, &new_qp_params);
+
+	err = hinic3_prepare_channel(netdev, trxq_params);
+	if (err)
+		goto err_uninit_qps;
+
+	if (nic_dev->num_qp_irq > trxq_params->num_qps)
+		hinic3_qp_irq_change(netdev, trxq_params->num_qps);
+
+	nic_dev->q_params = *trxq_params;
+
+	err = hinic3_open_channel(netdev);
+	if (err)
+		goto err_qp_irq_reset;
+
+	err = hinic3_vport_up(netdev);
+	if (err)
+		goto err_close_channel;
+
+	hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);
+
+	mutex_unlock(&nic_dev->change_res_mutex);
+
+	return 0;
+
+err_close_channel:
+	hinic3_close_channel(netdev);
+err_qp_irq_reset:
+	nic_dev->q_params = cur_trxq_params;
+
+	if (trxq_params->num_qps > cur_trxq_params.num_qps)
+		hinic3_qp_irq_change(netdev, cur_trxq_params.num_qps);
+	hinic3_free_qp_ctxts(nic_dev);
+err_uninit_qps:
+	hinic3_get_cur_qps(nic_dev, &new_qp_params);
+	hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
+	hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);
+	mutex_unlock(&nic_dev->change_res_mutex);
+
+	return err;
+}
+
 static int hinic3_open(struct net_device *netdev)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
@@ -458,6 +537,10 @@ static int hinic3_open(struct net_device *netdev)
 
 	hinic3_init_qps(nic_dev, &qp_params);
 
+	err = hinic3_prepare_channel(netdev, &nic_dev->q_params);
+	if (err)
+		goto err_uninit_qps;
+
 	err = hinic3_open_channel(netdev);
 	if (err)
 		goto err_uninit_qps;
@@ -473,7 +556,7 @@ static int hinic3_open(struct net_device *netdev)
 err_close_channel:
 	hinic3_close_channel(netdev);
 err_uninit_qps:
-	hinic3_uninit_qps(nic_dev, &qp_params);
+	hinic3_get_cur_qps(nic_dev, &qp_params);
 	hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
 err_destroy_num_qps:
 	hinic3_destroy_num_qps(netdev);
@@ -493,10 +576,15 @@ static int hinic3_close(struct net_device *netdev)
 		return 0;
 	}
 
+	mutex_lock(&nic_dev->change_res_mutex);
 	hinic3_vport_down(netdev);
 	hinic3_close_channel(netdev);
-	hinic3_uninit_qps(nic_dev, &qp_params);
-	hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
+	hinic3_get_cur_qps(nic_dev, &qp_params);
+	hinic3_free_channel_resources(netdev, &qp_params,
+				      &nic_dev->q_params);
+	hinic3_free_nicio_res(nic_dev);
+	hinic3_destroy_num_qps(netdev);
+	mutex_unlock(&nic_dev->change_res_mutex);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
index 9502293ff710..005b2c01a988 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
@@ -10,6 +10,9 @@
 #include "hinic3_hw_cfg.h"
 #include "hinic3_hwdev.h"
 #include "hinic3_mgmt_interface.h"
+#include "hinic3_nic_io.h"
+#include "hinic3_tx.h"
+#include "hinic3_rx.h"
 
 #define HINIC3_VLAN_BITMAP_BYTE_SIZE(nic_dev)  (sizeof(*(nic_dev)->vlan_bitmap))
 #define HINIC3_VLAN_BITMAP_SIZE(nic_dev)  \
@@ -129,6 +132,8 @@ struct hinic3_nic_dev {
 	struct work_struct              rx_mode_work;
 	/* lock for enable/disable port */
 	struct mutex                    port_state_mutex;
+	/* mutex to serialize channel/resource changes */
+	struct mutex                    change_res_mutex;
 
 	struct list_head                uc_filter_list;
 	struct list_head                mc_filter_list;
@@ -143,6 +148,10 @@ struct hinic3_nic_dev {
 
 void hinic3_set_netdev_ops(struct net_device *netdev);
 int hinic3_set_hw_features(struct net_device *netdev);
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+			       struct hinic3_dyna_txrxq_params *trxq_params);
+
 int hinic3_qps_irq_init(struct net_device *netdev);
 void hinic3_qps_irq_uninit(struct net_device *netdev);
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
index 87e736adba02..0e7a0ccfba98 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.c
@@ -484,8 +484,8 @@ void hinic3_init_qps(struct hinic3_nic_dev *nic_dev,
 	}
 }
 
-void hinic3_uninit_qps(struct hinic3_nic_dev *nic_dev,
-		       struct hinic3_dyna_qp_params *qp_params)
+void hinic3_get_cur_qps(struct hinic3_nic_dev *nic_dev,
+			struct hinic3_dyna_qp_params *qp_params)
 {
 	struct hinic3_nic_io *nic_io = nic_dev->nic_io;
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
index 12eefabcf1db..571b34d63950 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
@@ -14,6 +14,10 @@ struct hinic3_nic_dev;
 #define HINIC3_RQ_WQEBB_SHIFT      3
 #define HINIC3_SQ_WQEBB_SIZE       BIT(HINIC3_SQ_WQEBB_SHIFT)
 
+#define HINIC3_MAX_TX_QUEUE_DEPTH  65536
+#define HINIC3_MAX_RX_QUEUE_DEPTH  16384
+#define HINIC3_MIN_QUEUE_DEPTH     128
+
 /* ******************** RQ_CTRL ******************** */
 enum hinic3_rq_wqe_type {
 	HINIC3_NORMAL_RQ_WQE = 1,
@@ -136,8 +140,8 @@ void hinic3_free_qps(struct hinic3_nic_dev *nic_dev,
 		     struct hinic3_dyna_qp_params *qp_params);
 void hinic3_init_qps(struct hinic3_nic_dev *nic_dev,
 		     struct hinic3_dyna_qp_params *qp_params);
-void hinic3_uninit_qps(struct hinic3_nic_dev *nic_dev,
-		       struct hinic3_dyna_qp_params *qp_params);
+void hinic3_get_cur_qps(struct hinic3_nic_dev *nic_dev,
+			struct hinic3_dyna_qp_params *qp_params);
 
 int hinic3_init_qp_ctxts(struct hinic3_nic_dev *nic_dev);
 void hinic3_free_qp_ctxts(struct hinic3_nic_dev *nic_dev);
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
index 309ab5901379..b5b601469517 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
@@ -541,7 +541,7 @@ int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
 		rq_associate_cqes(rxq);
 
 		pkts = hinic3_rx_fill_buffers(rxq);
-		if (!pkts) {
+		if (pkts < rxq->q_depth - 1) {
 			netdev_err(netdev, "Failed to fill Rx buffer\n");
 			return -ENOMEM;
 		}
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v09 2/5] hinic3: Add ethtool statistic ops
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1781062575.git.wudi234@huawei.com>

  Add PF/VF statistics functions in TX and RX processing.
  Implement following ethtool callback function:
.get_sset_count
.get_ethtool_stats
.get_strings
.get_eth_phy_stats
.get_eth_mac_stats
.get_eth_ctrl_stats
.get_rmon_stats
.get_pause_stats

  These callbacks allow users to utilize ethtool for detailed
TX and RX netdev stats monitoring.

  Add mgmt_msg_params_init_in_out() to support management
commands that require separate input and output buffers. This is
needed for retrieving the expanded PF/VF MAC statistics from HW,
which no longer fit in the input buffer used by the existing path.

  Remove unused stats "restore_drop_sge" in struct hinic3_rxq_stats.

Co-developed-by: Wu Di <wudi234@huawei.com>
Signed-off-by: Wu Di <wudi234@huawei.com>
Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 483 ++++++++++++++++++
 .../ethernet/huawei/hinic3/hinic3_hw_intf.h   |  11 +
 .../huawei/hinic3/hinic3_mgmt_interface.h     |  37 ++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.c   |  64 +++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.h   | 109 ++++
 .../net/ethernet/huawei/hinic3/hinic3_rx.c    |  70 ++-
 .../net/ethernet/huawei/hinic3/hinic3_rx.h    |  15 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.c    |  62 ++-
 .../net/ethernet/huawei/hinic3/hinic3_tx.h    |   2 +
 9 files changed, 845 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index be9992a235f7..66ca4303bb3c 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -500,6 +500,481 @@ static int hinic3_set_ringparam(struct net_device *netdev,
 	return 0;
 }
 
+struct hinic3_stats {
+	char name[ETH_GSTRING_LEN];
+	u32  size;
+	int  offset;
+};
+
+#define HINIC3_RXQ_STAT(_stat_item) { \
+	.name   = "rxq%d_"#_stat_item, \
+	.size   = sizeof_field(struct hinic3_rxq_stats, _stat_item), \
+	.offset = offsetof(struct hinic3_rxq_stats, _stat_item) \
+}
+
+#define HINIC3_TXQ_STAT(_stat_item) { \
+	.name   = "txq%d_"#_stat_item, \
+	.size   = sizeof_field(struct hinic3_txq_stats, _stat_item), \
+	.offset = offsetof(struct hinic3_txq_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_rx_queue_stats[] = {
+	HINIC3_RXQ_STAT(csum_errors),
+	HINIC3_RXQ_STAT(other_errors),
+	HINIC3_RXQ_STAT(rx_buf_empty),
+	HINIC3_RXQ_STAT(alloc_skb_err),
+	HINIC3_RXQ_STAT(alloc_rx_buf_err),
+};
+
+static struct hinic3_stats hinic3_tx_queue_stats[] = {
+	HINIC3_TXQ_STAT(busy),
+	HINIC3_TXQ_STAT(skb_pad_err),
+	HINIC3_TXQ_STAT(frag_len_overflow),
+	HINIC3_TXQ_STAT(offload_cow_skb_err),
+	HINIC3_TXQ_STAT(map_frag_err),
+	HINIC3_TXQ_STAT(unknown_tunnel_pkt),
+	HINIC3_TXQ_STAT(frag_size_err),
+};
+
+#define HINIC3_FUNC_STAT(_stat_item) {	\
+	.name   = #_stat_item, \
+	.size   = sizeof_field(struct l2nic_vport_stats, _stat_item), \
+	.offset = offsetof(struct l2nic_vport_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_function_stats[] = {
+	HINIC3_FUNC_STAT(tx_unicast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_unicast_bytes_vport),
+	HINIC3_FUNC_STAT(tx_multicast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_multicast_bytes_vport),
+	HINIC3_FUNC_STAT(tx_broadcast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_broadcast_bytes_vport),
+
+	HINIC3_FUNC_STAT(rx_unicast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_unicast_bytes_vport),
+	HINIC3_FUNC_STAT(rx_multicast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_multicast_bytes_vport),
+	HINIC3_FUNC_STAT(rx_broadcast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_broadcast_bytes_vport),
+
+	HINIC3_FUNC_STAT(tx_discard_vport),
+	HINIC3_FUNC_STAT(rx_discard_vport),
+	HINIC3_FUNC_STAT(tx_err_vport),
+	HINIC3_FUNC_STAT(rx_err_vport),
+};
+
+#define HINIC3_PORT_STAT(_stat_item) { \
+	.name   = #_stat_item, \
+	.size   = sizeof_field(struct mag_cmd_port_stats, _stat_item), \
+	.offset = offsetof(struct mag_cmd_port_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_port_stats[] = {
+	HINIC3_PORT_STAT(mac_tx_fragment_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_undersize_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_undermin_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_1519_max_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_1519_max_good_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_oversize_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_jabber_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_bad_oct_num),
+	HINIC3_PORT_STAT(mac_tx_good_oct_num),
+	HINIC3_PORT_STAT(mac_tx_total_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_uni_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri0_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri1_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri2_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri3_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri4_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri5_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri6_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri7_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_err_all_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_from_app_good_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_from_app_bad_pkt_num),
+
+	HINIC3_PORT_STAT(mac_rx_undermin_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_1519_max_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_1519_max_good_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_bad_oct_num),
+	HINIC3_PORT_STAT(mac_rx_good_oct_num),
+	HINIC3_PORT_STAT(mac_rx_total_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_uni_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri0_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri1_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri2_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri3_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri4_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri5_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri6_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri7_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_send_app_good_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_send_app_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_unfilter_pkt_num),
+};
+
+static int hinic3_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int count, q_num;
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		q_num = nic_dev->q_params.num_qps;
+		count = ARRAY_SIZE(hinic3_function_stats) +
+			(ARRAY_SIZE(hinic3_tx_queue_stats) +
+			 ARRAY_SIZE(hinic3_rx_queue_stats)) *
+			q_num;
+
+		if (!HINIC3_IS_VF(nic_dev->hwdev))
+			count += ARRAY_SIZE(hinic3_port_stats);
+
+		return count;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static u64 get_val_of_ptr(u32 size, const void *ptr)
+{
+	u64 ret = size == sizeof(u64) ? *(u64 *)ptr :
+		  size == sizeof(u32) ? *(u32 *)ptr :
+		  size == sizeof(u16) ? *(u16 *)ptr :
+		  *(u8 *)ptr;
+
+	return ret;
+}
+
+static void hinic3_get_drv_queue_stats(struct net_device *netdev, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_txq_stats txq_stats = {};
+	struct hinic3_rxq_stats rxq_stats = {};
+	u16 i = 0, j, qid;
+	char *p;
+
+	for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+		if (!nic_dev->txqs)
+			break;
+
+		hinic3_txq_get_stats(&nic_dev->txqs[qid], &txq_stats);
+		for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++, i++) {
+			p = (char *)&txq_stats +
+			    hinic3_tx_queue_stats[j].offset;
+			data[i] = get_val_of_ptr(hinic3_tx_queue_stats[j].size,
+						 p);
+		}
+	}
+
+	i = nic_dev->q_params.num_qps * ARRAY_SIZE(hinic3_tx_queue_stats);
+	for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+		if (!nic_dev->rxqs)
+			break;
+
+		hinic3_rxq_get_stats(&nic_dev->rxqs[qid], &rxq_stats);
+		for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++, i++) {
+			p = (char *)&rxq_stats +
+			    hinic3_rx_queue_stats[j].offset;
+			data[i] = get_val_of_ptr(hinic3_rx_queue_stats[j].size,
+						 p);
+		}
+	}
+}
+
+static u16 hinic3_get_ethtool_port_stats(struct net_device *netdev, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	u16 i = 0, j;
+	char *p;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		goto err_zero_stats;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get port stats from fw\n");
+		goto err_zero_stats;
+	}
+
+	for (j = 0; j < ARRAY_SIZE(hinic3_port_stats); j++, i++) {
+		p = (char *)ps + hinic3_port_stats[j].offset;
+		data[i] = get_val_of_ptr(hinic3_port_stats[j].size, p);
+	}
+
+	kfree(ps);
+
+	return i;
+
+err_zero_stats:
+	memset(&data[i], 0, ARRAY_SIZE(hinic3_port_stats) * sizeof(*data));
+
+	return i + ARRAY_SIZE(hinic3_port_stats);
+}
+
+static void hinic3_get_ethtool_stats(struct net_device *netdev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct l2nic_vport_stats vport_stats = {};
+	u16 i = 0, j;
+	char *p;
+	int err;
+
+	err = hinic3_get_vport_stats(nic_dev->hwdev,
+				     hinic3_global_func_id(nic_dev->hwdev),
+				     &vport_stats);
+	if (err)
+		netdev_err(netdev, "Failed to get function stats from fw\n");
+
+	for (j = 0; j < ARRAY_SIZE(hinic3_function_stats); j++, i++) {
+		p = (char *)&vport_stats + hinic3_function_stats[j].offset;
+		data[i] = get_val_of_ptr(hinic3_function_stats[j].size, p);
+	}
+
+	if (!HINIC3_IS_VF(nic_dev->hwdev))
+		i += hinic3_get_ethtool_port_stats(netdev, data + i);
+
+	hinic3_get_drv_queue_stats(netdev, data + i);
+}
+
+static u16 hinic3_get_hw_stats_strings(struct net_device *netdev, char *p)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u16 i, cnt = 0;
+
+	for (i = 0; i < ARRAY_SIZE(hinic3_function_stats); i++) {
+		memcpy(p, hinic3_function_stats[i].name, ETH_GSTRING_LEN);
+		p += ETH_GSTRING_LEN;
+		cnt++;
+	}
+
+	if (!HINIC3_IS_VF(nic_dev->hwdev)) {
+		for (i = 0; i < ARRAY_SIZE(hinic3_port_stats); i++) {
+			memcpy(p, hinic3_port_stats[i].name, ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+			cnt++;
+		}
+	}
+
+	return cnt;
+}
+
+static void hinic3_get_qp_stats_strings(struct net_device *netdev, char *p)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u8 *data = p;
+	u16 i, j;
+
+	for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+		for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++)
+			ethtool_sprintf(&data,
+					hinic3_tx_queue_stats[j].name, i);
+	}
+
+	for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+		for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++)
+			ethtool_sprintf(&data,
+					hinic3_rx_queue_stats[j].name, i);
+	}
+}
+
+static void hinic3_get_strings(struct net_device *netdev,
+			       u32 stringset, u8 *data)
+{
+	char *p = (char *)data;
+	u16 offset;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		offset = hinic3_get_hw_stats_strings(netdev, p);
+		hinic3_get_qp_stats_strings(netdev,
+					    p + offset * ETH_GSTRING_LEN);
+
+		return;
+	default:
+		netdev_err(netdev, "Invalid string set %u.\n", stringset);
+		return;
+	}
+}
+
+static void hinic3_get_eth_phy_stats(struct net_device *netdev,
+				     struct ethtool_eth_phy_stats *phy_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth phy stats from fw\n");
+		return;
+	}
+
+	phy_stats->SymbolErrorDuringCarrier = ps->mac_rx_sym_err_pkt_num;
+
+	kfree(ps);
+}
+
+static void hinic3_get_eth_mac_stats(struct net_device *netdev,
+				     struct ethtool_eth_mac_stats *mac_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth mac stats from fw\n");
+		return;
+	}
+
+	mac_stats->FramesTransmittedOK = ps->mac_tx_good_pkt_num;
+	mac_stats->FramesReceivedOK = ps->mac_rx_good_pkt_num;
+	mac_stats->FrameCheckSequenceErrors = ps->mac_rx_fcs_err_pkt_num;
+	mac_stats->OctetsTransmittedOK = ps->mac_tx_total_oct_num;
+	mac_stats->OctetsReceivedOK = ps->mac_rx_total_oct_num;
+	mac_stats->MulticastFramesXmittedOK = ps->mac_tx_multi_pkt_num;
+	mac_stats->BroadcastFramesXmittedOK = ps->mac_tx_broad_pkt_num;
+	mac_stats->MulticastFramesReceivedOK = ps->mac_rx_multi_pkt_num;
+	mac_stats->BroadcastFramesReceivedOK = ps->mac_rx_broad_pkt_num;
+
+	kfree(ps);
+}
+
+static void hinic3_get_eth_ctrl_stats(struct net_device *netdev,
+				      struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth ctrl stats from fw\n");
+		return;
+	}
+
+	ctrl_stats->MACControlFramesTransmitted = ps->mac_tx_control_pkt_num;
+	ctrl_stats->MACControlFramesReceived = ps->mac_rx_control_pkt_num;
+
+	kfree(ps);
+}
+
+static const struct ethtool_rmon_hist_range hinic3_rmon_ranges[] = {
+	{     0,    64 },
+	{    65,   127 },
+	{   128,   255 },
+	{   256,   511 },
+	{   512,  1023 },
+	{  1024,  1518 },
+	{  1519,  2047 },
+	{  2048,  4095 },
+	{  4096,  8191 },
+	{  8192,  9216 },
+	{  9217, 12287 },
+	{}
+};
+
+static void hinic3_get_rmon_stats(struct net_device *netdev,
+				  struct ethtool_rmon_stats *rmon_stats,
+				  const struct ethtool_rmon_hist_range **ranges)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth rmon stats from fw\n");
+		return;
+	}
+
+	rmon_stats->undersize_pkts	= ps->mac_rx_undersize_pkt_num;
+	rmon_stats->oversize_pkts	= ps->mac_rx_oversize_pkt_num;
+	rmon_stats->fragments		= ps->mac_rx_fragment_pkt_num;
+	rmon_stats->jabbers		= ps->mac_rx_jabber_pkt_num;
+
+	rmon_stats->hist[0]		= ps->mac_rx_64_oct_pkt_num;
+	rmon_stats->hist[1]		= ps->mac_rx_65_127_oct_pkt_num;
+	rmon_stats->hist[2]		= ps->mac_rx_128_255_oct_pkt_num;
+	rmon_stats->hist[3]		= ps->mac_rx_256_511_oct_pkt_num;
+	rmon_stats->hist[4]		= ps->mac_rx_512_1023_oct_pkt_num;
+	rmon_stats->hist[5]		= ps->mac_rx_1024_1518_oct_pkt_num;
+	rmon_stats->hist[6]		= ps->mac_rx_1519_2047_oct_pkt_num;
+	rmon_stats->hist[7]		= ps->mac_rx_2048_4095_oct_pkt_num;
+	rmon_stats->hist[8]		= ps->mac_rx_4096_8191_oct_pkt_num;
+	rmon_stats->hist[9]		= ps->mac_rx_8192_9216_oct_pkt_num;
+	rmon_stats->hist[10]		= ps->mac_rx_9217_12287_oct_pkt_num;
+
+	rmon_stats->hist_tx[0]		= ps->mac_tx_64_oct_pkt_num;
+	rmon_stats->hist_tx[1]		= ps->mac_tx_65_127_oct_pkt_num;
+	rmon_stats->hist_tx[2]		= ps->mac_tx_128_255_oct_pkt_num;
+	rmon_stats->hist_tx[3]		= ps->mac_tx_256_511_oct_pkt_num;
+	rmon_stats->hist_tx[4]		= ps->mac_tx_512_1023_oct_pkt_num;
+	rmon_stats->hist_tx[5]		= ps->mac_tx_1024_1518_oct_pkt_num;
+	rmon_stats->hist_tx[6]		= ps->mac_tx_1519_2047_oct_pkt_num;
+	rmon_stats->hist_tx[7]		= ps->mac_tx_2048_4095_oct_pkt_num;
+	rmon_stats->hist_tx[8]		= ps->mac_tx_4096_8191_oct_pkt_num;
+	rmon_stats->hist_tx[9]		= ps->mac_tx_8192_9216_oct_pkt_num;
+	rmon_stats->hist_tx[10]		= ps->mac_tx_9217_12287_oct_pkt_num;
+
+	*ranges = hinic3_rmon_ranges;
+
+	kfree(ps);
+}
+
+static void hinic3_get_pause_stats(struct net_device *netdev,
+				   struct ethtool_pause_stats *pause_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth pause stats from fw\n");
+		return;
+	}
+
+	pause_stats->tx_pause_frames = ps->mac_tx_pause_num;
+	pause_stats->rx_pause_frames = ps->mac_rx_pause_num;
+
+	kfree(ps);
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
 	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
 					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -510,6 +985,14 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_link                       = ethtool_op_get_link,
 	.get_ringparam                  = hinic3_get_ringparam,
 	.set_ringparam                  = hinic3_set_ringparam,
+	.get_sset_count                 = hinic3_get_sset_count,
+	.get_ethtool_stats              = hinic3_get_ethtool_stats,
+	.get_strings                    = hinic3_get_strings,
+	.get_eth_phy_stats              = hinic3_get_eth_phy_stats,
+	.get_eth_mac_stats              = hinic3_get_eth_mac_stats,
+	.get_eth_ctrl_stats             = hinic3_get_eth_ctrl_stats,
+	.get_rmon_stats                 = hinic3_get_rmon_stats,
+	.get_pause_stats                = hinic3_get_pause_stats,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
index cfc9daa3034f..6b0f486ba590 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
@@ -53,6 +53,17 @@ static inline void mgmt_msg_params_init_default(struct mgmt_msg_params *msg_para
 	msg_params->timeout_ms = 0;
 }
 
+static inline void
+mgmt_msg_params_init_in_out(struct mgmt_msg_params *msg_params, void *in_buf,
+			    void *out_buf, u32 in_buf_size, u32 out_buf_size)
+{
+	msg_params->buf_in = in_buf;
+	msg_params->buf_out = out_buf;
+	msg_params->in_size = in_buf_size;
+	msg_params->expected_out_size = out_buf_size;
+	msg_params->timeout_ms = 0;
+}
+
 enum cfg_cmd {
 	CFG_CMD_GET_DEV_CAP = 0,
 };
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
index c5bca3c4af96..76c691f82703 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
@@ -143,6 +143,41 @@ struct l2nic_cmd_set_dcb_state {
 	u8                   rsvd[7];
 };
 
+struct l2nic_port_stats_info {
+	struct mgmt_msg_head msg_head;
+	u16                  func_id;
+	u16                  rsvd1;
+};
+
+struct l2nic_vport_stats {
+	u64 tx_unicast_pkts_vport;
+	u64 tx_unicast_bytes_vport;
+	u64 tx_multicast_pkts_vport;
+	u64 tx_multicast_bytes_vport;
+	u64 tx_broadcast_pkts_vport;
+	u64 tx_broadcast_bytes_vport;
+
+	u64 rx_unicast_pkts_vport;
+	u64 rx_unicast_bytes_vport;
+	u64 rx_multicast_pkts_vport;
+	u64 rx_multicast_bytes_vport;
+	u64 rx_broadcast_pkts_vport;
+	u64 rx_broadcast_bytes_vport;
+
+	u64 tx_discard_vport;
+	u64 rx_discard_vport;
+	u64 tx_err_vport;
+	u64 rx_err_vport;
+};
+
+struct l2nic_cmd_vport_stats {
+	struct mgmt_msg_head     msg_head;
+	u32                      stats_size;
+	u32                      rsvd1;
+	struct l2nic_vport_stats stats;
+	u64                      rsvd2[6];
+};
+
 struct l2nic_cmd_lro_config {
 	struct mgmt_msg_head msg_head;
 	u16                  func_id;
@@ -234,6 +269,7 @@ enum l2nic_cmd {
 	L2NIC_CMD_SET_VPORT_ENABLE    = 6,
 	L2NIC_CMD_SET_RX_MODE         = 7,
 	L2NIC_CMD_SET_SQ_CI_ATTR      = 8,
+	L2NIC_CMD_GET_VPORT_STAT      = 9,
 	L2NIC_CMD_CLEAR_QP_RESOURCE   = 11,
 	L2NIC_CMD_CFG_RX_LRO          = 13,
 	L2NIC_CMD_CFG_LRO_TIMER       = 14,
@@ -272,6 +308,7 @@ enum mag_cmd {
 	MAG_CMD_SET_PORT_ENABLE = 6,
 	MAG_CMD_GET_LINK_STATUS = 7,
 
+	MAG_CMD_GET_PORT_STAT   = 151,
 	MAG_CMD_GET_PORT_INFO   = 153,
 };
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
index de5a7984d2cb..1b14dc824ce1 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
@@ -639,6 +639,42 @@ int hinic3_get_link_status(struct hinic3_hwdev *hwdev, bool *link_status_up)
 	return 0;
 }
 
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+			      struct mag_cmd_port_stats *stats)
+{
+	struct mag_cmd_port_stats_info stats_info = {};
+	struct mag_cmd_get_port_stat *ps;
+	struct mgmt_msg_params msg_params = {};
+	int err;
+
+	ps = kzalloc_obj(*ps);
+	if (!ps)
+		return -ENOMEM;
+
+	stats_info.port_id = hinic3_physical_port_id(hwdev);
+
+	mgmt_msg_params_init_in_out(&msg_params, &stats_info, ps,
+				    sizeof(stats_info), sizeof(*ps));
+
+	err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_HILINK,
+				       MAG_CMD_GET_PORT_STAT, &msg_params);
+
+	if (err || ps->head.status) {
+		dev_err(hwdev->dev,
+			"Failed to get port statistics, err: %d, status: 0x%x\n",
+			err, ps->head.status);
+		err = -EFAULT;
+		goto out;
+	}
+
+	memcpy(stats, &ps->counter, sizeof(*stats));
+
+out:
+	kfree(ps);
+
+	return err;
+}
+
 int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
 			 struct hinic3_nic_port_info *port_info)
 {
@@ -738,3 +774,31 @@ int hinic3_get_pause_info(struct hinic3_nic_dev *nic_dev,
 	return hinic3_cfg_hw_pause(nic_dev->hwdev, MGMT_MSG_CMD_OP_GET,
 				   nic_pause);
 }
+
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+			   struct l2nic_vport_stats *stats)
+{
+	struct l2nic_cmd_vport_stats vport_stats = {};
+	struct l2nic_port_stats_info stats_info = {};
+	struct mgmt_msg_params msg_params = {};
+	int err;
+
+	stats_info.func_id = func_id;
+
+	mgmt_msg_params_init_in_out(&msg_params, &stats_info, &vport_stats,
+				    sizeof(stats_info), sizeof(vport_stats));
+
+	err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_L2NIC,
+				       L2NIC_CMD_GET_VPORT_STAT, &msg_params);
+
+	if (err || vport_stats.msg_head.status) {
+		dev_err(hwdev->dev,
+			"Failed to get function statistics, err: %d, status: 0x%x\n",
+			err, vport_stats.msg_head.status);
+		return -EFAULT;
+	}
+
+	memcpy(stats, &vport_stats.stats, sizeof(*stats));
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
index 5d52202a8d4e..80573c121539 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
@@ -129,6 +129,110 @@ struct mag_cmd_get_xsfp_present {
 	u8                   rsvd[2];
 };
 
+struct mag_cmd_port_stats {
+	u64 mac_tx_fragment_pkt_num;
+	u64 mac_tx_undersize_pkt_num;
+	u64 mac_tx_undermin_pkt_num;
+	u64 mac_tx_64_oct_pkt_num;
+	u64 mac_tx_65_127_oct_pkt_num;
+	u64 mac_tx_128_255_oct_pkt_num;
+	u64 mac_tx_256_511_oct_pkt_num;
+	u64 mac_tx_512_1023_oct_pkt_num;
+	u64 mac_tx_1024_1518_oct_pkt_num;
+	u64 mac_tx_1519_2047_oct_pkt_num;
+	u64 mac_tx_2048_4095_oct_pkt_num;
+	u64 mac_tx_4096_8191_oct_pkt_num;
+	u64 mac_tx_8192_9216_oct_pkt_num;
+	u64 mac_tx_9217_12287_oct_pkt_num;
+	u64 mac_tx_12288_16383_oct_pkt_num;
+	u64 mac_tx_1519_max_bad_pkt_num;
+	u64 mac_tx_1519_max_good_pkt_num;
+	u64 mac_tx_oversize_pkt_num;
+	u64 mac_tx_jabber_pkt_num;
+	u64 mac_tx_bad_pkt_num;
+	u64 mac_tx_bad_oct_num;
+	u64 mac_tx_good_pkt_num;
+	u64 mac_tx_good_oct_num;
+	u64 mac_tx_total_pkt_num;
+	u64 mac_tx_total_oct_num;
+	u64 mac_tx_uni_pkt_num;
+	u64 mac_tx_multi_pkt_num;
+	u64 mac_tx_broad_pkt_num;
+	u64 mac_tx_pause_num;
+	u64 mac_tx_pfc_pkt_num;
+	u64 mac_tx_pfc_pri0_pkt_num;
+	u64 mac_tx_pfc_pri1_pkt_num;
+	u64 mac_tx_pfc_pri2_pkt_num;
+	u64 mac_tx_pfc_pri3_pkt_num;
+	u64 mac_tx_pfc_pri4_pkt_num;
+	u64 mac_tx_pfc_pri5_pkt_num;
+	u64 mac_tx_pfc_pri6_pkt_num;
+	u64 mac_tx_pfc_pri7_pkt_num;
+	u64 mac_tx_control_pkt_num;
+	u64 mac_tx_err_all_pkt_num;
+	u64 mac_tx_from_app_good_pkt_num;
+	u64 mac_tx_from_app_bad_pkt_num;
+
+	u64 mac_rx_fragment_pkt_num;
+	u64 mac_rx_undersize_pkt_num;
+	u64 mac_rx_undermin_pkt_num;
+	u64 mac_rx_64_oct_pkt_num;
+	u64 mac_rx_65_127_oct_pkt_num;
+	u64 mac_rx_128_255_oct_pkt_num;
+	u64 mac_rx_256_511_oct_pkt_num;
+	u64 mac_rx_512_1023_oct_pkt_num;
+	u64 mac_rx_1024_1518_oct_pkt_num;
+	u64 mac_rx_1519_2047_oct_pkt_num;
+	u64 mac_rx_2048_4095_oct_pkt_num;
+	u64 mac_rx_4096_8191_oct_pkt_num;
+	u64 mac_rx_8192_9216_oct_pkt_num;
+	u64 mac_rx_9217_12287_oct_pkt_num;
+	u64 mac_rx_12288_16383_oct_pkt_num;
+	u64 mac_rx_1519_max_bad_pkt_num;
+	u64 mac_rx_1519_max_good_pkt_num;
+	u64 mac_rx_oversize_pkt_num;
+	u64 mac_rx_jabber_pkt_num;
+	u64 mac_rx_bad_pkt_num;
+	u64 mac_rx_bad_oct_num;
+	u64 mac_rx_good_pkt_num;
+	u64 mac_rx_good_oct_num;
+	u64 mac_rx_total_pkt_num;
+	u64 mac_rx_total_oct_num;
+	u64 mac_rx_uni_pkt_num;
+	u64 mac_rx_multi_pkt_num;
+	u64 mac_rx_broad_pkt_num;
+	u64 mac_rx_pause_num;
+	u64 mac_rx_pfc_pkt_num;
+	u64 mac_rx_pfc_pri0_pkt_num;
+	u64 mac_rx_pfc_pri1_pkt_num;
+	u64 mac_rx_pfc_pri2_pkt_num;
+	u64 mac_rx_pfc_pri3_pkt_num;
+	u64 mac_rx_pfc_pri4_pkt_num;
+	u64 mac_rx_pfc_pri5_pkt_num;
+	u64 mac_rx_pfc_pri6_pkt_num;
+	u64 mac_rx_pfc_pri7_pkt_num;
+	u64 mac_rx_control_pkt_num;
+	u64 mac_rx_sym_err_pkt_num;
+	u64 mac_rx_fcs_err_pkt_num;
+	u64 mac_rx_send_app_good_pkt_num;
+	u64 mac_rx_send_app_bad_pkt_num;
+	u64 mac_rx_unfilter_pkt_num;
+};
+
+struct mag_cmd_port_stats_info {
+	struct mgmt_msg_head head;
+
+	u8                   port_id;
+	u8                   rsvd0[3];
+};
+
+struct mag_cmd_get_port_stat {
+	struct mgmt_msg_head      head;
+
+	struct mag_cmd_port_stats counter;
+	u64                       rsvd1[15];
+};
+
 enum link_err_type {
 	LINK_ERR_MODULE_UNRECOGENIZED,
 	LINK_ERR_NUM,
@@ -209,6 +313,11 @@ int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
 			 struct hinic3_nic_port_info *port_info);
 int hinic3_set_vport_enable(struct hinic3_hwdev *hwdev, u16 func_id,
 			    bool enable);
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+			      struct mag_cmd_port_stats *stats);
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+			   struct l2nic_vport_stats *stats);
+
 int hinic3_add_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
 int hinic3_del_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
index b5b601469517..a332f814e625 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
@@ -29,7 +29,7 @@
 #define HINIC3_LRO_PKT_HDR_LEN_IPV4     66
 #define HINIC3_LRO_PKT_HDR_LEN_IPV6     86
 #define HINIC3_LRO_PKT_HDR_LEN(cqe) \
-	(RQ_CQE_OFFOLAD_TYPE_GET((cqe)->offload_type, IP_TYPE) == \
+	(RQ_CQE_OFFOLAD_TYPE_GET(le32_to_cpu((cqe)->offload_type), IP_TYPE) == \
 	 HINIC3_RX_IPV6_PKT ? HINIC3_LRO_PKT_HDR_LEN_IPV6 : \
 	 HINIC3_LRO_PKT_HDR_LEN_IPV4)
 
@@ -46,7 +46,6 @@ static void hinic3_rxq_clean_stats(struct hinic3_rxq_stats *rxq_stats)
 
 	rxq_stats->alloc_skb_err = 0;
 	rxq_stats->alloc_rx_buf_err = 0;
-	rxq_stats->restore_drop_sge = 0;
 	u64_stats_update_end(&rxq_stats->syncp);
 }
 
@@ -155,8 +154,14 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
 
 		err = rx_alloc_mapped_page(rxq->page_pool, rx_info,
 					   rxq->buf_len);
-		if (unlikely(err))
+		if (unlikely(err)) {
+			preempt_disable();
+			u64_stats_update_begin(&rxq->rxq_stats.syncp);
+			rxq->rxq_stats.alloc_rx_buf_err++;
+			u64_stats_update_end(&rxq->rxq_stats.syncp);
+			preempt_enable();
 			break;
+		}
 
 		dma_addr = page_pool_get_dma_addr(rx_info->page) +
 			rx_info->page_offset;
@@ -170,6 +175,12 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
 				rxq->next_to_update << HINIC3_NORMAL_RQ_WQE);
 		rxq->delta -= i;
 		rxq->next_to_alloc = rxq->next_to_update;
+	} else if (free_wqebbs == rxq->q_depth - 1) {
+		preempt_disable();
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.rx_buf_empty++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
+		preempt_enable();
 	}
 
 	return i;
@@ -330,11 +341,23 @@ static void hinic3_rx_csum(struct hinic3_rxq *rxq, u32 offload_type,
 	struct net_device *netdev = rxq->netdev;
 	bool l2_tunnel;
 
+	if (unlikely(csum_err == HINIC3_RX_CSUM_IPSU_OTHER_ERR)) {
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.other_errors++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
+	}
+
 	if (!(netdev->features & NETIF_F_RXCSUM))
 		return;
 
 	if (unlikely(csum_err)) {
 		/* pkt type is recognized by HW, and csum is wrong */
+		if (!(csum_err & (HINIC3_RX_CSUM_HW_CHECK_NONE |
+				  HINIC3_RX_CSUM_IPSU_OTHER_ERR))) {
+			u64_stats_update_begin(&rxq->rxq_stats.syncp);
+			rxq->rxq_stats.csum_errors++;
+			u64_stats_update_end(&rxq->rxq_stats.syncp);
+		}
 		skb->ip_summed = CHECKSUM_NONE;
 		return;
 	}
@@ -387,8 +410,12 @@ static int recv_one_pkt(struct hinic3_rxq *rxq, struct hinic3_rq_cqe *rx_cqe,
 	u16 num_lro;
 
 	skb = hinic3_fetch_rx_buffer(rxq, pkt_len);
-	if (unlikely(!skb))
+	if (unlikely(!skb)) {
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.alloc_skb_err++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
 		return -ENOMEM;
+	}
 
 	/* place header in linear portion of buffer */
 	if (skb_is_nonlinear(skb))
@@ -550,11 +577,29 @@ int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
 	return 0;
 }
 
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+			  struct hinic3_rxq_stats *stats)
+{
+	struct hinic3_rxq_stats *rxq_stats = &rxq->rxq_stats;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin(&rxq_stats->syncp);
+		stats->csum_errors = rxq_stats->csum_errors;
+		stats->other_errors = rxq_stats->other_errors;
+		stats->rx_buf_empty = rxq_stats->rx_buf_empty;
+		stats->alloc_skb_err = rxq_stats->alloc_skb_err;
+		stats->alloc_rx_buf_err = rxq_stats->alloc_rx_buf_err;
+	} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
+}
+
 int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(rxq->netdev);
+	u32 ip_type, offload_type, pkt_hdr_len;
 	u32 sw_ci, status, pkt_len, vlan_len;
 	struct hinic3_rq_cqe *rx_cqe;
+	u64 rx_bytes = 0;
 	u32 num_wqe = 0;
 	int nr_pkts = 0;
 	u16 num_lro;
@@ -574,10 +619,20 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 		if (recv_one_pkt(rxq, rx_cqe, pkt_len, vlan_len, status))
 			break;
 
+		rx_bytes += pkt_len;
 		nr_pkts++;
 		num_lro = RQ_CQE_STATUS_GET(status, NUM_LRO);
-		if (num_lro)
+		if (num_lro) {
+			offload_type = le32_to_cpu(rx_cqe->offload_type);
+			ip_type = RQ_CQE_OFFOLAD_TYPE_GET(offload_type,
+							  IP_TYPE);
+			pkt_hdr_len = ip_type == HINIC3_RX_IPV6_PKT ?
+				      HINIC3_LRO_PKT_HDR_LEN_IPV6 :
+				      HINIC3_LRO_PKT_HDR_LEN_IPV4;
+
+			rx_bytes += (num_lro - 1) * pkt_hdr_len;
 			num_wqe += hinic3_get_sge_num(rxq, pkt_len);
+		}
 
 		rx_cqe->status = 0;
 
@@ -588,5 +643,10 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 	if (rxq->delta >= HINIC3_RX_BUFFER_WRITE)
 		hinic3_rx_fill_buffers(rxq);
 
+	u64_stats_update_begin(&rxq->rxq_stats.syncp);
+	rxq->rxq_stats.packets += (u64)nr_pkts;
+	rxq->rxq_stats.bytes += rx_bytes;
+	u64_stats_update_end(&rxq->rxq_stats.syncp);
+
 	return nr_pkts;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index 06d1b3299e7c..c11d080408a7 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -8,6 +8,17 @@
 #include <linux/dim.h>
 #include <linux/netdevice.h>
 
+/* rx cqe checksum err */
+#define HINIC3_RX_CSUM_IP_CSUM_ERR      BIT(0)
+#define HINIC3_RX_CSUM_TCP_CSUM_ERR     BIT(1)
+#define HINIC3_RX_CSUM_UDP_CSUM_ERR     BIT(2)
+#define HINIC3_RX_CSUM_IGMP_CSUM_ERR    BIT(3)
+#define HINIC3_RX_CSUM_ICMPV4_CSUM_ERR  BIT(4)
+#define HINIC3_RX_CSUM_ICMPV6_CSUM_ERR  BIT(5)
+#define HINIC3_RX_CSUM_SCTP_CRC_ERR     BIT(6)
+#define HINIC3_RX_CSUM_HW_CHECK_NONE    BIT(7)
+#define HINIC3_RX_CSUM_IPSU_OTHER_ERR   BIT(8)
+
 #define RQ_CQE_OFFOLAD_TYPE_PKT_TYPE_MASK           GENMASK(4, 0)
 #define RQ_CQE_OFFOLAD_TYPE_IP_TYPE_MASK            GENMASK(6, 5)
 #define RQ_CQE_OFFOLAD_TYPE_TUNNEL_PKT_FORMAT_MASK  GENMASK(11, 8)
@@ -39,7 +50,6 @@ struct hinic3_rxq_stats {
 	u64                   rx_buf_empty;
 	u64                   alloc_skb_err;
 	u64                   alloc_rx_buf_err;
-	u64                   restore_drop_sge;
 	struct u64_stats_sync syncp;
 };
 
@@ -123,6 +133,9 @@ void hinic3_free_rxqs_res(struct net_device *netdev, u16 num_rq,
 			  u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
 int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
 			  u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
+
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+			  struct hinic3_rxq_stats *stats);
 int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget);
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
index 9306bf0020ca..019ea4d03c19 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
@@ -97,8 +97,12 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
 
 	dma_info[0].dma = dma_map_single(&pdev->dev, skb->data,
 					 skb_headlen(skb), DMA_TO_DEVICE);
-	if (dma_mapping_error(&pdev->dev, dma_info[0].dma))
+	if (dma_mapping_error(&pdev->dev, dma_info[0].dma)) {
+		u64_stats_update_begin(&txq->txq_stats.syncp);
+		txq->txq_stats.map_frag_err++;
+		u64_stats_update_end(&txq->txq_stats.syncp);
 		return -EFAULT;
+	}
 
 	dma_info[0].len = skb_headlen(skb);
 
@@ -117,6 +121,9 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
 						     skb_frag_size(frag),
 						     DMA_TO_DEVICE);
 		if (dma_mapping_error(&pdev->dev, dma_info[idx].dma)) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.map_frag_err++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			err = -EFAULT;
 			goto err_unmap_page;
 		}
@@ -260,6 +267,9 @@ static int hinic3_tx_csum(struct hinic3_txq *txq, struct hinic3_sq_task *task,
 		if (l4_proto != IPPROTO_UDP ||
 		    ((struct udphdr *)skb_transport_header(skb))->dest !=
 		    VXLAN_OFFLOAD_PORT_LE) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.unknown_tunnel_pkt++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			/* Unsupported tunnel packet, disable csum offload */
 			skb_checksum_help(skb);
 			return 0;
@@ -433,6 +443,27 @@ static u32 hinic3_tx_offload(struct sk_buff *skb, struct hinic3_sq_task *task,
 	return offload;
 }
 
+static void hinic3_get_pkt_stats(struct hinic3_txq *txq, struct sk_buff *skb)
+{
+	u32 hdr_len, tx_bytes;
+	unsigned short pkts;
+
+	if (skb_is_gso(skb)) {
+		hdr_len = (skb_shinfo(skb)->gso_segs - 1) *
+			  skb_tcp_all_headers(skb);
+		tx_bytes = skb->len + hdr_len;
+		pkts = skb_shinfo(skb)->gso_segs;
+	} else {
+		tx_bytes = skb->len > ETH_ZLEN ? skb->len : ETH_ZLEN;
+		pkts = 1;
+	}
+
+	u64_stats_update_begin(&txq->txq_stats.syncp);
+	txq->txq_stats.bytes += tx_bytes;
+	txq->txq_stats.packets += pkts;
+	u64_stats_update_end(&txq->txq_stats.syncp);
+}
+
 static u16 hinic3_get_and_update_sq_owner(struct hinic3_io_queue *sq,
 					  u16 curr_pi, u16 wqebb_cnt)
 {
@@ -539,8 +570,12 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 	int err;
 
 	if (unlikely(skb->len < MIN_SKB_LEN)) {
-		if (skb_pad(skb, MIN_SKB_LEN - skb->len))
+		if (skb_pad(skb, MIN_SKB_LEN - skb->len)) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.skb_pad_err++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			goto err_out;
+		}
 
 		skb->len = MIN_SKB_LEN;
 	}
@@ -595,6 +630,7 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 				  txq->tx_stop_thrs,
 				  txq->tx_start_thrs);
 
+	hinic3_get_pkt_stats(txq, skb);
 	hinic3_prepare_sq_ctrl(&wqe_combo, queue_info, num_sge, owner);
 	hinic3_write_db(txq->sq, 0, DB_CFLAG_DP_SQ,
 			hinic3_get_sq_local_pi(txq->sq));
@@ -604,6 +640,10 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 err_drop_pkt:
 	dev_kfree_skb_any(skb);
 err_out:
+	u64_stats_update_begin(&txq->txq_stats.syncp);
+	txq->txq_stats.dropped++;
+	u64_stats_update_end(&txq->txq_stats.syncp);
+
 	return NETDEV_TX_OK;
 }
 
@@ -754,6 +794,24 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
 	return 0;
 }
 
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+			  struct hinic3_txq_stats *stats)
+{
+	struct hinic3_txq_stats *txq_stats = &txq->txq_stats;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin(&txq_stats->syncp);
+		stats->busy = txq_stats->busy;
+		stats->skb_pad_err = txq_stats->skb_pad_err;
+		stats->frag_len_overflow = txq_stats->frag_len_overflow;
+		stats->offload_cow_skb_err = txq_stats->offload_cow_skb_err;
+		stats->map_frag_err = txq_stats->map_frag_err;
+		stats->unknown_tunnel_pkt = txq_stats->unknown_tunnel_pkt;
+		stats->frag_size_err = txq_stats->frag_size_err;
+	} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
+}
+
 bool hinic3_tx_poll(struct hinic3_txq *txq, int budget)
 {
 	struct net_device *netdev = txq->netdev;
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
index 00194f2a1bcc..0a21c423618f 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
@@ -157,6 +157,8 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
 			  u32 sq_depth, struct hinic3_dyna_txq_res *txqs_res);
 
 netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+			  struct hinic3_txq_stats *stats);
 bool hinic3_tx_poll(struct hinic3_txq *txq, int budget);
 void hinic3_flush_txqs(struct net_device *netdev);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v09 3/5] hinic3: Add ethtool coalesce ops
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1781062575.git.wudi234@huawei.com>

  Implement following ethtool callback function:
.get_coalesce
.set_coalesce

  These callbacks allow users to utilize ethtool for detailed
RX coalesce configuration and monitoring.

Co-developed-by: Wu Di <wudi234@huawei.com>
Signed-off-by: Wu Di <wudi234@huawei.com>
Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 249 +++++++++++++++++-
 .../net/ethernet/huawei/hinic3/hinic3_irq.c   |   3 +
 .../net/ethernet/huawei/hinic3/hinic3_main.c  |   1 +
 .../ethernet/huawei/hinic3/hinic3_nic_dev.h   |   2 +
 4 files changed, 253 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index 66ca4303bb3c..11c8eb0f5d2a 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -18,6 +18,11 @@
 #include "hinic3_nic_cfg.h"
 
 #define HINIC3_MGMT_VERSION_MAX_LEN     32
+/* Coalesce time properties in microseconds */
+#define COALESCE_PENDING_LIMIT_UNIT     8
+#define COALESCE_TIMER_CFG_UNIT         5
+#define COALESCE_MAX_PENDING_LIMIT      (255 * COALESCE_PENDING_LIMIT_UNIT)
+#define COALESCE_MAX_TIMER_CFG          (255 * COALESCE_TIMER_CFG_UNIT)
 
 static void hinic3_get_drvinfo(struct net_device *netdev,
 			       struct ethtool_drvinfo *info)
@@ -975,9 +980,247 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
 	kfree(ps);
 }
 
+static int hinic3_set_queue_coalesce(struct net_device *netdev, u16 q_id,
+				     struct hinic3_intr_coal_info *coal,
+				     struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *intr_coal;
+	struct hinic3_interrupt_info info = {};
+	int err;
+
+	if (nic_dev->adaptive_rx_coal) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Static coalesce not allowed in adaptive RX mode");
+		return -EINVAL;
+	}
+
+	if (!test_bit(HINIC3_INTF_UP, &nic_dev->flags) ||
+	    q_id >= nic_dev->q_params.num_qps)
+		return 0;
+
+	spin_lock(&nic_dev->coal_lock);
+
+	intr_coal = &nic_dev->intr_coalesce[q_id];
+
+	intr_coal->coalesce_timer_cfg = coal->coalesce_timer_cfg;
+	intr_coal->pending_limit = coal->pending_limit;
+	intr_coal->rx_pending_limit_low = coal->rx_pending_limit_low;
+	intr_coal->rx_pending_limit_high = coal->rx_pending_limit_high;
+	spin_unlock(&nic_dev->coal_lock);
+
+	info.msix_index = nic_dev->q_params.irq_cfg[q_id].msix_entry_idx;
+	info.interrupt_coalesc_set = 1;
+	info.coalesc_timer_cfg = intr_coal->coalesce_timer_cfg;
+	info.pending_limit = intr_coal->pending_limit;
+	info.resend_timer_cfg = intr_coal->resend_timer_cfg;
+	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
+	if (err) {
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "Failed to set queue%u coalesce",
+				       q_id);
+		return err;
+	}
+
+	return 0;
+}
+
+static int is_coalesce_exceed_limit(const struct ethtool_coalesce *coal,
+				    struct netlink_ext_ack *extack)
+{
+	const struct {
+		const char *name;
+		u32 value;
+		u32 limit;
+	} coalesce_limits[] = {
+		{"rx_coalesce_usecs",
+		 coal->rx_coalesce_usecs,
+		 COALESCE_MAX_TIMER_CFG},
+		{"rx_max_coalesced_frames",
+		 coal->rx_max_coalesced_frames,
+		 COALESCE_MAX_PENDING_LIMIT},
+		{"rx_max_coalesced_frames_low",
+		 coal->rx_max_coalesced_frames_low,
+		 COALESCE_MAX_PENDING_LIMIT},
+		{"rx_max_coalesced_frames_high",
+		 coal->rx_max_coalesced_frames_high,
+		 COALESCE_MAX_PENDING_LIMIT},
+	};
+
+	for (int i = 0; i < ARRAY_SIZE(coalesce_limits); i++) {
+		if (coalesce_limits[i].value > coalesce_limits[i].limit) {
+			NL_SET_ERR_MSG_FMT_MOD(extack,
+					       "%s out of range %d-%d",
+					       coalesce_limits[i].name,
+					       0,
+					       coalesce_limits[i].limit);
+			return -ERANGE;
+		}
+	}
+	return 0;
+}
+
+static int is_coalesce_legal(const struct ethtool_coalesce *coal,
+			     struct netlink_ext_ack *extack)
+{
+	int err;
+
+	err = is_coalesce_exceed_limit(coal, extack);
+	if (err)
+		return err;
+
+	if (coal->rx_max_coalesced_frames_low >
+	    coal->rx_max_coalesced_frames_high) {
+		NL_SET_ERR_MSG_FMT_MOD(extack,
+				       "invalid coalesce frame high %u, low %u",
+				       coal->rx_max_coalesced_frames_high,
+				       coal->rx_max_coalesced_frames_low);
+		return -ERANGE;
+	}
+
+	return 0;
+}
+
+static void check_coalesce_align(struct net_device *netdev,
+				 u32 item, u32 unit, const char *str)
+{
+	if (item % unit)
+		netdev_warn(netdev, "%s in %d units, change to %u\n",
+			    str, unit, item - item % unit);
+}
+
+#define CHECK_COALESCE_ALIGN(member, unit) \
+	check_coalesce_align(netdev, member, unit, #member)
+
+static void check_coalesce_changed(struct net_device *netdev,
+				   u32 item, u32 unit, u32 ori_val,
+				   const char *obj_str, const char *str)
+{
+	if ((item / unit) != ori_val)
+		netdev_dbg(netdev, "Change %s from %d to %u %s\n",
+			   str, ori_val * unit, item - item % unit, obj_str);
+}
+
+#define CHECK_COALESCE_CHANGED(member, unit, ori_val, obj_str) \
+	check_coalesce_changed(netdev, member, unit, ori_val, obj_str, #member)
+
+static int hinic3_set_hw_coal_param(struct net_device *netdev,
+				    struct hinic3_intr_coal_info *intr_coal,
+				    struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err;
+	u16 i;
+
+	for (i = 0; i < nic_dev->max_qps; i++) {
+		err = hinic3_set_queue_coalesce(netdev, i, intr_coal, extack);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int hinic3_get_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *coal,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *interrupt_info;
+
+	memset(coal, 0, sizeof(*coal));
+
+	interrupt_info = &nic_dev->intr_coalesce[0];
+
+	coal->use_adaptive_rx_coalesce = nic_dev->adaptive_rx_coal;
+
+	if (nic_dev->adaptive_rx_coal) {
+		coal->rx_max_coalesced_frames_low =
+			interrupt_info->rx_pending_limit_low *
+			COALESCE_PENDING_LIMIT_UNIT;
+		coal->rx_max_coalesced_frames_high =
+			interrupt_info->rx_pending_limit_high *
+			COALESCE_PENDING_LIMIT_UNIT;
+	} else {
+		/* TX/RX uses the same interrupt.
+		 * So we only declare RX ethtool_coalesce parameters.
+		 */
+		coal->rx_coalesce_usecs = interrupt_info->coalesce_timer_cfg *
+					  COALESCE_TIMER_CFG_UNIT;
+		coal->rx_max_coalesced_frames = interrupt_info->pending_limit *
+						COALESCE_PENDING_LIMIT_UNIT;
+	}
+
+	return 0;
+}
+
+static int hinic3_set_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *coal,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *ori_intr_coal;
+	struct hinic3_intr_coal_info intr_coal = {};
+	const char *obj_str = "for netdev";
+	int err;
+
+	err = is_coalesce_legal(coal, extack);
+	if (err)
+		return err;
+
+	CHECK_COALESCE_ALIGN(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames,
+			     COALESCE_PENDING_LIMIT_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_high,
+			     COALESCE_PENDING_LIMIT_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_low,
+			     COALESCE_PENDING_LIMIT_UNIT);
+
+	ori_intr_coal = &nic_dev->intr_coalesce[0];
+
+	CHECK_COALESCE_CHANGED(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT,
+			       ori_intr_coal->coalesce_timer_cfg, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->pending_limit, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_high,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->rx_pending_limit_high, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_low,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->rx_pending_limit_low, obj_str);
+
+	intr_coal.coalesce_timer_cfg =
+		(u8)(coal->rx_coalesce_usecs / COALESCE_TIMER_CFG_UNIT);
+	intr_coal.pending_limit = (u8)(coal->rx_max_coalesced_frames /
+				      COALESCE_PENDING_LIMIT_UNIT);
+
+	nic_dev->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+
+	intr_coal.rx_pending_limit_high =
+		(u8)(coal->rx_max_coalesced_frames_high /
+		     COALESCE_PENDING_LIMIT_UNIT);
+
+	intr_coal.rx_pending_limit_low =
+		(u8)(coal->rx_max_coalesced_frames_low /
+		     COALESCE_PENDING_LIMIT_UNIT);
+
+	/* coalesce timer or pending set to zero will disable coalesce */
+	if (!nic_dev->adaptive_rx_coal &&
+	    (!intr_coal.coalesce_timer_cfg || !intr_coal.pending_limit))
+		NL_SET_ERR_MSG_MOD(extack, "Coalesce will be disabled");
+
+	return hinic3_set_hw_coal_param(netdev, &intr_coal, extack);
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
-	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
-					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
+	.supported_coalesce_params      = ETHTOOL_COALESCE_RX_USECS |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES |
+					  ETHTOOL_COALESCE_USE_ADAPTIVE_RX |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH,
 	.get_link_ksettings             = hinic3_get_link_ksettings,
 	.get_drvinfo                    = hinic3_get_drvinfo,
 	.get_msglevel                   = hinic3_get_msglevel,
@@ -993,6 +1236,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_eth_ctrl_stats             = hinic3_get_eth_ctrl_stats,
 	.get_rmon_stats                 = hinic3_get_rmon_stats,
 	.get_pause_stats                = hinic3_get_pause_stats,
+	.get_coalesce                   = hinic3_get_coalesce,
+	.set_coalesce                   = hinic3_set_coalesce,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index bc4d879f9be4..b7cd5f2f53a7 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -173,9 +173,12 @@ static void hinic3_update_queue_coal(struct net_device *netdev, u16 q_id,
 
 	q_coal = &nic_dev->intr_coalesce[q_id];
 	coalesc_timer_cfg = (u8)coal_timer;
+
+	spin_lock(&nic_dev->coal_lock);
 	pending_limit = clamp_t(u8, coal_pkts >> HINIC3_COAL_PKT_SHIFT,
 				q_coal->rx_pending_limit_low,
 				q_coal->rx_pending_limit_high);
+	spin_unlock(&nic_dev->coal_lock);
 
 	hinic3_set_interrupt_moder(nic_dev->netdev, q_id,
 				   coalesc_timer_cfg, pending_limit);
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index c87624a5e5dc..b4821c8042b1 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -180,6 +180,7 @@ static int hinic3_sw_init(struct net_device *netdev)
 
 	mutex_init(&nic_dev->port_state_mutex);
 	mutex_init(&nic_dev->change_res_mutex);
+	spin_lock_init(&nic_dev->coal_lock);
 
 	nic_dev->q_params.sq_depth = HINIC3_SQ_DEPTH;
 	nic_dev->q_params.rq_depth = HINIC3_RQ_DEPTH;
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
index 005b2c01a988..b6e3b188fa78 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
@@ -134,6 +134,8 @@ struct hinic3_nic_dev {
 	struct mutex                    port_state_mutex;
 	/* mutex to serialize channel/resource changes */
 	struct mutex                    change_res_mutex;
+	/* lock for set queue coalesce */
+	spinlock_t                      coal_lock;
 
 	struct list_head                uc_filter_list;
 	struct list_head                mc_filter_list;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v09 5/5] hinic3: Remove unneeded coalesce parameters
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1781062575.git.wudi234@huawei.com>

  Remove unneeded coalesce parameters in irq handling.

Co-developed-by: Wu Di <wudi234@huawei.com>
Signed-off-by: Wu Di <wudi234@huawei.com>
Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 drivers/net/ethernet/huawei/hinic3/hinic3_irq.c | 6 +-----
 drivers/net/ethernet/huawei/hinic3/hinic3_rx.h  | 3 ---
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index b7cd5f2f53a7..6a8695e0a69a 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -149,13 +149,9 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 		nic_dev->intr_coalesce[q_id].resend_timer_cfg;
 
 	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
-	if (err) {
+	if (err)
 		netdev_err(netdev,
 			   "Failed to modify moderation for Queue: %u\n", q_id);
-	} else {
-		nic_dev->rxqs[q_id].last_coalesc_timer_cfg = coalesc_timer_cfg;
-		nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
-	}
 
 	mutex_unlock(&nic_dev->change_res_mutex);
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index c11d080408a7..2ab691ed11a9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -111,9 +111,6 @@ struct hinic3_rxq {
 	dma_addr_t             cqe_start_paddr;
 
 	struct dim             dim;
-
-	u8                     last_coalesc_timer_cfg;
-	u8                     last_pending_limit;
 } ____cacheline_aligned;
 
 struct hinic3_dyna_rxq_res {
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v09 4/5] hinic3: Add ethtool rss ops
From: Fan Gong @ 2026-06-10  6:59 UTC (permalink / raw)
  To: Fan Gong, Wu Di, Teng Peisen, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Andrew Lunn, Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1781062575.git.wudi234@huawei.com>

  Implement following ethtool callback function:
.get_rxnfc
.set_rxnfc
.get_channels
.set_channels
.get_rxfh_indir_size
.get_rxfh_key_size
.get_rxfh
.set_rxfh

  These callbacks allow users to utilize ethtool for detailed
RSS parameters configuration and monitoring.

Co-developed-by: Wu Di <wudi234@huawei.com>
Signed-off-by: Wu Di <wudi234@huawei.com>
Co-developed-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Teng Peisen <tengpeisen@huawei.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   |   9 +
 .../huawei/hinic3/hinic3_mgmt_interface.h     |   2 +
 .../net/ethernet/huawei/hinic3/hinic3_rss.c   | 539 +++++++++++++++++-
 .../net/ethernet/huawei/hinic3/hinic3_rss.h   |  19 +
 4 files changed, 567 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index 11c8eb0f5d2a..78818de9a946 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -16,6 +16,7 @@
 #include "hinic3_hw_comm.h"
 #include "hinic3_nic_dev.h"
 #include "hinic3_nic_cfg.h"
+#include "hinic3_rss.h"
 
 #define HINIC3_MGMT_VERSION_MAX_LEN     32
 /* Coalesce time properties in microseconds */
@@ -1238,6 +1239,14 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_pause_stats                = hinic3_get_pause_stats,
 	.get_coalesce                   = hinic3_get_coalesce,
 	.set_coalesce                   = hinic3_set_coalesce,
+	.get_rxnfc                      = hinic3_get_rxnfc,
+	.set_rxnfc                      = hinic3_set_rxnfc,
+	.get_channels                   = hinic3_get_channels,
+	.set_channels                   = hinic3_set_channels,
+	.get_rxfh_indir_size            = hinic3_get_rxfh_indir_size,
+	.get_rxfh_key_size              = hinic3_get_rxfh_key_size,
+	.get_rxfh                       = hinic3_get_rxfh,
+	.set_rxfh                       = hinic3_set_rxfh,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
index 76c691f82703..3c1263ff99ff 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
@@ -282,6 +282,7 @@ enum l2nic_cmd {
 	L2NIC_CMD_SET_VLAN_FILTER_EN  = 26,
 	L2NIC_CMD_SET_RX_VLAN_OFFLOAD = 27,
 	L2NIC_CMD_CFG_RSS             = 60,
+	L2NIC_CMD_GET_RSS_CTX_TBL     = 62,
 	L2NIC_CMD_CFG_RSS_HASH_KEY    = 63,
 	L2NIC_CMD_CFG_RSS_HASH_ENGINE = 64,
 	L2NIC_CMD_SET_RSS_CTX_TBL     = 65,
@@ -301,6 +302,7 @@ enum l2nic_ucode_cmd {
 	L2NIC_UCODE_CMD_MODIFY_QUEUE_CTX  = 0,
 	L2NIC_UCODE_CMD_CLEAN_QUEUE_CTX   = 1,
 	L2NIC_UCODE_CMD_SET_RSS_INDIR_TBL = 4,
+	L2NIC_UCODE_CMD_GET_RSS_INDIR_TBL = 6,
 };
 
 /* hilink mac group command */
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
index 25db74d8c7dd..811a6b491e74 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
@@ -155,7 +155,7 @@ static int hinic3_set_rss_type(struct hinic3_hwdev *hwdev,
 				       L2NIC_CMD_SET_RSS_CTX_TBL, &msg_params);
 
 	if (ctx_tbl.msg_head.status == MGMT_STATUS_CMD_UNSUPPORTED) {
-		return MGMT_STATUS_CMD_UNSUPPORTED;
+		return -EOPNOTSUPP;
 	} else if (err || ctx_tbl.msg_head.status) {
 		dev_err(hwdev->dev, "mgmt Failed to set rss context offload, err: %d, status: 0x%x\n",
 			err, ctx_tbl.msg_head.status);
@@ -165,6 +165,41 @@ static int hinic3_set_rss_type(struct hinic3_hwdev *hwdev,
 	return 0;
 }
 
+static int hinic3_get_rss_type(struct hinic3_hwdev *hwdev,
+			       struct hinic3_rss_type *rss_type)
+{
+	struct l2nic_cmd_rss_ctx_tbl ctx_tbl = {};
+	struct mgmt_msg_params msg_params = {};
+	int err;
+
+	ctx_tbl.func_id = hinic3_global_func_id(hwdev);
+
+	mgmt_msg_params_init_default(&msg_params, &ctx_tbl, sizeof(ctx_tbl));
+
+	err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_L2NIC,
+				       L2NIC_CMD_GET_RSS_CTX_TBL,
+				       &msg_params);
+	if (ctx_tbl.msg_head.status == MGMT_STATUS_CMD_UNSUPPORTED) {
+		return -EOPNOTSUPP;
+	} else if (err || ctx_tbl.msg_head.status) {
+		dev_err(hwdev->dev, "Failed to get hash type, err: %d, status: 0x%x\n",
+			err, ctx_tbl.msg_head.status);
+		return -EINVAL;
+	}
+
+	rss_type->ipv4         = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV4);
+	rss_type->ipv6         = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV6);
+	rss_type->ipv6_ext     = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV6_EXT);
+	rss_type->tcp_ipv4     = L2NIC_RSS_TYPE_GET(ctx_tbl.context, TCP_IPV4);
+	rss_type->tcp_ipv6     = L2NIC_RSS_TYPE_GET(ctx_tbl.context, TCP_IPV6);
+	rss_type->tcp_ipv6_ext = L2NIC_RSS_TYPE_GET(ctx_tbl.context,
+						    TCP_IPV6_EXT);
+	rss_type->udp_ipv4     = L2NIC_RSS_TYPE_GET(ctx_tbl.context, UDP_IPV4);
+	rss_type->udp_ipv6     = L2NIC_RSS_TYPE_GET(ctx_tbl.context, UDP_IPV6);
+
+	return 0;
+}
+
 static int hinic3_rss_cfg_hash_type(struct hinic3_hwdev *hwdev, u8 opcode,
 				    enum hinic3_rss_hash_type *type)
 {
@@ -264,7 +299,8 @@ static int hinic3_set_hw_rss_parameters(struct net_device *netdev, u8 rss_en)
 	if (err)
 		return err;
 
-	hinic3_fillout_indir_tbl(netdev, nic_dev->rss_indir);
+	if (!netif_is_rxfh_configured(netdev))
+		hinic3_fillout_indir_tbl(netdev, nic_dev->rss_indir);
 
 	err = hinic3_config_rss_hw_resource(netdev, nic_dev->rss_indir);
 	if (err)
@@ -334,3 +370,502 @@ void hinic3_try_to_enable_rss(struct net_device *netdev)
 	clear_bit(HINIC3_RSS_ENABLE, &nic_dev->flags);
 	nic_dev->q_params.num_qps = nic_dev->max_qps;
 }
+
+static int hinic3_set_l4_rss_hash_ops(const struct ethtool_rxnfc *cmd,
+				      struct hinic3_rss_type *rss_type)
+{
+	u8 rss_l4_en;
+
+	switch (cmd->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+	case 0:
+		rss_l4_en = 0;
+		break;
+	case (RXH_L4_B_0_1 | RXH_L4_B_2_3):
+		rss_l4_en = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		rss_type->tcp_ipv4 = rss_l4_en;
+		break;
+	case TCP_V6_FLOW:
+		rss_type->tcp_ipv6 = rss_l4_en;
+		break;
+	case UDP_V4_FLOW:
+		rss_type->udp_ipv4 = rss_l4_en;
+		break;
+	case UDP_V6_FLOW:
+		rss_type->udp_ipv6 = rss_l4_en;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_update_rss_hash_opts(struct net_device *netdev,
+				       struct ethtool_rxnfc *cmd,
+				       struct hinic3_rss_type *rss_type)
+{
+	int err;
+
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+	case TCP_V6_FLOW:
+	case UDP_V4_FLOW:
+	case UDP_V6_FLOW:
+		err = hinic3_set_l4_rss_hash_ops(cmd, rss_type);
+		if (err)
+			return err;
+
+		break;
+	case IPV4_FLOW:
+		rss_type->ipv4 = 1;
+		break;
+	case IPV6_FLOW:
+		rss_type->ipv6 = 1;
+		break;
+	default:
+		netdev_err(netdev, "Unsupported flow type\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_set_rss_hash_opts(struct net_device *netdev,
+				    struct ethtool_rxnfc *cmd)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_rss_type rss_type;
+	int err;
+
+	if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+		cmd->data = 0;
+		netdev_err(netdev, "RSS is disable, not support to set flow-hash\n");
+		return -EOPNOTSUPP;
+	}
+
+	/* RSS only supports hashing of IP addresses and L4 ports */
+	if (cmd->data & ~(RXH_IP_SRC | RXH_IP_DST |
+			  RXH_L4_B_0_1 | RXH_L4_B_2_3))
+		return -EINVAL;
+
+	/* Both IP addresses must be part of the hash tuple */
+	if (!(cmd->data & RXH_IP_SRC) || !(cmd->data & RXH_IP_DST))
+		return -EINVAL;
+
+	/* L4 hash bits are not valid for pure L3 flow types */
+	if ((cmd->flow_type == IPV4_FLOW || cmd->flow_type == IPV6_FLOW) &&
+	    (cmd->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)))
+		return -EINVAL;
+
+	err = hinic3_get_rss_type(nic_dev->hwdev, &rss_type);
+	if (err) {
+		netdev_err(netdev, "Failed to get rss type\n");
+		return err;
+	}
+
+	err = hinic3_update_rss_hash_opts(netdev, cmd, &rss_type);
+	if (err)
+		return err;
+
+	err = hinic3_set_rss_type(nic_dev->hwdev, rss_type);
+	if (err) {
+		netdev_err(netdev, "Failed to set rss type\n");
+		return err;
+	}
+
+	nic_dev->rss_type = rss_type;
+
+	return 0;
+}
+
+static void convert_rss_l3_type(u8 rss_opt, struct ethtool_rxnfc *cmd)
+{
+	if (!rss_opt)
+		cmd->data &= ~(RXH_IP_SRC | RXH_IP_DST);
+}
+
+static void convert_rss_l4_type(u8 rss_opt, struct ethtool_rxnfc *cmd)
+{
+	if (rss_opt)
+		cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+}
+
+static int hinic3_convert_rss_type(struct net_device *netdev,
+				   struct hinic3_rss_type *rss_type,
+				   struct ethtool_rxnfc *cmd)
+{
+	cmd->data = RXH_IP_SRC | RXH_IP_DST;
+	switch (cmd->flow_type) {
+	case TCP_V4_FLOW:
+		convert_rss_l4_type(rss_type->tcp_ipv4, cmd);
+		break;
+	case TCP_V6_FLOW:
+		convert_rss_l4_type(rss_type->tcp_ipv6, cmd);
+		break;
+	case UDP_V4_FLOW:
+		convert_rss_l4_type(rss_type->udp_ipv4, cmd);
+		break;
+	case UDP_V6_FLOW:
+		convert_rss_l4_type(rss_type->udp_ipv6, cmd);
+		break;
+	case IPV4_FLOW:
+		convert_rss_l3_type(rss_type->ipv4, cmd);
+		break;
+	case IPV6_FLOW:
+		convert_rss_l3_type(rss_type->ipv6, cmd);
+		break;
+	default:
+		netdev_err(netdev, "Unsupported flow type\n");
+		cmd->data = 0;
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_get_rss_hash_opts(struct net_device *netdev,
+				    struct ethtool_rxnfc *cmd)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_rss_type rss_type;
+	int err;
+
+	cmd->data = 0;
+
+	if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags))
+		return 0;
+
+	err = hinic3_get_rss_type(nic_dev->hwdev, &rss_type);
+	if (err) {
+		netdev_err(netdev, "Failed to get rss type\n");
+		return err;
+	}
+
+	return hinic3_convert_rss_type(netdev, &rss_type, cmd);
+}
+
+int hinic3_get_rxnfc(struct net_device *netdev,
+		     struct ethtool_rxnfc *cmd, u32 *rule_locs)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err = 0;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_GRXRINGS:
+		cmd->data = nic_dev->q_params.num_qps;
+		break;
+	case ETHTOOL_GRXFH:
+		err = hinic3_get_rss_hash_opts(netdev, cmd);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+int hinic3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+	int err;
+
+	switch (cmd->cmd) {
+	case ETHTOOL_SRXFH:
+		err = hinic3_set_rss_hash_opts(netdev, cmd);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	return err;
+}
+
+static u16 hinic3_max_channels(struct net_device *netdev)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u8 tcs = netdev_get_num_tc(netdev);
+
+	return tcs ? nic_dev->max_qps / tcs : nic_dev->max_qps;
+}
+
+static u16 hinic3_curr_channels(struct net_device *netdev)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+
+	if (netif_running(netdev))
+		return nic_dev->q_params.num_qps ?
+				nic_dev->q_params.num_qps : 1;
+	else
+		return min_t(u16, hinic3_max_channels(netdev),
+			     nic_dev->q_params.num_qps);
+}
+
+void hinic3_get_channels(struct net_device *netdev,
+			 struct ethtool_channels *channels)
+{
+	channels->max_rx = 0;
+	channels->max_tx = 0;
+	channels->max_other = 0;
+	/* report maximum channels */
+	channels->max_combined = hinic3_max_channels(netdev);
+	channels->rx_count = 0;
+	channels->tx_count = 0;
+	channels->other_count = 0;
+	/* report flow director queues as maximum channels */
+	channels->combined_count = hinic3_curr_channels(netdev);
+}
+
+static int
+hinic3_validate_channel_parameter(struct net_device *netdev,
+				  const struct ethtool_channels *channels)
+{
+	u16 max_channel = hinic3_max_channels(netdev);
+	unsigned int count = channels->combined_count;
+
+	if (!count) {
+		netdev_err(netdev, "Unsupported combined_count=0\n");
+		return -EINVAL;
+	}
+
+	if (channels->tx_count || channels->rx_count || channels->other_count) {
+		netdev_err(netdev, "Setting rx/tx/other count not supported\n");
+		return -EINVAL;
+	}
+
+	if (count > max_channel) {
+		netdev_err(netdev, "Combined count %u exceed limit %u\n", count,
+			   max_channel);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_rss_update_num_qps_and_reprogram(struct net_device *netdev)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err;
+
+	if (!netif_is_rxfh_configured(netdev))
+		hinic3_rss_set_indir_tbl(nic_dev->hwdev, nic_dev->rss_indir);
+
+	if (!netif_running(netdev))
+		return 0;
+
+	err = hinic3_set_hw_rss_parameters(netdev, 1);
+	if (err)
+		netdev_err(netdev,
+			   "Failed to update RSS parameters after changing channels\n");
+
+	return err;
+}
+
+int hinic3_set_channels(struct net_device *netdev,
+			struct ethtool_channels *channels)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	unsigned int count = channels->combined_count;
+	struct hinic3_dyna_txrxq_params q_params;
+	int err;
+
+	err = hinic3_validate_channel_parameter(netdev, channels);
+	if (err)
+		return err;
+
+	if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+		netdev_err(netdev, "This function doesn't support RSS, only support 1 queue pair\n");
+		return -EOPNOTSUPP;
+	}
+
+	netdev_dbg(netdev, "Set max combined queue number from %u to %u\n",
+		   nic_dev->q_params.num_qps, count);
+
+	if (netif_running(netdev)) {
+		q_params = nic_dev->q_params;
+		q_params.num_qps = (u16)count;
+		q_params.txqs_res = NULL;
+		q_params.rxqs_res = NULL;
+		q_params.irq_cfg = NULL;
+
+		err = hinic3_change_channel_settings(netdev, &q_params);
+		if (err) {
+			netdev_err(netdev, "Failed to change channel settings\n");
+			return err;
+		}
+	} else {
+		nic_dev->q_params.num_qps = (u16)count;
+	}
+
+	err = hinic3_rss_update_num_qps_and_reprogram(netdev);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+u32 hinic3_get_rxfh_indir_size(struct net_device *netdev)
+{
+	return L2NIC_RSS_INDIR_SIZE;
+}
+
+static int hinic3_set_rss_rxfh(struct net_device *netdev,
+			       const u32 *indir, u8 *key)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u16 temp_indir[L2NIC_RSS_INDIR_SIZE];
+	int err;
+	u32 i;
+
+	if (indir) {
+		for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
+			temp_indir[i] = (u16)indir[i];
+
+		err = hinic3_rss_set_indir_tbl(nic_dev->hwdev, temp_indir);
+		if (err) {
+			netdev_err(netdev, "Failed to set rss indir table\n");
+			return err;
+		}
+
+		memcpy(nic_dev->rss_indir, temp_indir, L2NIC_RSS_INDIR_SIZE);
+	}
+
+	if (key) {
+		err = hinic3_rss_set_hash_key(nic_dev->hwdev, key);
+		if (err) {
+			netdev_err(netdev, "Failed to set rss key\n");
+			return err;
+		}
+
+		memcpy(nic_dev->rss_hkey, key, L2NIC_RSS_KEY_SIZE);
+	}
+
+	return 0;
+}
+
+u32 hinic3_get_rxfh_key_size(struct net_device *netdev)
+{
+	return L2NIC_RSS_KEY_SIZE;
+}
+
+static int hinic3_rss_get_indir_tbl(struct hinic3_hwdev *hwdev,
+				    u32 *indir_table)
+{
+	struct hinic3_cmd_buf_pair pair;
+	__le16 *indir_tbl = NULL;
+	int err, i;
+
+	err = hinic3_cmd_buf_pair_init(hwdev, &pair);
+	if (err) {
+		dev_err(hwdev->dev, "Failed to allocate cmd_buf.\n");
+		return err;
+	}
+
+	memset(pair.in->buf, 0, le16_to_cpu(pair.in->size));
+
+	err = hinic3_cmdq_detail_resp(hwdev, MGMT_MOD_L2NIC,
+				      L2NIC_UCODE_CMD_GET_RSS_INDIR_TBL,
+				      pair.in, pair.out, NULL);
+	if (err) {
+		dev_err(hwdev->dev, "Failed to get rss indir table\n");
+		goto err_get_indir_tbl;
+	}
+
+	indir_tbl = (__force __le16 *)pair.out->buf;
+	for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
+		indir_table[i] = le16_to_cpu(*(indir_tbl + i));
+
+err_get_indir_tbl:
+	hinic3_cmd_buf_pair_uninit(hwdev, &pair);
+
+	return err;
+}
+
+int hinic3_get_rxfh(struct net_device *netdev,
+		    struct ethtool_rxfh_param *rxfh)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err = 0;
+
+	if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+		netdev_err(netdev, "Rss is disabled\n");
+		return -EOPNOTSUPP;
+	}
+
+	rxfh->hfunc =
+		nic_dev->rss_hash_type == HINIC3_RSS_HASH_ENGINE_TYPE_XOR ?
+		ETH_RSS_HASH_XOR : ETH_RSS_HASH_TOP;
+
+	if (rxfh->indir) {
+		err = hinic3_rss_get_indir_tbl(nic_dev->hwdev, rxfh->indir);
+		if (err)
+			return err;
+	}
+
+	if (rxfh->key)
+		memcpy(rxfh->key, nic_dev->rss_hkey, L2NIC_RSS_KEY_SIZE);
+
+	return err;
+}
+
+static int hinic3_update_hash_func_type(struct net_device *netdev, u8 hfunc)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	enum hinic3_rss_hash_type new_rss_hash_type;
+	int err;
+
+	switch (hfunc) {
+	case ETH_RSS_HASH_NO_CHANGE:
+		return 0;
+	case ETH_RSS_HASH_XOR:
+		new_rss_hash_type = HINIC3_RSS_HASH_ENGINE_TYPE_XOR;
+		break;
+	case ETH_RSS_HASH_TOP:
+		new_rss_hash_type = HINIC3_RSS_HASH_ENGINE_TYPE_TOEP;
+		break;
+	default:
+		netdev_err(netdev, "Unsupported hash func %u\n", hfunc);
+		return -EOPNOTSUPP;
+	}
+
+	if (new_rss_hash_type == nic_dev->rss_hash_type)
+		return 0;
+
+	err = hinic3_rss_set_hash_type(nic_dev->hwdev, nic_dev->rss_hash_type);
+	if (err) {
+		netdev_err(netdev, "Failed to set RSS hash type to HW\n");
+		return err;
+	}
+
+	nic_dev->rss_hash_type = new_rss_hash_type;
+
+	return 0;
+}
+
+int hinic3_set_rxfh(struct net_device *netdev,
+		    struct ethtool_rxfh_param *rxfh,
+		    struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err;
+
+	if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+		netdev_err(netdev, "Not support to set rss parameters when rss is disable\n");
+		return -EOPNOTSUPP;
+	}
+
+	err = hinic3_update_hash_func_type(netdev, rxfh->hfunc);
+	if (err)
+		return err;
+
+	err = hinic3_set_rss_rxfh(netdev, rxfh->indir, rxfh->key);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
index 78d82c2aca06..9f1b77780cd4 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
@@ -5,10 +5,29 @@
 #define _HINIC3_RSS_H_
 
 #include <linux/netdevice.h>
+#include <linux/ethtool.h>
 
 int hinic3_rss_init(struct net_device *netdev);
 void hinic3_rss_uninit(struct net_device *netdev);
 void hinic3_try_to_enable_rss(struct net_device *netdev);
 void hinic3_clear_rss_config(struct net_device *netdev);
 
+int hinic3_get_rxnfc(struct net_device *netdev,
+		     struct ethtool_rxnfc *cmd, u32 *rule_locs);
+int hinic3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd);
+
+void hinic3_get_channels(struct net_device *netdev,
+			 struct ethtool_channels *channels);
+int hinic3_set_channels(struct net_device *netdev,
+			struct ethtool_channels *channels);
+
+u32 hinic3_get_rxfh_indir_size(struct net_device *netdev);
+u32 hinic3_get_rxfh_key_size(struct net_device *netdev);
+
+int hinic3_get_rxfh(struct net_device *netdev,
+		    struct ethtool_rxfh_param *rxfh);
+int hinic3_set_rxfh(struct net_device *netdev,
+		    struct ethtool_rxfh_param *rxfh,
+		    struct netlink_ext_ack *extack);
+
 #endif
-- 
2.43.0


^ permalink raw reply related

* Re: [RFC PATCH v1 00/13] exec: add spawn templates for repeated executable startup
From: Christian Brauner @ 2026-06-10  7:28 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Li Chen, Kees Cook, Alexander Viro, linux-fsdevel, linux-api,
	linux-kernel, linux-mm, linux-arch, linux-doc, linux-kselftest,
	x86, Arnd Bergmann, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, H. Peter Anvin, Jan Kara, Jonathan Corbet,
	Shuah Khan
In-Reply-To: <CALCETrWJQpLR4n1cpichBk8=uExSKLWTMGU3BufGdk_WE_p5UA@mail.gmail.com>

On Mon, Jun 08, 2026 at 05:01:57PM -0700, Andy Lutomirski wrote:
> On Thu, May 28, 2026 at 4:05 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Thu, May 28, 2026 at 05:52:21PM +0800, Li Chen wrote:
> > > Hi,
> > >
> > > This is an early RFC for an idea that is probably still rough in both the
> > > UAPI and implementation details. Sorry for the rough edges; I am sending
> > > it now to check whether this direction is worth pursuing and to get
> > > feedback on the kernel/userspace boundary.
> >
> > The idea of having a builder api for exec isn't all that crazy. But it
> > should simply be built on top of pidfds and thus pidfs itself instead.
> > It has all the basic infrastructure in place already. Any implementation
> > should also allow userspace to implement posix_spawn() on top of it.
> >
> > fd = pidfd_open(0, PIDFD_EMPTY /* or better name */)
> >
> > pidfd_config(fd, ...) // modeled similar to fsconfig()
> >
> 
> After contemplating this for a bit... why pidfd?  Doesn't a pidfd
> refer to an actual process that is, or at least was, running?  This
> new thing is a process that we are contemplating spawning.  I can
> imagine that basically all pidfd APIs would be a bit confused by the
> nonexistence of the process in question.

I don't think that would be a problem because every api just needs to
handle ESRCH. Ignoring that for a second: the mount api has a builder fd
that is later transformed into a pidfd. Which is easily doable here as
well. My point is that all the infrastructure building blocks already
exist in pidfs.

^ permalink raw reply

* [PATCH v4 2/2] cpu/hotplug: Fix NULL kobject warning in cpuhp_smt_enable()
From: Jinjie Ruan @ 2026-06-10  7:52 UTC (permalink / raw)
  To: catalin.marinas, will, corbet, skhan, punit.agrawal, ruanjinjie,
	mrigendra.chaubey, suzuki.poulose, chenl311, fengchengwen, maz,
	timothy.hayes, lpieralisi, arnd, gshan, jic23, dietmar.eggemann,
	sudeep.holla, pierre.gondois, linux-arm-kernel, linux-doc,
	linux-kernel
In-Reply-To: <20260610075202.3597031-1-ruanjinjie@huawei.com>

On arm64, when booting with `maxcpus` greater than the number of present
CPUs (e.g., QEMU -smp cpus=4,maxcpus=8), some CPUs are marked as 'present'
but have not yet been registered via register_cpu(). Consequently,
the per-cpu device objects for these CPUs are not yet initialized.

In cpuhp_smt_enable(), the code iterates over all present CPUs. Calling
_cpu_up() for these unregistered CPUs eventually leads to
sysfs_create_group() being called with a NULL kobject (or a kobject
without a directory), triggering the following warning in
fs/sysfs/group.c:

	if (WARN_ON(!kobj || (!update && !kobj->sd)))
		return -EINVAL;

When booting with ACPI, arm64 smp_prepare_cpus() currently sets all
enumerated CPUs as "present" regardless of their status in the MADT. This
causes issues with SMT hotplug control. For instance, with QEMU's
"-smp 4,maxcpus=8" configuration, the MADT GICC entries are populated as
follows:

1. The first four CPUs: `Enabled` set but `Online Capable` not set.

2. The remaining four CPUs: `Online Capable` set but `Enabled` not set
   to support potential hot-plugging.

Fix this by:

1. When booting with ACPI, checking the ACPI_MADT_ENABLED flag in the GICC
   entry before calling set_cpu_present() during SMP initialization.

2. Properly managing the present mask in acpi_map_cpu() and
   acpi_unmap_cpu() to support actual CPU hotplug events, This aligns with
   other architectures like x86 and LoongArch.

3. Update the arm64 CPU hotplug documentation to no longer state that all
   online-capable vCPUs are marked as present by the kernel at boot time.

This ensures that only physically available or explicitly enabled CPUs
are in the present mask, keeping the SMT control logic consistent with
the actual hardware state.

How to reproduce:

	1. echo off > /sys/devices/system/cpu/smt/control
		psci: CPU1 killed (polled 0 ms)
		psci: CPU3 killed (polled 0 ms)

	2. echo 2 > /sys/devices/system/cpu/smt/control

	Detected PIPT I-cache on CPU1
	GICv3: CPU1: found redistributor 1 region 0:0x00000000080c0000
	CPU1: Booted secondary processor 0x0000000001 [0x410fd082]
	Detected PIPT I-cache on CPU3
	GICv3: CPU3: found redistributor 3 region 0:0x0000000008100000
	CPU3: Booted secondary processor 0x0000000003 [0x410fd082]
	------------[ cut here ]------------
	WARNING: fs/sysfs/group.c:137 at internal_create_group+0x41c/0x4bc, CPU#2: sh/181
	Modules linked in:
	CPU: 2 UID: 0 PID: 181 Comm: sh Not tainted 7.0.0-rc1-00010-g8d13386c7624 #142 PREEMPT
	Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
	pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
	pc : internal_create_group+0x41c/0x4bc
	lr : sysfs_create_group+0x18/0x24
	sp : ffff80008078ba40
	x29: ffff80008078ba40 x28: ffff296c980ad000 x27: ffff00007fb94128
	x26: 0000000000000054 x25: ffffd693e845f3f0 x24: 0000000000000001
	x23: 0000000000000001 x22: 0000000000000004 x21: 0000000000000000
	x20: ffffd693e845fc10 x19: 0000000000000004 x18: 00000000ffffffff
	x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
	x14: 0000000000000358 x13: 0000000000000007 x12: 0000000000000350
	x11: 0000000000000008 x10: 0000000000000407 x9 : 0000000000000400
	x8 : ffff00007fbf3b60 x7 : 0000000000000000 x6 : ffffd693e845f3f0
	x5 : ffff00007fb94128 x4 : 0000000000000000 x3 : ffff000000f4eac0
	x2 : ffffd693e7095a08 x1 : 0000000000000000 x0 : 0000000000000000
	Call trace:
	 internal_create_group+0x41c/0x4bc (P)
	 sysfs_create_group+0x18/0x24
	 topology_add_dev+0x1c/0x28
	 cpuhp_invoke_callback+0x104/0x20c
	 __cpuhp_invoke_callback_range+0x94/0x11c
	 _cpu_up+0x200/0x37c
	 cpuhp_smt_enable+0xbc/0x114
	 control_store+0xe8/0x1d4
	 dev_attr_store+0x18/0x2c
	 sysfs_kf_write+0x7c/0x94
	 kernfs_fop_write_iter+0x128/0x1b8
	 vfs_write+0x2b0/0x354
	 ksys_write+0x68/0xfc
	 __arm64_sys_write+0x1c/0x28
	 invoke_syscall+0x48/0x10c
	 el0_svc_common.constprop.0+0x40/0xe8
	 do_el0_svc+0x20/0x2c
	 el0_svc+0x34/0x124
	 el0t_64_sync_handler+0xa0/0xe4
	 el0t_64_sync+0x198/0x19c
	---[ end trace 0000000000000000 ]---

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Yicong Yang <yangyicong@hisilicon.com>
Cc: stable@vger.kernel.org
Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#gic-cpu-interface-gicc-structure
Fixes: eed4583bcf9a6 ("arm64: Kconfig: Enable HOTPLUG_SMT")
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
v4:
- Udpate the arm64 cpu-hotplug documentation to make it more clear.
v3:
- Update the arm64 cpu-hotplug documentation as Catalin suggested.
- Update the commit message.
v2:
- Update the fix way.
---
 Documentation/arch/arm64/cpu-hotplug.rst | 28 ++++++++++++++----------
 arch/arm64/kernel/acpi.c                 |  2 ++
 arch/arm64/kernel/smp.c                  | 12 +++++++++-
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/Documentation/arch/arm64/cpu-hotplug.rst b/Documentation/arch/arm64/cpu-hotplug.rst
index 8fb438bf7781..7c3379b704aa 100644
--- a/Documentation/arch/arm64/cpu-hotplug.rst
+++ b/Documentation/arch/arm64/cpu-hotplug.rst
@@ -47,11 +47,12 @@ ever have can be described at boot. There are no power-domain considerations
 as such devices are emulated.
 
 CPU Hotplug on virtual systems is supported. It is distinct from physical
-CPU Hotplug as all resources are described as ``present``, but CPUs may be
-marked as disabled by firmware. Only the CPU's online/offline behaviour is
-influenced by firmware. An example is where a virtual machine boots with a
-single CPU, and additional CPUs are added once a cloud orchestrator deploys
-the workload.
+CPU Hotplug as all vCPU resources are statically described in the firmware
+configuration tables (e.g. MADT), meaning their maximum possible count is
+known at boot. However, vCPUs that are not enabled at boot are not marked
+as ``present`` by the kernel until they are hotplugged. An example is where
+a virtual machine boots with a single CPU, and additional CPUs are added
+once a cloud orchestrator deploys the workload.
 
 For a virtual machine, the VMM (e.g. Qemu) plays the part of firmware.
 
@@ -60,16 +61,19 @@ brought online. Firmware can enforce its policy via PSCI's return codes. e.g.
 ``DENIED``.
 
 The ACPI tables must describe all the resources of the virtual machine. CPUs
-that firmware wishes to disable either from boot (or later) should not be
-``enabled`` in the MADT GICC structures, but should have the ``online capable``
-bit set, to indicate they can be enabled later. The boot CPU must be marked as
-``enabled``.  The 'always on' GICR structure must be used to describe the
-redistributors.
+that are hot-pluggable must have the ``online capable`` bit set and the
+``enabled`` bit cleared in the MADT GICC structures to indicate they can be
+enabled later. The boot CPU must be marked as ``enabled`` with its
+``online capable`` bit cleared. The 'always on' GICR structure must be used
+to describe the redistributors.
 
 CPUs described as ``online capable`` but not ``enabled`` can be set to enabled
 by the DSDT's Processor object's _STA method. On virtual systems the _STA method
-must always report the CPU as ``present``. Changes to the firmware policy can
-be notified to the OS via device-check or eject-request.
+must always set the ``ACPI_STA_DEVICE_PRESENT`` bit, while toggling the
+``ACPI_STA_DEVICE_ENABLED`` bit to reflect its plug status. The kernel will
+then dynamically mark the vCPU as ``present`` within the OS when the
+``ACPI_STA_DEVICE_ENABLED`` bit becomes set during hot-add. Changes to the
+firmware policy can be notified to the OS via device-check or eject-request.
 
 CPUs described as ``enabled`` in the static table, should not have their _STA
 modified dynamically by firmware. Soft-restart features such as kexec will
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 5891f92c2035..681aa2bbc399 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -448,12 +448,14 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 apci_id,
 		return *pcpu;
 	}
 
+	set_cpu_present(*pcpu, true);
 	return 0;
 }
 EXPORT_SYMBOL(acpi_map_cpu);
 
 int acpi_unmap_cpu(int cpu)
 {
+	set_cpu_present(cpu, false);
 	return 0;
 }
 EXPORT_SYMBOL(acpi_unmap_cpu);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 543d3459f6e3..24f8448e1fbb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -556,6 +556,11 @@ struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu)
 }
 EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc);
 
+static bool acpi_cpu_is_present(int cpu)
+{
+	return acpi_cpu_get_madt_gicc(cpu)->flags & ACPI_MADT_ENABLED;
+}
+
 /*
  * acpi_map_gic_cpu_interface - parse processor MADT entry
  *
@@ -660,6 +665,10 @@ static void __init acpi_parse_and_init_cpus(void)
 		early_map_cpu_to_node(i, acpi_numa_get_nid(i));
 }
 #else
+static bool acpi_cpu_is_present(int cpu)
+{
+	return false;
+}
 #define acpi_parse_and_init_cpus(...)	do { } while (0)
 #endif
 
@@ -798,7 +807,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		if (err)
 			continue;
 
-		set_cpu_present(cpu, true);
+		if (acpi_disabled || acpi_cpu_is_present(cpu))
+			set_cpu_present(cpu, true);
 		numa_store_cpu_info(cpu);
 	}
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 1/2] arm64: smp: Fix hot-unplug tearing by forcing unregistration
From: Jinjie Ruan @ 2026-06-10  7:52 UTC (permalink / raw)
  To: catalin.marinas, will, corbet, skhan, punit.agrawal, ruanjinjie,
	mrigendra.chaubey, suzuki.poulose, chenl311, fengchengwen, maz,
	timothy.hayes, lpieralisi, arnd, gshan, jic23, dietmar.eggemann,
	sudeep.holla, pierre.gondois, linux-arm-kernel, linux-doc,
	linux-kernel
In-Reply-To: <20260610075202.3597031-1-ruanjinjie@huawei.com>

Sashiko review pointed out the following issue[1].

Commit eba4675008a6 ("arm64: arch_register_cpu() variant to check if
an ACPI handle is now available.") introduced architectural safety
blocks inside arch_unregister_cpu(). If a hot-unplug operation is
determined to be a physical hardware removal (where _STA evaluates to
!ACPI_STA_DEVICE_PRESENT), or if firmware evaluation fails, it aborts
the unregistration transaction early to protect unreadied arm64
infrastructure.

However, returning early from arch_unregister_cpu() causes a catastrophic
state tearing because the generic ACPI layer (acpi_processor_post_eject())
unconditionally continues its cleanup flow. This leaves the stale sysfs
device leaked in the memory, deadlocking any subsequent hot-add attempts
on the same CPU.

Fix it by simplifying arch_unregister_cpu() to always proceed with
the unregistration, as a pr_err_once() warning is sufficient to make
it more visible for currently not supported physical CPU removal.
Also remove the redundant NULL check on acpi_handle as it cannot be
NULL when calling arch_unregister_cpu().

[1]: https://sashiko.dev/#/patchset/20260520022023.126670-1-ruanjinjie@huawei.com
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: stable@vger.kernel.org
Fixes: eba4675008a6e ("arm64: arch_register_cpu() variant to check if an ACPI handle is now available.")
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jinjie Ruan <ruanjinjie@huawei.com>
---
 arch/arm64/kernel/smp.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 1aa324104afb..543d3459f6e3 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -535,23 +535,13 @@ void arch_unregister_cpu(int cpu)
 {
 	acpi_handle acpi_handle = acpi_get_processor_handle(cpu);
 	struct cpu *c = &per_cpu(cpu_devices, cpu);
-	acpi_status status;
 	unsigned long long sta;
-
-	if (!acpi_handle) {
-		pr_err_once("Removing a CPU without associated ACPI handle\n");
-		return;
-	}
+	acpi_status status;
 
 	status = acpi_evaluate_integer(acpi_handle, "_STA", NULL, &sta);
-	if (ACPI_FAILURE(status))
-		return;
-
-	/* For now do not allow anything that looks like physical CPU HP */
-	if (cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT)) {
+	if (!ACPI_FAILURE(status) &&
+	    cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT))
 		pr_err_once("Changing CPU present bit is not supported\n");
-		return;
-	}
 
 	unregister_cpu(c);
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 0/2] arm64: acpi: Fix NULL kobject warning in cpuhp_smt_enable()
From: Jinjie Ruan @ 2026-06-10  7:52 UTC (permalink / raw)
  To: catalin.marinas, will, corbet, skhan, punit.agrawal, ruanjinjie,
	mrigendra.chaubey, suzuki.poulose, chenl311, fengchengwen, maz,
	timothy.hayes, lpieralisi, arnd, gshan, jic23, dietmar.eggemann,
	sudeep.holla, pierre.gondois, linux-arm-kernel, linux-doc,
	linux-kernel

Fix NULL kobject warning in cpuhp_smt_enable().

Change in v4:
- Fix hot-unplug tearing as Sashiko AI code review pointed out
  by forcing unregistration.
- Update the arm64 hotplug doc.
- Update the commit message.

Jinjie Ruan (2):
  arm64: smp: Fix hot-unplug tearing by forcing unregistration
  cpu/hotplug: Fix NULL kobject warning in cpuhp_smt_enable()

 Documentation/arch/arm64/cpu-hotplug.rst | 28 ++++++++++++++----------
 arch/arm64/kernel/acpi.c                 |  2 ++
 arch/arm64/kernel/smp.c                  | 28 ++++++++++++------------
 3 files changed, 32 insertions(+), 26 deletions(-)

-- 
2.34.1


^ permalink raw reply

* Re: [PATCH] hwmon: (pmbus/max34440): add support adpm12250
From: Nuno Sá @ 2026-06-10  8:23 UTC (permalink / raw)
  To: Alexis Czezar Torreno, Guenter Roeck, Jonathan Corbet, Shuah Khan
  Cc: linux-hwmon, linux-doc, linux-kernel
In-Reply-To: <20260610-dev-adpm12250-v1-1-422760bb80da@analog.com>

On Wed, 2026-06-10 at 09:12 +0800, Alexis Czezar Torreno wrote:
> ADPM12250 is a quarter brick DC/DC Power Module. It is a high power
> non-isolated converter capable of delivering regulated 12V with
> continuous power level of 2500W. Uses PMBus.
> 
> Signed-off-by: Alexis Czezar Torreno <alexisczezar.torreno@analog.com>
> ---
> ADPM12250 is a quarter brick DC/DC Power Module. It is a high power
> non-isolated converter capable of delivering regulated 12V with continuous
> power level of 2500W. Uses PMBus.
> ---

Reviewed-by: Nuno Sá <nuno.sa@analog.com>

>  Documentation/hwmon/max34440.rst | 27 ++++++++++++++++--------
>  drivers/hwmon/pmbus/max34440.c   | 45 +++++++++++++++++++++++++++++++++++++---
>  2 files changed, 60 insertions(+), 12 deletions(-)
> 
> diff --git a/Documentation/hwmon/max34440.rst b/Documentation/hwmon/max34440.rst
> index
> d6d4fbc863d96c1008a1971d3e3245d9ce1ef688..e7421f4dbf38fc1436bbaeba71d4461a00f8cefb
> 100644
> --- a/Documentation/hwmon/max34440.rst
> +++ b/Documentation/hwmon/max34440.rst
> @@ -19,6 +19,14 @@ Supported chips:
>  
>      Datasheet: -
>  
> +  * ADI ADPM12250
> +
> +    Prefixes: 'adpm12250'
> +
> +    Addresses scanned: -
> +
> +    Datasheet: -
> +
>    * Maxim MAX34440
>  
>      Prefixes: 'max34440'
> @@ -87,11 +95,11 @@ This driver supports multiple devices: hardware monitoring for
> Maxim MAX34440
>  PMBus 6-Channel Power-Supply Manager, MAX34441 PMBus 5-Channel Power-Supply
>  Manager and Intelligent Fan Controller, and MAX34446 PMBus Power-Supply Data
>  Logger; PMBus Voltage Monitor and Sequencers for MAX34451, MAX34460, and
> -MAX34461; PMBus DC/DC Power Module ADPM12160, and ADPM12200. The MAX34451
> -supports monitoring voltage or current of 12 channels based on GIN pins. The
> -MAX34460 supports 12 voltage channels, and the MAX34461 supports 16 voltage
> -channels. The ADPM12160, and ADPM12200 also monitors both input and output
> -of voltage and current.
> +MAX34461; PMBus DC/DC Power Module ADPM12160, ADPM12200, and ADPM12250. The
> +MAX34451 supports monitoring voltage or current of 12 channels based on GIN
> +pins. The MAX34460 supports 12 voltage channels, and the MAX34461 supports 16
> +voltage channels. The ADPM12160, ADPM12200, and ADPM12250 also monitors both
> +input and output of voltage and current.
>  
>  The driver is a client driver to the core PMBus driver. Please see
>  Documentation/hwmon/pmbus.rst for details on PMBus client drivers.
> @@ -149,7 +157,7 @@ in[1-6]_reset_history	Write any value to reset history.
>  .. note::
>  
>      - MAX34446 only supports in[1-4].
> -    - ADPM12160, and ADPM12200 only supports in[1-2]. Label is "vin1"
> +    - ADPM12160, ADPM12200, and ADPM12250 only supports in[1-2]. Label is "vin1"
>        and "vout1" respectively.
>  
>  Curr
> @@ -172,8 +180,9 @@ curr[1-6]_reset_history	Write any value to reset history.
>  
>      - in6 and curr6 attributes only exist for MAX34440.
>      - MAX34446 only supports curr[1-4].
> -    - For ADPM12160, and ADPM12200, curr[1] is "iin1" and curr[2-6]
> -      are "iout[1-5]".
> +    - For ADPM12160, ADPM12200, and ADPM12250, curr[1] is "iin1"
> +    - For ADPM12160, and ADPM12200 curr[2-6] are "iout[1-5]".
> +    - For ADPM12250, curr[2-4] are "iout[1-3]".
>  
>  Power
>  ~~~~~
> @@ -209,7 +218,7 @@ temp[1-8]_reset_history	Write any value to reset history.
>  .. note::
>     - temp7 and temp8 attributes only exist for MAX34440.
>     - MAX34446 only supports temp[1-3].
> -   - ADPM12160, and ADPM12200 only supports temp[1].
> +   - ADPM12160, ADPM12200, and ADPM12250 only supports temp[1].
>  
>  
>  .. note::
> diff --git a/drivers/hwmon/pmbus/max34440.c b/drivers/hwmon/pmbus/max34440.c
> index
> 4525b9fc56267479534251a1444aa09181615ac6..74876d2207fbe4014b8b54a9fd9682370fc3bbed
> 100644
> --- a/drivers/hwmon/pmbus/max34440.c
> +++ b/drivers/hwmon/pmbus/max34440.c
> @@ -18,6 +18,7 @@
>  enum chips {
>  	adpm12160,
>  	adpm12200,
> +	adpm12250,
>  	max34440,
>  	max34441,
>  	max34446,
> @@ -97,7 +98,8 @@ static int max34440_read_word_data(struct i2c_client *client, int
> page,
>  		break;
>  	case PMBUS_VIRT_READ_IOUT_AVG:
>  		if (data->id != max34446 && data->id != max34451 &&
> -		    data->id != adpm12160 && data->id != adpm12200)
> +		    data->id != adpm12160 && data->id != adpm12200 &&
> +		    data->id != adpm12250)
>  			return -ENXIO;
>  		ret = pmbus_read_word_data(client, page, phase,
>  					   MAX34446_MFR_IOUT_AVG);
> @@ -182,7 +184,8 @@ static int max34440_write_word_data(struct i2c_client *client,
> int page,
>  		ret = pmbus_write_word_data(client, page,
>  					    MAX34440_MFR_IOUT_PEAK, 0);
>  		if (!ret && (data->id == max34446 || data->id == max34451 ||
> -			     data->id == adpm12160 || data->id == adpm12200))
> +			     data->id == adpm12160 || data->id == adpm12200 ||
> +			     data->id == adpm12250))
>  			ret = pmbus_write_word_data(client, page,
>  					MAX34446_MFR_IOUT_AVG, 0);
>  
> @@ -399,6 +402,40 @@ static struct pmbus_driver_info max34440_info[] = {
>  		.read_word_data = max34440_read_word_data,
>  		.write_word_data = max34440_write_word_data,
>  	},
> +	[adpm12250] = {
> +		.pages = 19,
> +		.format[PSC_VOLTAGE_IN] = direct,
> +		.format[PSC_VOLTAGE_OUT] = direct,
> +		.format[PSC_CURRENT_IN] = direct,
> +		.format[PSC_CURRENT_OUT] = direct,
> +		.format[PSC_TEMPERATURE] = direct,
> +		.m[PSC_VOLTAGE_IN] = 125,
> +		.b[PSC_VOLTAGE_IN] = 0,
> +		.R[PSC_VOLTAGE_IN] = 0,
> +		.m[PSC_VOLTAGE_OUT] = 125,
> +		.b[PSC_VOLTAGE_OUT] = 0,
> +		.R[PSC_VOLTAGE_OUT] = 0,
> +		.m[PSC_CURRENT_IN] = 250,
> +		.b[PSC_CURRENT_IN] = 0,
> +		.R[PSC_CURRENT_IN] = -1,
> +		.m[PSC_CURRENT_OUT] = 250,
> +		.b[PSC_CURRENT_OUT] = 0,
> +		.R[PSC_CURRENT_OUT] = -1,
> +		.m[PSC_TEMPERATURE] = 1,
> +		.b[PSC_TEMPERATURE] = 0,
> +		.R[PSC_TEMPERATURE] = 2,
> +		/* absent func below [18] are not for monitoring */
> +		.func[2] = PMBUS_HAVE_VOUT | PMBUS_HAVE_STATUS_VOUT,
> +		.func[4] = PMBUS_HAVE_STATUS_IOUT,
> +		.func[5] = PMBUS_HAVE_IOUT | PMBUS_HAVE_STATUS_IOUT,
> +		.func[6] = PMBUS_HAVE_IOUT | PMBUS_HAVE_STATUS_IOUT,
> +		.func[9] = PMBUS_HAVE_VIN | PMBUS_HAVE_STATUS_INPUT,
> +		.func[10] = PMBUS_HAVE_IIN | PMBUS_HAVE_STATUS_INPUT,
> +		.func[14] = PMBUS_HAVE_IOUT,
> +		.func[18] = PMBUS_HAVE_TEMP | PMBUS_HAVE_STATUS_TEMP,
> +		.read_word_data = max34440_read_word_data,
> +		.write_word_data = max34440_write_word_data,
> +	},
>  	[max34440] = {
>  		.pages = 14,
>  		.format[PSC_VOLTAGE_IN] = direct,
> @@ -635,7 +672,8 @@ static int max34440_probe(struct i2c_client *client)
>  		rv = max34451_set_supported_funcs(client, data);
>  		if (rv)
>  			return rv;
> -	} else if (data->id == adpm12160 || data->id == adpm12200) {
> +	} else if (data->id == adpm12160 || data->id == adpm12200 ||
> +		   data->id == adpm12250) {
>  		data->iout_oc_fault_limit = PMBUS_IOUT_OC_FAULT_LIMIT;
>  		data->iout_oc_warn_limit = PMBUS_IOUT_OC_WARN_LIMIT;
>  	}
> @@ -646,6 +684,7 @@ static int max34440_probe(struct i2c_client *client)
>  static const struct i2c_device_id max34440_id[] = {
>  	{ .name = "adpm12160", .driver_data = adpm12160 },
>  	{ .name = "adpm12200", .driver_data = adpm12200 },
> +	{ .name = "adpm12250", .driver_data = adpm12250 },
>  	{ .name = "max34440", .driver_data = max34440 },
>  	{ .name = "max34441", .driver_data = max34441 },
>  	{ .name = "max34446", .driver_data = max34446 },
> 
> ---
> base-commit: 1723bc01ecc7ca2f30272685121314379ba5eb18
> change-id: 20260610-dev-adpm12250-4ce6fc8c82ac
> 
> Best regards,

^ permalink raw reply

* Re: [PATCH v4 2/2] iio: dac: Add AD5529R DAC driver support
From: Nuno Sá @ 2026-06-10  8:47 UTC (permalink / raw)
  To: Janani Sunil, Lars-Peter Clausen, Michael Hennerich,
	Jonathan Cameron, David Lechner, Nuno Sá, Andy Shevchenko,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Philipp Zabel,
	Jonathan Corbet, Shuah Khan
  Cc: linux-iio, devicetree, linux-kernel, linux-doc, Janani Sunil
In-Reply-To: <20260609-ad5529r-driver-v4-2-2e4c02234a1a@analog.com>

On Tue, 2026-06-09 at 17:00 +0200, Janani Sunil wrote:
> Add support for AD5529R 16-channel, 12/16 bit Digital to Analog Converter
> 
> Signed-off-by: Janani Sunil <janani.sunil@analog.com>
> ---

Nothing to add in addition to Andy's comment. With:

* Sashiko point fixed
* And the excess of parenthesis that Andy pointed out

Reviewed-by: Nuno Sá <nuno.sa@analog.com>

(The rest is up to you and Andy :))

>  MAINTAINERS               |   1 +
>  drivers/iio/dac/Kconfig   |  17 ++
>  drivers/iio/dac/Makefile  |   1 +
>  drivers/iio/dac/ad5529r.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 536 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 320e84765ce6..143714e27d51 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1513,6 +1513,7 @@ L:	linux-iio@vger.kernel.org
>  S:	Supported
>  W:	https://ez.analog.com/linux-software-drivers
>  F:	Documentation/devicetree/bindings/iio/dac/adi,ad5529r.yaml
> +F:	drivers/iio/dac/ad5529r.c
>  
>  ANALOG DEVICES INC AD5706R DRIVER
>  M:	Alexis Czezar Torreno <alexisczezar.torreno@analog.com>
> diff --git a/drivers/iio/dac/Kconfig b/drivers/iio/dac/Kconfig
> index 657c68e75542..bb1d59889a2a 100644
> --- a/drivers/iio/dac/Kconfig
> +++ b/drivers/iio/dac/Kconfig
> @@ -134,6 +134,23 @@ config AD5449
>  	  To compile this driver as a module, choose M here: the
>  	  module will be called ad5449.
>  
> +config AD5529R
> +	tristate "Analog Devices AD5529R High Voltage DAC driver"
> +	depends on SPI_MASTER
> +	select REGMAP_SPI
> +	help
> +	  Say yes here to build support for Analog Devices AD5529R
> +	  16-Channel, 12-Bit/16-Bit, 40V High Voltage Precision Digital to Analog
> +	  Converter.
> +
> +	  The device features multiple output voltage ranges from -20V to +20V,
> +	  built-in 4.096V voltage reference, and digital functions including
> +	  toggle, dither, and ramp modes. Supports both 12-bit and 16-bit
> +	  resolution variants.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called ad5529r.
> +
>  config AD5592R_BASE
>  	tristate
>  
> diff --git a/drivers/iio/dac/Makefile b/drivers/iio/dac/Makefile
> index 003431798498..f35e060b3643 100644
> --- a/drivers/iio/dac/Makefile
> +++ b/drivers/iio/dac/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_AD5446) += ad5446.o
>  obj-$(CONFIG_AD5446_SPI) += ad5446-spi.o
>  obj-$(CONFIG_AD5446_I2C) += ad5446-i2c.o
>  obj-$(CONFIG_AD5449) += ad5449.o
> +obj-$(CONFIG_AD5529R) += ad5529r.o
>  obj-$(CONFIG_AD5592R_BASE) += ad5592r-base.o
>  obj-$(CONFIG_AD5592R) += ad5592r.o
>  obj-$(CONFIG_AD5593R) += ad5593r.o
> diff --git a/drivers/iio/dac/ad5529r.c b/drivers/iio/dac/ad5529r.c
> new file mode 100644
> index 000000000000..d2d0287d0f95
> --- /dev/null
> +++ b/drivers/iio/dac/ad5529r.c
> @@ -0,0 +1,517 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * AD5529R Digital-to-Analog Converter Driver
> + * 16-Channel, 12/16-Bit, 40V High Voltage Precision DAC
> + *
> + * Copyright 2026 Analog Devices Inc.
> + * Author: Janani Sunil <janani.sunil@analog.com>
> + */
> +
> +#include <linux/array_size.h>
> +#include <linux/bits.h>
> +#include <linux/delay.h>
> +#include <linux/dev_printk.h>
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/iio/iio.h>
> +#include <linux/mod_devicetable.h>
> +#include <linux/module.h>
> +#include <linux/property.h>
> +#include <linux/regmap.h>
> +#include <linux/regulator/consumer.h>
> +#include <linux/reset.h>
> +#include <linux/spi/spi.h>
> +
> +#define AD5529R_REG_INTERFACE_CONFIG_A		0x00
> +#define AD5529R_REG_DEVICE_CONFIG		0x02
> +#define AD5529R_REG_CHIP_GRADE			0x06
> +#define AD5529R_REG_SCRATCH_PAD			0x0A
> +#define AD5529R_REG_SPI_REVISION		0x0B
> +#define AD5529R_REG_VENDOR_H			0x0D
> +#define AD5529R_REG_STREAM_MODE			0x0E
> +#define AD5529R_REG_INTERFACE_STATUS_A		0x11
> +#define AD5529R_REG_MULTI_DAC_CH_SEL		0x14
> +#define AD5529R_REG_OUT_RANGE_BASE		0x3C
> +#define AD5529R_REG_OUT_RANGE(ch)		(AD5529R_REG_OUT_RANGE_BASE + (ch)
> * 2)
> +#define AD5529R_REG_DAC_INPUT_A_BASE		0x148
> +#define AD5529R_REG_DAC_INPUT_A(ch)		(AD5529R_REG_DAC_INPUT_A_BASE +
> (ch) * 2)
> +#define AD5529R_REG_DAC_DATA_READBACK_BASE	0x16A
> +#define AD5529R_REG_TSENS_ALERT_FLAG		0x18C
> +#define AD5529R_REG_TSENS_SHTD_FLAG		0x18E
> +#define AD5529R_REG_FUNC_BUSY			0x1A0
> +#define AD5529R_REG_REF_SEL			0x1A2
> +#define AD5529R_REG_INIT_CRC_ERR_STAT		0x1A4
> +#define AD5529R_REG_MULTI_DAC_HOTPATH_SW_LDAC	0x1A8
> +
> +#define   AD5529R_INTERFACE_CONFIG_A_SW_RESET	(BIT(7) | BIT(0))
> +#define   AD5529R_INTERFACE_CONFIG_A_ADDR_ASCENSION	BIT(5)
> +#define   AD5529R_INTERFACE_CONFIG_A_SDO_ENABLE	BIT(4)
> +#define   AD5529R_REF_SEL_INTERNAL_REF		BIT(0)
> +#define   AD5529R_MAX_REGISTER			0x232
> +#define   AD5529R_8BIT_REG_MAX			0x13
> +#define   AD5529R_SPI_READ_FLAG			0x80
> +
> +struct ad5529r_model_data {
> +	const char *model_name;
> +	unsigned int resolution;
> +	const struct iio_chan_spec *channels;
> +	unsigned int num_channels;
> +};
> +
> +#define AD5529R_DAC_CHANNEL(chan, bits) {			\
> +	.type = IIO_VOLTAGE,					\
> +	.indexed = 1,						\
> +	.output = 1,						\
> +	.channel = (chan),					\
> +	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |		\
> +			      BIT(IIO_CHAN_INFO_SCALE) |	\
> +			      BIT(IIO_CHAN_INFO_OFFSET),	\
> +	.scan_type = {						\
> +		.format = 'u',					\
> +		.realbits = (bits),				\
> +		.storagebits = 16,				\
> +	},							\
> +}
> +
> +static const char * const ad5529r_supply_names[] = {
> +	"vdd",
> +	"avdd",
> +	"hvdd",
> +};
> +
> +static const struct iio_chan_spec ad5529r_channels_16bit[] = {
> +	AD5529R_DAC_CHANNEL(0, 16),
> +	AD5529R_DAC_CHANNEL(1, 16),
> +	AD5529R_DAC_CHANNEL(2, 16),
> +	AD5529R_DAC_CHANNEL(3, 16),
> +	AD5529R_DAC_CHANNEL(4, 16),
> +	AD5529R_DAC_CHANNEL(5, 16),
> +	AD5529R_DAC_CHANNEL(6, 16),
> +	AD5529R_DAC_CHANNEL(7, 16),
> +	AD5529R_DAC_CHANNEL(8, 16),
> +	AD5529R_DAC_CHANNEL(9, 16),
> +	AD5529R_DAC_CHANNEL(10, 16),
> +	AD5529R_DAC_CHANNEL(11, 16),
> +	AD5529R_DAC_CHANNEL(12, 16),
> +	AD5529R_DAC_CHANNEL(13, 16),
> +	AD5529R_DAC_CHANNEL(14, 16),
> +	AD5529R_DAC_CHANNEL(15, 16),
> +};
> +
> +static const struct iio_chan_spec ad5529r_channels_12bit[] = {
> +	AD5529R_DAC_CHANNEL(0, 12),
> +	AD5529R_DAC_CHANNEL(1, 12),
> +	AD5529R_DAC_CHANNEL(2, 12),
> +	AD5529R_DAC_CHANNEL(3, 12),
> +	AD5529R_DAC_CHANNEL(4, 12),
> +	AD5529R_DAC_CHANNEL(5, 12),
> +	AD5529R_DAC_CHANNEL(6, 12),
> +	AD5529R_DAC_CHANNEL(7, 12),
> +	AD5529R_DAC_CHANNEL(8, 12),
> +	AD5529R_DAC_CHANNEL(9, 12),
> +	AD5529R_DAC_CHANNEL(10, 12),
> +	AD5529R_DAC_CHANNEL(11, 12),
> +	AD5529R_DAC_CHANNEL(12, 12),
> +	AD5529R_DAC_CHANNEL(13, 12),
> +	AD5529R_DAC_CHANNEL(14, 12),
> +	AD5529R_DAC_CHANNEL(15, 12),
> +};
> +
> +static const struct ad5529r_model_data ad5529r_16bit_model_data = {
> +	.model_name = "ad5529r-16",
> +	.resolution = 16,
> +	.channels = ad5529r_channels_16bit,
> +	.num_channels = ARRAY_SIZE(ad5529r_channels_16bit),
> +};
> +
> +static const struct ad5529r_model_data ad5529r_12bit_model_data = {
> +	.model_name = "ad5529r-12",
> +	.resolution = 12,
> +	.channels = ad5529r_channels_12bit,
> +	.num_channels = ARRAY_SIZE(ad5529r_channels_12bit),
> +};
> +
> +enum ad5529r_output_range {
> +	AD5529R_RANGE_0V_5V,
> +	AD5529R_RANGE_0V_10V,
> +	AD5529R_RANGE_0V_20V,
> +	AD5529R_RANGE_0V_40V,
> +	AD5529R_RANGE_NEG5V_5V,
> +	AD5529R_RANGE_NEG10V_10V,
> +	AD5529R_RANGE_NEG15V_15V,
> +	AD5529R_RANGE_NEG20V_20V,
> +};
> +
> +static const s32 ad5529r_output_ranges_mv[8][2] = {
> +	[AD5529R_RANGE_0V_5V] = { 0, 5000 },
> +	[AD5529R_RANGE_0V_10V] = { 0, 10000 },
> +	[AD5529R_RANGE_0V_20V] = { 0, 20000 },
> +	[AD5529R_RANGE_0V_40V] = { 0, 40000 },
> +	[AD5529R_RANGE_NEG5V_5V] = { -5000, 5000 },
> +	[AD5529R_RANGE_NEG10V_10V] = { -10000, 10000 },
> +	[AD5529R_RANGE_NEG15V_15V] = { -15000, 15000 },
> +	[AD5529R_RANGE_NEG20V_20V] = { -20000, 20000 },
> +};
> +
> +struct ad5529r_state {
> +	struct spi_device *spi;
> +	const struct ad5529r_model_data *model_data;
> +	struct regmap *regmap_8bit;
> +	struct regmap *regmap_16bit;
> +	enum ad5529r_output_range output_range_idx[16];
> +};
> +
> +static const struct regmap_range ad5529r_8bit_readable_ranges[] = {
> +	regmap_reg_range(AD5529R_REG_INTERFACE_CONFIG_A, AD5529R_REG_CHIP_GRADE),
> +	regmap_reg_range(AD5529R_REG_SCRATCH_PAD, AD5529R_REG_VENDOR_H),
> +	regmap_reg_range(AD5529R_REG_STREAM_MODE, AD5529R_REG_INTERFACE_STATUS_A),
> +};
> +
> +static const struct regmap_range ad5529r_16bit_readable_ranges[] = {
> +	regmap_reg_range(AD5529R_REG_MULTI_DAC_CH_SEL,
> AD5529R_REG_INIT_CRC_ERR_STAT),
> +	regmap_reg_range(AD5529R_REG_MULTI_DAC_HOTPATH_SW_LDAC,
> AD5529R_MAX_REGISTER),
> +};
> +
> +static const struct regmap_access_table ad5529r_8bit_readable_table = {
> +	.yes_ranges = ad5529r_8bit_readable_ranges,
> +	.n_yes_ranges = ARRAY_SIZE(ad5529r_8bit_readable_ranges),
> +};
> +
> +static const struct regmap_access_table ad5529r_16bit_readable_table = {
> +	.yes_ranges = ad5529r_16bit_readable_ranges,
> +	.n_yes_ranges = ARRAY_SIZE(ad5529r_16bit_readable_ranges),
> +};
> +
> +static const struct regmap_range ad5529r_8bit_read_only_ranges[] = {
> +	regmap_reg_range(AD5529R_REG_DEVICE_CONFIG, AD5529R_REG_CHIP_GRADE),
> +	regmap_reg_range(AD5529R_REG_SPI_REVISION, AD5529R_REG_VENDOR_H),
> +};
> +
> +static const struct regmap_range ad5529r_16bit_read_only_ranges[] = {
> +	regmap_reg_range(AD5529R_REG_DAC_DATA_READBACK_BASE,
> +			 (AD5529R_REG_DAC_DATA_READBACK_BASE + 15 * 2)),
> +	regmap_reg_range(AD5529R_REG_TSENS_ALERT_FLAG,
> AD5529R_REG_TSENS_SHTD_FLAG),
> +	regmap_reg_range(AD5529R_REG_FUNC_BUSY, AD5529R_REG_FUNC_BUSY),
> +	regmap_reg_range(AD5529R_REG_INIT_CRC_ERR_STAT,
> AD5529R_REG_INIT_CRC_ERR_STAT),
> +};
> +
> +static const struct regmap_access_table ad5529r_8bit_writeable_table = {
> +	.no_ranges = ad5529r_8bit_read_only_ranges,
> +	.n_no_ranges = ARRAY_SIZE(ad5529r_8bit_read_only_ranges),
> +};
> +
> +static const struct regmap_access_table ad5529r_16bit_writeable_table = {
> +	.no_ranges = ad5529r_16bit_read_only_ranges,
> +	.n_no_ranges = ARRAY_SIZE(ad5529r_16bit_read_only_ranges),
> +};
> +
> +static const struct regmap_config ad5529r_regmap_8bit_config = {
> +	.name = "ad5529r-8bit",
> +	.reg_bits = 16,
> +	.val_bits = 8,
> +	.max_register = AD5529R_8BIT_REG_MAX,
> +	.read_flag_mask = AD5529R_SPI_READ_FLAG,
> +	.rd_table = &ad5529r_8bit_readable_table,
> +	.wr_table = &ad5529r_8bit_writeable_table,
> +};
> +
> +static const struct regmap_config ad5529r_regmap_16bit_config = {
> +	.name = "ad5529r-16bit",
> +	.reg_bits = 16,
> +	.val_bits = 16,
> +	.max_register = AD5529R_MAX_REGISTER,
> +	.read_flag_mask = AD5529R_SPI_READ_FLAG,
> +	.val_format_endian = REGMAP_ENDIAN_LITTLE,
> +	.rd_table = &ad5529r_16bit_readable_table,
> +	.wr_table = &ad5529r_16bit_writeable_table,
> +	.reg_stride = 2,
> +};
> +
> +static struct regmap *ad5529r_get_regmap(struct ad5529r_state *st,
> +					 unsigned int reg)
> +{
> +	if (reg <= AD5529R_8BIT_REG_MAX)
> +		return st->regmap_8bit;
> +
> +	return st->regmap_16bit;
> +}
> +
> +static int ad5529r_reset(struct ad5529r_state *st)
> +{
> +	struct reset_control *rst;
> +	int ret;
> +
> +	rst = devm_reset_control_get_optional_exclusive(&st->spi->dev, NULL);
> +	if (IS_ERR(rst))
> +		return PTR_ERR(rst);
> +
> +	if (rst) {
> +		ret = reset_control_deassert(rst);
> +		if (ret)
> +			return ret;
> +	} else {
> +		ret = regmap_write(st->regmap_8bit,
> AD5529R_REG_INTERFACE_CONFIG_A,
> +				   AD5529R_INTERFACE_CONFIG_A_SW_RESET);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	/*
> +	 * Wait 10 ms for digital initialization to complete.
> +	 * Per datasheet, Interface Status A register NOT_READY_ERR bit is
> +	 * set if SPI transactions are attempted before digital initialization
> +	 * completes.
> +	 */
> +	fsleep(10000);
> +
> +	return regmap_write(st->regmap_8bit, AD5529R_REG_INTERFACE_CONFIG_A,
> +			    AD5529R_INTERFACE_CONFIG_A_SDO_ENABLE |
> +			    AD5529R_INTERFACE_CONFIG_A_ADDR_ASCENSION);
> +}
> +
> +static int ad5529r_read_raw(struct iio_dev *indio_dev,
> +			    struct iio_chan_spec const *chan,
> +			    int *val, int *val2, long mask)
> +{
> +	struct ad5529r_state *st = iio_priv(indio_dev);
> +	unsigned int reg_addr, reg_val_h;
> +	int ret, range_idx, span_mv;
> +
> +	switch (mask) {
> +	case IIO_CHAN_INFO_RAW:
> +		/*
> +		 * Read from DAC_INPUT_A register rather than DAC_DATA_READBACK.
> +		 * The DAC operates in transparent mode and directly reflects
> +		 * whatever value is written to the INPUT_A register.
> +		 */
> +		reg_addr = AD5529R_REG_DAC_INPUT_A(chan->channel);
> +		ret = regmap_read(st->regmap_16bit, reg_addr, &reg_val_h);
> +		if (ret)
> +			return ret;
> +
> +		*val = reg_val_h;
> +
> +		return IIO_VAL_INT;
> +	case IIO_CHAN_INFO_SCALE:
> +		range_idx = st->output_range_idx[chan->channel];
> +
> +		span_mv = ad5529r_output_ranges_mv[range_idx][1] -
> +			  ad5529r_output_ranges_mv[range_idx][0];
> +		*val = span_mv;
> +		*val2 = st->model_data->resolution;
> +
> +		return IIO_VAL_FRACTIONAL_LOG2;
> +	case IIO_CHAN_INFO_OFFSET:
> +		range_idx = st->output_range_idx[chan->channel];
> +
> +		if (ad5529r_output_ranges_mv[range_idx][0] < 0)
> +			*val = -(1 << (st->model_data->resolution - 1));
> +		else
> +			*val = 0;
> +
> +		return IIO_VAL_INT;
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int ad5529r_write_raw(struct iio_dev *indio_dev,
> +			     struct iio_chan_spec const *chan,
> +			     int val, int val2, long mask)
> +{
> +	struct ad5529r_state *st = iio_priv(indio_dev);
> +	unsigned int reg_addr;
> +
> +	switch (mask) {
> +	case IIO_CHAN_INFO_RAW:
> +		if (val < 0 || val > GENMASK(st->model_data->resolution - 1, 0))
> +			return -EINVAL;
> +
> +		reg_addr = AD5529R_REG_DAC_INPUT_A(chan->channel);
> +
> +		return regmap_write(st->regmap_16bit, reg_addr, val);
> +	default:
> +		return -EINVAL;
> +	}
> +}
> +
> +static int ad5529r_find_output_range(const s32 *vals)
> +{
> +	for (unsigned int i = 0; i < ARRAY_SIZE(ad5529r_output_ranges_mv); i++) {
> +		if (vals[0] == ad5529r_output_ranges_mv[i][0] * 1000 &&
> +		    vals[1] == ad5529r_output_ranges_mv[i][1] * 1000)
> +			return i;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +static int ad5529r_parse_channel_ranges(struct device *dev,
> +					struct ad5529r_state *st)
> +{
> +	int ret, range_idx;
> +	u32 ch;
> +	s32 vals[2];
> +
> +	device_for_each_child_node_scoped(dev, child) {
> +		range_idx = AD5529R_RANGE_0V_5V;
> +
> +		ret = fwnode_property_read_u32(child, "reg", &ch);
> +		if (ret)
> +			return dev_err_probe(dev, ret,
> +					     "Missing reg property in channel
> node\n");
> +
> +		if (ch >= 16)
> +			return dev_err_probe(dev, -EINVAL,
> +					     "Invalid channel number: %u\n", ch);
> +
> +		/* Read u32 property into s32 to handle negative voltage ranges */
> +		if (!fwnode_property_read_u32_array(child,
> +						    "adi,output-range-microvolt",
> +						    (u32 *)vals,
> ARRAY_SIZE(vals))) {
> +			range_idx = ad5529r_find_output_range(vals);
> +			if (range_idx < 0)
> +				return dev_err_probe(dev, range_idx,
> +						     "Invalid range [%d %d] for ch
> %u\n",
> +						     vals[0], vals[1], ch);
> +		}
> +
> +		st->output_range_idx[ch] = range_idx;
> +		ret = regmap_write(st->regmap_16bit,
> +				   AD5529R_REG_OUT_RANGE(ch), range_idx);
> +		if (ret)
> +			return dev_err_probe(dev, ret,
> +					     "Failed to configure range for ch
> %u\n",
> +					     ch);
> +	}
> +
> +	return 0;
> +}
> +
> +static int ad5529r_reg_access(struct iio_dev *indio_dev,
> +			      unsigned int reg,
> +			      unsigned int writeval,
> +			      unsigned int *readval)
> +{
> +	struct ad5529r_state *st = iio_priv(indio_dev);
> +
> +	if (readval)
> +		return regmap_read(ad5529r_get_regmap(st, reg), reg, readval);
> +
> +	return regmap_write(ad5529r_get_regmap(st, reg), reg, writeval);
> +}
> +
> +static const struct iio_info ad5529r_info = {
> +	.read_raw = ad5529r_read_raw,
> +	.write_raw = ad5529r_write_raw,
> +	.debugfs_reg_access = ad5529r_reg_access,
> +};
> +
> +static int ad5529r_probe(struct spi_device *spi)
> +{
> +	struct device *dev = &spi->dev;
> +	struct iio_dev *indio_dev;
> +	struct ad5529r_state *st;
> +	bool external_vref;
> +	int ret;
> +
> +	indio_dev = devm_iio_device_alloc(dev, sizeof(*st));
> +	if (!indio_dev)
> +		return -ENOMEM;
> +
> +	st = iio_priv(indio_dev);
> +
> +	st->spi = spi;
> +
> +	st->model_data = spi_get_device_match_data(spi);
> +	if (!st->model_data)
> +		return dev_err_probe(dev, -EINVAL, "Failed to identify device
> variant\n");
> +
> +	ret = devm_regulator_bulk_get_enable(dev,
> ARRAY_SIZE(ad5529r_supply_names),
> +					     ad5529r_supply_names);
> +	if (ret)
> +		return dev_err_probe(dev, ret,
> +				     "Failed to get and enable regulators\n");
> +
> +	ret = devm_regulator_get_enable_optional(dev, "hvss");
> +	if (ret && ret != -ENODEV)
> +		return dev_err_probe(dev, ret,
> +				     "Failed to get and enable hvss regulator\n");
> +
> +	/*
> +	 * The datasheet mentions a 4.096V external reference for correct
> +	 * operation.
> +	 */
> +	ret = devm_regulator_get_enable_optional(dev, "vref");
> +	if (ret && ret != -ENODEV)
> +		return dev_err_probe(dev, ret,
> +				     "Failed to get and enable vref regulator\n");
> +
> +	external_vref = ret != -ENODEV;
> +
> +	st->regmap_8bit = devm_regmap_init_spi(spi, &ad5529r_regmap_8bit_config);
> +	if (IS_ERR(st->regmap_8bit))
> +		return dev_err_probe(dev, PTR_ERR(st->regmap_8bit),
> +				     "Failed to initialize 8-bit regmap\n");
> +
> +	st->regmap_16bit = devm_regmap_init_spi(spi,
> &ad5529r_regmap_16bit_config);
> +	if (IS_ERR(st->regmap_16bit))
> +		return dev_err_probe(dev, PTR_ERR(st->regmap_16bit),
> +				     "Failed to initialize 16-bit regmap\n");
> +
> +	ret = ad5529r_reset(st);
> +	if (ret)
> +		return dev_err_probe(dev, ret, "Failed to reset device\n");
> +
> +	ret = regmap_assign_bits(st->regmap_16bit, AD5529R_REG_REF_SEL,
> +				 AD5529R_REF_SEL_INTERNAL_REF,
> +				 external_vref ? 0 :
> AD5529R_REF_SEL_INTERNAL_REF);
> +	if (ret)
> +		return dev_err_probe(dev, ret, "Failed to configure reference\n");
> +
> +	ret = ad5529r_parse_channel_ranges(dev, st);
> +	if (ret)
> +		return ret;
> +
> +	indio_dev->name = st->model_data->model_name;
> +	indio_dev->info = &ad5529r_info;
> +	indio_dev->modes = INDIO_DIRECT_MODE;
> +	indio_dev->channels = st->model_data->channels;
> +	indio_dev->num_channels = st->model_data->num_channels;
> +
> +	return devm_iio_device_register(dev, indio_dev);
> +}
> +
> +static const struct of_device_id ad5529r_of_match[] = {
> +	{ .compatible = "adi,ad5529r-16", .data = &ad5529r_16bit_model_data },
> +	{ .compatible = "adi,ad5529r-12", .data = &ad5529r_12bit_model_data },
> +	{ }
> +};
> +MODULE_DEVICE_TABLE(of, ad5529r_of_match);
> +
> +static const struct spi_device_id ad5529r_id[] = {
> +	{
> +		.name = "ad5529r-16",
> +		.driver_data = (kernel_ulong_t)&ad5529r_16bit_model_data,
> +	},
> +	{
> +		.name = "ad5529r-12",
> +		.driver_data = (kernel_ulong_t)&ad5529r_12bit_model_data,
> +	},
> +	{ }
> +};
> +MODULE_DEVICE_TABLE(spi, ad5529r_id);
> +
> +static struct spi_driver ad5529r_driver = {
> +	.driver = {
> +		.name = "ad5529r",
> +		.of_match_table = ad5529r_of_match,
> +	},
> +	.probe = ad5529r_probe,
> +	.id_table = ad5529r_id,
> +};
> +module_spi_driver(ad5529r_driver);
> +
> +MODULE_AUTHOR("Janani Sunil <janani.sunil@analog.com>");
> +MODULE_DESCRIPTION("Analog Devices AD5529R 12/16-bit DAC driver");
> +MODULE_LICENSE("GPL");

^ permalink raw reply

* Re: [PATCH RFC v4 1/6] dt-bindings: iio: add Open Sensor Fusion device
From: Kim Jinseob @ 2026-06-10  9:33 UTC (permalink / raw)
  To: Conor Dooley
  Cc: Jonathan Cameron, linux-iio, David Lechner, Nuno Sá,
	Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Jonathan Corbet, Shuah Khan, devicetree, linux-kernel, linux-doc
In-Reply-To: <20260609-glacial-colossal-38b4937ec620@spud>

> Do you think it makes sense to permit a regulator here, so that the
> "host" OS can power on/off the board running the osf stack?

From the OSF hardware side, yes, that makes sense.

The current prototype used for testing is powered independently, but an OSF
device may also be integrated as a host-powered UART peripheral. In that case
allowing the host to control the board supply through an optional regulator
would be useful.

Unless the IIO side prefers otherwise, I will add an optional supply property
to the binding and matching optional regulator handling in the driver in the
next revision.

Jinseob


2026년 6월 10일 (수) 오전 1:19, Conor Dooley <conor@kernel.org>님이 작성:
>
> Jonathan/IIO folks,
>
> On Mon, Jun 08, 2026 at 08:43:38AM +0900, Jinseob Kim wrote:
>
> > diff --git a/Documentation/devicetree/bindings/iio/opensensorfusion,osf.yaml b/Documentation/devicetree/bindings/iio/opensensorfusion,osf.yaml
> > new file mode 100644
> > index 000000000..a4049715a
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/iio/opensensorfusion,osf.yaml
> > @@ -0,0 +1,43 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: http://devicetree.org/schemas/iio/opensensorfusion,osf.yaml#
> > +$schema: http://devicetree.org/meta-schemas/core.yaml#
> > +
> > +title: Open Sensor Fusion Sensor Aggregation Hub
> > +
> > +maintainers:
> > +  - Jinseob Kim <kimjinseob88@gmail.com>
> > +
> > +description: |
> > +  Open Sensor Fusion is a sensor aggregation hub. The hub exposes an OSF
> > +  protocol data stream over its host interface and may report capabilities and
> > +  samples for multiple sensor classes. The Linux driver discovers the actual
> > +  sensor channels from OSF capability reports instead of describing those
> > +  sensors in Device Tree.
> > +
> > +  Open Sensor Fusion is not a generic industry standard. Public project
> > +  documentation is available at:
> > +
> > +    https://github.com/opensensorfusion
> > +
> > +allOf:
> > +  - $ref: /schemas/serial/serial-peripheral-props.yaml#
> > +
> > +properties:
> > +  compatible:
> > +    const: opensensorfusion,osf
> > +
> > +required:
> > +  - compatible
>
> Do you think it makes sense to permit a regulator here, so that the
> "host" OS can power on/off the board running the osf stack?
>
> > +
> > +unevaluatedProperties: false
> > +
> > +examples:
> > +  - |
> > +    serial {
> > +        sensor {
> > +            compatible = "opensensorfusion,osf";
> > +        };
> > +    };
> > +...

^ permalink raw reply

* Re: [PATCH v4 6/6] kselftest: alloc_tag: extend the allocinfo ioctl kselftest
From: Hao Ge @ 2026-06-10  9:33 UTC (permalink / raw)
  To: Abhishek Bapat
  Cc: Shuah Khan, Jonathan Corbet, linux-doc, linux-kernel, linux-mm,
	Sourav Panda, Suren Baghdasaryan, Andrew Morton, Kent Overstreet
In-Reply-To: <d0a8308b4d0799876d24461a8ed9b5a71d3e1e89.1781042698.git.abhishekbapat@google.com>

Hi Abhishek


On 2026/6/10 08:12, Abhishek Bapat wrote:
> Add the following 2 scenarios to the allocinfo ioctl kselftest:
> 1. Validate size based filtering
> 2. Validate lineno based filtering
>
> The first test uses "do_init_module" as the candidate function for the
> test. This is because the associated site will only allocate memory when
> a kernel module is loaded. The return value of get_content_id() changes
> every time modules are loaded or unloaded. Hence, as long as
> get_content_id() values at the start and the end of the test are the
> same, the memory allocated by the do_init_module call site should also
> remain the same. Consequently, the test can assume consistency between
> the value returned by the ioctl and the procfs resulting in less
> flakiness.
>
> Signed-off-by: Abhishek Bapat <abhishekbapat@google.com>
> ---
>   .../alloc_tag/allocinfo_ioctl_test.c          | 204 +++++++++++++++++-
>   1 file changed, 203 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> index cd9cf229ae1f..5d2f13900a47 100644
> --- a/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> +++ b/tools/testing/selftests/alloc_tag/allocinfo_ioctl_test.c
> @@ -311,11 +311,201 @@ static int test_function_filter(void)
>   	return run_filter_test(&filter);
>   }
>   
> +static int test_size_filter(void)
> +{
> +	int fd;
> +	struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
> +	struct allocinfo_tag_data_vec *procfs_entries = malloc(sizeof(*procfs_entries));
> +	struct allocinfo_filter filter;
> +	int ret = KSFT_PASS;
> +	__u64 target_size, i, pos;
> +	bool found;
> +	const char *target_function = "do_init_module";
> +	struct allocinfo_content_id start_cont_id, end_cont_id;
> +	int retry = 0;
> +	const int max_retries = 10;
> +
> +	if (!tags || !procfs_entries) {
> +		ksft_print_msg("Memory allocation failed.\n");
> +		ret = KSFT_FAIL;
> +		goto freemem;
> +	}
> +
> +	fd = open(ALLOCINFO_PROC, O_RDONLY);
> +	if (fd < 0) {
> +		ksft_exit_skip("Failed to open " ALLOCINFO_PROC ": %s\n", strerror(errno));
> +		ret = KSFT_FAIL;
> +		goto freemem;
> +	}
> +
> +	do {
> +		found = false;
> +		pos = 0;
> +
> +		if (__allocinfo_get_content_id(fd, &start_cont_id)) {
> +			ksft_print_msg("allocinfo_get_content_id failed\n");
> +			ret = KSFT_FAIL;
> +			goto exit;
> +		}
> +
> +		memset(&filter, 0, sizeof(filter));
> +		filter.mask |= ALLOCINFO_FILTER_MASK_FUNCTION;
> +		strncpy(filter.fields.function, target_function, ALLOCINFO_STR_SIZE);
> +
> +		if (get_filtered_procfs_entries(procfs_entries, &filter, fd)) {
> +			ksft_print_msg("Error retrieving entries from " ALLOCINFO_PROC "\n");
> +			ret = KSFT_FAIL;
> +			goto exit;
> +		}
> +


As I mentioned for patch 5, the retry loop in test_size_filter calls

get_filtered_procfs_entries() which reads fd to EOF via fdopen/fgets.

If a module load triggers a retry, the second call to 
get_filtered_procfs_entries() gets EOF

immediately.

And Sashiko has also reported several minor issues.


Thanks

Best Regards

Hao

> +		if (procfs_entries->count == 0) {
> +			ksft_print_msg("Function %s not found in procfs\n", target_function);
> +			ret = KSFT_SKIP;
> +			goto exit;
> +		}
> +
> +		target_size = procfs_entries->tag[0].counter.bytes;
> +
> +		memset(&filter, 0, sizeof(filter));
> +		filter.mask |= ALLOCINFO_FILTER_MASK_MIN_SIZE | ALLOCINFO_FILTER_MASK_MAX_SIZE;
> +		filter.min_size = target_size;
> +		filter.max_size = target_size;
> +
> +		while (1) {
> +			struct allocinfo_get_at get_at_params;
> +
> +			memset(&get_at_params, 0, sizeof(get_at_params));
> +			memcpy(&get_at_params.filter, &filter, sizeof(filter));
> +			get_at_params.pos = pos;
> +
> +			if (__allocinfo_get_at(fd, &get_at_params))
> +				break;
> +
> +			tags->count = 0;
> +			memcpy(&tags->tag[tags->count++], &get_at_params.data,
> +			       sizeof(get_at_params.data));
> +
> +			while (tags->count < VEC_MAX_ENTRIES &&
> +			       __allocinfo_get_next(fd, &tags->tag[tags->count]) == 0)
> +				tags->count++;
> +
> +			for (i = 0; i < tags->count; i++) {
> +				if (strcmp(tags->tag[i].tag.function, target_function) == 0) {
> +					found = true;
> +					break;
> +				}
> +			}
> +
> +			if (found || tags->count < VEC_MAX_ENTRIES)
> +				break;
> +
> +			pos += tags->count;
> +		}
> +
> +		if (__allocinfo_get_content_id(fd, &end_cont_id)) {
> +			ksft_print_msg("allocinfo_get_content_id failed\n");
> +			ret = KSFT_FAIL;
> +			goto exit;
> +		}
> +
> +		if (start_cont_id.id == end_cont_id.id)
> +			break;
> +
> +		ksft_print_msg("Module load detected during size verification, retrying...\n");
> +	} while (retry++ < max_retries);
> +
> +	if (start_cont_id.id == end_cont_id.id && !found) {
> +		ksft_print_msg("Entry with function %s not found in IOCTL results\n",
> +			       target_function);
> +		ret = KSFT_FAIL;
> +	}
> +
> +exit:
> +	close(fd);
> +freemem:
> +	free(tags);
> +	free(procfs_entries);
> +	return ret;
> +}
> +
> +static int test_lineno_filter(void)
> +{
> +	int fd;
> +	struct allocinfo_tag_data_vec *tags = malloc(sizeof(*tags));
> +	struct allocinfo_tag_data_vec *procfs_entries = malloc(sizeof(*procfs_entries));
> +	struct allocinfo_filter filter;
> +	enum ioctl_ret ioctl_status;
> +	int ret = KSFT_PASS;
> +	__u64 target_lineno, i;
> +
> +	if (!tags || !procfs_entries) {
> +		ksft_print_msg("Memory allocation failed.\n");
> +		ret = KSFT_FAIL;
> +		goto freemem;
> +	}
> +
> +	fd = open(ALLOCINFO_PROC, O_RDONLY);
> +	if (fd < 0) {
> +		ksft_exit_skip("Failed to open " ALLOCINFO_PROC ": %s\n", strerror(errno));
> +		ret = KSFT_FAIL;
> +		goto freemem;
> +	}
> +
> +	memset(&filter, 0, sizeof(filter));
> +
> +	if (get_filtered_procfs_entries(procfs_entries, &filter, fd)) {
> +		ksft_print_msg("Error retrieving entries from " ALLOCINFO_PROC "\n");
> +		ret = KSFT_FAIL;
> +		goto exit;
> +	}
> +	if (procfs_entries->count == 0) {
> +		ksft_print_msg("Could not retrieve procfs entries\n");
> +		ret = KSFT_SKIP;
> +		goto exit;
> +	}
> +	/*
> +	 * We depend on the result of procfs entries to create the ioctl_filter. Hence we
> +	 * cannot recycle the run_filter_test function here.
> +	 */
> +	target_lineno = procfs_entries->tag[0].tag.lineno;
> +
> +	filter.mask |= ALLOCINFO_FILTER_MASK_LINENO;
> +	filter.fields.lineno = target_lineno;
> +
> +	ioctl_status = get_filtered_ioctl_entries(tags, &filter, fd, 0);
> +	if (ioctl_status == IOCTL_INVALID_DATA) {
> +		ksft_print_msg("Trouble retrieving valid IOCTL entries, skipping.\n");
> +		ret = KSFT_SKIP;
> +		goto exit;
> +	}
> +	if (ioctl_status == IOCTL_FAILURE) {
> +		ksft_print_msg("Error retrieving IOCTL entries.\n");
> +		ret = KSFT_FAIL;
> +		goto exit;
> +	}
> +
> +	for (i = 0; i < tags->count; i++) {
> +		if (tags->tag[i].tag.lineno != target_lineno) {
> +			ksft_print_msg("IOCTL entry %llu has incorrect lineno %llu.\n",
> +				       i, tags->tag[i].tag.lineno);
> +			ret = KSFT_FAIL;
> +			goto exit;
> +		}
> +	}
> +
> +exit:
> +	close(fd);
> +freemem:
> +	free(tags);
> +	free(procfs_entries);
> +	return ret;
> +}
> +
>   int main(int argc, char *argv[])
>   {
>   	int ret;
>   
> -	ksft_set_plan(2);
> +	ksft_set_plan(4);
>   
>   	ret = test_filename_filter();
>   	if (ret == KSFT_SKIP)
> @@ -329,5 +519,17 @@ int main(int argc, char *argv[])
>   	else
>   		ksft_test_result(ret == KSFT_PASS, "test_function_filter\n");
>   
> +	ret = test_size_filter();
> +	if (ret == KSFT_SKIP)
> +		ksft_test_result_skip("Skipping test_size_filter\n");
> +	else
> +		ksft_test_result(ret == KSFT_PASS, "test_size_filter\n");
> +
> +	ret = test_lineno_filter();
> +	if (ret == KSFT_SKIP)
> +		ksft_test_result_skip("Skipping test_lineno_filter\n");
> +	else
> +		ksft_test_result(ret == KSFT_PASS, "test_lineno_filter\n");
> +
>   	ksft_finished();
>   }

^ permalink raw reply

* Re: [PATCH 12/15] accel/qda: Add FastRPC invocation support
From: Ekansh Gupta @ 2026-06-10  9:38 UTC (permalink / raw)
  To: Dmitry Baryshkov
  Cc: Oded Gabbay, Jonathan Corbet, Shuah Khan, Joerg Roedel,
	Will Deacon, Robin Murphy, Maarten Lankhorst, Maxime Ripard,
	Thomas Zimmermann, David Airlie, Simona Vetter, Sumit Semwal,
	Christian König, Bharath Kumar, Chenna Kesava Raju, srini,
	andersson, konradybcio, robin.clark, linux-kernel, dri-devel,
	linux-doc, linux-arm-msm, iommu, linux-media, linaro-mm-sig
In-Reply-To: <q2q6tfnas3kikapwehbp6q7mykvk2wbuvn6ypbzc5ta7azh65w@wdwphde7gcxc>

On 08-06-2026 02:44, Dmitry Baryshkov wrote:
> On Thu, Jun 04, 2026 at 10:39:14AM +0530, Ekansh Gupta wrote:
>> On 20-05-2026 19:26, Dmitry Baryshkov wrote:
>>> On Tue, May 19, 2026 at 11:46:02AM +0530, Ekansh Gupta via B4 Relay wrote:
>>>> From: Ekansh Gupta <ekansh.gupta@oss.qualcomm.com>
>>>>
>>>> Implement the FastRPC remote procedure call path, allowing user-space
>>>> to invoke methods on the DSP via DRM_IOCTL_QDA_REMOTE_INVOKE.
>>>>
>>>> qda_fastrpc.c / qda_fastrpc.h
>>>>   Implements the FastRPC protocol layer: argument marshalling
>>>>   (qda_fastrpc_invoke_pack), response unmarshalling
>>>>   (qda_fastrpc_invoke_unpack), and invocation context lifecycle
>>>>   management. Each invocation allocates a fastrpc_invoke_context
>>>>   which tracks buffer descriptors, GEM objects, and the completion
>>>>   used to synchronise with the DSP response.
>>>>
>>>>   Buffer arguments are handled in three ways:
>>>>   - DMA-BUF fd: imported via PRIME, IOMMU-mapped dma_addr used
>>>>   - Direct (inline): copied into the GEM-backed message buffer
>>>>   - DMA handle: fd forwarded to DSP, physical page descriptor computed
>>>
>>> No. This needs to go away. The QDA should support only one way to pass
>>> data - via the GEM buffers. Everything else should be handled by the
>>> shim layer, etc.
>> each FD passed here is a GEM buffer. The reason to pass fd is that there
>> are some APIs on DSP side which takes fd as an argument and the user
>> might use the same on their skel implementation. So in this case the
>> remote call will take fd to DSP and the skel implementation will use the
>> FD.>
> 
> Then handle it all on the userspace side. In the end, bad library API is
> not a reason to complicate kernel API and kernel driver.
The problem is that the user passes the fd as an argument to the remote
call which the fastrpc library cannot decrypt. So basically the user can
allocate some fd buffer(say with FD1) and then call some remote method
passing FD1 as an int argument to call HAP_mmap on the same at DSP side,
this int argument cannot be differentiated by fastrpc library as
FD/non-FD argument.
> 
>>>> +#define FASTRPC_SCALARS(method, in, out) \
>>>> +		FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0)
>>>> +
>>>> +/**
>>>> + * struct fastrpc_buf_overlap - Buffer overlap tracking structure
>>>> + *
>>>> + * Tracks overlapping buffer regions to optimise memory mapping and avoid
>>>> + * redundant mappings of the same physical memory.
>>>
>>> WHat for? Even if this is a valid optimization, implement it as a
>>> subsequent patch. The first goal should be very simple - get GEM buffers
>>> from the app, pass them to the DSP, read the results.
>> yes, this implementation is mimicking the existing fastrpc design where
>> non-FD buffers are also supported. I am currently evaluating the
>> maintainance of such buffers from userspace side and trying to
>> understand the impacts of the same. I am planning to bring it as a
>> future enhancement if there is no regression.>
> 
> Other way around. Drop it for now and bring it back if it has any
> positive impact.
We did evaluation and don't see userspace side handling being feasible
for non-FD buffers, I'll try to summarize the current design and the
problem:

Currently a remote call can take up to 255 arguments and in many cases
the user passes the buffers as non-FD arguments which is then copied to
meta data and sent to DSP. Before copy there is an operation to identify
if the buffers are overlapped so that it can be maintained efficiently.

DSP understands this based on offset and maps it accordingly, so for
multiple small sized arguments, there is a possibility that a single
page is used. Now if we allocate GEM buffers for each of these small
arguments, it would lead to creation of multiple pages(can go up to 255)
and all these pages then are required to be mapped onto DSP which could
also lead to DSP address space exhaustion. So the limitation is too many
pages and that DSP cannot handling this as efficiently as overlapped
copy buffers.>
>>>> + */
>>>> +struct fastrpc_buf_overlap {
>>>
>>> Stop clashing the names with the existing fastrpc driver.
>> ack.>
>>>> +	/** @start: Start address of the buffer in user virtual address space */
>>>> +	u64 start;
>>>> +	/** @end: End address of the buffer in user virtual address space */
>>>> +	u64 end;
>>>> +	/** @raix: Remote argument index associated with this overlap */
>>>> +	int raix;
>>>> +	/** @mstart: Start address of the mapped region */
>>>> +	u64 mstart;
>>>> +	/** @mend: End address of the mapped region */
>>>> +	u64 mend;
>>>> +	/** @offset: Offset within the mapped region */
>>>> +	u64 offset;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_remote_dmahandle - Remote DMA handle descriptor
>>>> + */
>>>> +struct fastrpc_remote_dmahandle {
>>>> +	/** @fd: DMA-BUF file descriptor */
>>>> +	s32 fd;
>>>> +	/** @offset: Byte offset within the DMA-BUF */
>>>> +	u32 offset;
>>>> +	/** @len: Length of the region in bytes */
>>>> +	u32 len;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_remote_buf - Remote buffer descriptor
>>>> + */
>>>> +struct fastrpc_remote_buf {
>>>> +	/** @pv: Buffer pointer (user virtual address) */
>>>> +	u64 pv;
>>>> +	/** @len: Length of the buffer in bytes */
>>>> +	u64 len;
>>>> +};
>>>> +
>>>> +/**
>>>> + * union fastrpc_remote_arg - Remote argument (buffer or DMA handle)
>>>> + */
>>>> +union fastrpc_remote_arg {
>>>> +	/** @buf: Inline buffer descriptor */
>>>> +	struct fastrpc_remote_buf buf;
>>>> +	/** @dma: DMA-BUF handle descriptor */
>>>> +	struct fastrpc_remote_dmahandle dma;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_phy_page - Physical page descriptor
>>>> + */
>>>> +struct fastrpc_phy_page {
>>>> +	/** @addr: Physical (IOMMU) address of the page */
>>>> +	u64 addr;
>>>> +	/** @size: Size of the contiguous region in bytes */
>>>> +	u64 size;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_invoke_buf - Invoke buffer descriptor
>>>> + */
>>>> +struct fastrpc_invoke_buf {
>>>> +	/** @num: Number of contiguous physical regions */
>>>> +	u32 num;
>>>> +	/** @pgidx: Index into the physical page array */
>>>> +	u32 pgidx;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_msg - FastRPC wire message for remote invocations
>>>> + *
>>>> + * Sent to the remote processor via RPMsg. This is the exact layout
>>>> + * the DSP expects; do not reorder or add fields without DSP firmware
>>>> + * coordination.
>>>> + */
>>>> +struct fastrpc_msg {
>>>> +	/** @remote_session_id: Session identifier on the remote processor */
>>>> +	int remote_session_id;
>>>> +	/** @tid: Thread ID of the invoking thread */
>>>> +	int tid;
>>>> +	/** @ctx: Context identifier for matching request/response */
>>>> +	u64 ctx;
>>>> +	/** @handle: Handle of the remote method to invoke */
>>>> +	u32 handle;
>>>> +	/** @sc: Scalars value encoding in/out buffer counts */
>>>> +	u32 sc;
>>>> +	/** @addr: Physical address of the message payload buffer */
>>>> +	u64 addr;
>>>> +	/** @size: Size of the message payload in bytes */
>>>> +	u64 size;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct qda_msg - FastRPC message with kernel-internal bookkeeping
>>>> + *
>>>> + * The wire-format portion is kept in the embedded @fastrpc member (must
>>>> + * be first) so that &qda_msg->fastrpc can be passed directly to
>>>> + * rpmsg_send() without a copy.
>>>> + */
>>>> +struct qda_msg {
>>>> +	/**
>>>> +	 * @fastrpc: Wire-format message sent to the DSP via RPMsg.
>>>> +	 * Must be the first member.
>>>> +	 */
>>>> +	struct fastrpc_msg fastrpc;
>>>> +	/** @buf: Kernel virtual address of the payload buffer */
>>>> +	void *buf;
>>>> +	/** @phys: Physical/DMA address of the payload buffer */
>>>> +	u64 phys;
>>>> +	/** @ret: Return value from the remote processor */
>>>> +	int ret;
>>>> +	/** @fastrpc_ctx: Back-pointer to the owning invocation context */
>>>> +	struct fastrpc_invoke_context *fastrpc_ctx;
>>>> +	/** @file_priv: DRM file private data for GEM object lookup */
>>>> +	struct drm_file *file_priv;
>>>> +};
>>>> +
>>>> +/**
>>>> + * struct fastrpc_invoke_context - Remote procedure call invocation context
>>>> + *
>>>> + * Maintains all state for a single remote procedure call, including buffer
>>>> + * management, synchronisation, and result handling.
>>>> + */
>>>> +struct fastrpc_invoke_context {
>>>> +	/** @node: List node for linking contexts in a queue */
>>>> +	struct list_head node;
>>>> +	/** @ctxid: Unique context identifier (XArray key shifted left by 4) */
>>>> +	u64 ctxid;
>>>> +	/** @inbufs: Number of input buffers */
>>>> +	int inbufs;
>>>> +	/** @outbufs: Number of output buffers */
>>>> +	int outbufs;
>>>> +	/** @handles: Number of DMA-BUF handle arguments */
>>>> +	int handles;
>>>> +	/** @nscalars: Total number of scalar arguments */
>>>> +	int nscalars;
>>>> +	/** @nbufs: Total number of buffer arguments (inbufs + outbufs) */
>>>> +	int nbufs;
>>>
>>> If it is inbufs + outbufs, why do you need it here?
>>>
>>>> +	/** @pid: Process ID of the calling process */
>>>> +	int pid;
>>>> +	/** @retval: Return value from the remote invocation */
>>>> +	int retval;
>>>> +	/** @metalen: Length of the FastRPC metadata header in bytes */
>>>> +	int metalen;
>>>
>>> size_t, also why do you need it?
>>>
>>>> +	/** @remote_session_id: Session identifier on the remote processor */
>>>> +	int remote_session_id;
>>>> +	/** @pd: Protection domain identifier encoded into the context ID */
>>>> +	int pd;
>>>> +	/** @type: Invocation type (e.g. FASTRPC_RMID_INVOKE_DYNAMIC) */
>>>> +	int type;
>>>> +	/** @sc: Scalars value encoding in/out buffer counts */
>>>> +	u32 sc;
>>>
>>> How is this different from the counts above?
>> sc carries the method id and handle counts. The reason to maintain count
>> separately is to avoid calculating it again and again.>
> 
> Is it just a sum of several values or something more complicated?
just the sum, I'll drop it if it's not really useful.>
>>>> +	/** @handle: Handle of the remote method being invoked */
>>>> +	u32 handle;
>>>> +	/** @crc: Pointer to CRC values for data integrity checking */
>>>> +	u32 *crc;
>>>
>>> Add it later. It's unused. Drop all unused fields.
>> ack.>
>>>> +	/** @fdlist: Pointer to array of DMA-BUF file descriptors */
>>>> +	u64 *fdlist;
>>>
>>> Why do you need DMA-BUFs in the invocation context? They all should be
>>> GEM buffers.
>> the reason is that the users are dependent on FDs as they can import
>> buffers allocated from anywhere and there are DSP APIs which takes fd as
>> an argument, so they might end up using the same in there skel
>> implementation.>
> 
> No, DSP API can't take FD, they don't quite cross the OS and IOMMU
> boundary. It's the userspace library API. Which might be improved,
> rewritten, implemented underneath, etc. For the kernel side please,
> pass _only_ GEM handles + offsets.
Yes, but with the current DSP design, DSP APIs take FD just because of
client/user design. On fastrpc, users could bring FD from any source,
register it with fastrpc and pass it on to DSP. The major problem is
what I mentioned above, where the user application passes the FD as an
integer argument and the fastrpc library not able to identify if that
int is an fd or some other data.>
>>>> +	/** @pkt_size: Total payload size in bytes */
>>>> +	u64 pkt_size;
>>>> +	/** @aligned_pkt_size: Page-aligned payload size for GEM allocation */
>>>> +	u64 aligned_pkt_size;
>>>> +	/** @list: Array of invoke buffer descriptors */
>>>> +	struct fastrpc_invoke_buf *list;
>>>> +	/** @pages: Array of physical page descriptors for all arguments */
>>>> +	struct fastrpc_phy_page *pages;
>>>> +	/** @input_pages: Array of physical page descriptors for input buffers */
>>>> +	struct fastrpc_phy_page *input_pages;
>>>
>>> I think you are trying to bring all the complexity from the old driver
>>> with no added benefit. Please don't. Use the existing memory manager.
>>> Let it handle all the gory details. If someting is not there, we should
>>> consider extending GEM instead.
>> I'm not changing the metadata format as the DSP might not understand the
>> messages if we modify it.
> 
> Well, it's up to you to know if DSP will understand the message or not.
> The probability ("might not") is not suitable here. Anyway, let's get
> rid of the various data formats first, then maybe some of the items will
> go away on their own.
ack>
>> Also, the fd is still being used because of
>> the client dependency on it. I'll check if there is any other logic that
>> needs alteration here.>
> 
> If the client keeps on passing FD to the library calls, you can map
> FD to GEM handles in the library code.
I hope the int argument part mentioned above answers this.>
>>>> +
>>>> +static int fastrpc_context_get_id(struct fastrpc_invoke_context *ctx, struct qda_dev *qdev)
>>>> +{
>>>> +	int ret;
>>>> +	u32 id;
>>>> +
>>>> +	if (!qdev)
>>>> +		return -EINVAL;
>>>> +
>>>> +	ret = xa_alloc(&qdev->ctx_xa, &id, ctx, xa_limit_32b, GFP_KERNEL);
>>>> +	if (ret)
>>>> +		return ret;
>>>> +
>>>> +	ctx->ctxid = id << 4;
>>>
>>> Why is it being shifted?
>> this is to accomodate PD type>
> 
> Not really an answer.
Okay, let me bring the ctxid layout that DSP expects:

[11:4] = CCCCCCCC (context ID)
[3:0]  = PPPP (PD type)

Based on this PD type, DSP will decide where to queue the message.
> 
>>>> +	return 0;
>>>> +}
>>>> +
>>>
>>
> 


^ permalink raw reply

* Re: [PATCH v9 2/6] mm/memory-failure: surface unhandlable kernel pages as -ENOTRECOVERABLE
From: Breno Leitao @ 2026-06-10  9:53 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Miaohe Lin, Andrew Morton, Lorenzo Stoakes, Vlastimil Babka,
	Mike Rapoport, Suren Baghdasaryan, Michal Hocko, Shuah Khan,
	Naoya Horiguchi, Jonathan Corbet, Shuah Khan, Liam R. Howlett,
	lance.yang, Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-mm, linux-kernel, linux-doc, linux-kselftest,
	linux-trace-kernel, kernel-team
In-Reply-To: <cf2bb24e-9341-4ded-b238-064dca442a92@kernel.org>

On Tue, Jun 09, 2026 at 08:41:25PM +0200, David Hildenbrand (Arm) wrote:
> On 6/9/26 18:15, Breno Leitao wrote:
> > On Tue, Jun 09, 2026 at 04:41:01PM +0200, David Hildenbrand (Arm) wrote:

> >> a) HWPoisonKernelOwned: this is not the common style for us to name functions.
> >>
> >> is_kernel_owned_page() or sth like that would do.
> > 
> > Ack, I will rename it is_kernel_owned_page()
> > 
> > In my defence, most of the functions similar to HWPoisonKernelOwned()
> > has this name format, and I got this discussion earlier (with Lance?
> > I think). Here are the similar function names in that file:
> > 
> >  * HWPoisonHandlable
> >  * PageHWPoisonTakenOff()
> >  * SetPageHWPoisonTakenOff
> 
> Some of these probably date back to our old way of handling page flags and
> things, like PageLRU.
> 
> But we really should stop :)

Ack!

> > I will update in the new version.
> 
> Thanks! Probably best to wait a bit, the merge window is coming up either way,
> so this will have to wait a bit either way.

no hurry at all,

Thanks for the review,
--breno

^ permalink raw reply

* Re: [PATCH v1] arm64: errata: Mitigate TLBI errata on NVIDIA Olympus CPU
From: Mark Rutland @ 2026-06-10 10:00 UTC (permalink / raw)
  To: Shanker Donthineni
  Cc: Catalin Marinas, Will Deacon, linux-arm-kernel, linux-kernel,
	linux-doc, Vikram Sethi, Jason Sequeira, Alok Mooley, Rich Wiley
In-Reply-To: <20260609234044.3945938-1-sdonthineni@nvidia.com>

On Tue, Jun 09, 2026 at 06:40:44PM -0500, Shanker Donthineni wrote:
> NVIDIA Olympus cores are affected by the TLBI completion issue tracked as
> CVE-2025-10263. The existing ARM64_ERRATUM_4118414 handling already uses
> ARM64_WORKAROUND_REPEAT_TLBI to issue an additional broadcast TLBI;DSB
> sequence and ensure affected memory write effects are globally observed.
> 
> Add MIDR_NVIDIA_OLYMPUS to the repeat-TLBI match list so the same
> mitigation is enabled on affected Olympus systems. Also document the
> NVIDIA Olympus erratum in the arm64 silicon errata table and list it in
> the Kconfig help text.
> 
> Signed-off-by: Shanker Donthineni <sdonthineni@nvidia.com>
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Mark Rutland <mark.rutland@arm.com>
> ---
> Note: This patch depends on the following series as a prerequisite:
> https://lore.kernel.org/all/20260609101203.1512409-1-mark.rutland@arm.com/

FWIW:

Acked-by: Mark Rutland <mark.rutland@arm.com>

I'll keep note of this when backporting the other patches; as a
prerequisite we'll also need to pick commit

  e185c8a0d842 ("arm64: cputype: Add NVIDIA Olympus definitions")

I have one minor comment below, but that's more for Catalin/Will, and
doesn't require a respin.

>  Documentation/arch/arm64/silicon-errata.rst | 2 ++
>  arch/arm64/Kconfig                          | 3 ++-
>  arch/arm64/kernel/cpu_errata.c              | 1 +
>  3 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
> index a01e916ede17..ad09bbb10da8 100644
> --- a/Documentation/arch/arm64/silicon-errata.rst
> +++ b/Documentation/arch/arm64/silicon-errata.rst
> @@ -298,6 +298,8 @@ stable kernels.
>  +----------------+-----------------+-----------------+-----------------------------+
>  | NVIDIA         | Carmel Core     | N/A             | NVIDIA_CARMEL_CNP_ERRATUM   |
>  +----------------+-----------------+-----------------+-----------------------------+
> +| NVIDIA         | Olympus core    | T410-OLY-1029   | ARM64_ERRATUM_4118414       |
> ++----------------+-----------------+-----------------+-----------------------------+
>  | NVIDIA         | T241 GICv3/4.x  | T241-FABRIC-4   | N/A                         |
>  +----------------+-----------------+-----------------+-----------------------------+
>  | NVIDIA         | T241 MPAM       | T241-MPAM-1     | N/A                         |
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 48233b54c482..c65cef81be86 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -1155,7 +1155,7 @@ config ARM64_ERRATUM_4193714
>  	  If unsure, say Y.
>  
>  config ARM64_ERRATUM_4118414
> -	bool "Cortex-*/Neoverse-*/C1-*: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
> +	bool "Cortex-*/Neoverse-*/C1-*/Olympus: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"

As this is getting increasingly long, maybe it's worth reducing this to
"Various" in the title, i.e.

	bool "Cortex-*/Neoverse: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"

... but as above, no need to respin for that.

Mark.

>  	default y
>  	select ARM64_WORKAROUND_REPEAT_TLBI
>  	help
> @@ -1182,6 +1182,7 @@ config ARM64_ERRATUM_4118414
>  	  * ARM Neoverse-V2 erratum 4193787
>  	  * ARM Neoverse-V3 erratum 4193784
>  	  * ARM Neoverse-V3AE erratum 4193784
> +	  * NVIDIA Olympus erratum T410-OLY-1029
>  
>  	  On affected cores, some memory accesses might not be completed by
>  	  broadcast TLB invalidation.
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index fe6fe5de495b..d597896b0f7f 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -364,6 +364,7 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
>  			MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
>  			MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3),
>  			MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V3AE),
> +			MIDR_ALL_VERSIONS(MIDR_NVIDIA_OLYMPUS),
>  			{}
>  		})),
>  	},
> -- 
> 2.43.0
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox