DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v5 04/11] bpf: add cBPF origin to rte_bpf_load_ex
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Add cBPF origin to rte_bpf_load_ex to allow loading PCAP filters and
other cBPF code through the unified interface.

Note that for the no-libpcap stub of rte_bpf_convert, the behavior when
called with a NULL program has changed from setting rte_errno to EINVAL
to setting it to ENOTSUP. Since both cases return NULL, callers relying
on pointer checking are unaffected.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_convert.c | 81 +++++++++++++++++++++++++++++++++++++++++--
 lib/bpf/bpf_impl.h    | 11 ++++++
 lib/bpf/bpf_load.c    | 12 ++++++-
 lib/bpf/bpf_stub.c    | 27 ---------------
 lib/bpf/meson.build   | 11 +++---
 lib/bpf/rte_bpf.h     |  8 ++++-
 6 files changed, 113 insertions(+), 37 deletions(-)
 delete mode 100644 lib/bpf/bpf_stub.c

diff --git a/lib/bpf/bpf_convert.c b/lib/bpf/bpf_convert.c
index 953ca80670..e8074b13d0 100644
--- a/lib/bpf/bpf_convert.c
+++ b/lib/bpf/bpf_convert.c
@@ -9,6 +9,12 @@
  * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
  */
 
+#include "bpf_impl.h"
+#include <eal_export.h>
+#include <rte_errno.h>
+
+#ifdef RTE_HAS_LIBPCAP
+
 #include <assert.h>
 #include <errno.h>
 #include <stdbool.h>
@@ -17,17 +23,14 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <eal_export.h>
 #include <rte_common.h>
 #include <rte_bpf.h>
 #include <rte_log.h>
 #include <rte_malloc.h>
-#include <rte_errno.h>
 
 #include <pcap/pcap.h>
 #include <pcap/bpf.h>
 
-#include "bpf_impl.h"
 #include "bpf_def.h"
 
 #ifndef BPF_MAXINSNS
@@ -572,3 +575,75 @@ rte_bpf_convert(const struct bpf_program *prog)
 
 	return prm;
 }
+
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load)
+{
+	free(load->ins);
+}
+
+int
+__rte_bpf_convert(struct __rte_bpf_load *load)
+{
+	struct rte_bpf_prm_ex *const prm = &load->prm;
+	uint32_t nb_ins = 0;
+	int ret;
+
+	RTE_ASSERT(prm->origin == RTE_BPF_ORIGIN_CBPF);
+
+	if (prm->cbpf.ins == NULL || prm->cbpf.nb_ins == 0)
+		return -EINVAL;
+
+	/* 1st pass: calculate the eBPF program length */
+	ret = bpf_convert_filter(prm->cbpf.ins, prm->cbpf.nb_ins, NULL, &nb_ins);
+	if (ret < 0) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot get eBPF length");
+		return ret;
+	}
+
+	RTE_ASSERT(load->ins == NULL);
+	load->ins = malloc(nb_ins * sizeof(load->ins[0]));
+	if (load->ins == NULL)
+		return -ENOMEM;
+
+	/* 2nd pass: remap cBPF to eBPF instructions  */
+	ret = bpf_convert_filter(prm->cbpf.ins, prm->cbpf.nb_ins, load->ins, &nb_ins);
+	if (ret < 0) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot convert cBPF to eBPF");
+		return ret;
+	}
+
+	prm->origin = RTE_BPF_ORIGIN_RAW;
+	prm->raw.ins = load->ins;
+	prm->raw.nb_ins = nb_ins;
+
+	return 0;
+}
+
+#else /* RTE_HAS_LIBPCAP */
+
+RTE_EXPORT_SYMBOL(rte_bpf_convert)
+struct rte_bpf_prm *
+rte_bpf_convert(const struct bpf_program *prog)
+{
+	RTE_SET_USED(prog);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
+	rte_errno = ENOTSUP;
+	return NULL;
+}
+
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load)
+{
+	RTE_ASSERT(load->ins == NULL);
+}
+
+int
+__rte_bpf_convert(struct __rte_bpf_load *load)
+{
+	RTE_SET_USED(load);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
+	return -ENOTSUP;
+}
+
+#endif /* RTE_HAS_LIBPCAP */
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index 4a98b33730..92d03583d9 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -21,6 +21,9 @@ struct rte_bpf {
 struct __rte_bpf_load {
 	struct rte_bpf_prm_ex prm;
 
+	/* Conversion from cBPF. */
+	struct ebpf_insn *ins;
+
 	/* Loading ELF and applying relocations. */
 	int elf_fd;  /* ELF fd, must be negative (not zero) by default. */
 	void *elf;  /* Using void to avoid dependency on libelf. */
@@ -34,6 +37,14 @@ struct __rte_bpf_load {
  * to avoid potential name conflict with other libraries.
  */
 
+/* Free temporary resources created by converting from cBPF to eBPF. */
+void
+__rte_bpf_convert_cleanup(struct __rte_bpf_load *load);
+
+/* Convert program from cBPF to eBPF. */
+int
+__rte_bpf_convert(struct __rte_bpf_load *load);
+
 /* Free temporary resources created by opening ELF. */
 void
 __rte_bpf_load_elf_cleanup(struct __rte_bpf_load *load);
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index f63093b9bc..e3265e97ff 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -240,6 +240,9 @@ load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
 	switch (load->prm.origin) {
 	case RTE_BPF_ORIGIN_RAW:
 		break;
+	case RTE_BPF_ORIGIN_CBPF:
+		rc = rc < 0 ? rc : __rte_bpf_convert(load);
+		break;
 	case RTE_BPF_ORIGIN_ELF_FILE:
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_file(load);
 		rc = rc < 0 ? rc : __rte_bpf_load_elf_code(load);
@@ -254,6 +257,13 @@ load_try(struct __rte_bpf_load *load, const struct rte_bpf_prm_ex *app_prm)
 	return rc;
 }
 
+static void
+load_cleanup(struct __rte_bpf_load *load)
+{
+	__rte_bpf_convert_cleanup(load);
+	__rte_bpf_load_elf_cleanup(load);
+}
+
 RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_load_ex, 26.11)
 struct rte_bpf *
 rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
@@ -262,7 +272,7 @@ rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
 
 	const int rc = load_try(&load, prm);
 
-	__rte_bpf_load_elf_cleanup(&load);
+	load_cleanup(&load);
 
 	RTE_ASSERT((rc < 0) == (load.bpf == NULL));
 
diff --git a/lib/bpf/bpf_stub.c b/lib/bpf/bpf_stub.c
deleted file mode 100644
index 4c329832c2..0000000000
--- a/lib/bpf/bpf_stub.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018-2021 Intel Corporation
- */
-
-#include "bpf_impl.h"
-#include <eal_export.h>
-#include <rte_errno.h>
-
-/**
- * Contains stubs for unimplemented public API functions
- */
-
-#ifndef RTE_HAS_LIBPCAP
-RTE_EXPORT_SYMBOL(rte_bpf_convert)
-struct rte_bpf_prm *
-rte_bpf_convert(const struct bpf_program *prog)
-{
-	if (prog == NULL) {
-		rte_errno = EINVAL;
-		return NULL;
-	}
-
-	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
-	rte_errno = ENOTSUP;
-	return NULL;
-}
-#endif
diff --git a/lib/bpf/meson.build b/lib/bpf/meson.build
index 4901b6ee14..7e8a300e3f 100644
--- a/lib/bpf/meson.build
+++ b/lib/bpf/meson.build
@@ -15,14 +15,16 @@ if arch_subdir == 'x86' and dpdk_conf.get('RTE_ARCH_32')
     subdir_done()
 endif
 
-sources = files('bpf.c',
+sources = files(
+        'bpf.c',
+        'bpf_convert.c',
         'bpf_dump.c',
         'bpf_exec.c',
         'bpf_load.c',
         'bpf_load_elf.c',
         'bpf_pkt.c',
-        'bpf_stub.c',
-        'bpf_validate.c')
+        'bpf_validate.c',
+)
 
 if arch_subdir == 'x86' and dpdk_conf.get('RTE_ARCH_64')
     sources += files('bpf_jit_x86.c')
@@ -45,8 +47,7 @@ else
 endif
 
 if dpdk_conf.has('RTE_HAS_LIBPCAP')
-    sources += files('bpf_convert.c')
     ext_deps += pcap_dep
 else
-    warning('libpcap is missing, rte_bpf_convert API will be disabled')
+    warning('libpcap is missing, cBPF API will be disabled')
 endif
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index 0e7eaa3c18..da2bdea7e0 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -95,10 +95,12 @@ struct rte_bpf_xsym {
  */
 enum rte_bpf_origin {
 	RTE_BPF_ORIGIN_RAW,		/**< code loaded from raw array */
-	RTE_BPF_ORIGIN_RESERVED,	/**< reserved for cBPF */
+	RTE_BPF_ORIGIN_CBPF,		/**< code converted from cbpf */
 	RTE_BPF_ORIGIN_ELF_FILE,	/**< code loaded from elf_file */
 };
 
+struct bpf_insn;
+
 /**
  * Input parameters for loading eBPF code, extensible version.
  *
@@ -117,6 +119,10 @@ struct rte_bpf_prm_ex {
 			const struct ebpf_insn *ins;  /**< eBPF instructions */
 			uint32_t nb_ins;  /**< number of instructions in ins */
 		} raw;
+		struct {
+			const struct bpf_insn *ins;  /**< cBPF instructions */
+			uint32_t nb_ins;  /**< number of instructions in ins */
+		} cbpf;
 		struct {
 			const char *path;  /**< path to the ELF file */
 			const char *section;  /**< ELF section with the code */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 03/11] bpf: support up to 5 arguments
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev, Wathsala Vithanage; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

When using rte_bpf_load_ex allow up to 5 arguments for a BPF program.
Particularly useful for call-backs and other internal functions.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf.c           |  32 ++++++++++-
 lib/bpf/bpf_exec.c      | 124 +++++++++++++++++++++++++++++++++++++++-
 lib/bpf/bpf_impl.h      |   2 +-
 lib/bpf/bpf_jit_arm64.c |   2 +-
 lib/bpf/bpf_jit_x86.c   |   2 +-
 lib/bpf/bpf_load.c      |   8 ++-
 lib/bpf/bpf_validate.c  |  45 +++++++++++----
 lib/bpf/rte_bpf.h       | 121 +++++++++++++++++++++++++++++++++++++--
 8 files changed, 311 insertions(+), 25 deletions(-)

diff --git a/lib/bpf/bpf.c b/lib/bpf/bpf.c
index 5239b3e11e..67dededd9a 100644
--- a/lib/bpf/bpf.c
+++ b/lib/bpf/bpf.c
@@ -16,8 +16,8 @@ void
 rte_bpf_destroy(struct rte_bpf *bpf)
 {
 	if (bpf != NULL) {
-		if (bpf->jit.func != NULL)
-			munmap(bpf->jit.func, bpf->jit.sz);
+		if (bpf->jit.raw != NULL)
+			munmap(bpf->jit.raw, bpf->jit.sz);
 		munmap(bpf, bpf->sz);
 	}
 }
@@ -29,7 +29,33 @@ rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit)
 	if (bpf == NULL || jit == NULL)
 		return -EINVAL;
 
-	jit[0] = bpf->jit;
+	if (bpf->prm.nb_prog_arg != 1) {
+		RTE_BPF_LOG_LINE(ERR,
+			"this program takes %d arguments, use rte_bpf_get_jit_ex",
+			bpf->prm.nb_prog_arg);
+		return -EINVAL;
+	}
+
+	*jit = (struct rte_bpf_jit) {
+		.func = bpf->jit.raw,
+		.sz = bpf->jit.sz,
+	};
+	return 0;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_get_jit_ex, 26.11)
+int
+rte_bpf_get_jit_ex(const struct rte_bpf *bpf, struct rte_bpf_jit_ex *jit)
+{
+	if (bpf == NULL || jit == NULL)
+		return -EINVAL;
+
+	if (bpf->jit.raw == NULL) {
+		RTE_BPF_LOG_LINE(ERR, "no JIT-compiled version");
+		return -ENOENT;
+	}
+
+	*jit = bpf->jit;
 	return 0;
 }
 
diff --git a/lib/bpf/bpf_exec.c b/lib/bpf/bpf_exec.c
index e4668ba10b..20f8d0fa29 100644
--- a/lib/bpf/bpf_exec.c
+++ b/lib/bpf/bpf_exec.c
@@ -10,6 +10,7 @@
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_byteorder.h>
+#include <rte_errno.h>
 
 #include "bpf_impl.h"
 
@@ -502,6 +503,12 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 	uint64_t reg[EBPF_REG_NUM];
 	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
 
+	if (bpf->prm.nb_prog_arg != 1) {
+		/* Use rte_bpf_exec_burst_ex with this program. */
+		rte_errno = EINVAL;
+		return 0;
+	}
+
 	for (i = 0; i != num; i++) {
 
 		reg[EBPF_REG_1] = (uintptr_t)ctx[i];
@@ -513,12 +520,127 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 	return i;
 }
 
+static uint32_t
+exec_vm_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num)
+{
+	uint32_t i;
+	uint64_t reg[EBPF_REG_NUM];
+	uint64_t stack[MAX_BPF_STACK_SIZE / sizeof(uint64_t)];
+
+	for (i = 0; i != num; i++) {
+
+		switch (bpf->prm.nb_prog_arg) {
+		case 5:
+			reg[EBPF_REG_5] = ctx[i].arg[4].u64;
+			/* FALLTHROUGH */
+		case 4:
+			reg[EBPF_REG_4] = ctx[i].arg[3].u64;
+			/* FALLTHROUGH */
+		case 3:
+			reg[EBPF_REG_3] = ctx[i].arg[2].u64;
+			/* FALLTHROUGH */
+		case 2:
+			reg[EBPF_REG_2] = ctx[i].arg[1].u64;
+			/* FALLTHROUGH */
+		case 1:
+			reg[EBPF_REG_1] = ctx[i].arg[0].u64;
+			/* FALLTHROUGH */
+		case 0:
+			break;
+		}
+
+		reg[EBPF_REG_10] = (uintptr_t)(stack + RTE_DIM(stack));
+
+		rc[i] = bpf_exec(bpf, reg);
+	}
+
+	return i;
+}
+
+static uint32_t
+exec_jit_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num)
+{
+	uint32_t i = 0;
+	const struct rte_bpf_jit_ex jit = bpf->jit;
+
+	/*
+	 * Fast path: assumes application pre-validated RTE_BPF_EXEC_FLAG_JIT
+	 * and successful JIT generation. No explicit NULL checks here.
+	 */
+	switch (bpf->prm.nb_prog_arg) {
+	case 0:
+		for (i = 0; i != num; i++)
+			rc[i] = jit.func0();
+		break;
+	case 1:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func1(arg[0]);
+		}
+		break;
+	case 2:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func2(arg[0], arg[1]);
+		}
+		break;
+	case 3:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func3(arg[0], arg[1], arg[2]);
+		}
+		break;
+	case 4:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func4(arg[0], arg[1], arg[2], arg[3]);
+		}
+		break;
+	case 5:
+		for (i = 0; i != num; i++) {
+			const union rte_bpf_func_arg *const arg = ctx[i].arg;
+			rc[i] = jit.func5(arg[0], arg[1], arg[2], arg[3], arg[4]);
+		}
+		break;
+	}
+
+	return i;
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_exec_burst_ex, 26.11)
+uint32_t
+rte_bpf_exec_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+	uint64_t rc[], uint32_t num, uint64_t flags)
+{
+	if ((flags & ~RTE_BPF_EXEC_FLAG_MASK) != 0) {
+		rte_errno = EINVAL;
+		return 0;
+	}
+
+	return (flags & RTE_BPF_EXEC_FLAG_JIT) != 0 ?
+		exec_jit_burst_ex(bpf, ctx, rc, num) :
+		exec_vm_burst_ex(bpf, ctx, rc, num);
+}
+
 RTE_EXPORT_SYMBOL(rte_bpf_exec)
 uint64_t
 rte_bpf_exec(const struct rte_bpf *bpf, void *ctx)
 {
-	uint64_t rc;
+	uint64_t rc = UINT64_MAX;
 
 	rte_bpf_exec_burst(bpf, &ctx, &rc, 1);
 	return rc;
 }
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_bpf_exec_ex, 26.11)
+uint64_t
+rte_bpf_exec_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t flags)
+{
+	uint64_t rc = UINT64_MAX;
+
+	rte_bpf_exec_burst_ex(bpf, ctx, &rc, 1, flags);
+	return rc;
+}
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index 1cee109bc9..4a98b33730 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -12,7 +12,7 @@
 
 struct rte_bpf {
 	struct rte_bpf_prm_ex prm;
-	struct rte_bpf_jit jit;
+	struct rte_bpf_jit_ex jit;
 	size_t sz;
 	uint32_t stack_sz;
 };
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 9e5e142c13..ba7ae4d680 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -1471,7 +1471,7 @@ __rte_bpf_jit_arm64(struct rte_bpf *bpf)
 	/* Flush the icache */
 	__builtin___clear_cache((char *)ctx.ins, (char *)(ctx.ins + ctx.idx));
 
-	bpf->jit.func = (void *)ctx.ins;
+	bpf->jit.raw = ctx.ins;
 	bpf->jit.sz = size;
 
 	goto finish;
diff --git a/lib/bpf/bpf_jit_x86.c b/lib/bpf/bpf_jit_x86.c
index 6f4235d434..54eb279643 100644
--- a/lib/bpf/bpf_jit_x86.c
+++ b/lib/bpf/bpf_jit_x86.c
@@ -1568,7 +1568,7 @@ __rte_bpf_jit_x86(struct rte_bpf *bpf)
 	if (rc != 0)
 		munmap(st.ins, st.sz);
 	else {
-		bpf->jit.func = (void *)st.ins;
+		bpf->jit.raw = st.ins;
 		bpf->jit.sz = st.sz;
 	}
 
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index a6793b2c94..f63093b9bc 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -149,7 +149,8 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
 			.raw.nb_ins = prm->nb_ins,
 			.xsym = prm->xsym,
 			.nb_xsym = prm->nb_xsym,
-			.prog_arg = prm->prog_arg,
+			.prog_arg[0] = prm->prog_arg,
+			.nb_prog_arg = 1,
 		});
 }
 
@@ -170,7 +171,8 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 			.elf_file.section = sname,
 			.xsym = prm->xsym,
 			.nb_xsym = prm->nb_xsym,
-			.prog_arg = prm->prog_arg,
+			.prog_arg[0] = prm->prog_arg,
+			.nb_prog_arg = 1,
 		});
 }
 
@@ -271,6 +273,6 @@ rte_bpf_load_ex(const struct rte_bpf_prm_ex *prm)
 	}
 
 	RTE_BPF_LOG_FUNC_LINE(INFO, "successfully creates %p(jit={.func=%p,.sz=%zu});",
-		load.bpf, load.bpf->jit.func, load.bpf->jit.sz);
+		load.bpf, load.bpf->jit.raw, load.bpf->jit.sz);
 	return load.bpf;
 }
diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index 5bfc59296d..bf8a4abb5a 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -2425,10 +2425,14 @@ evaluate(struct bpf_verifier *bvf)
 		.s = {.min = MAX_BPF_STACK_SIZE, .max = MAX_BPF_STACK_SIZE},
 	};
 
-	bvf->evst->rv[EBPF_REG_1].v = bvf->prm->prog_arg;
-	bvf->evst->rv[EBPF_REG_1].mask = UINT64_MAX;
-	if (bvf->prm->prog_arg.type == RTE_BPF_ARG_RAW)
-		eval_max_bound(bvf->evst->rv + EBPF_REG_1, UINT64_MAX);
+	for (uint32_t pai = 0; pai != bvf->prm->nb_prog_arg; ++pai) {
+		struct bpf_reg_val *reg = &bvf->evst->rv[EBPF_REG_1 + pai];
+
+		reg->v = bvf->prm->prog_arg[pai];
+		reg->mask = UINT64_MAX;
+		if (reg->v.type == RTE_BPF_ARG_RAW)
+			eval_max_bound(reg, UINT64_MAX);
+	}
 
 	bvf->evst->rv[EBPF_REG_10] = rvfp;
 
@@ -2521,21 +2525,42 @@ evaluate(struct bpf_verifier *bvf)
 	return rc;
 }
 
+static bool
+prog_arg_is_valid(const struct rte_bpf_arg *prog_arg)
+{
+	/* check input argument type, don't allow mbuf ptr on 32-bit */
+	if (prog_arg->type != RTE_BPF_ARG_RAW &&
+			prog_arg->type != RTE_BPF_ARG_PTR &&
+			(sizeof(uint64_t) != sizeof(uintptr_t) ||
+			prog_arg->type != RTE_BPF_ARG_PTR_MBUF)) {
+		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
+		return false;
+	}
+
+	return true;
+}
+
 int
 __rte_bpf_validate(const struct rte_bpf_prm_ex *prm, uint32_t *stack_sz)
 {
 	int32_t rc;
 	struct bpf_verifier bvf;
 
-	/* check input argument type, don't allow mbuf ptr on 32-bit */
-	if (prm->prog_arg.type != RTE_BPF_ARG_RAW &&
-			prm->prog_arg.type != RTE_BPF_ARG_PTR &&
-			(sizeof(uint64_t) != sizeof(uintptr_t) ||
-			prm->prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
-		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
+	if (prm->nb_prog_arg > EBPF_FUNC_MAX_ARGS) {
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"support up to %u arguments, found %u",
+			EBPF_FUNC_MAX_ARGS, prm->nb_prog_arg);
 		return -ENOTSUP;
 	}
 
+	for (uint32_t pai = 0; pai != prm->nb_prog_arg; ++pai)
+		if (!prog_arg_is_valid(&prm->prog_arg[pai])) {
+			RTE_BPF_LOG_FUNC_LINE(ERR,
+				"unsupported argument %d (r%d) type",
+				pai, EBPF_REG_1 + pai);
+			return -ENOTSUP;
+		}
+
 	memset(&bvf, 0, sizeof(bvf));
 	bvf.prm = prm;
 	bvf.in = calloc(prm->raw.nb_ins, sizeof(bvf.in[0]));
diff --git a/lib/bpf/rte_bpf.h b/lib/bpf/rte_bpf.h
index bf58a41819..0e7eaa3c18 100644
--- a/lib/bpf/rte_bpf.h
+++ b/lib/bpf/rte_bpf.h
@@ -25,6 +25,11 @@
 extern "C" {
 #endif
 
+#define RTE_BPF_EXEC_FLAG_JIT	RTE_BIT64(0)	/**< use JIT-compiled version */
+
+/** Mask with all supported `RTE_BPF_EXEC_FLAG_*` flags set. */
+#define RTE_BPF_EXEC_FLAG_MASK  RTE_BPF_EXEC_FLAG_JIT
+
 /**
  * Possible types for function/BPF program arguments.
  */
@@ -122,7 +127,8 @@ struct rte_bpf_prm_ex {
 	/**< array of external symbols that eBPF code is allowed to reference */
 	uint32_t nb_xsym;  /**< number of elements in xsym */
 
-	struct rte_bpf_arg prog_arg;  /**< input arg description */
+	struct rte_bpf_arg prog_arg[EBPF_FUNC_MAX_ARGS];  /**< program arguments */
+	uint32_t nb_prog_arg;  /**< program argument count */
 };
 
 /**
@@ -138,13 +144,49 @@ struct rte_bpf_prm {
 };
 
 /**
- * Information about compiled into native ISA eBPF code.
+ * Information about compiled into native ISA eBPF code accepting 1 argument.
  */
 struct rte_bpf_jit {
 	uint64_t (*func)(void *); /**< JIT-ed native code */
 	size_t sz;                /**< size of JIT-ed code */
 };
 
+union rte_bpf_func_arg {
+	uint64_t u64;
+	void *ptr;
+};
+
+typedef uint64_t (*rte_bpf_jit_func0_t)(void);
+typedef uint64_t (*rte_bpf_jit_func1_t)(union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func2_t)(union rte_bpf_func_arg, union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func3_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func4_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg, union rte_bpf_func_arg);
+typedef uint64_t (*rte_bpf_jit_func5_t)(union rte_bpf_func_arg, union rte_bpf_func_arg,
+	union rte_bpf_func_arg, union rte_bpf_func_arg, union rte_bpf_func_arg);
+
+/**
+ * JIT-ed native code, member depends on number of program arguments.
+ */
+struct rte_bpf_jit_ex {
+	union {
+		void *raw;
+		rte_bpf_jit_func0_t func0;  /* nullary function */
+		rte_bpf_jit_func1_t func1;  /* unary function */
+		rte_bpf_jit_func2_t func2;  /* binary function */
+		rte_bpf_jit_func3_t func3;  /* ternary function */
+		rte_bpf_jit_func4_t func4;  /* quaternary function */
+		rte_bpf_jit_func5_t func5;  /* quinary function */
+	};
+	size_t sz;
+};
+
+/* Tuple of eBPF program arguments. */
+struct rte_bpf_prog_ctx {
+	union rte_bpf_func_arg arg[EBPF_FUNC_MAX_ARGS];
+};
+
 struct rte_bpf;
 
 /**
@@ -224,7 +266,7 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 	__rte_malloc __rte_dealloc(rte_bpf_destroy, 1);
 
 /**
- * Execute given BPF bytecode.
+ * Execute given BPF bytecode accepting 1 argument.
  *
  * @param bpf
  *   handle for the BPF code to execute.
@@ -237,7 +279,29 @@ uint64_t
 rte_bpf_exec(const struct rte_bpf *bpf, void *ctx);
 
 /**
- * Execute given BPF bytecode over a set of input contexts.
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Execute given BPF bytecode accepting any number of arguments.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   program arguments tuple.
+ * @param flags
+ *   bitwise OR of `RTE_BPF_EXEC_FLAG_*` values controlling execution.
+ *   Flag RTE_BPF_EXEC_FLAG_JIT requires presence of JIT version (can be checked
+ *   with rte_bpf_get_jit_ex).
+ * @return
+ *   BPF execution return value.
+ */
+__rte_experimental
+uint64_t
+rte_bpf_exec_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t flags);
+
+/**
+ * Execute given BPF bytecode accepting 1 argument over a set of input contexts.
  *
  * @param bpf
  *   handle for the BPF code to execute.
@@ -255,7 +319,35 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 		uint32_t num);
 
 /**
- * Provide information about natively compiled code for given BPF handle.
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Execute given BPF program accepting any number of arguments over a set of
+ * input contexts.
+ *
+ * @param bpf
+ *   handle for the BPF code to execute.
+ * @param ctx
+ *   pointer to array of program argument tuples, can be NULL for nullary programs.
+ * @param rc
+ *   array of return values (one per input).
+ * @param num
+ *   number executions, number of elements in arrays ctx and rc[].
+ * @param flags
+ *   bitwise OR of `RTE_BPF_EXEC_FLAG_*` values controlling execution.
+ *   Flag RTE_BPF_EXEC_FLAG_JIT requires presence of JIT version (can be checked
+ *   with rte_bpf_get_jit_ex).
+ * @return
+ *   number of successfully processed inputs.
+ */
+__rte_experimental
+uint32_t
+rte_bpf_exec_burst_ex(const struct rte_bpf *bpf, const struct rte_bpf_prog_ctx *ctx,
+		uint64_t rc[], uint32_t num, uint64_t flags);
+
+/**
+ * Provide information about natively compiled code for given BPF program
+ * accepting 1 argument.
  *
  * @param bpf
  *   handle for the BPF code.
@@ -268,6 +360,25 @@ rte_bpf_exec_burst(const struct rte_bpf *bpf, void *ctx[], uint64_t rc[],
 int
 rte_bpf_get_jit(const struct rte_bpf *bpf, struct rte_bpf_jit *jit);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: This API may change, or be removed, without prior notice.
+ *
+ * Get function JIT-compiled from the BPF program.
+ *
+ * @param bpf
+ *   handle for the BPF code.
+ * @param jit
+ *   pointer to the struct rte_bpf_jit_ex.
+ * @return
+ *   - -EINVAL if the parameters are invalid.
+ *   - -ENOENT if there is no JIT-compiled version.
+ *   - Zero if operation completed successfully.
+ */
+__rte_experimental
+int
+rte_bpf_get_jit_ex(const struct rte_bpf *bpf, struct rte_bpf_jit_ex *jit);
+
 /**
  * Dump epf instructions to a file.
  *
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 01/11] bpf: make logging prefixes more consistent
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  To: Konstantin Ananyev, Wathsala Vithanage; +Cc: dev
In-Reply-To: <20260612084219.38399-1-marat.khalili@huawei.com>

Logging in lib/bpf is inconsistent: some places use `%s()`, other just
`%s` for `__func__`.

Introduce new macro for logging prefixed with function name and use it
everywhere function name without arguments is prefixed to the log line.

Signed-off-by: Marat Khalili <marat.khalili@huawei.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/bpf/bpf_convert.c   | 18 +++++++++---------
 lib/bpf/bpf_impl.h      |  3 +++
 lib/bpf/bpf_jit_arm64.c |  4 ++--
 lib/bpf/bpf_load.c      |  2 +-
 lib/bpf/bpf_load_elf.c  |  2 +-
 lib/bpf/bpf_stub.c      |  6 ++----
 lib/bpf/bpf_validate.c  | 25 ++++++++++++-------------
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/lib/bpf/bpf_convert.c b/lib/bpf/bpf_convert.c
index 86e703299d..953ca80670 100644
--- a/lib/bpf/bpf_convert.c
+++ b/lib/bpf/bpf_convert.c
@@ -247,8 +247,8 @@ static int bpf_convert_filter(const struct bpf_insn *prog, size_t len,
 	uint8_t bpf_src;
 
 	if (len > BPF_MAXINSNS) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cBPF program too long (%zu insns)",
-			    __func__, len);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cBPF program too long (%zu insns)",
+			    len);
 		return -EINVAL;
 	}
 
@@ -483,8 +483,8 @@ static int bpf_convert_filter(const struct bpf_insn *prog, size_t len,
 
 			/* Unknown instruction. */
 		default:
-			RTE_BPF_LOG_LINE(ERR, "%s: Unknown instruction!: %#x",
-				    __func__, fp->code);
+			RTE_BPF_LOG_FUNC_LINE(ERR, "Unknown instruction!: %#x",
+				    fp->code);
 			goto err;
 		}
 
@@ -528,7 +528,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 	int ret;
 
 	if (prog == NULL) {
-		RTE_BPF_LOG_LINE(ERR, "%s: NULL program", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "NULL program");
 		rte_errno = EINVAL;
 		return NULL;
 	}
@@ -536,13 +536,13 @@ rte_bpf_convert(const struct bpf_program *prog)
 	/* 1st pass: calculate the eBPF program length */
 	ret = bpf_convert_filter(prog->bf_insns, prog->bf_len, NULL, &ebpf_len);
 	if (ret < 0) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cannot get eBPF length", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot get eBPF length");
 		rte_errno = -ret;
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(DEBUG, "%s: prog len cBPF=%u -> eBPF=%u",
-		    __func__, prog->bf_len, ebpf_len);
+	RTE_BPF_LOG_FUNC_LINE(DEBUG, "prog len cBPF=%u -> eBPF=%u",
+		    prog->bf_len, ebpf_len);
 
 	prm = rte_zmalloc("bpf_filter",
 			  sizeof(*prm) + ebpf_len * sizeof(*ebpf), 0);
@@ -557,7 +557,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 	/* 2nd pass: remap cBPF to eBPF instructions  */
 	ret = bpf_convert_filter(prog->bf_insns, prog->bf_len, ebpf, &ebpf_len);
 	if (ret < 0) {
-		RTE_BPF_LOG_LINE(ERR, "%s: cannot convert cBPF to eBPF", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "cannot convert cBPF to eBPF");
 		rte_free(prm);
 		rte_errno = -ret;
 		return NULL;
diff --git a/lib/bpf/bpf_impl.h b/lib/bpf/bpf_impl.h
index f5fa220984..fb5ec3c4d6 100644
--- a/lib/bpf/bpf_impl.h
+++ b/lib/bpf/bpf_impl.h
@@ -32,6 +32,9 @@ extern int rte_bpf_logtype;
 #define RTE_BPF_LOG_LINE(lvl, ...) \
 	RTE_LOG_LINE(lvl, BPF, __VA_ARGS__)
 
+#define RTE_BPF_LOG_FUNC_LINE(lvl, fmt, ...) \
+	RTE_LOG_LINE(lvl, BPF, "%s(): " fmt, __func__, ##__VA_ARGS__)
+
 static inline size_t
 bpf_size(uint32_t bpf_op_sz)
 {
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index a04ef33a9c..4bbb97da1b 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -98,8 +98,8 @@ check_invalid_args(struct a64_jit_ctx *ctx, uint32_t limit)
 
 	for (idx = 0; idx < limit; idx++) {
 		if (rte_le_to_cpu_32(ctx->ins[idx]) == A64_INVALID_OP_CODE) {
-			RTE_BPF_LOG_LINE(ERR,
-				"%s: invalid opcode at %u;", __func__, idx);
+			RTE_BPF_LOG_FUNC_LINE(ERR,
+				"invalid opcode at %u;", idx);
 			return -EINVAL;
 		}
 	}
diff --git a/lib/bpf/bpf_load.c b/lib/bpf/bpf_load.c
index 6983c026af..b8a0426fe2 100644
--- a/lib/bpf/bpf_load.c
+++ b/lib/bpf/bpf_load.c
@@ -100,7 +100,7 @@ rte_bpf_load(const struct rte_bpf_prm *prm)
 
 	if (rc != 0) {
 		rte_errno = -rc;
-		RTE_BPF_LOG_LINE(ERR, "%s: %d-th xsym is invalid", __func__, i);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "%d-th xsym is invalid", i);
 		return NULL;
 	}
 
diff --git a/lib/bpf/bpf_load_elf.c b/lib/bpf/bpf_load_elf.c
index 1d30ba17e2..2390823cbf 100644
--- a/lib/bpf/bpf_load_elf.c
+++ b/lib/bpf/bpf_load_elf.c
@@ -122,7 +122,7 @@ check_elf_header(const Elf64_Ehdr *eh)
 		err = "unexpected machine type";
 
 	if (err != NULL) {
-		RTE_BPF_LOG_LINE(ERR, "%s(): %s", __func__, err);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "%s", err);
 		return -EINVAL;
 	}
 
diff --git a/lib/bpf/bpf_stub.c b/lib/bpf/bpf_stub.c
index dea0d703ca..e06e820d83 100644
--- a/lib/bpf/bpf_stub.c
+++ b/lib/bpf/bpf_stub.c
@@ -21,8 +21,7 @@ rte_bpf_elf_load(const struct rte_bpf_prm *prm, const char *fname,
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(ERR, "%s() is not supported, rebuild with libelf installed",
-		__func__);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libelf installed");
 	rte_errno = ENOTSUP;
 	return NULL;
 }
@@ -38,8 +37,7 @@ rte_bpf_convert(const struct bpf_program *prog)
 		return NULL;
 	}
 
-	RTE_BPF_LOG_LINE(ERR, "%s() is not supported, rebuild with libpcap installed",
-		__func__);
+	RTE_BPF_LOG_FUNC_LINE(ERR, "not supported, rebuild with libpcap installed");
 	rte_errno = ENOTSUP;
 	return NULL;
 }
diff --git a/lib/bpf/bpf_validate.c b/lib/bpf/bpf_validate.c
index e8dbec2827..a7f4f576c9 100644
--- a/lib/bpf/bpf_validate.c
+++ b/lib/bpf/bpf_validate.c
@@ -1838,16 +1838,16 @@ add_edge(struct bpf_verifier *bvf, struct inst_node *node, uint32_t nidx)
 	uint32_t ne;
 
 	if (nidx >= bvf->prm->nb_ins) {
-		RTE_BPF_LOG_LINE(ERR,
-			"%s: program boundary violation at pc: %u, next pc: %u",
-			__func__, get_node_idx(bvf, node), nidx);
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"program boundary violation at pc: %u, next pc: %u",
+			get_node_idx(bvf, node), nidx);
 		return -EINVAL;
 	}
 
 	ne = node->nb_edge;
 	if (ne >= RTE_DIM(node->edge_dest)) {
-		RTE_BPF_LOG_LINE(ERR, "%s: internal error at pc: %u",
-			__func__, get_node_idx(bvf, node));
+		RTE_BPF_LOG_FUNC_LINE(ERR, "internal error at pc: %u",
+			get_node_idx(bvf, node));
 		return -EINVAL;
 	}
 
@@ -2005,8 +2005,7 @@ validate(struct bpf_verifier *bvf)
 
 		err = check_syntax(ins);
 		if (err != 0) {
-			RTE_BPF_LOG_LINE(ERR, "%s: %s at pc: %u",
-				__func__, err, i);
+			RTE_BPF_LOG_FUNC_LINE(ERR, "%s at pc: %u", err, i);
 			rc |= -EINVAL;
 		}
 
@@ -2230,9 +2229,9 @@ save_cur_eval_state(struct bpf_verifier *bvf, struct inst_node *node)
 	/* get new eval_state for this node */
 	st = pull_eval_state(&bvf->evst_sr_pool);
 	if (st == NULL) {
-		RTE_BPF_LOG_LINE(ERR,
-			"%s: internal error (out of space) at pc: %u",
-			__func__, get_node_idx(bvf, node));
+		RTE_BPF_LOG_FUNC_LINE(ERR,
+			"internal error (out of space) at pc: %u",
+			get_node_idx(bvf, node));
 		return -ENOMEM;
 	}
 
@@ -2462,8 +2461,8 @@ evaluate(struct bpf_verifier *bvf)
 				err = ins_chk[op].eval(bvf, ins + idx);
 				stats.nb_eval++;
 				if (err != NULL) {
-					RTE_BPF_LOG_LINE(ERR, "%s: %s at pc: %u",
-						__func__, err, idx);
+					RTE_BPF_LOG_FUNC_LINE(ERR,
+						"%s at pc: %u", err, idx);
 					rc = -EINVAL;
 				}
 			}
@@ -2533,7 +2532,7 @@ __rte_bpf_validate(struct rte_bpf *bpf)
 			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR &&
 			(sizeof(uint64_t) != sizeof(uintptr_t) ||
 			bpf->prm.prog_arg.type != RTE_BPF_ARG_PTR_MBUF)) {
-		RTE_BPF_LOG_LINE(ERR, "%s: unsupported argument type", __func__);
+		RTE_BPF_LOG_FUNC_LINE(ERR, "unsupported argument type");
 		return -ENOTSUP;
 	}
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 00/11] bpf: introduce extensible load API
From: Marat Khalili @ 2026-06-12  8:42 UTC (permalink / raw)
  Cc: dev
In-Reply-To: <20260520124922.42445-1-marat.khalili@huawei.com>

This patchset introduces an extensible load API for the BPF library in
DPDK, addressing current limitations regarding ABI stability and feature
constraints.

Currently, `rte_bpf_load` relies on a fixed `struct rte_bpf_prm`, which
makes it difficult to add new loading options or parameters without
breaking the ABI.

To resolve these issues, this series introduces `rte_bpf_load_ex` taking
`struct rte_bpf_prm_ex`. The new parameter structure includes a `sz`
field for backward compatibility, allowing future extensions.

Taking advantage of the new extensible API, this patchset also adds
several new features:
* Support for loading and executing BPF programs with up to 5 arguments.
* Support for loading classic BPF (cBPF) directly.
* Support for loading ELF files directly from memory buffers.
* New API functions (`rte_bpf_eth_rx_install` and `rte_bpf_eth_tx_install`)
  to install an already loaded BPF program as a port callback, decoupling
  the loading phase from the installation phase.

v5:
* Fixed compilation between commits broken in v4 while addressing AI
  comments.
* Rebased on fresh main, addressing conflicts in release notes.

v4:
* Restored missing NULL checks in wrapper functions `rte_bpf_load` and
  `rte_bpf_elf_load`.
* Fixed the burst execution functions (`rte_bpf_exec_burst*`) to return
  `0` and set `rte_errno = EINVAL` on failure, preventing `-EINVAL`
  being reinterpreted as a large `uint32_t` value. Initialized `rc`
  properly in scalar execution wrappers for this case.
* Swapped the Doxygen comments in `rte_bpf_ethdev.h` for RX and TX functions.
* Added diagnostic dump on failure path in `test_bpf_filter`.
* Fixed memory leak of the BPF handle in `bpf_rx_test` upon install failure.
* Added tests for NULL parameter rejection, mismatched execution arguments,
  unsupported execution flags, and the libpcap-less `rte_bpf_convert` stub.

v3:
* Appended Acked-by tags to all individual commits to align with
  patchwork requirements.

v2:
* Fixed a potential segmentation fault in `exec_vm_burst_ex` by deferring
  the dereference of `ctx[i].arg` until it is confirmed that `nb_prog_arg > 0`.
* Clarified documentation and code comments for `RTE_BPF_EXEC_FLAG_JIT`
  requirements and fast-path expectations.

Marat Khalili (11):
  bpf: make logging prefixes more consistent
  bpf: introduce extensible load API
  bpf: support up to 5 arguments
  bpf: add cBPF origin to rte_bpf_load_ex
  bpf: support rte_bpf_prm_ex with port callbacks
  bpf: support loading ELF files from memory
  test/bpf: test loading cBPF directly
  test/bpf: test loading ELF file from memory
  doc: add release notes for new extensible BPF API
  doc: add load API to BPF programmer's guide
  test/bpf: add tests for error handling contracts

 app/test/test_bpf.c                    | 455 +++++++++++++++++--------
 doc/guides/prog_guide/bpf_lib.rst      |  75 +++-
 doc/guides/rel_notes/release_26_07.rst |  20 ++
 lib/bpf/bpf.c                          |  32 +-
 lib/bpf/bpf_convert.c                  |  99 +++++-
 lib/bpf/bpf_exec.c                     | 134 +++++++-
 lib/bpf/bpf_impl.h                     |  53 ++-
 lib/bpf/bpf_jit_arm64.c                |  18 +-
 lib/bpf/bpf_jit_x86.c                  |  10 +-
 lib/bpf/bpf_load.c                     | 213 ++++++++++--
 lib/bpf/bpf_load_elf.c                 | 189 ++++++----
 lib/bpf/bpf_pkt.c                      |  65 +++-
 lib/bpf/bpf_stub.c                     |  46 ---
 lib/bpf/bpf_validate.c                 |  94 +++--
 lib/bpf/meson.build                    |  15 +-
 lib/bpf/rte_bpf.h                      | 199 ++++++++++-
 lib/bpf/rte_bpf_ethdev.h               |  54 +++
 17 files changed, 1400 insertions(+), 371 deletions(-)
 delete mode 100644 lib/bpf/bpf_stub.c

-- 
2.43.0

^ permalink raw reply

* [PATCH v9 1/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-12  8:17 UTC (permalink / raw)
  To: dev, stephen; +Cc: longli, weh
In-Reply-To: <20260612081723.27699-1-weh@linux.microsoft.com>

From: Wei Hu <weh@microsoft.com>

Add support for handling hardware reset events in the MANA driver.
When the MANA kernel driver receives a hardware service event, it
initiates a device reset and notifies userspace via
IBV_EVENT_DEVICE_FATAL. The DPDK driver handles this by performing
an automatic teardown and recovery sequence.

The interrupt handler sets the device state, blocks new data path
bursts, waits for in-flight bursts to drain using per-queue atomic
flags, and spawns a control thread. The control thread performs
teardown immediately (dev_stop, secondary IPC, dev_close, MR cache
free) before waiting for the hardware recovery timer to fire. This
avoids blocking the EAL interrupt thread on multi-second IPC
timeouts and ibverbs calls. After the recovery delay, the thread
unregisters the interrupt handler, re-probes the PCI device,
reinitializes MR caches, and restarts queues. Each function owns
its own lock scope with no lock hand-off between threads.

Each queue has an atomic burst_state variable where bit 0 is the
in-burst flag and bit 1 is a blocked flag. The data path uses a
single compare-and-swap (0 to 1) to enter a burst, which fails
immediately if the blocked bit is set. The reset path sets the
blocked bit via atomic fetch-or and polls bit 0 to wait for
in-flight bursts to drain. This single-variable design avoids the
need for sequential consistency ordering.

A per-device mutex serializes the reset path with ethdev
operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
support and is held across blocking IB verbs calls. A trylock
helper encapsulates the lock acquisition and device state check
for all ethdev operation wrappers. Operations that cannot wait
(configure, queue setup) return -EBUSY during reset, while
dev_stop and dev_close join the reset thread before acquiring
the lock to ensure proper sequencing.

The reset thread keeps reset_thread_active true throughout its
lifetime. mana_join_reset_thread uses rte_thread_equal to detect
the self-join case (when a recovery callback calls dev_stop or
dev_close from the reset thread itself) and calls
rte_thread_detach instead of join, so thread resources are freed
on exit. External callers join normally.

The condvar wait in the reset thread uses a predicate loop that
checks dev_state under reset_cond_mutex, so a PCI remove signal
that arrives before the thread enters the wait is not lost. The
PCI remove callback sets dev_state to RESET_FAILED under the
same mutex before signaling. A lock/unlock barrier on
reset_ops_lock in the PCI remove path ensures teardown has
completed before emitting the INTR_RMV event.

Multi-process support is included: secondary processes unmap and
remap doorbell pages via IPC during the reset enter and exit
phases. Data path functions in both primary and secondary
processes check the device state atomically and return early when
the device is not active.

The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
reset path so that upper layers (e.g. netvsc) can switch their
data path before queues are stopped. The event is emitted outside
the reset lock to avoid deadlock if the callback calls dev_stop or
dev_close. On completion, the driver emits RECOVERY_SUCCESS or
RECOVERY_FAILED after releasing the lock. If a recovery callback
triggers dev_stop or dev_close, the self-join detection in
mana_join_reset_thread detaches the thread to avoid deadlock. If
the enter phase fails internally, RECOVERY_FAILED is sent
immediately so the application receives a terminal event. A PCI
device removal event callback distinguishes hot-remove from
service reset.

Documentation for the device reset feature is added in the MANA
NIC guide and the 26.07 release notes.

Signed-off-by: Wei Hu <weh@microsoft.com>
---
 doc/guides/nics/mana.rst               |   40 +
 doc/guides/rel_notes/release_26_07.rst |    8 +
 drivers/net/mana/mana.c                | 1088 ++++++++++++++++++++++--
 drivers/net/mana/mana.h                |   52 +-
 drivers/net/mana/mp.c                  |   89 +-
 drivers/net/mana/mr.c                  |    6 +-
 drivers/net/mana/rx.c                  |   23 +-
 drivers/net/mana/tx.c                  |   44 +-
 8 files changed, 1242 insertions(+), 108 deletions(-)

diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
index 0fcab6e2f6..08e345ea61 100644
--- a/doc/guides/nics/mana.rst
+++ b/doc/guides/nics/mana.rst
@@ -71,3 +71,43 @@ The user can specify below argument in devargs.
     The default value is not set,
     meaning all the NICs will be probed and loaded.
     User can specify multiple mac=xx:xx:xx:xx:xx:xx arguments for up to 8 NICs.
+
+Device Reset Support
+--------------------
+
+The MANA PMD supports automatic recovery from hardware service reset events.
+When the MANA kernel driver receives a hardware service event,
+it initiates a device reset and notifies userspace
+via ``IBV_EVENT_DEVICE_FATAL``.
+
+The driver handles this transparently through a two-phase reset flow:
+
+* **Enter phase**: The interrupt handler blocks new data path bursts
+  and waits for all in-flight burst calls to drain
+  using per-queue atomic flags,
+  then spawns a control thread for the remaining work.
+
+* **Teardown and exit phase**: The control thread tears down
+  IB resources and queues, unmaps secondary process doorbell pages,
+  and closes the device. After a delay for hardware recovery,
+  it re-probes the PCI device,
+  reinstalls the interrupt handler,
+  reinitializes resources, and restarts queues.
+
+The driver emits the following ethdev recovery events
+to notify upper layers (e.g. netvsc) of the reset lifecycle:
+
+``RTE_ETH_EVENT_ERR_RECOVERING``
+   Reset has started.
+
+``RTE_ETH_EVENT_RECOVERY_SUCCESS``
+   Device has recovered successfully.
+
+``RTE_ETH_EVENT_RECOVERY_FAILED``
+   Recovery failed.
+
+To distinguish a PCI hot-remove from a service reset,
+the driver registers for PCI device removal events.
+This requires the application to call ``rte_dev_event_monitor_start()``
+for removal events to be delivered
+(e.g. testpmd ``--hot-plug-handling`` option).
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index bd0cec2709..58e8c2422e 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -122,6 +122,14 @@ New Features
   Added AGENTS.md file for AI review
   and supporting scripts to review patches and documentation.
 
+* **Added device reset support to the MANA PMD.**
+
+  Added automatic recovery from hardware service reset events
+  in the MANA poll mode driver. The driver uses ethdev recovery events
+  (``RTE_ETH_EVENT_ERR_RECOVERING``, ``RTE_ETH_EVENT_RECOVERY_SUCCESS``,
+  ``RTE_ETH_EVENT_RECOVERY_FAILED``) to notify upper layers of the
+  reset lifecycle.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 67396cda1f..0b72f711a1 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_configure(struct rte_eth_dev *dev)
 			      RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
 
 	priv->num_queues = dev->data->nb_rx_queues;
+	DRV_LOG(DEBUG, "priv %p, port %u, dev port %u, num_queues: %u",
+		priv, priv->port_id, priv->dev_port, priv->num_queues);
 
 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -214,8 +216,8 @@ mana_dev_start(struct rte_eth_dev *dev)
 
 	DRV_LOG(INFO, "TX/RX queues have started");
 
-	/* Enable datapath for secondary processes */
-	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+	/* Intentionally ignore errors — secondary may not be running */
+	(void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
 
 	ret = rxq_intr_enable(priv);
 	if (ret) {
@@ -242,26 +244,33 @@ mana_dev_stop(struct rte_eth_dev *dev)
 {
 	int ret;
 	struct mana_priv *priv = dev->data->dev_private;
-
-	rxq_intr_disable(priv);
+	enum mana_device_state state;
+
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		rxq_intr_disable(priv);
+		DRV_LOG(DEBUG, "rxq_intr_disable called");
+	}
 
 	dev->tx_pkt_burst = mana_tx_burst_removed;
 	dev->rx_pkt_burst = mana_rx_burst_removed;
 
-	/* Stop datapath on secondary processes */
-	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+	/* Intentionally ignore errors — secondary may not be running */
+	(void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
 
 	rte_wmb();
 
 	ret = mana_stop_tx_queues(dev);
 	if (ret) {
-		DRV_LOG(ERR, "failed to stop tx queues");
+		DRV_LOG(ERR, "failed to stop tx queues, ret %d", ret);
 		return ret;
 	}
 
 	ret = mana_stop_rx_queues(dev);
 	if (ret) {
-		DRV_LOG(ERR, "failed to stop tx queues");
+		DRV_LOG(ERR, "failed to stop rx queues, ret %d", ret);
 		return ret;
 	}
 
@@ -275,36 +284,66 @@ mana_dev_close(struct rte_eth_dev *dev)
 {
 	struct mana_priv *priv = dev->data->dev_private;
 	int ret;
+	enum mana_device_state state;
 
+	DRV_LOG(DEBUG, "Free MR for priv %p", priv);
 	mana_remove_all_mr(priv);
 
-	ret = mana_intr_uninstall(priv);
-	if (ret)
-		return ret;
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		ret = mana_intr_uninstall(priv);
+		if (ret)
+			return ret;
+	}
 
 	if (priv->ib_parent_pd) {
-		int err = ibv_dealloc_pd(priv->ib_parent_pd);
-		if (err)
-			DRV_LOG(ERR, "Failed to deallocate parent PD: %d", err);
+		ret = ibv_dealloc_pd(priv->ib_parent_pd);
+		if (ret)
+			DRV_LOG(ERR,
+				"Failed to deallocate parent PD: %d", ret);
 		priv->ib_parent_pd = NULL;
 	}
 
 	if (priv->ib_pd) {
-		int err = ibv_dealloc_pd(priv->ib_pd);
-		if (err)
-			DRV_LOG(ERR, "Failed to deallocate PD: %d", err);
+		ret = ibv_dealloc_pd(priv->ib_pd);
+		if (ret)
+			DRV_LOG(ERR, "Failed to deallocate PD: %d", ret);
 		priv->ib_pd = NULL;
 	}
 
-	ret = ibv_close_device(priv->ib_ctx);
-	if (ret) {
-		ret = errno;
-		return ret;
+	state = rte_atomic_load_explicit(&priv->dev_state,
+					 rte_memory_order_acquire);
+	if (state == MANA_DEV_ACTIVE ||
+	    state == MANA_DEV_RESET_FAILED) {
+		if (priv->ib_ctx) {
+			ret = ibv_close_device(priv->ib_ctx);
+			if (ret) {
+				ret = errno;
+				return ret;
+			}
+			priv->ib_ctx = NULL;
+		}
 	}
 
 	return 0;
 }
 
+/*
+ * Called from mana_pci_remove to free resources allocated
+ * during probe that are not freed by dev_close.
+ */
+static void
+mana_dev_free_resources(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	pthread_mutex_destroy(&priv->reset_ops_lock);
+	pthread_mutex_destroy(&priv->reset_cond_mutex);
+	pthread_cond_destroy(&priv->reset_cond);
+}
+
 static int
 mana_dev_info_get(struct rte_eth_dev *dev,
 		  struct rte_eth_dev_info *dev_info)
@@ -391,6 +430,39 @@ mana_dev_info_get(struct rte_eth_dev *dev,
 	return 0;
 }
 
+/*
+ * Try to acquire the reset lock and verify the device is active.
+ * Returns 0 with lock held on success, or -EBUSY if the lock
+ * could not be acquired or the device is not in ACTIVE state.
+ */
+static int
+mana_reset_trylock(struct mana_priv *priv)
+{
+	if (pthread_mutex_trylock(&priv->reset_ops_lock))
+		return -EBUSY;
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static int
+mana_dev_info_get_lock(struct rte_eth_dev *dev,
+		       struct rte_eth_dev_info *dev_info)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_info_get(dev, dev_info);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
 		       struct rte_eth_txq_info *qinfo)
@@ -552,6 +624,22 @@ mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 	return ret;
 }
 
+static int
+mana_dev_tx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+			     uint16_t nb_desc, unsigned int socket_id,
+			     const struct rte_eth_txconf *tx_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_tx_queue_setup(dev, queue_idx,
+				      nb_desc, socket_id, tx_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
@@ -629,6 +717,23 @@ mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
 	return ret;
 }
 
+static int
+mana_dev_rx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+			     uint16_t nb_desc, unsigned int socket_id,
+			     const struct rte_eth_rxconf *rx_conf __rte_unused,
+			     struct rte_mempool *mp)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_rx_queue_setup(dev, queue_idx, nb_desc,
+				      socket_id, rx_conf, mp);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static void
 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
 {
@@ -820,33 +925,267 @@ mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
 	return mana_ifreq(priv, SIOCSIFMTU, &request);
 }
 
+static int
+mana_dev_configure_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_configure(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_dev_start_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_dev_start(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+/*
+ * Join the reset thread if it is active. Uses CAS on
+ * reset_thread_active to ensure only one caller joins.
+ * If called from the reset thread itself (e.g. via a recovery
+ * event callback that calls dev_stop/dev_close), detach instead
+ * of joining to avoid deadlock and let the thread self-free.
+ */
+static void
+mana_join_reset_thread(struct mana_priv *priv)
+{
+	bool expected = true;
+
+	if (rte_atomic_compare_exchange_strong_explicit(
+			&priv->reset_thread_active, &expected, false,
+			rte_memory_order_acq_rel,
+			rte_memory_order_acquire)) {
+		if (rte_thread_equal(rte_thread_self(),
+				     priv->reset_thread)) {
+			/* Self case: detach so resources are freed on
+			 * thread exit. Don't modify dev_state — the
+			 * caller (dev_stop_lock/dev_close_lock) handles
+			 * state transitions.
+			 */
+			rte_thread_detach(priv->reset_thread);
+			return;
+		}
+
+		pthread_mutex_lock(&priv->reset_cond_mutex);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+		pthread_cond_signal(&priv->reset_cond);
+		pthread_mutex_unlock(&priv->reset_cond_mutex);
+		rte_thread_join(priv->reset_thread, NULL);
+	}
+}
+
+/*
+ * Clear per-queue burst_state so the data path CAS can succeed again.
+ * Must be called under reset_ops_lock when transitioning back to ACTIVE
+ * after a failed or aborted reset.
+ */
+static void
+mana_clear_burst_state(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int i;
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_store_explicit(&rxq->burst_state, 0,
+						  rte_memory_order_release);
+		if (txq)
+			rte_atomic_store_explicit(&txq->burst_state, 0,
+						  rte_memory_order_release);
+	}
+}
+
+/*
+ * Custom lock wrappers for dev_stop and dev_close.
+ * These join any active reset thread and use a blocking lock (not
+ * trylock) so they wait for any in-progress reset processing to
+ * finish, rather than returning -EBUSY. When the device is not in
+ * MANA_DEV_ACTIVE state, they transition state to MANA_DEV_ACTIVE.
+ */
+static int
+mana_dev_stop_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	mana_join_reset_thread(priv);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		mana_clear_burst_state(dev);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return 0;
+	}
+
+	ret = mana_dev_stop(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_dev_close_lock(struct rte_eth_dev *dev)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	mana_join_reset_thread(priv);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+		mana_clear_burst_state(dev);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_ACTIVE, rte_memory_order_release);
+	}
+
+	ret = mana_dev_close(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rss_hash_update_lock(struct rte_eth_dev *dev,
+			  struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rss_hash_update(dev, rss_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rss_hash_conf_get_lock(struct rte_eth_dev *dev,
+			    struct rte_eth_rss_conf *rss_conf)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rss_hash_conf_get(dev, rss_conf);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static void
+mana_dev_tx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	if (mana_reset_trylock(priv)) {
+		DRV_LOG(ERR, "Device reset in progress, "
+			"mana_dev_tx_queue_release not called");
+		return;
+	}
+	mana_dev_tx_queue_release(dev, qid);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static void
+mana_dev_rx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+
+	if (mana_reset_trylock(priv)) {
+		DRV_LOG(ERR, "Device reset in progress, "
+			"mana_dev_rx_queue_release not called");
+		return;
+	}
+	mana_dev_rx_queue_release(dev, qid);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_rx_intr_enable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rx_intr_enable(dev, rx_queue_id);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_rx_intr_disable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_rx_intr_disable(dev, rx_queue_id);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
+static int
+mana_mtu_set_lock(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	if (mana_reset_trylock(priv))
+		return -EBUSY;
+	ret = mana_mtu_set(dev, mtu);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return ret;
+}
+
 static const struct eth_dev_ops mana_dev_ops = {
-	.dev_configure		= mana_dev_configure,
-	.dev_start		= mana_dev_start,
-	.dev_stop		= mana_dev_stop,
-	.dev_close		= mana_dev_close,
-	.dev_infos_get		= mana_dev_info_get,
+	.dev_configure		= mana_dev_configure_lock,
+	.dev_start		= mana_dev_start_lock,
+	.dev_stop		= mana_dev_stop_lock,
+	.dev_close		= mana_dev_close_lock,
+	.dev_infos_get		= mana_dev_info_get_lock,
 	.txq_info_get		= mana_dev_tx_queue_info,
 	.rxq_info_get		= mana_dev_rx_queue_info,
 	.dev_supported_ptypes_get = mana_supported_ptypes,
-	.rss_hash_update	= mana_rss_hash_update,
-	.rss_hash_conf_get	= mana_rss_hash_conf_get,
-	.tx_queue_setup		= mana_dev_tx_queue_setup,
-	.tx_queue_release	= mana_dev_tx_queue_release,
-	.rx_queue_setup		= mana_dev_rx_queue_setup,
-	.rx_queue_release	= mana_dev_rx_queue_release,
-	.rx_queue_intr_enable	= mana_rx_intr_enable,
-	.rx_queue_intr_disable	= mana_rx_intr_disable,
+	.rss_hash_update	= mana_rss_hash_update_lock,
+	.rss_hash_conf_get	= mana_rss_hash_conf_get_lock,
+	.tx_queue_setup		= mana_dev_tx_queue_setup_lock,
+	.tx_queue_release	= mana_dev_tx_queue_release_lock,
+	.rx_queue_setup		= mana_dev_rx_queue_setup_lock,
+	.rx_queue_release	= mana_dev_rx_queue_release_lock,
+	.rx_queue_intr_enable	= mana_rx_intr_enable_lock,
+	.rx_queue_intr_disable	= mana_rx_intr_disable_lock,
 	.link_update		= mana_dev_link_update,
 	.stats_get		= mana_dev_stats_get,
 	.stats_reset		= mana_dev_stats_reset,
-	.mtu_set		= mana_mtu_set,
+	.mtu_set		= mana_mtu_set_lock,
 };
 
 static const struct eth_dev_ops mana_dev_secondary_ops = {
 	.stats_get = mana_dev_stats_get,
 	.stats_reset = mana_dev_stats_reset,
-	.dev_infos_get = mana_dev_info_get,
+	.dev_infos_get = mana_dev_info_get_lock,
 };
 
 uint16_t
@@ -1031,28 +1370,516 @@ mana_ibv_device_to_pci_addr(const struct ibv_device *device,
 	return 0;
 }
 
+static int mana_pci_probe(struct rte_pci_driver *pci_drv,
+			  struct rte_pci_device *pci_dev);
+static void mana_intr_handler(void *arg);
+static void mana_reset_exit(struct mana_priv *priv);
+
+/* Delay before initiating reset exit after reset enter completes */
+#define MANA_RESET_TIMER_US (15 * 1000000ULL) /* 15 seconds */
+
 /*
- * Interrupt handler from IB layer to notify this device is being removed.
+ * Callback for PCI device removal events from EAL.
+ * If the device is in reset (RESET_EXIT state), this means the PCI
+ * device was hot-removed rather than a service reset. Wake the reset
+ * thread via condvar and notify netvsc via RTE_ETH_EVENT_INTR_RMV.
+ */
+static void
+mana_pci_remove_event_cb(const char *device_name,
+			 enum rte_dev_event_type event, void *cb_arg)
+{
+	struct mana_priv *priv = cb_arg;
+	struct rte_eth_dev *dev;
+
+	if (event != RTE_DEV_EVENT_REMOVE)
+		return;
+
+	DRV_LOG(INFO, "PCI device %s removed", device_name);
+
+	/* Wake the reset thread immediately */
+	pthread_mutex_lock(&priv->reset_cond_mutex);
+	rte_atomic_store_explicit(&priv->dev_state,
+		MANA_DEV_RESET_FAILED, rte_memory_order_release);
+	pthread_cond_signal(&priv->reset_cond);
+	pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+	/* Wait for the reset thread to finish teardown and release
+	 * reset_ops_lock before emitting INTR_RMV to the application.
+	 */
+	pthread_mutex_lock(&priv->reset_ops_lock);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	dev = &rte_eth_devices[priv->port_id];
+	DRV_LOG(INFO, "Sending RTE_ETH_EVENT_INTR_RMV for port %u",
+		priv->port_id);
+	rte_eth_dev_callback_process(dev,
+		RTE_ETH_EVENT_INTR_RMV, NULL);
+}
+
+/*
+ * Reset thread: performs teardown immediately, waits for the
+ * recovery timer, then re-probes and restarts the device.
+ * Runs on a control thread so it can call blocking IPC, ibv
+ * teardown, and rte_intr_callback_unregister (which all must
+ * not run on the EAL interrupt thread).
+ */
+static uint32_t
+mana_reset_thread(void *arg)
+{
+	struct mana_priv *priv = (struct mana_priv *)arg;
+	struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+	struct timespec ts;
+	int ret;
+	int i;
+
+	DRV_LOG(INFO, "Reset thread started");
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	/* Teardown: stop data path, unmap secondary doorbells, close device,
+	 * free MR caches. Must happen immediately — hardware may be gone.
+	 */
+	ret = mana_dev_stop(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to stop mana dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_ENTER);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to reset secondary processes ret = %d",
+			ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	ret = mana_dev_close(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to close mana dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+			MANA_DEV_RESET_FAILED, rte_memory_order_release);
+		goto reset_failed;
+	}
+
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		DRV_LOG(DEBUG, "Free MR for priv = %p, rxq %u, txq %u",
+			priv, rxq->rxq_idx, txq->txq_idx);
+		mana_mr_btree_free(&rxq->mr_btree);
+		mana_mr_btree_free(&txq->mr_btree);
+	}
+
+	DRV_LOG(DEBUG, "Teardown complete");
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_EXIT,
+				     rte_memory_order_release);
+
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	/* Wait for the recovery timer before re-probing.
+	 * Check dev_state under reset_cond_mutex before waiting:
+	 * if mana_pci_remove_event_cb already set RESET_FAILED
+	 * (under the same mutex), we skip the wait entirely.
+	 * This avoids losing a condvar signal that arrived before
+	 * we entered the wait.
+	 */
+	DRV_LOG(INFO, "Waiting %us for hardware recovery",
+		(unsigned int)(MANA_RESET_TIMER_US / 1000000));
+
+	clock_gettime(CLOCK_REALTIME, &ts);
+	ts.tv_sec += MANA_RESET_TIMER_US / 1000000;
+
+	pthread_mutex_lock(&priv->reset_cond_mutex);
+	while (rte_atomic_load_explicit(&priv->dev_state,
+	       rte_memory_order_acquire) == MANA_DEV_RESET_EXIT) {
+		if (pthread_cond_timedwait(&priv->reset_cond,
+		    &priv->reset_cond_mutex, &ts))
+			break; /* timeout */
+	}
+	pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+	pthread_mutex_lock(&priv->reset_ops_lock);
+
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+		DRV_LOG(INFO, "Reset thread: dev_state=%d, skipping exit",
+			(int)rte_atomic_load_explicit(&priv->dev_state,
+			rte_memory_order_acquire));
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return 0;
+	}
+
+	DRV_LOG(INFO, "Reset thread: initiating reset exit");
+	mana_reset_exit(priv);
+	/* Lock is released by mana_reset_exit_delay.
+	 * reset_thread_active remains true — the joiner
+	 * (mana_join_reset_thread) will either join or detach
+	 * (if called from this thread's own callback).
+	 */
+	return 0;
+
+reset_failed:
+	mana_clear_burst_state(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+		priv->port_id);
+	rte_eth_dev_callback_process(dev,
+		RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+	return 0;
+}
+
+static void
+mana_reset_enter(struct mana_priv *priv)
+{
+	int ret;
+	int i;
+	struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+
+	/*
+	 * Lock ownership: mana_intr_handler acquires reset_ops_lock,
+	 * mana_reset_enter sets state/drains/spawns thread and releases it.
+	 * The reset thread independently acquires/releases the lock for
+	 * teardown and for the exit (re-probe) phase.
+	 */
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_ENTER,
+				     rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Entering into device reset state");
+	DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+	/* Set the blocked bit on each queue's burst_state so new bursts
+	 * are rejected, then wait for any in-flight burst (bit 0) to finish.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_fetch_or_explicit(&rxq->burst_state,
+				MANA_BURST_BLOCKED,
+				rte_memory_order_release);
+		if (txq)
+			rte_atomic_fetch_or_explicit(&txq->burst_state,
+				MANA_BURST_BLOCKED,
+				rte_memory_order_release);
+	}
+
+	/* Wait for all in-flight burst calls to finish (bit 0 to clear) */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			while (rte_atomic_load_explicit(&rxq->burst_state,
+				    rte_memory_order_acquire) & 1)
+				rte_pause();
+		if (txq)
+			while (rte_atomic_load_explicit(&txq->burst_state,
+				    rte_memory_order_acquire) & 1)
+				rte_pause();
+	}
+
+	DRV_LOG(DEBUG, "All data path threads drained");
+
+	/* Join previous reset thread if it completed but was not joined.
+	 * Use CAS to avoid double-join if another path joined first.
+	 * Don't use mana_join_reset_thread() here — we are already in
+	 * RESET_ENTER state and must not change dev_state to ACTIVE.
+	 */
+	{
+		bool expected = true;
+
+		if (rte_atomic_compare_exchange_strong_explicit(
+				&priv->reset_thread_active, &expected, false,
+				rte_memory_order_acq_rel,
+				rte_memory_order_acquire))
+			rte_thread_join(priv->reset_thread, NULL);
+	}
+
+	ret = rte_thread_create_internal_control(&priv->reset_thread,
+						 "mana-reset",
+						 mana_reset_thread, priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to create reset thread ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state,
+					  MANA_DEV_RESET_FAILED,
+					  rte_memory_order_release);
+		goto reset_failed;
+	}
+	rte_atomic_store_explicit(&priv->reset_thread_active,
+		true, rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Reset thread started");
+
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+	return;
+
+reset_failed:
+	mana_clear_burst_state(dev);
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_reset_exit_delay(void *arg)
+{
+	struct mana_priv *priv = (struct mana_priv *)arg;
+	int ret = 0;
+	int i;
+	struct rte_eth_dev *dev;
+	struct rte_pci_device *pci_dev;
+
+	DRV_LOG(DEBUG, "Delayed mana device reset complete processing");
+
+	/* If the app called dev_stop/dev_close during the timer window,
+	 * state is no longer RESET_EXIT. Nothing to do.
+	 */
+	if (rte_atomic_load_explicit(&priv->dev_state,
+	    rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+		DRV_LOG(DEBUG, "State is not RESET_EXIT, skipping");
+		pthread_mutex_unlock(&priv->reset_ops_lock);
+		return ret;
+	}
+
+	dev = &rte_eth_devices[priv->port_id];
+	pci_dev = RTE_CLASS_TO_BUS_DEVICE(dev, *pci_dev);
+
+	DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+	ret = ibv_close_device(priv->ib_ctx);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to close ibv device %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+		goto out;
+	}
+	priv->ib_ctx = NULL;
+
+	ret = mana_pci_probe(NULL, pci_dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to probe mana pci dev ret %d", ret);
+		rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+		goto out;
+	}
+
+	/*
+	 * Init the local MR caches.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		ret = mana_mr_btree_init(&rxq->mr_btree,
+					 MANA_MR_BTREE_PER_QUEUE_N,
+					 rxq->socket);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to init RXQ %d MR btree "
+				"on socket %u, ret %d", i, rxq->socket, ret);
+			goto mr_init_failed_rxq;
+		}
+
+		ret = mana_mr_btree_init(&txq->mr_btree,
+					 MANA_MR_BTREE_PER_QUEUE_N,
+					 txq->socket);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to init TXQ %d MR btree "
+				"on socket %u, ret %d", i, txq->socket, ret);
+			goto mr_init_failed_txq;
+		}
+	}
+	DRV_LOG(DEBUG, "priv %p, num_queues %u", priv, priv->num_queues);
+
+	/* Start secondaries */
+	ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_EXIT);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to start secondary processes ret = %d",
+			ret);
+		goto mr_init_failed_all;
+	}
+
+	ret = mana_dev_start(dev);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to start mana dev ret %d", ret);
+		goto mr_init_failed_all;
+	}
+
+	/* Clear per-queue burst_state before marking device active so
+	 * data path CAS can succeed again.
+	 */
+	for (i = 0; i < priv->num_queues; i++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[i];
+		struct mana_txq *txq = dev->data->tx_queues[i];
+
+		if (rxq)
+			rte_atomic_store_explicit(&rxq->burst_state, 0,
+						  rte_memory_order_release);
+		if (txq)
+			rte_atomic_store_explicit(&txq->burst_state, 0,
+						  rte_memory_order_release);
+	}
+
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+				     rte_memory_order_release);
+
+	DRV_LOG(DEBUG, "Exiting the reset complete processing");
+	goto out;
+
+mr_init_failed_all:
+	i = priv->num_queues;
+	goto mr_init_failed_rxq;
+
+mr_init_failed_txq:
+	/* RXQ btree at index i was initialized, free it */
+	mana_mr_btree_free(&((struct mana_rxq *)
+			     dev->data->rx_queues[i])->mr_btree);
+
+mr_init_failed_rxq:
+	/* Free all fully initialized btrees for indices < i */
+	for (int j = 0; j < i; j++) {
+		struct mana_rxq *rxq = dev->data->rx_queues[j];
+		struct mana_txq *txq = dev->data->tx_queues[j];
+
+		mana_mr_btree_free(&rxq->mr_btree);
+		mana_mr_btree_free(&txq->mr_btree);
+	}
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+				     rte_memory_order_release);
+
+out:
+	pthread_mutex_unlock(&priv->reset_ops_lock);
+
+	if (!ret) {
+		DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_SUCCESS for port %u",
+			priv->port_id);
+		rte_eth_dev_callback_process(dev,
+			RTE_ETH_EVENT_RECOVERY_SUCCESS, NULL);
+	} else {
+		DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+			priv->port_id);
+		rte_eth_dev_callback_process(dev,
+			RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+	}
+	return ret;
+}
+
+static void
+mana_reset_exit(struct mana_priv *priv)
+{
+	int ret;
+
+	if (!priv) {
+		DRV_LOG(ERR, "Private structure invalid");
+		return;
+	}
+	DRV_LOG(DEBUG, "Entering into device reset complete processing");
+
+	rxq_intr_disable(priv);
+
+	/* Unregister the interrupt handler. Since mana_reset_exit is always
+	 * called from mana_reset_thread (a non-interrupt thread), the
+	 * interrupt source is inactive and rte_intr_callback_unregister
+	 * succeeds directly.
+	 */
+	if (priv->intr_handle) {
+		ret = rte_intr_callback_unregister(priv->intr_handle,
+						   mana_intr_handler, priv);
+		if (ret < 0)
+			DRV_LOG(ERR, "Failed to unregister intr callback ret %d",
+				ret);
+		else
+			DRV_LOG(DEBUG, "%d intr callback(s) removed", ret);
+
+		rte_intr_instance_free(priv->intr_handle);
+		priv->intr_handle = NULL;
+	}
+
+	/* Proceed directly to reset exit delay (re-probe and restart).
+	 * No need for a separate thread - we are already on
+	 * mana_reset_thread which is a non-interrupt control thread.
+	 */
+	mana_reset_exit_delay(priv);
+}
+
+/*
+ * Interrupt handler from IB layer to notify this device is
+ * being removed or reset.
  */
 static void
 mana_intr_handler(void *arg)
 {
 	struct mana_priv *priv = arg;
 	struct ibv_context *ctx = priv->ib_ctx;
-	struct ibv_async_event event;
+	struct ibv_async_event event = { 0 };
+	struct rte_eth_dev *dev;
 
 	/* Read and ack all messages from IB device */
 	while (true) {
 		if (ibv_get_async_event(ctx, &event))
 			break;
 
-		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
-			struct rte_eth_dev *dev;
-
-			dev = &rte_eth_devices[priv->port_id];
-			if (dev->data->dev_conf.intr_conf.rmv)
+		switch (event.event_type) {
+		case IBV_EVENT_DEVICE_FATAL:
+			DRV_LOG(INFO, "IBV_EVENT_DEVICE_FATAL received, dev_state=%d",
+				(int)rte_atomic_load_explicit(&priv->dev_state,
+				rte_memory_order_acquire));
+			if (rte_atomic_load_explicit(&priv->dev_state,
+			    rte_memory_order_acquire) == MANA_DEV_ACTIVE) {
+				/* Notify upper layers (e.g. netvsc) before
+				 * acquiring the lock so they can switch data
+				 * path before mana stops queues. Emitting
+				 * outside the lock avoids deadlock if the
+				 * callback calls dev_stop/dev_close.
+				 */
+				dev = &rte_eth_devices[priv->port_id];
+				DRV_LOG(INFO,
+					"Sending RTE_ETH_EVENT_ERR_RECOVERING for port %u",
+					priv->port_id);
 				rte_eth_dev_callback_process(dev,
-					RTE_ETH_EVENT_INTR_RMV, NULL);
+					RTE_ETH_EVENT_ERR_RECOVERING,
+					NULL);
+
+				pthread_mutex_lock(&priv->reset_ops_lock);
+
+				/* Re-check after lock to avoid racing with
+				 * mana_pci_remove_event_cb which may have
+				 * set RESET_FAILED while we waited.
+				 */
+				if (rte_atomic_load_explicit(&priv->dev_state,
+				    rte_memory_order_acquire) !=
+				    MANA_DEV_ACTIVE) {
+					pthread_mutex_unlock(
+						&priv->reset_ops_lock);
+					break;
+				}
+
+				mana_reset_enter(priv);
+
+				if (rte_atomic_load_explicit(&priv->dev_state,
+				    rte_memory_order_acquire) ==
+				    MANA_DEV_RESET_FAILED) {
+					DRV_LOG(INFO,
+						"Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+						priv->port_id);
+					rte_eth_dev_callback_process(dev,
+						RTE_ETH_EVENT_RECOVERY_FAILED,
+						NULL);
+				}
+			} else {
+				DRV_LOG(ERR, "Already in reset handling, dev_state=%d",
+					(int)rte_atomic_load_explicit(&priv->dev_state,
+					rte_memory_order_acquire));
+			}
+			break;
+
+		default:
+			break;
 		}
 
 		ibv_ack_async_event(&event);
@@ -1063,6 +1890,23 @@ static int
 mana_intr_uninstall(struct mana_priv *priv)
 {
 	int ret;
+	struct rte_eth_dev *dev;
+
+	if (!priv->intr_handle)
+		return 0;
+
+	/* Unregister PCI device removal event callback.
+	 * Do not retry on -EAGAIN to avoid deadlock: the callback
+	 * may be blocked waiting for reset_ops_lock which we hold.
+	 */
+	dev = &rte_eth_devices[priv->port_id];
+	if (dev->device) {
+		ret = rte_dev_event_callback_unregister(dev->device->name,
+			mana_pci_remove_event_cb, priv);
+		if (ret < 0 && ret != -ENOENT)
+			DRV_LOG(WARNING, "Failed to unregister PCI remove cb ret %d",
+				ret);
+	}
 
 	ret = rte_intr_callback_unregister(priv->intr_handle,
 					   mana_intr_handler, priv);
@@ -1072,6 +1916,7 @@ mana_intr_uninstall(struct mana_priv *priv)
 	}
 
 	rte_intr_instance_free(priv->intr_handle);
+	priv->intr_handle = NULL;
 
 	return 0;
 }
@@ -1127,6 +1972,16 @@ mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
 		goto free_intr;
 	}
 
+	/* Register for PCI device removal events to distinguish
+	 * PCI hot-remove from service reset. This requires the
+	 * application to call rte_dev_event_monitor_start() for
+	 * events to be delivered (e.g. testpmd --hot-plug-handling).
+	 */
+	ret = rte_dev_event_callback_register(eth_dev->device->name,
+					      mana_pci_remove_event_cb, priv);
+	if (ret)
+		DRV_LOG(WARNING, "Failed to register PCI remove event callback");
+
 	eth_dev->intr_handle = priv->intr_handle;
 	return 0;
 
@@ -1156,7 +2011,7 @@ mana_proc_priv_init(struct rte_eth_dev *dev)
 /*
  * Map the doorbell page for the secondary process through IB device handle.
  */
-static int
+int
 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
 {
 	struct mana_process_priv *priv = eth_dev->process_private;
@@ -1294,17 +2149,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 	char name[RTE_ETH_NAME_MAX_LEN];
 	int ret;
 	struct ibv_context *ctx = NULL;
+	bool is_reset = false;
+	pthread_mutexattr_t mattr;
+	pthread_condattr_t cattr;
 
 	rte_ether_format_addr(address, sizeof(address), addr);
-	DRV_LOG(INFO, "device located port %u address %s", port, address);
 
-	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
-				  SOCKET_ID_ANY);
-	if (!priv)
-		return -ENOMEM;
+	DRV_LOG(DEBUG, "device located port %u address %s", port, address);
 
 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
 
+	eth_dev = rte_eth_dev_allocated(name);
+	if (eth_dev) {
+		is_reset = true;
+		priv = eth_dev->data->dev_private;
+		DRV_LOG(DEBUG, "Device reset for eth_dev %p priv %p",
+			eth_dev, priv);
+	} else {
+		priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
+					  SOCKET_ID_ANY);
+		if (!priv)
+			return -ENOMEM;
+	}
+
 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 		int fd;
 
@@ -1317,6 +2184,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 
 		eth_dev->device = &pci_dev->device;
 		eth_dev->dev_ops = &mana_dev_secondary_ops;
+
 		ret = mana_proc_priv_init(eth_dev);
 		if (ret)
 			goto failed;
@@ -1336,7 +2204,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 			goto failed;
 		}
 
-		/* fd is no not used after mapping doorbell */
+		/* fd is not used after mapping doorbell */
 		close(fd);
 
 		eth_dev->tx_pkt_burst = mana_tx_burst;
@@ -1355,22 +2223,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 		goto failed;
 	}
 
-	eth_dev = rte_eth_dev_allocate(name);
-	if (!eth_dev) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	eth_dev->data->mac_addrs =
-		rte_calloc("mana_mac", 1,
-			   sizeof(struct rte_ether_addr), 0);
-	if (!eth_dev->data->mac_addrs) {
-		ret = -ENOMEM;
-		goto failed;
-	}
-
-	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
-
 	priv->ib_pd = ibv_alloc_pd(ctx);
 	if (!priv->ib_pd) {
 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
@@ -1390,10 +2242,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 	}
 
 	priv->ib_ctx = ctx;
-	priv->port_id = eth_dev->data->port_id;
-	priv->dev_port = port;
-	eth_dev->data->dev_private = priv;
-	priv->dev_data = eth_dev->data;
 
 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
@@ -1415,23 +2263,72 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 		name, priv->max_rx_queues, priv->max_rx_desc,
 		priv->max_send_sge, priv->max_mr_size);
 
-	rte_eth_copy_pci_info(eth_dev, pci_dev);
+	if (!is_reset) {
+		eth_dev = rte_eth_dev_allocate(name);
+		if (!eth_dev) {
+			ret = -ENOMEM;
+			goto failed;
+		}
 
-	/* Create async interrupt handler */
-	ret = mana_intr_install(eth_dev, priv);
-	if (ret) {
-		DRV_LOG(ERR, "Failed to install intr handler");
-		goto failed;
+		eth_dev->data->mac_addrs =
+			rte_calloc("mana_mac", 1,
+				   sizeof(struct rte_ether_addr), 0);
+		if (!eth_dev->data->mac_addrs) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+
+		rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
+	} else {
+		/*
+		 * Reset path.
+		 */
+		rte_ether_format_addr(address, RTE_ETHER_ADDR_FMT_SIZE,
+				      eth_dev->data->mac_addrs);
+		DRV_LOG(DEBUG, "Found existing eth_dev %p with mac addr %s",
+			eth_dev, address);
+		DRV_LOG(DEBUG, "ib_ctx = %p", priv->ib_ctx);
+		goto out;
 	}
 
-	eth_dev->device = &pci_dev->device;
+	priv->port_id = eth_dev->data->port_id;
+	priv->dev_port = port;
+	eth_dev->data->dev_private = priv;
+	priv->dev_data = eth_dev->data;
+	rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+				     rte_memory_order_release);
+
+	rte_eth_copy_pci_info(eth_dev, pci_dev);
+
+	pthread_mutexattr_init(&mattr);
+	pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->reset_ops_lock, &mattr);
+	pthread_mutex_init(&priv->reset_cond_mutex, &mattr);
+	pthread_mutexattr_destroy(&mattr);
+
+	pthread_condattr_init(&cattr);
+	pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&priv->reset_cond, &cattr);
+	pthread_condattr_destroy(&cattr);
 
-	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
+	eth_dev->device = &pci_dev->device;
 
 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
 	eth_dev->dev_ops = &mana_dev_ops;
 
+out:
+	/* Create async interrupt handler */
+	ret = mana_intr_install(eth_dev, priv);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to install intr handler, ret %d", ret);
+		goto failed;
+	}
+	DRV_LOG(INFO, "mana_intr_install succeeded");
+
+	DRV_LOG(INFO, "device %s priv %p dev port %d at port %u",
+		name, priv, priv->dev_port, eth_dev->data->port_id);
+
 	rte_eth_dev_probing_finish(eth_dev);
 
 	return 0;
@@ -1439,20 +2336,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
 failed:
 	/* Free the resource for the port failed */
 	if (priv) {
-		if (priv->ib_parent_pd)
+		if (priv->ib_parent_pd) {
 			ibv_dealloc_pd(priv->ib_parent_pd);
+			priv->ib_parent_pd = NULL;
+		}
 
-		if (priv->ib_pd)
+		if (priv->ib_pd) {
 			ibv_dealloc_pd(priv->ib_pd);
+			priv->ib_pd = NULL;
+		}
 	}
 
-	if (eth_dev)
-		rte_eth_dev_release_port(eth_dev);
+	if (!is_reset) {
+		if (eth_dev)
+			rte_eth_dev_release_port(eth_dev);
 
-	rte_free(priv);
+		rte_free(priv);
+	}
 
-	if (ctx)
+	if (ctx) {
 		ibv_close_device(ctx);
+		if (is_reset && priv)
+			priv->ib_ctx = NULL;
+	}
 
 	return ret;
 }
@@ -1617,7 +2523,17 @@ mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 static int
 mana_dev_uninit(struct rte_eth_dev *dev)
 {
-	return mana_dev_close(dev);
+	struct mana_priv *priv = dev->data->dev_private;
+	int ret;
+
+	/* Join reset thread before teardown to ensure it has exited
+	 * before we destroy the condvar/mutex in free_resources.
+	 */
+	mana_join_reset_thread(priv);
+
+	ret = mana_dev_close(dev);
+	mana_dev_free_resources(dev);
+	return ret;
 }
 
 /*
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 79cc47b6ab..a7b301484a 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -5,6 +5,8 @@
 #ifndef __MANA_H__
 #define __MANA_H__
 
+#include <pthread.h>
+
 #define	PCI_VENDOR_ID_MICROSOFT		0x1414
 #define PCI_DEVICE_ID_MICROSOFT_MANA_PF	0x00b9
 #define PCI_DEVICE_ID_MICROSOFT_MANA	0x00ba
@@ -337,6 +339,26 @@ struct mana_process_priv {
 	void *db_page;
 };
 
+enum mana_device_state {
+	/* Normal running */
+	MANA_DEV_ACTIVE		= 0,
+	/* In reset enter processing */
+	MANA_DEV_RESET_ENTER	= 1,
+	/*
+	 * Reset enter processing completed.
+	 * Waiting for reset exit or in reset exit processing.
+	 */
+	MANA_DEV_RESET_EXIT	= 2,
+	/* Reset failed */
+	MANA_DEV_RESET_FAILED	= 3,
+};
+
+/* burst_state bit layout:
+ *   Bit 0: in-burst (set by data path CAS 0→1, cleared on exit).
+ *   Bit 1: blocked  (set by reset path to reject new bursts).
+ */
+#define MANA_BURST_BLOCKED	2
+
 struct mana_priv {
 	struct rte_eth_dev_data *dev_data;
 	struct mana_process_priv *process_priv;
@@ -368,6 +390,15 @@ struct mana_priv {
 	uint64_t max_mr_size;
 	struct mana_mr_btree mr_btree;
 	rte_spinlock_t	mr_btree_lock;
+	RTE_ATOMIC(enum mana_device_state) dev_state;
+	/* mutex for synchronizing mana reset and some mana_dev_ops callbacks */
+	pthread_mutex_t reset_ops_lock;
+	/* Reset thread ID, valid when reset_thread_active is true */
+	rte_thread_t reset_thread;
+	RTE_ATOMIC(bool) reset_thread_active;
+	/* Condvar to wake reset thread early on PCI remove */
+	pthread_mutex_t reset_cond_mutex;
+	pthread_cond_t reset_cond;
 };
 
 struct mana_txq_desc {
@@ -427,6 +458,14 @@ struct mana_txq {
 	struct mana_mr_btree mr_btree;
 	struct mana_stats stats;
 	unsigned int socket;
+	unsigned int txq_idx;
+
+	/*
+	 * Bit 0: in-burst flag (set by data path, cleared on exit).
+	 * Bit 1: blocked flag (set by reset path via fetch_or).
+	 * Data path CAS 0→1 to enter; fails if blocked bit is set.
+	 */
+	RTE_ATOMIC(uint32_t) burst_state;
 };
 
 struct mana_rxq {
@@ -462,6 +501,14 @@ struct mana_rxq {
 	struct mana_mr_btree mr_btree;
 
 	unsigned int socket;
+	unsigned int rxq_idx;
+
+	/*
+	 * Bit 0: in-burst flag (set by data path, cleared on exit).
+	 * Bit 1: blocked flag (set by reset path via fetch_or).
+	 * Data path CAS 0→1 to enter; fails if blocked bit is set.
+	 */
+	RTE_ATOMIC(uint32_t) burst_state;
 };
 
 extern int mana_logtype_driver;
@@ -543,6 +590,8 @@ enum mana_mp_req_type {
 	MANA_MP_REQ_CREATE_MR,
 	MANA_MP_REQ_START_RXTX,
 	MANA_MP_REQ_STOP_RXTX,
+	MANA_MP_REQ_RESET_ENTER,
+	MANA_MP_REQ_RESET_EXIT,
 };
 
 /* Pameters for IPC. */
@@ -563,8 +612,9 @@ void mana_mp_uninit_primary(void);
 void mana_mp_uninit_secondary(void);
 int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
 int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
+int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd);
 
-void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+int mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
 void *mana_alloc_verbs_buf(size_t size, void *data);
 void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 72417fc0c7..1161ebd71c 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -2,10 +2,13 @@
  * Copyright 2022 Microsoft Corporation
  */
 
+#include <sys/mman.h>
 #include <rte_malloc.h>
 #include <ethdev_driver.h>
 #include <rte_log.h>
+#include <rte_eal_paging.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include <infiniband/verbs.h>
 
@@ -119,6 +122,23 @@ mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 	return ret;
 }
 
+static int
+mana_mp_reset_enter(struct rte_eth_dev *dev)
+{
+	struct mana_process_priv *proc_priv = dev->process_private;
+
+	void *addr = proc_priv->db_page;
+
+	/* Reset the db_page to NULL */
+	proc_priv->db_page = NULL;
+
+	if (addr)
+		(void)munmap(addr, rte_mem_page_size());
+
+	DRV_LOG(DEBUG, "Secondary doorbell pages unmapped");
+	return 0;
+}
+
 static int
 mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 {
@@ -171,6 +191,49 @@ mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
 		ret = rte_mp_reply(&mp_res, peer);
 		break;
 
+	case MANA_MP_REQ_RESET_ENTER:
+		DRV_LOG(INFO, "Port %u reset enter", dev->data->port_id);
+		res->result = mana_mp_reset_enter(dev);
+
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
+	case MANA_MP_REQ_RESET_EXIT:
+		DRV_LOG(INFO, "Port %u reset exit", dev->data->port_id);
+		{
+			struct mana_process_priv *proc_priv =
+				dev->process_private;
+
+			if (proc_priv->db_page != NULL) {
+				DRV_LOG(DEBUG,
+					"Secondary doorbell already "
+					"mapped to %p",
+					proc_priv->db_page);
+				res->result = 0;
+			} else if (mp_msg->num_fds < 1) {
+				DRV_LOG(ERR,
+					"No FD in RESET_EXIT message");
+				res->result = -EINVAL;
+			} else {
+				int fd = mp_msg->fds[0];
+
+				ret = mana_map_doorbell_secondary(dev,
+								  fd);
+				if (ret) {
+					DRV_LOG(ERR,
+						"Failed secondary "
+						"doorbell map %d",
+						fd);
+					res->result = -ENODEV;
+				} else {
+					res->result = 0;
+				}
+				close(fd);
+			}
+		}
+		ret = rte_mp_reply(&mp_res, peer);
+		break;
+
 	default:
 		DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
 			param->port_id, param->type);
@@ -254,7 +317,7 @@ mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
 	}
 
 	ret = mp_res->fds[0];
-	DRV_LOG(ERR, "port %u command FD from primary is %d",
+	DRV_LOG(DEBUG, "port %u command FD from primary is %d",
 		dev->data->port_id, ret);
 exit:
 	free(mp_rep.msgs);
@@ -298,27 +361,36 @@ mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
 	return ret;
 }
 
-void
+int
 mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 {
 	struct rte_mp_msg mp_req = { 0 };
 	struct rte_mp_msg *mp_res;
-	struct rte_mp_reply mp_rep;
+	struct rte_mp_reply mp_rep = { 0 };
 	struct mana_mp_param *res;
 	struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
-	int i, ret;
+	int i, ret = 0;
 
-	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+	if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX &&
+	    type != MANA_MP_REQ_RESET_ENTER && type != MANA_MP_REQ_RESET_EXIT) {
 		DRV_LOG(ERR, "port %u unknown request (req_type %d)",
 			dev->data->port_id, type);
-		return;
+		return -EINVAL;
 	}
 
 	if (rte_atomic_load_explicit(&mana_shared_data->secondary_cnt, rte_memory_order_relaxed) == 0)
-		return;
+		return 0;
 
 	mp_init_msg(&mp_req, type, dev->data->port_id);
 
+	/* Include IB cmd FD for secondary doorbell remap */
+	if (type == MANA_MP_REQ_RESET_EXIT) {
+		struct mana_priv *priv = dev->data->dev_private;
+
+		mp_req.num_fds = 1;
+		mp_req.fds[0] = priv->ib_ctx->cmd_fd;
+	}
+
 	ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
 	if (ret) {
 		if (rte_errno != ENOTSUP)
@@ -329,6 +401,7 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 	if (mp_rep.nb_sent != mp_rep.nb_received) {
 		DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
 			dev->data->port_id, type);
+		ret = -ETIMEDOUT;
 		goto exit;
 	}
 	for (i = 0; i < mp_rep.nb_received; i++) {
@@ -337,9 +410,11 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
 		if (res->result) {
 			DRV_LOG(ERR, "port %u request failed on secondary %d",
 				dev->data->port_id, i);
+			ret = res->result;
 			goto exit;
 		}
 	}
 exit:
 	free(mp_rep.msgs);
+	return ret;
 }
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
index c4045141bc..8914f4cf04 100644
--- a/drivers/net/mana/mr.c
+++ b/drivers/net/mana/mr.c
@@ -314,8 +314,10 @@ mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
 void
 mana_mr_btree_free(struct mana_mr_btree *bt)
 {
-	rte_free(bt->table);
-	memset(bt, 0, sizeof(*bt));
+	if (bt && bt->table) {
+		rte_free(bt->table);
+		memset(bt, 0, sizeof(*bt));
+	}
 }
 
 int
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index 1b8ba1f3a9..aedb05d46f 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -36,6 +36,11 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
 		db_page = process_priv->db_page;
 	}
 
+	if (!db_page) {
+		DP_LOG(ERR, "db_page is NULL, cannot ring RX doorbell");
+		return -EINVAL;
+	}
+
 	/* Hardware Spec specifies that software client should set 0 for
 	 * wqe_cnt for Receive Queues.
 	 */
@@ -172,7 +177,7 @@ mana_stop_rx_queues(struct rte_eth_dev *dev)
 
 	for (i = 0; i < priv->num_queues; i++)
 		if (dev->data->rx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
-			return -EINVAL;
+			return 0;
 
 	if (priv->rwq_qp) {
 		ret = ibv_destroy_qp(priv->rwq_qp);
@@ -256,6 +261,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
 		struct mana_rxq *rxq = dev->data->rx_queues[i];
 		struct ibv_wq_init_attr wq_attr = {};
 
+		rxq->rxq_idx = i;
+		DRV_LOG(DEBUG, "assigning rxq_idx to %d", i);
+
 		manadv_set_context_attr(priv->ib_ctx,
 			MANADV_CTX_ATTR_BUF_ALLOCATORS,
 			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -451,6 +459,16 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	uint32_t pkt_len;
 	uint32_t i;
 	int polled = 0;
+	uint32_t expected = 0;
+
+	/* Single atomic CAS: enter burst only if device is active (0→1).
+	 * Fails immediately if reset path has set the blocked bit.
+	 */
+	if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+			&rxq->burst_state, &expected, 1,
+			rte_memory_order_acquire,
+			rte_memory_order_relaxed)))
+		return 0;
 
 repoll:
 	/* Polling on new completions if we have no backlog */
@@ -592,6 +610,9 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				wqe_consumed, ret);
 	}
 
+	rte_atomic_fetch_and_explicit(&rxq->burst_state, ~(uint32_t)1,
+				     rte_memory_order_release);
+
 	return pkt_received;
 }
 
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
index 57dbbc3651..10f2212b5d 100644
--- a/drivers/net/mana/tx.c
+++ b/drivers/net/mana/tx.c
@@ -17,7 +17,7 @@ mana_stop_tx_queues(struct rte_eth_dev *dev)
 
 	for (i = 0; i < priv->num_queues; i++)
 		if (dev->data->tx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
-			return -EINVAL;
+			return 0;
 
 	for (i = 0; i < priv->num_queues; i++) {
 		struct mana_txq *txq = dev->data->tx_queues[i];
@@ -83,6 +83,9 @@ mana_start_tx_queues(struct rte_eth_dev *dev)
 
 		txq = dev->data->tx_queues[i];
 
+		txq->txq_idx = i;
+		DRV_LOG(DEBUG, "assigning txq_idx to %d", txq->txq_idx);
+
 		manadv_set_context_attr(priv->ib_ctx,
 			MANADV_CTX_ATTR_BUF_ALLOCATORS,
 			(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -190,10 +193,34 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	void *db_page;
 	uint16_t pkt_sent = 0;
 	uint32_t num_comp, i;
+	uint32_t expected = 0;
 #ifdef RTE_ARCH_32
 	uint32_t wqe_count = 0;
 #endif
 
+	db_page = priv->db_page;
+	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+		struct rte_eth_dev *dev =
+			&rte_eth_devices[priv->dev_data->port_id];
+		struct mana_process_priv *process_priv = dev->process_private;
+
+		db_page = process_priv->db_page;
+	}
+
+	/* Single atomic CAS: enter burst only if device is active (0→1).
+	 * Fails immediately if reset path has set the blocked bit.
+	 */
+	if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+			&txq->burst_state, &expected, 1,
+			rte_memory_order_acquire,
+			rte_memory_order_relaxed) || !db_page)) {
+		if (!expected) /* CAS succeeded but db_page NULL — undo */
+			rte_atomic_fetch_and_explicit(&txq->burst_state,
+						      ~(uint32_t)1,
+						      rte_memory_order_release);
+		return 0;
+	}
+
 	/* Process send completions from GDMA */
 	num_comp = gdma_poll_completion_queue(&txq->gdma_cq,
 			txq->gdma_comp_buf, txq->num_desc);
@@ -216,7 +243,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		}
 
 		if (!desc->pkt) {
-			DP_LOG(ERR, "mana_txq_desc has a NULL pkt");
+			DP_LOG(ERR, "mana_txq_desc has a NULL pkt, priv %p, "
+			       "txq = %d", priv, txq->txq_idx);
 		} else {
 			txq->stats.bytes += desc->pkt->pkt_len;
 			rte_pktmbuf_free(desc->pkt);
@@ -474,15 +502,6 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	}
 
 	/* Ring hardware door bell */
-	db_page = priv->db_page;
-	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
-		struct rte_eth_dev *dev =
-			&rte_eth_devices[priv->dev_data->port_id];
-		struct mana_process_priv *process_priv = dev->process_private;
-
-		db_page = process_priv->db_page;
-	}
-
 	if (pkt_sent) {
 #ifdef RTE_ARCH_32
 		ret = mana_ring_short_doorbell(db_page, GDMA_QUEUE_SEND,
@@ -501,5 +520,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 			DP_LOG(ERR, "mana_ring_doorbell failed ret %d", ret);
 	}
 
+	rte_atomic_fetch_and_explicit(&txq->burst_state, ~(uint32_t)1,
+				     rte_memory_order_release);
+
 	return pkt_sent;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH v9 0/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-12  8:17 UTC (permalink / raw)
  To: dev, stephen; +Cc: longli, weh

From: Wei Hu <weh@microsoft.com>

Add support for handling hardware service reset events in the
MANA driver. When the MANA kernel driver receives a hardware
service event, it initiates a device reset and notifies userspace
via IBV_EVENT_DEVICE_FATAL. The MANA PMD handles this by
performing an automatic teardown and recovery sequence.

The driver uses ethdev recovery events (ERR_RECOVERING,
RECOVERY_SUCCESS, RECOVERY_FAILED) to notify upper layers of
the reset lifecycle, and a PCI device removal event callback
to distinguish hot-remove from service reset.

Changes since v8:
- Fixed reset thread resource leak: previously reset_thread_active
  was cleared before emitting recovery callbacks, so no join site
  would reap the thread. Now the flag stays true throughout the
  thread lifetime. mana_join_reset_thread detects the self-join
  case (callback calling dev_stop/dev_close from the reset thread)
  using rte_thread_equal and calls rte_thread_detach instead of
  join, so thread resources are freed on exit. External callers
  continue to join normally.
- Fixed lost condvar signal: added a predicate loop around
  pthread_cond_timedwait that checks dev_state under
  reset_cond_mutex. If mana_pci_remove_event_cb signals before
  the reset thread enters the wait, the wakeup is no longer lost.
  The PCI remove callback sets dev_state to RESET_FAILED under
  the same mutex before signaling.
- Added a lock/unlock barrier on reset_ops_lock in
  mana_pci_remove_event_cb to ensure teardown has completed
  before emitting the INTR_RMV event.
- Fixed mana_reset_exit_delay return type from uint32_t to int
  to match the negative error codes it stores.
- Removed unnecessary else-after-goto in mana_probe_port.

Changes since v7:
- Moved heavy teardown (dev_stop, IPC to secondaries, dev_close,
  MR btree free) from mana_reset_enter (EAL interrupt thread)
  to mana_reset_thread (control thread). The interrupt handler
  now only sets state, drains in-flight bursts, and spawns the
  thread. Teardown runs immediately in the control thread before
  the recovery timer wait, avoiding blocking the interrupt thread
  on multi-second IPC timeouts and ibverbs calls. Each function
  now owns its own lock scope with no lock hand-off between
  threads.
- Simplified burst_state from encoding device state in bits 1+
  to a single blocked flag (bit 1). Only one value was ever
  stored, so the multi-state encoding was misleading. Added
  MANA_BURST_BLOCKED constant.
- Updated mana.rst to reflect that teardown runs on the control
  thread, not the interrupt handler.

Changes since v6:
- Rebased onto latest upstream for-main
- Replaced removed RTE_ETH_DEV_TO_PCI macro with
  RTE_CLASS_TO_BUS_DEVICE (upstream commit 4757b8df04
  removed the old bus-specific ethdev convenience macros)

Changes since v5:
- Replaced RCU QSBR with per-queue atomic burst_state using a
  single-variable CAS design: bit 0 is the in-burst flag, bit 1
  is the blocked flag. The data path uses CAS(0→1) to enter
  burst and fetch_and(~1) to exit. The reset path uses fetch_or
  to set the blocked bit and polls bit 0 to drain in-flight
  bursts. This eliminates the two-variable Dekker pattern and the
  need for sequential consistency (seq_cst) ordering.
- Removed librte_rcu dependency
- Removed __rte_no_thread_safety_analysis annotations (no longer
  needed after mutex conversion)
- Moved ERR_RECOVERING event emission before acquiring
  reset_ops_lock and before mana_reset_enter, so upper layers
  (e.g. netvsc) can switch data path before mana stops queues.
  Emitting outside the lock avoids deadlock if the callback
  calls dev_stop or dev_close.
- Replaced MANA_OPS_*_LOCK macros with mana_reset_trylock()
  helper function and explicit per-operation wrappers
- Removed unused rte_alarm.h and rte_lock_annotations.h includes
- Added RECOVERY_FAILED event when mana_reset_enter fails
  internally, so the application always receives a terminal event
- Added mana_clear_burst_state() helper to clear per-queue
  burst_state on failure paths (reset_failed, dev_stop_lock,
  dev_close_lock) preventing permanent silent packet drop after
  a failed reset

Changes since v4:
- Fixed stale rte_spinlock_unlock call in mana_intr_handler that
  was missed during the spinlock-to-mutex conversion, causing a
  -Wincompatible-pointer-types warning

Changes since v3:
- Converted reset_ops_lock from rte_spinlock_t to pthread_mutex_t
  with PTHREAD_PROCESS_SHARED, since the lock is held across
  blocking IB verbs calls and IPC with 5s timeout
- Removed rte_dev_event_callback_unregister retry loop to avoid
  deadlock when interrupt thread and reset thread contend

Changes since v2:
- Added per-queue burst_state atomic variable with Dekker-like
  synchronization to block data path during reset without RCU
- Replaced rte_alarm with condvar + control thread for reset exit
- Made reset_thread_active atomic with CAS — flag is set by
  creator and only cleared by the joiner, not the thread itself
- Fixed second reset crash: removed reset thread join logic from
  mana_dev_close (inner function) to avoid corrupting dev_state
  when called from mana_reset_enter
- Made reset_thread_active RTE_ATOMIC(bool) with explicit ordering
- Added retry loop for rte_dev_event_callback_unregister on -EAGAIN
- Initialized condvar/mutex with PTHREAD_PROCESS_SHARED since priv
  is in hugepage shared memory
- Added re-check of dev_state after lock acquisition in
  mana_intr_handler to prevent racing with pci_remove_event_cb
- Replaced (void *)0 with NULL in mp.c
- Added lock ownership comment block at mana_reset_enter
- Documented rte_dev_event_monitor_start() requirement
- Added mana.rst documentation and release note

Changes since v1:
- Removed net/netvsc patch from this series
- Simplified reset exit: mana_reset_exit calls
  mana_reset_exit_delay directly instead of spawning a thread
- Added __rte_no_thread_safety_analysis annotations for clang
- Switched to rte_thread_create_internal_control
- Fixed declaration-after-statement style issues
- Removed unnecessary blank lines and stale comments

Wei Hu (1):
  net/mana: add device reset support

 doc/guides/nics/mana.rst               |   40 +
 doc/guides/rel_notes/release_26_07.rst |    8 +
 drivers/net/mana/mana.c                | 1088 ++++++++++++++++++++++--
 drivers/net/mana/mana.h                |   52 +-
 drivers/net/mana/mp.c                  |   89 +-
 drivers/net/mana/mr.c                  |    6 +-
 drivers/net/mana/rx.c                  |   23 +-
 drivers/net/mana/tx.c                  |   44 +-
 8 files changed, 1242 insertions(+), 108 deletions(-)

-- 
2.34.1


^ permalink raw reply

* [PATCH] app/testpmd: add VLAN priority insert support
From: Xingui Yang @ 2026-06-12  8:14 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

The tx_vlan set command currently only accepts a VLAN ID in range
[0, 4095].  This patch adds support for an extended format that includes
802.1p priority and CFI bits, allowing users to set the VLAN priority
tag when inserting VLAN headers in TX packets.

The extended format is:
  bit 0-11:  VLAN ID (0-4095)
  bit 12:    CFI (Canonical Format Indicator)
  bit 13-15: Priority (0-7, 802.1p CoS)

This is consistent with the VLAN tag structure used by
rte_eth_dev_set_vlan_pvid() where the PVID field encodes VLAN ID, CFI
and priority in the same format.

A new command line option --enable-vlan-priority is added to enable this
feature. By default, the feature is disabled to maintain backward
compatibility with existing users. When enabled, the
vlan_id_is_invalid() function allows any 16-bit value to pass, while the
full 16-bit value (including CFI and priority bits) is passed to the
driver for hardware VLAN insertion.

Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
 app/test-pmd/config.c     | 24 +++++++++++++++---------
 app/test-pmd/parameters.c |  6 ++++++
 app/test-pmd/testpmd.c    |  5 +++++
 app/test-pmd/testpmd.h    |  2 ++
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 36b9b023e2..80cde109e6 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1241,12 +1241,18 @@ void print_valid_ports(void)
 }
 
 static int
-vlan_id_is_invalid(uint16_t vlan_id)
+vlan_id_is_invalid(uint16_t vlan_id, int vlan_priority_ena)
 {
-	if (vlan_id < 4096)
-		return 0;
-	fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
-	return 1;
+	if (!vlan_priority_ena && vlan_id >= 4096) {
+		fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
+		return 1;
+	}
+
+	/*
+	 * When vlan_priority_ena is enabled, allow any 16-bit value
+	 * to pass priority and CFI bits to the driver.
+	 */
+	return 0;
 }
 
 static uint32_t
@@ -6876,7 +6882,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
 
 	if (port_id_is_invalid(port_id, ENABLED_WARN))
 		return 1;
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return 1;
 	diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
 	if (diag == 0)
@@ -6923,7 +6929,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
 	struct rte_eth_dev_info dev_info;
 	int ret;
 
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return;
 
 	if (ports[port_id].dev_conf.txmode.offloads &
@@ -6954,9 +6960,9 @@ tx_qinq_set(portid_t port_id, uint16_t vlan_id, uint16_t vlan_id_outer)
 	struct rte_eth_dev_info dev_info;
 	int ret;
 
-	if (vlan_id_is_invalid(vlan_id))
+	if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
 		return;
-	if (vlan_id_is_invalid(vlan_id_outer))
+	if (vlan_id_is_invalid(vlan_id_outer, vlan_priority_insert_ena))
 		return;
 
 	ret = eth_dev_info_get_print_err(port_id, &dev_info);
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 8c3b1244e7..3f37498d3b 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -117,6 +117,8 @@ enum {
 	TESTPMD_OPT_ENABLE_HW_VLAN_EXTEND_NUM,
 #define TESTPMD_OPT_ENABLE_HW_QINQ_STRIP "enable-hw-qinq-strip"
 	TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM,
+#define TESTPMD_OPT_ENABLE_VLAN_PRIORITY "enable-vlan-priority"
+	TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM,
 #define TESTPMD_OPT_ENABLE_DROP_EN "enable-drop-en"
 	TESTPMD_OPT_ENABLE_DROP_EN_NUM,
 #define TESTPMD_OPT_DISABLE_RSS "disable-rss"
@@ -461,6 +463,7 @@ usage(char* progname)
 	printf("  --enable-hw-vlan-strip: enable hardware vlan strip.\n");
 	printf("  --enable-hw-vlan-extend: enable hardware vlan extend.\n");
 	printf("  --enable-hw-qinq-strip: enable hardware qinq strip.\n");
+	printf("  --enable-vlan-priority: enable vlan priority insert.\n");
 	printf("  --enable-drop-en: enable per queue packet drop.\n");
 	printf("  --disable-rss: disable rss.\n");
 	printf("  --enable-rss: Force rss even for single-queue operation.\n");
@@ -1259,6 +1262,9 @@ launch_args_parse(int argc, char** argv)
 		case TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM:
 			rx_offloads |= RTE_ETH_RX_OFFLOAD_QINQ_STRIP;
 			break;
+		case TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM:
+			vlan_priority_insert_ena = 1;
+			break;
 		case TESTPMD_OPT_ENABLE_DROP_EN_NUM:
 			rx_drop_en = 1;
 			break;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index 457bb6d3fe..0239ec59de 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -307,6 +307,11 @@ uint16_t mb_mempool_cache = DEF_MBUF_CACHE; /**< Size of mbuf mempool cache. */
 /* current configuration is in DCB or not,0 means it is not in DCB mode */
 uint8_t dcb_config = 0;
 
+/*
+ * Configurable value of vlan priority insert enable.
+ */
+uint8_t vlan_priority_insert_ena;
+
 /*
  * Configurable number of RX/TX queues.
  */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 04fdc2db42..104a6e73be 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -618,6 +618,8 @@ extern uint64_t noisy_lkup_num_reads_writes;
 
 extern uint8_t dcb_config;
 
+extern uint8_t vlan_priority_insert_ena;
+
 extern uint32_t mbuf_data_size_n;
 extern uint16_t mbuf_data_size[MAX_SEGS_BUFFER_SPLIT];
 /**< Mbuf data space size. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH] app/testpmd: add padding mode to txonly engine
From: Xingui Yang @ 2026-06-12  7:37 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

Add a new padding mode to the txonly forwarding engine, which allows
sending packets with configurable small sizes without standard L2/L3
headers. This is useful for testing NIC padding logic.

When padding mode is enabled via --tx-pkt-pad-mode flag:
- l2_len and l3_len are set to 0 instead of standard header lengths
- Packet data is filled with a static pattern instead of
  Ethernet/IP/UDP headers
- Minimum packet length validation is bypassed to allow small
  packet sizes (e.g., set txpkts 14)

Signed-off-by: Xingui Yang <yangxingui@huawei.com>
Signed-off-by: Huisong Li <lihuisong@huawei.com>
---
 app/test-pmd/config.c     |  2 +-
 app/test-pmd/parameters.c |  7 +++++++
 app/test-pmd/testpmd.c    |  3 +++
 app/test-pmd/testpmd.h    |  1 +
 app/test-pmd/txonly.c     | 18 ++++++++++++++++--
 5 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9d457ca88e..36b9b023e2 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -6341,7 +6341,7 @@ set_tx_pkt_segments(unsigned int *seg_lengths, unsigned int nb_segs)
 		}
 		tx_pkt_len = (uint16_t)(tx_pkt_len + seg_lengths[i]);
 	}
-	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8)) {
+	if (tx_pkt_len < (sizeof(struct rte_ether_hdr) + 20 + 8) && !tx_pkt_pad_mode) {
 		fprintf(stderr, "total packet length=%u < %d - give up\n",
 				(unsigned) tx_pkt_len,
 				(int)(sizeof(struct rte_ether_hdr) + 20 + 8));
diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 337d8fc8ac..8c3b1244e7 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -195,6 +195,8 @@ enum {
 	TESTPMD_OPT_TXONLY_MULTI_FLOW_NUM,
 #define TESTPMD_OPT_TXONLY_FLOWS "txonly-flows"
 	TESTPMD_OPT_TXONLY_FLOWS_NUM,
+#define TESTPMD_OPT_TX_PKT_PAD_MODE "tx-pkt-pad-mode"
+	TESTPMD_OPT_TX_PKT_PAD_MODE_NUM,
 #define TESTPMD_OPT_RXQ_SHARE "rxq-share"
 	TESTPMD_OPT_RXQ_SHARE_NUM,
 #define TESTPMD_OPT_ETH_LINK_SPEED "eth-link-speed"
@@ -351,6 +353,7 @@ static const struct option long_options[] = {
 	NO_ARG(TESTPMD_OPT_MULTI_RX_MEMPOOL),
 	NO_ARG(TESTPMD_OPT_TXONLY_MULTI_FLOW),
 	REQUIRED_ARG(TESTPMD_OPT_TXONLY_FLOWS),
+	NO_ARG(TESTPMD_OPT_TX_PKT_PAD_MODE),
 	NO_ARG(TESTPMD_OPT_RXQ_SHARE),
 	REQUIRED_ARG(TESTPMD_OPT_ETH_LINK_SPEED),
 	NO_ARG(TESTPMD_OPT_DISABLE_LINK_CHECK),
@@ -504,6 +507,7 @@ usage(char* progname)
 	printf("  --txonly-multi-flow: generate multiple flows in txonly mode\n");
 	printf("  --txonly-nb-flows=N: number of flows per lcore in txonly"
 	       " multi-flow mode (1-64, default 64)\n");
+	printf("  --tx-pkt-pad-mode: enable padding mode in txonly mode\n");
 	printf("  --tx-ip=src,dst: IP addresses in Tx-only mode\n");
 	printf("  --tx-udp=src[,dst]: UDP ports in Tx-only mode\n");
 	printf("  --eth-link-speed: force link speed.\n");
@@ -1577,6 +1581,9 @@ launch_args_parse(int argc, char** argv)
 			else
 				rte_exit(EXIT_FAILURE, "txonly-flows must be >= 1 and <= 64\n");
 			break;
+		case TESTPMD_OPT_TX_PKT_PAD_MODE_NUM:
+			tx_pkt_pad_mode = 1;
+			break;
 		case TESTPMD_OPT_RXQ_SHARE_NUM:
 			rxq_share = 1;
 			break;
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index fcd8a90967..457bb6d3fe 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -296,6 +296,9 @@ uint32_t tx_pkt_times_inter;
 uint32_t tx_pkt_times_intra;
 /**< Timings for send scheduling in TXONLY mode, time between packets. */
 
+uint8_t tx_pkt_pad_mode;
+/**< Whether packet padding mode is enabled. */
+
 uint16_t nb_pkt_per_burst = DEF_PKT_BURST; /**< Number of packets per burst. */
 uint16_t nb_pkt_flowgen_clones; /**< Number of Tx packet clones to send in flowgen mode. */
 int nb_flows_flowgen = 1024; /**< Number of flows in flowgen mode. */
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 3d4b36d668..04fdc2db42 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -663,6 +663,7 @@ extern uint16_t tx_pkt_seg_lengths[RTE_MAX_SEGS_PER_PKT]; /**< Seg. lengths */
 extern uint8_t  tx_pkt_nb_segs; /**< Number of segments in TX packets */
 extern uint32_t tx_pkt_times_intra;
 extern uint32_t tx_pkt_times_inter;
+extern uint8_t  tx_pkt_pad_mode;
 
 enum tx_pkt_split {
 	TX_PKT_SPLIT_OFF,
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 64893fa205..d8dbc2d46e 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -192,8 +192,8 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 	pkt->ol_flags |= ol_flags;
 	pkt->vlan_tci = vlan_tci;
 	pkt->vlan_tci_outer = vlan_tci_outer;
-	pkt->l2_len = sizeof(struct rte_ether_hdr);
-	pkt->l3_len = sizeof(struct rte_ipv4_hdr);
+	pkt->l2_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ether_hdr);
+	pkt->l3_len = tx_pkt_pad_mode ? 0 : sizeof(struct rte_ipv4_hdr);
 
 	pkt_len = pkt->data_len;
 	pkt_seg = pkt;
@@ -204,6 +204,19 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 		pkt_len += pkt_seg->data_len;
 	}
 	pkt_seg->next = NULL; /* Last segment of packet. */
+
+	if (tx_pkt_pad_mode) {
+		static const char pad_pattern[16] = "0123456789abcdef";
+		uint32_t j;
+		char *pad;
+
+		pad = rte_pktmbuf_mtod(pkt, char *);
+		for (j = 0; j < pkt->data_len; j++)
+			pad[j] = pad_pattern[j % 16];
+
+		goto out;
+	}
+
 	/*
 	 * Copy headers in first packet segment(s).
 	 */
@@ -295,6 +308,7 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
 			sizeof(struct rte_ipv4_hdr) +
 			sizeof(pkt_udp_hdr));
 	}
+out:
 	/*
 	 * Complete first mbuf of packet and append it to the
 	 * burst of packets to be transmitted.
-- 
2.43.0


^ permalink raw reply related

* 回复：[PATCH] gpu/metax: add new driver for Metax GPU
From: 许玲燕 @ 2026-06-12  7:19 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, eagostini, 王冬冬
In-Reply-To: <CNjXKdTGQx6A6BPVH44M4w@monjalon.net>

[-- Attachment #1: Type: text/plain, Size: 2058 bytes --]

Hi,
Thank you for the follow-up. Please find the clarifications below regarding the kernel dependency and availability:
1. Kernel Dependency
While it is not upstreamed into the mainline Linux kernel, it is actively maintained by Metax to interface with our hardware.
2. Availability and Download Link
Yes, both the proprietary user-space libraries and the corresponding kernel modules are freely available for download. You can access them via the Metax Software Download Center:
Download Link: https://sw-download.metax-tech.com/ <https://sw-download.metax-tech.com/ >
How to obtain the files:

 * 
Register and log in to the portal.

 * 
Navigate to "SDK Development Kit".

 * 
Select your specific GPU Type.

 * 
Choose your target OS. We currently support Linux (aarch64 & x86_64) across mainstream distributions, including but not limited to:
 * 
Ubuntu (18.04 - 24.04)

 * 
RHEL / CentOS / RockyOS (8.x / 9.x)

 * 
Domestic distros: KylinV10/V11, OpenCloudOS, TencentOS, etc.
Best regards,
------------------------------------------------------------------
发件人：Thomas Monjalon <thomas@monjalon.net>
发送时间：2026年6月11日(周四) 17:17
收件人："许玲燕"<lingyan.xu@metax-tech.com>
抄　送：dev<dev@dpdk.org>; eagostini<eagostini@nvidia.com>; "王冬冬"<dongdong.wang@metax-tech.com>
主　题：Re: [PATCH] gpu/metax: add new driver for Metax GPU
11/06/2026 09:10:
> Both libmcruntime.so and the corresponding gdrapi libraries
> are proprietary user-space libraries provided by Metax.
> They are not upstreamed to the DPDK mainline repository.
> However, please rest assured that the current patch interacts
> with them via standard dlopen (dynamic loading) at runtime.
> We do not link directly against their source code
> or require them as hard build-time dependencies.
> Therefore, this approach will not introduce any additional
> compilation dependencies or licensing issues to the DPDK main tree.
What about the kernel dependency?
Are libraries and kernel module freely available for download?
Can you provide a link?

[-- Attachment #2: Type: text/html, Size: 71291 bytes --]

^ permalink raw reply

* [PATCH] common/mlx5: fix high SMMU TLB miss with mempool alignment
From: Xingui Yang @ 2026-06-12  7:14 UTC (permalink / raw)
  To: dev
  Cc: stephen, david.marchand, thomas, dsosnowski, viacheslavo, bingz,
	orika, suanmingm, matan, dmitry.kozliuk, fengchengwen,
	yangshuaisong, lihuisong, liuyonglong, kangfenglong

From: Shuaisong Yang <yangshuaisong@h-partners.com>

On Kunpeng SoC with mlx CX7, dpdk-l3fwd with intra-NUMA core pinning
under SMMU nonstrict/strict mode shows about 30% performance degradation
compared to cross-NUMA pinning. With SMMU disabled or passthrough mode,
intra-NUMA performs as expected (slightly better than cross-NUMA).

CX7 in NUMA1
NUMA node0 CPU(s):    0-39
NUMA node1 CPU(s):    40-79

intra-NUMA:
dpdk-l3fwd -l 40-55 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
  --config='(0,0,40),(0,1,41),(0,2,42),(0,3,43),(0,4,44),\
            (0,5,45),(0,6,46),(0,7,47),(0,8,48),(0,9,49),\
            (0,10,50),(0,11,51),(0,12,52),(0,13,53),\
            (0,14,54),(0,15,55)' \
  --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64

cross-NUMA:
dpdk-l3fwd -l 11-26 -n 4 -a 0000:17:00.1,mprq_en=1 -- -p 0x1 -P \
  --config='(0,0,11),(0,1,12),(0,2,13),(0,3,14),(0,4,15),\
            (0,5,16),(0,6,17),(0,7,18),(0,8,19),(0,9,20),\
            (0,10,21),(0,11,22),(0,12,23),(0,13,24),\
            (0,14,25),(0,15,26)' \
  --rx-queue-size=4096 --tx-queue-size=4096 --rx-burst=64

The root cause is that under SMMU enabled mode, the mempool allocated
for intra-NUMA pinning is aligned to system page size instead of
hugepage size, while cross-NUMA pinning correctly uses hugepage size
alignment. This causes high TLB miss rates under SMMU.

Align all memory ranges to hugepage boundaries during mempool
registration to ensure hugepage_sz alignment, thereby reducing TLB
misses and fixing the intra-NUMA performance degradation.

Fixes: 690b2a88c2f7 ("common/mlx5: add mempool registration facilities")
Cc: stable@dpdk.org

Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
 .mailmap                             |  1 +
 drivers/common/mlx5/mlx5_common_mr.c | 53 +++++++++++++++++++---------
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/.mailmap b/.mailmap
index 4001e5fb0e..e13e88db1b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1979,3 +1979,4 @@ Zongyu Wu <wuzongyu1@huawei.com>
 Zorik Machulsky <zorik@amazon.com>
 Zyta Szpak <zyta@marvell.com> <zr@semihalf.com>
 Zyta Szpak <zyta@marvell.com> <zyta.szpak@semihalf.com>
+Shuaisong Yang <yangshuaisong@h-partners.com>
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index aa2d5e88a4..aee037abb4 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -1524,7 +1524,9 @@ mlx5_get_mempool_ranges(struct rte_mempool *mp, bool is_extmem,
  * @param[in] is_extmem
  *   Whether the pool is contains only external pinned buffers.
  * @param[out] out
- *   Receives memory ranges to register, aligned to the system page size.
+ *   Receives memory ranges to register. Aligned to the hugepage size
+ *   if all ranges reside on hugepages of the same size,
+ *   otherwise aligned to the system page size.
  *   The caller must release them with free().
  * @param[out] out_n
  *   Receives the number of @p out items.
@@ -1541,7 +1543,9 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
 {
 	struct mlx5_range *ranges = NULL;
 	unsigned int i, ranges_n = 0;
+	bool same_hugepage_sz = true;
 	struct rte_memseg_list *msl;
+	uint64_t hugepage_sz = 0;
 
 	if (mlx5_get_mempool_ranges(mp, is_extmem, &ranges, &ranges_n) < 0) {
 		DRV_LOG(ERR, "Cannot get address ranges for mempool %s",
@@ -1552,28 +1556,43 @@ mlx5_mempool_reg_analyze(struct rte_mempool *mp, bool is_extmem,
 	*share_hugepage = false;
 	msl = rte_mem_virt2memseg_list((void *)ranges[0].start);
 	if (msl != NULL) {
-		uint64_t hugepage_sz = 0;
+		hugepage_sz = msl->page_sz;
 
 		/* Check that all ranges are on pages of the same size. */
 		for (i = 0; i < ranges_n; i++) {
-			if (hugepage_sz != 0 && hugepage_sz != msl->page_sz)
+			struct rte_memseg_list *range_msl;
+			range_msl = rte_mem_virt2memseg_list(
+					(void *)ranges[i].start);
+			if (range_msl == NULL ||
+			    range_msl->page_sz != hugepage_sz) {
+				same_hugepage_sz = false;
 				break;
-			hugepage_sz = msl->page_sz;
+			}
 		}
-		if (i == ranges_n) {
-			/*
-			 * If the entire pool is within one hugepage,
-			 * combine all ranges into one of the hugepage size.
-			 */
-			uintptr_t reg_start = ranges[0].start;
-			uintptr_t reg_end = ranges[ranges_n - 1].end;
-			uintptr_t hugepage_start =
-				RTE_ALIGN_FLOOR(reg_start, hugepage_sz);
-			uintptr_t hugepage_end = hugepage_start + hugepage_sz;
-			if (reg_end < hugepage_end) {
-				ranges[0].start = hugepage_start;
+	}
+	if (same_hugepage_sz && hugepage_sz > 0) {
+		unsigned int orig_ranges_n = ranges_n;
+
+		for (i = 0; i < ranges_n; i++) {
+			ranges[i].start = RTE_ALIGN_FLOOR(ranges[i].start,
+							  hugepage_sz);
+			ranges[i].end = RTE_ALIGN_CEIL(ranges[i].end,
+							hugepage_sz);
+		}
+		ranges_n = 1;
+		for (i = 1; i < orig_ranges_n; i++) {
+			if (ranges[ranges_n - 1].end >= ranges[i].start)
+				ranges[ranges_n - 1].end =
+					RTE_MAX(ranges[ranges_n - 1].end,
+						ranges[i].end);
+			else
+				ranges[ranges_n++] = ranges[i];
+		}
+		if (ranges_n == 1) {
+			uintptr_t hugepage_end = ranges[0].start + hugepage_sz;
+
+			if (ranges[0].end <= hugepage_end) {
 				ranges[0].end = hugepage_end;
-				ranges_n = 1;
 				*share_hugepage = true;
 			}
 		}
-- 
2.43.0


^ permalink raw reply related

* [PATCH] crypto/cnxk: fix out of place AES GCM
From: Daphne Priscilla @ 2026-06-12  6:20 UTC (permalink / raw)
  To: dev; +Cc: stable, gakhil, ktejasree, anoobj, Daphne Priscilla

For AES-GCM out of place, when AAD is present in inbuf before the data,
it is treated as passthrough data. This results in AAD being
present in outbuf header, but test expects outbuf header to remain
zero. Passthrough data is now diverted to metabuf so outbuf
header remains zero.

Fixes: 7c19abdd0cf1 ("common/cnxk: support 103XX CPT")
Cc: stable@dpdk.org

Signed-off-by: Daphne Priscilla <df@marvell.com>
---
 .mailmap                                 |  1 +
 drivers/common/cnxk/roc_se.h             |  2 +-
 drivers/crypto/cnxk/cnxk_cryptodev_ops.c |  3 +
 drivers/crypto/cnxk/cnxk_se.h            | 96 ++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/.mailmap b/.mailmap
index 118dfa0ff9..1191afbf0b 100644
--- a/.mailmap
+++ b/.mailmap
@@ -334,6 +334,7 @@ Danny Patel <dannyp@marvell.com>
 Danny Zhou <danny.zhou@intel.com>
 Danylo Vodopianov <dvo-plv@napatech.com>
 Dapeng Yu <dapengx.yu@intel.com>
+Daphne Priscilla <df@marvell.com>
 Darek Stojaczyk <dariusz.stojaczyk@intel.com>
 Daria Kolistratova <daria.kolistratova@intel.com>
 Dariusz Chaberski <dariuszx.chaberski@intel.com>
diff --git a/drivers/common/cnxk/roc_se.h b/drivers/common/cnxk/roc_se.h
index 499e71ce85..d3ad61ca04 100644
--- a/drivers/common/cnxk/roc_se.h
+++ b/drivers/common/cnxk/roc_se.h
@@ -26,7 +26,7 @@
 #define ROC_SE_MISC_MINOR_OP_DUMMY	 0x04ULL
 #define ROC_SE_MISC_MINOR_OP_HW_SUPPORT	 0x08ULL
 
-#define ROC_SE_MAX_AAD_SIZE 64
+#define ROC_SE_MAX_AAD_SIZE 1024
 #define ROC_SE_MAX_MAC_LEN  64
 
 #define ROC_SE_OFF_CTRL_LEN 8
diff --git a/drivers/crypto/cnxk/cnxk_cryptodev_ops.c b/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
index 2f9eb322dc..5e59f1d7bd 100644
--- a/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
+++ b/drivers/crypto/cnxk/cnxk_cryptodev_ops.c
@@ -82,6 +82,9 @@ cnxk_cpt_get_mlen(void)
 			       (RTE_ALIGN_CEIL(ROC_MAX_SG_IN_OUT_CNT, 4) >> 2) * ROC_SG_ENTRY_SIZE),
 			      8);
 
+	/* Space for discarding AAD bytes from output stream in GCM OOP */
+	len += ROC_SE_MAX_AAD_SIZE;
+
 	return len;
 }
 
diff --git a/drivers/crypto/cnxk/cnxk_se.h b/drivers/crypto/cnxk/cnxk_se.h
index 8dbf3e73c7..09d9d1e0e3 100644
--- a/drivers/crypto/cnxk/cnxk_se.h
+++ b/drivers/crypto/cnxk/cnxk_se.h
@@ -407,8 +407,28 @@ sg_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 			if (unlikely(req_flags & ROC_SE_SINGLE_BUF_INPLACE)) {
 				i = fill_sg_comp_from_buf_min(scatter_comp, i, params->bufs, &size);
 			} else {
-				i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							  &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg_comp(scatter_comp, i, meta_passthrough,
+							 passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov,
+								  dst_offset, &size, aad_buf,
+								  aad_offset);
 			}
 			if (unlikely(size)) {
 				plt_dp_err("Insufficient buffer space,"
@@ -430,8 +450,28 @@ sg_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 			if (unlikely(req_flags & ROC_SE_SINGLE_BUF_INPLACE)) {
 				i = fill_sg_comp_from_buf_min(scatter_comp, i, params->bufs, &size);
 			} else {
-				i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							  &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg_comp(scatter_comp, i, meta_passthrough,
+							 passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg_comp_from_iov(scatter_comp, i, params->dst_iov,
+								  dst_offset, &size, aad_buf,
+								  aad_offset);
 			}
 
 			if (unlikely(size)) {
@@ -606,8 +646,28 @@ sg2_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 				i = fill_sg2_comp_from_buf_min(scatter_comp, i, params->bufs,
 							       &size);
 			} else {
-				i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							   &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg2_comp(scatter_comp, i, meta_passthrough,
+							  passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov,
+								   dst_offset, &size, aad_buf,
+								   aad_offset);
 			}
 			if (unlikely(size)) {
 				plt_dp_err("Insufficient buffer space,"
@@ -632,8 +692,28 @@ sg2_inst_prep(struct roc_se_fc_params *params, struct cpt_inst_s *inst, uint64_t
 				i = fill_sg2_comp_from_buf_min(scatter_comp, i, params->bufs,
 							       &size);
 			} else {
-				i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov, 0,
-							   &size, aad_buf, aad_offset);
+				uint32_t dst_offset = 0;
+
+				if (passthrough_len) {
+					if (unlikely(passthrough_len > ROC_SE_MAX_AAD_SIZE)) {
+						plt_dp_err(
+							"Passthrough length %u exceeds reserved space %u",
+							passthrough_len, ROC_SE_MAX_AAD_SIZE);
+						return -1;
+					}
+					uint64_t meta_passthrough =
+						(uint64_t)params->meta_buf.vaddr +
+						params->meta_buf.size - ROC_SE_MAX_AAD_SIZE;
+					i = fill_sg2_comp(scatter_comp, i, meta_passthrough,
+							  passthrough_len);
+					size -= passthrough_len;
+					dst_offset = passthrough_len;
+					aad_offset = 0;
+				}
+				if (size)
+					i = fill_sg2_comp_from_iov(scatter_comp, i, params->dst_iov,
+								   dst_offset, &size, aad_buf,
+								   aad_offset);
 			}
 
 			if (unlikely(size)) {
-- 
2.43.0


^ permalink raw reply related

* RE: ARM v8 rte_power_pause
From: Hemant Agrawal @ 2026-06-12  6:11 UTC (permalink / raw)
  To: Morten Brørup, Wathsala Vithanage
  Cc: dev@dpdk.org, Maxime Leroy, Gagandeep Singh
In-Reply-To: <98CBD80474FA8B44BF855DF32C47DC35F658E5@smartserver.smartshare.dk>

Hi Morten,
On Cortex‑A72 (ARMv8), the only architectural primitives available are YIELD, WFE, and WFI:

	YIELD is the only deterministic, low-overhead option (pure CPU relax, no entry into low-power state)
	WFE can be used as a low-power idle hint, but it is event-driven and not time-based (it may return immediately)
	WFI depends on interrupt wakeup and is therefore not suitable for tight latency loops

For ~1 µs latency targets, the practical approach is a hybrid strategy:

Short waits → spin using YIELD
Slightly longer waits → opportunistically use WFE for power reduction

A simple implementation could look like (not tested):

static inline void rte_armv8_pause(unsigned int iters)
{
	if (iters < 64) {
		for (unsigned int i = 0; i < iters; i++)
			asm volatile("yield");
	} else {
		asm volatile("sevl");
		asm volatile("wfe");
	}
}

@Wathsala Vithanage — would appreciate your thoughts, especially if there are any micro-architectural nuances we should consider.

Regards,
Hemant

> -----Original Message-----
> From: Morten Brørup <mb@smartsharesystems.com>
> Sent: 03 June 2026 17:26
> To: Wathsala Vithanage <wathsala.vithanage@arm.com>; Hemant Agrawal
> <hemant.agrawal@nxp.com>; Sachin Saxena (OSS)
> <sachin.saxena@oss.nxp.com>
> Cc: dev@dpdk.org; Maxime Leroy <maxime@leroys.fr>
> Subject: ARM v8 rte_power_pause
> Importance: High
> 
> Hi Wathsala, Hemant and Sachin,
> 
> Over at the Grout project, we are discussing power management in the
> context of 100 Gbit/s latency deadlines [1].
> 
> rte_power_pause() is not implemented for ARM v8 / Cortex-A72.
> Syscalls such as nanosleep() have too much overhead, and cannot be used.
> 
> Any suggestions for a power-reducing method to make a CPU core "sleep" (i.e.
> do nothing) for durations in the order of 1 microsecond?
> 
> [1]:
> https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
> b.com%2FDPDK%2Fgrout%2Fpull%2F624%23issuecomment-
> 4602036364&data=05%7C02%7Chemant.agrawal%40nxp.com%7Cdbff5f2e
> 8db1406f0c4008dec1671791%7C686ea1d3bc2b4c6fa92cd99c5c301635%7
> C0%7C0%7C639160845728472826%7CUnknown%7CTWFpbGZsb3d8eyJFb
> XB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTW
> FpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=DRpJWjm2yaF3Cnhk0b
> bFFhmGbKRweOOiWdsWco2NbX0%3D&reserved=0
> 
> -Morten


^ permalink raw reply

* Re: [PATCH 9/9] net/dpaa2: drop the fake software VLAN strip offload
From: Maxime Leroy @ 2026-06-12  5:42 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Hemant Agrawal, Sachin Saxena, dev
In-Reply-To: <20260611103058.160842f4@phoenix.local>

[-- Attachment #1: Type: text/plain, Size: 3898 bytes --]

Le jeu. 11 juin 2026, 19:31, Stephen Hemminger <stephen@networkplumber.org>
a écrit :

> On Thu, 11 Jun 2026 17:49:24 +0200
> Maxime Leroy <maxime@leroys.fr> wrote:
>
> > It saves a forwarding application nothing: the datapath reads the L2
> > header anyway to classify or strip. The offload does not remove that
> > read, it relocates it into the driver Rx burst, where it is far more
> > expensive.
> >
> > The cost is a matter of timing. rte_vlan_strip() reaches the L2 header
> > through rte_pktmbuf_mtod(), which dereferences mbuf->buf_addr. On a
> > freshly recycled buffer that mbuf cacheline is cold. eth_fd_to_mbuf()
> > has just written other fields of it (data_off, ol_flags), but buf_addr
> > is a persistent field it does not rewrite. A write does not stall: it
> > posts to the store buffer while the line fills in the background, and
> > the rewritten fields are forwarded straight from there. buf_addr has
> > nothing to forward, so it must be read from the line, whose fill is
> > still in flight, and the read stalls. The ethertype read that follows,
> > on the cold payload line, stalls again. Read later by the application,
> > when the fill has completed, the same read hits. The offload just
> > performs it at the worst possible moment.
> >
> > Measured on a single-core port-to-port forwarding test over two 10G
> > ports (one core at 2 GHz, 64-byte untagged frames):
> >
> >   - throughput 4.22 -> 5.00 Mpps (+18 percent)
> >   - IPC 0.93 -> 1.25: the cost was memory stall, not compute
> >   - L3/DRAM-bound L2 refills 319M -> 200M over 10s (-37 percent)
> >
> > perf confirms it: with the offload, the buf_addr load (the cold mbuf
> > field) and the payload load account for about 84 percent of the Rx
> > burst's L2 refills; removing it, those vanish and only the inherent DQRR
> > dequeue misses remain.
> >
> > Stop advertising VLAN_STRIP and remove the rte_vlan_strip() calls from
> > every Rx path. This is a behavioural change: the tag is left in the
> > frame, so an application must strip it itself, on the L2 header it
> > already reads.
> >
> > Signed-off-by: Maxime Leroy <maxime@leroys.fr>
> > ---
>
> In general I agree, but you overstate the impact. Any real application
> is going to look at the mbuf anyway. Relying on testpmd numbers is BS.
>
> The NBL driver does the same thing.
> So does PCAP but it has no choice, and is slow anyway.
> Virtio/vhost does as well.

This was not measured with testpmd, but with Grout in I/O forwarding mode.

The comparison is exactly between Grout's software fallback and the
advertised offload path. Without VLAN_STRIP, Grout's rx_process() reads the
Ethernet header and strips the VLAN tag itself if needed. With VLAN_STRIP
enabled, Grout uses rx_offload_process(), which only consumes
RTE_MBUF_F_RX_VLAN_STRIPPED/vlan_tci and does not inspect the Ethernet
header
for VLAN stripping.

For dpaa2, however, VLAN_STRIP is not done by the device. The PMD
implements the advertised offload by calling rte_vlan_strip() in the Rx
burst path. So enabling the "offload" just moves the same software work
from Grout into the driver.

The cost is timing. rte_vlan_strip() calls rte_pktmbuf_mtod(), which needs
mbuf->buf_addr. That value is persistent mbuf metadata; it is not produced
by the FD-to-mbuf conversion. eth_fd_to_mbuf() has just written other mbuf
fields such as data_off and ol_flags; those writes can be posted or
forwarded, but
they do not provide buf_addr. If the mbuf cacheline is cold, the buf_addr
load
has to wait for that line to be fetched before the driver can reach the
Ethernet header.

Grout does the same L2 read later in rx_process(), where it is already
processing L2. So the fake PMD offload performs the same software fallback,
but injects an extra mbuf-metadata dependency at a worse point in the Rx
burst path.

[-- Attachment #2: Type: text/html, Size: 4993 bytes --]

^ permalink raw reply

* Re: [PATCH] net/crc: add 4x folding loop for x86 SSE implementation
From: Shreesh Adiga @ 2026-06-12  3:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jasvinder Singh, Bruce Richardson, Konstantin Ananyev, dev
In-Reply-To: <20260611100611.17880d3b@phoenix.local>

[-- Attachment #1: Type: text/plain, Size: 1955 bytes --]

On Thu, Jun 11, 2026 at 10:36 PM Stephen Hemminger <
stephen@networkplumber.org> wrote:

> On Tue,  9 Jun 2026 13:27:12 +0530
> Shreesh Adiga <16567adigashreesh@gmail.com> wrote:
>
> > Add a 64-byte loop that maintains 4 fold registers and processes
> > 64 bytes at a time. The 4x fold registers is then reduced to 16 byte
> > single fold, similar to AVX512 implementation. This technique is
> > described in the paper by Intel:
> > "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
> Instruction"
> >
> > This results in roughly 50% performance improvement due to better ILP
> > for large input sizes like 1024.
> >
> > Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
> > ---
>
> Looks good applied to next-net.
>
> A couple of nits from more detailed AI review, that you still might want
> to look at:
>
> The current crc_autotest does not exercise the new 64-byte CRC16 path.
> Its CRC32 vectors are 1512 and 348 bytes, so the CRC32 4x loop is
> covered — but the largest CRC16 vector is 32 bytes, all three CRC16
> tests being ≤32. So the new CRC16 rk1_rk2 (64-byte fold) constants ship
> untested in CI. My exhaustive test confirms they're correct, but a
> future regression there wouldn't be caught. Suggest adding a CRC16
> vector ≥64 bytes, ideally a non-multiple of 64 (e.g. 80 or 100) so it
> hits the 4x loop, the single-fold tail, and the partial-bytes path
> together.
>
> In partial_bytes the comment /* k = rk1 & rk2 */ is now stale
>  — after the patch k holds rk3_rk4 on every path reaching it.
> Not introduced by this patch, but the patch is what made it wrong;
> worth fixing in passing.
>
> I've submitted couple of follow up patches that should address the above:
https://patches.dpdk.org/project/dpdk/patch/20260612023745.275608-1-16567adigashreesh@gmail.com/
https://patches.dpdk.org/project/dpdk/patch/20260612025135.298226-1-16567adigashreesh@gmail.com/

[-- Attachment #2: Type: text/html, Size: 2806 bytes --]

^ permalink raw reply

* [PATCH] net/crc: cleanup code in net_crc_sse.c implementation
From: Shreesh Adiga @ 2026-06-12  2:51 UTC (permalink / raw)
  To: Bruce Richardson, Konstantin Ananyev, Jasvinder Singh; +Cc: dev

Special handling for len between 16 and 31 is not required as the
implementation correctly handles them in the main path. Given that these
cases were annotated with unlikely branch hint, it should be simpler to
have these handled in the main path itself.

We can remove the partial_bytes label as there is no jump target to it,
and replace folding code in that block with already existing inline
function to simplify and have better code reuse.

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
 lib/net/net_crc_sse.c | 53 ++++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

diff --git a/lib/net/net_crc_sse.c b/lib/net/net_crc_sse.c
index dfef8ecc59..e30f8544fc 100644
--- a/lib/net/net_crc_sse.c
+++ b/lib/net/net_crc_sse.c
@@ -182,39 +182,24 @@ crc32_eth_calc_pclmulqdq(
 		goto single_fold_loop;
 	}
 
-	if (unlikely(data_len < 32)) {
-		if (unlikely(data_len == 16)) {
-			/* 16 bytes */
-			fold = _mm_loadu_si128((const __m128i *)data);
-			fold = _mm_xor_si128(fold, temp);
-			goto reduction_128_64;
-		}
+	if (unlikely(data_len < 16)) {
+		/* 0 to 15 bytes */
+		alignas(16) uint8_t buffer[16];
 
-		if (unlikely(data_len < 16)) {
-			/* 0 to 15 bytes */
-			alignas(16) uint8_t buffer[16];
-
-			memset(buffer, 0, sizeof(buffer));
-			memcpy(buffer, data, data_len);
-
-			fold = _mm_load_si128((const __m128i *)buffer);
-			fold = _mm_xor_si128(fold, temp);
-			if (unlikely(data_len < 4)) {
-				fold = xmm_shift_left(fold, 8 - data_len);
-				goto barret_reduction;
-			}
-			fold = xmm_shift_left(fold, 16 - data_len);
-			goto reduction_128_64;
-		}
-		/* 17 to 31 bytes */
-		fold = _mm_loadu_si128((const __m128i *)data);
+		memset(buffer, 0, sizeof(buffer));
+		memcpy(buffer, data, data_len);
+
+		fold = _mm_load_si128((const __m128i *)buffer);
 		fold = _mm_xor_si128(fold, temp);
-		n = 16;
-		k = params->rk3_rk4;
-		goto partial_bytes;
+		if (unlikely(data_len < 4)) {
+			fold = xmm_shift_left(fold, 8 - data_len);
+			goto barret_reduction;
+		}
+		fold = xmm_shift_left(fold, 16 - data_len);
+		goto reduction_128_64;
 	}
 
-	/** At least 32 bytes in the buffer */
+	/** At least 16 bytes in the buffer */
 	/** Apply CRC initial value */
 	fold = _mm_loadu_si128((const __m128i *)data);
 	fold = _mm_xor_si128(fold, temp);
@@ -229,7 +214,7 @@ crc32_eth_calc_pclmulqdq(
 		fold = crcr32_folding_round(temp, k, fold);
 	}
 
-partial_bytes:
+	/** Partial bytes - process last <16 bytes */
 	if (likely(n < data_len)) {
 
 		__m128i last16, a, b;
@@ -244,12 +229,8 @@ crc32_eth_calc_pclmulqdq(
 		b = _mm_shuffle_epi8(fold, temp);
 		b = _mm_blendv_epi8(b, last16, temp);
 
-		/* k = rk1 & rk2 */
-		temp = _mm_clmulepi64_si128(a, k, 0x01);
-		fold = _mm_clmulepi64_si128(a, k, 0x10);
-
-		fold = _mm_xor_si128(fold, temp);
-		fold = _mm_xor_si128(fold, b);
+		/* k = rk3 & rk4 */
+		fold = crcr32_folding_round(b, k, a);
 	}
 
 	/** Reduction 128 -> 32 Assumes: fold holds 128bit folded data */
-- 
2.53.0


^ permalink raw reply related

* [PATCH] test: add larger input len test for CRC16-CCITT
From: Shreesh Adiga @ 2026-06-12  2:37 UTC (permalink / raw)
  To: Jasvinder Singh; +Cc: dev

CRC16-CCITT test only covered len 32, 12, and 2 which meant that
code paths like 4x SSE4.2 loop and AVX512 code paths which operated
on larger lens like >255 never got covered.

This patch adds a 348 len input test for CRC16-CCITT similar to
CRC32 test which covers the additional paths in SSE4.2 and AVX512
implementations, therefore improving the test coverage.

Signed-off-by: Shreesh Adiga <16567adigashreesh@gmail.com>
---
 app/test/test_crc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/app/test/test_crc.c b/app/test/test_crc.c
index 4ff03e3f64..11645d323d 100644
--- a/app/test/test_crc.c
+++ b/app/test/test_crc.c
@@ -14,6 +14,7 @@
 #define CRC32_VEC_LEN2     348
 #define CRC16_VEC_LEN1     12
 #define CRC16_VEC_LEN2     2
+#define CRC16_VEC_LEN3     348
 
 /* CRC test vector */
 static const uint8_t crc_vec[CRC_VEC_LEN] = {
@@ -46,6 +47,7 @@ static const uint32_t crc32_vec2_res = 0xefaae02f;
 static const uint32_t crc16_vec_res = 0x6bec;
 static const uint32_t crc16_vec1_res = 0x8cdd;
 static const uint32_t crc16_vec2_res = 0xec5b;
+static const uint32_t crc16_vec3_res = 0x6271;
 
 static int
 crc_all_algs(const char *desc, enum rte_net_crc_type type,
@@ -137,6 +139,12 @@ crc_autotest(void)
 	ret |= crc_all_algs("16-bit CCITT CRC:  Test 6", RTE_NET_CRC16_CCITT, crc16_vec2,
 		CRC16_VEC_LEN2, crc16_vec2_res);
 
+	/* 16-bit CCITT CRC:  Test 7 */
+	memset(test_data, 0, CRC32_VEC_LEN1);
+	for (i = 0; i < CRC16_VEC_LEN3; i += 12)
+		rte_memcpy(&test_data[i], crc16_vec1, 12);
+	ret |= crc_all_algs("16-bit CCITT CRC: Test 7", RTE_NET_CRC16_CCITT, test_data,
+		CRC16_VEC_LEN3, crc16_vec3_res);
 	return ret;
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 15/15] doc: correct grammar and punctuation consistency issues
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Nicolas Chautru, Gowrishankar Muthukrishnan,
	Bruce Richardson, Radu Nicolau, Akhil Goyal, Anatoly Burakov,
	Jingjing Wu, Rajesh Kumar, Cristian Dumitrescu, John McNamara
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Correct grammar and punctuation issues across sample application guides:
- Added missing comma after "To compile the sample application" in 11 files
- Added missing period in cmd_line.rst compilation instruction
- Fixed capitalization of "Linux" (was lowercase "linux") in 4 files
- Fixed capitalization of "Ethernet" (was lowercase "ethernet") in 2 files
- Fixed "then" to "than" in comparison contexts (2 instances)
- Fixed subject-verb agreement: "specify" to "specifies"

These changes ensure consistency with the style used in other documentation
files and improve overall readability.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/bbdev_app.rst        |  4 ++--
 doc/guides/sample_app_ug/cmd_line.rst         |  2 +-
 doc/guides/sample_app_ug/ethtool.rst          |  2 +-
 doc/guides/sample_app_ug/fips_validation.rst  |  2 +-
 doc/guides/sample_app_ug/hello_world.rst      |  2 +-
 doc/guides/sample_app_ug/ipsec_secgw.rst      | 20 +++++++++----------
 .../sample_app_ug/l2_forward_real_virtual.rst |  4 ++--
 doc/guides/sample_app_ug/link_status_intr.rst |  2 +-
 doc/guides/sample_app_ug/multi_process.rst    |  2 +-
 doc/guides/sample_app_ug/ntb.rst              |  2 +-
 doc/guides/sample_app_ug/ptp_tap_relay_sw.rst |  2 +-
 doc/guides/sample_app_ug/ptpclient.rst        |  2 +-
 doc/guides/sample_app_ug/qos_metering.rst     |  2 +-
 doc/guides/sample_app_ug/rxtx_callbacks.rst   |  2 +-
 14 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/doc/guides/sample_app_ug/bbdev_app.rst b/doc/guides/sample_app_ug/bbdev_app.rst
index 00bbd1aa27..58deb6d90f 100644
--- a/doc/guides/sample_app_ug/bbdev_app.rst
+++ b/doc/guides/sample_app_ug/bbdev_app.rst
@@ -25,7 +25,7 @@ The MAC header is preserved in the packet throughout the loop-back operation.
 Limitations
 -----------
 
-* Only one baseband device and one ethernet port can be used.
+* Only one baseband device and one Ethernet port can be used.
 
 Compiling the Application
 -------------------------
@@ -34,7 +34,7 @@ DPDK needs to be built with ``baseband_turbo_sw`` PMD enabled along
 with ``FLEXRAN SDK`` Libraries. Refer to *SW Turbo Poll Mode Driver*
 documentation for more details.
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 
 Running the Application
diff --git a/doc/guides/sample_app_ug/cmd_line.rst b/doc/guides/sample_app_ug/cmd_line.rst
index 5b192bc615..072e4850b4 100644
--- a/doc/guides/sample_app_ug/cmd_line.rst
+++ b/doc/guides/sample_app_ug/cmd_line.rst
@@ -47,7 +47,7 @@ There are three simple commands:
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``cmd_line`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/ethtool.rst b/doc/guides/sample_app_ug/ethtool.rst
index 68bcf25b1f..4f740b09a7 100644
--- a/doc/guides/sample_app_ug/ethtool.rst
+++ b/doc/guides/sample_app_ug/ethtool.rst
@@ -18,7 +18,7 @@ is based upon a simple L2 frame reflector.
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``ethtool`` subdirectory.
 
diff --git a/doc/guides/sample_app_ug/fips_validation.rst b/doc/guides/sample_app_ug/fips_validation.rst
index 732f47212a..ad51beb4f4 100644
--- a/doc/guides/sample_app_ug/fips_validation.rst
+++ b/doc/guides/sample_app_ug/fips_validation.rst
@@ -96,7 +96,7 @@ Compiling the Application
 
 * Compile Application
 
-    To compile the sample application see :doc:`compiling`.
+    To compile the sample application, see :doc:`compiling`.
 
 *  Run ``dos2unix`` on the request files
 
diff --git a/doc/guides/sample_app_ug/hello_world.rst b/doc/guides/sample_app_ug/hello_world.rst
index 603a1d8767..feee6b52ae 100644
--- a/doc/guides/sample_app_ug/hello_world.rst
+++ b/doc/guides/sample_app_ug/hello_world.rst
@@ -14,7 +14,7 @@ The application simply prints a "helloworld" message on every enabled lcore.
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``helloworld`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/ipsec_secgw.rst b/doc/guides/sample_app_ug/ipsec_secgw.rst
index 8826ffb286..1f172fd4b6 100644
--- a/doc/guides/sample_app_ug/ipsec_secgw.rst
+++ b/doc/guides/sample_app_ug/ipsec_secgw.rst
@@ -89,7 +89,7 @@ The application supports two modes of operation: poll mode and event mode.
   **Currently the application provides non-burst, internal port worker threads.**
   It also provides infrastructure for non-internal ports but does not define any worker threads.
 
-  Event mode also supports event vectorization. The event devices, ethernet device
+  Event mode also supports event vectorization. The event devices, Ethernet device
   pairs which support the capability ``RTE_EVENT_ETH_RX_ADAPTER_CAP_EVENT_VECTOR`` can
   aggregate packets based on flow characteristics and generate a ``rte_event``
   containing ``rte_event_vector``.
@@ -245,9 +245,9 @@ Where:
     Zero value disables reassembly functionality.
     Default value: 0.
 
-*   ``--mtu MTU``: MTU value (in bytes) on all attached ethernet ports.
-    Outgoing packets with length bigger then MTU will be fragmented.
-    Incoming packets with length bigger then MTU will be discarded.
+*   ``--mtu MTU``: MTU value (in bytes) on all attached Ethernet ports.
+    Outgoing packets with length bigger than MTU will be fragmented.
+    Incoming packets with length bigger than MTU will be discarded.
     Default value: 1500.
 
 *   ``--frag-ttl FRAG_TTL_NS``: fragment lifetime (in nanoseconds).
@@ -694,10 +694,10 @@ where each options means:
 
 ``<type>``
 
- * Action type to specify the security action. This option specify
+ * Action type to specify the security action. This option specifies
    the SA to be performed with look aside protocol offload to HW
-   accelerator or protocol offload on ethernet device or inline
-   crypto processing on the ethernet device during transmission.
+   accelerator or protocol offload on Ethernet device or inline
+   crypto processing on the Ethernet device during transmission.
 
  * Optional: Yes, default type *no-offload*
 
@@ -1083,7 +1083,7 @@ The test directory contains scripts for testing the various encryption
 algorithms.
 
 The purpose of the scripts is to automate ipsec-secgw testing
-using another system running linux as a DUT.
+using another system running Linux as a DUT.
 
 The user must setup the following environment variables:
 
@@ -1127,11 +1127,11 @@ key to the DUT. It will ask for credentials so that it can upload the public key
 
 The SUT and DUT are connected through at least 2 NIC ports.
 
-One NIC port is expected to be managed by linux on both machines and will be
+One NIC port is expected to be managed by Linux on both machines and will be
 used as a control path.
 
 The second NIC port (test-port) should be bound to DPDK on the SUT, and should
-be managed by linux on the DUT.
+be managed by Linux on the DUT.
 
 The script starts ``ipsec-secgw`` with 2 NIC devices: ``test-port`` and
 ``tap vdev``.
diff --git a/doc/guides/sample_app_ug/l2_forward_real_virtual.rst b/doc/guides/sample_app_ug/l2_forward_real_virtual.rst
index dbe880e1b3..e31dd98ae5 100644
--- a/doc/guides/sample_app_ug/l2_forward_real_virtual.rst
+++ b/doc/guides/sample_app_ug/l2_forward_real_virtual.rst
@@ -111,14 +111,14 @@ where,
 
 *   --portmap="(port,port)[,(port,port)]": Determines forwarding ports mapping.
 
-To run the application in linux environment with 4 lcores, 16 ports and 8 RX queues per lcore and MAC address
+To run the application in Linux environment with 4 lcores, 16 ports and 8 RX queues per lcore and MAC address
 updating enabled, issue the command:
 
 .. code-block:: console
 
     $ ./<build_dir>/examples/dpdk-l2fwd -l 0-3 -- -q 8 -p ffff
 
-To run the application in linux environment with 4 lcores, 4 ports, 8 RX queues
+To run the application in Linux environment with 4 lcores, 4 ports, 8 RX queues
 per lcore, to forward RX traffic of ports 0 & 1 on ports 2 & 3 respectively and
 vice versa, issue the command:
 
diff --git a/doc/guides/sample_app_ug/link_status_intr.rst b/doc/guides/sample_app_ug/link_status_intr.rst
index fd4e32560c..d1021df5c8 100644
--- a/doc/guides/sample_app_ug/link_status_intr.rst
+++ b/doc/guides/sample_app_ug/link_status_intr.rst
@@ -50,7 +50,7 @@ where,
 
 *   -T PERIOD: statistics will be refreshed each PERIOD seconds (0 to disable, 10 default)
 
-To run the application in a linux environment with 4 lcores, 16 ports and 8 RX queues per lcore,
+To run the application in a Linux environment with 4 lcores, 16 ports and 8 RX queues per lcore,
 issue the command:
 
 .. code-block:: console
diff --git a/doc/guides/sample_app_ug/multi_process.rst b/doc/guides/sample_app_ug/multi_process.rst
index e9e5809a92..f1ec980e20 100644
--- a/doc/guides/sample_app_ug/multi_process.rst
+++ b/doc/guides/sample_app_ug/multi_process.rst
@@ -16,7 +16,7 @@ The multi-process example applications are built the same way as other sample ap
 and as documented in the *DPDK Getting Started Guide*.
 
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The applications are located in the ``multi_process`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/ntb.rst b/doc/guides/sample_app_ug/ntb.rst
index ca0ff54757..8557b73042 100644
--- a/doc/guides/sample_app_ug/ntb.rst
+++ b/doc/guides/sample_app_ug/ntb.rst
@@ -20,7 +20,7 @@ This sample supports 4 types of packet forwarding mode.
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``ntb`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/ptp_tap_relay_sw.rst b/doc/guides/sample_app_ug/ptp_tap_relay_sw.rst
index 466879cd46..a880a1295a 100644
--- a/doc/guides/sample_app_ug/ptp_tap_relay_sw.rst
+++ b/doc/guides/sample_app_ug/ptp_tap_relay_sw.rst
@@ -75,7 +75,7 @@ This minimises the gap between the measured timestamp and the actual wire egress
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``ptp_tap_relay_sw`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/ptpclient.rst b/doc/guides/sample_app_ug/ptpclient.rst
index 28007604a0..b842f5e0b8 100644
--- a/doc/guides/sample_app_ug/ptpclient.rst
+++ b/doc/guides/sample_app_ug/ptpclient.rst
@@ -58,7 +58,7 @@ synchronizes the PTP PHC clock with the Linux kernel clock.
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``ptpclient`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/qos_metering.rst b/doc/guides/sample_app_ug/qos_metering.rst
index b317d4079d..e8f3c6af19 100644
--- a/doc/guides/sample_app_ug/qos_metering.rst
+++ b/doc/guides/sample_app_ug/qos_metering.rst
@@ -38,7 +38,7 @@ all the incoming packets are colored as green.
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``qos_meter`` sub-directory.
 
diff --git a/doc/guides/sample_app_ug/rxtx_callbacks.rst b/doc/guides/sample_app_ug/rxtx_callbacks.rst
index 7dfcab5d65..31e2e5c811 100644
--- a/doc/guides/sample_app_ug/rxtx_callbacks.rst
+++ b/doc/guides/sample_app_ug/rxtx_callbacks.rst
@@ -33,7 +33,7 @@ The application is located in the ``rxtx_callbacks`` sub-directory.
 Running the Application
 -----------------------
 
-To run the example in a ``linux`` environment:
+To run the example in a ``Linux`` environment:
 
 .. code-block:: console
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 14/15] doc: improve vhost, VM power, and VMDq sample guides
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Maxime Coquelin, Chenbo Xia, Anatoly Burakov,
	Sivaprasad Tummala
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Enhanced documentation for virtualization-related samples:

vhost_blk.rst:
- Improved vhost block device descriptions
- Enhanced SCSI target configuration explanations
- Fixed formatting consistency

vhost_crypto.rst:
- Clarified cryptographic backend setup
- Improved virtio-crypto device descriptions
- Enhanced command-line examples

vm_power_management.rst:
- Restructured power management policy sections
- Improved guest-host communication descriptions
- Enhanced frequency scaling explanations
- Clarified branch ratio monitoring

vmdq_dcb_forwarding.rst:
- Enhanced VMDq and DCB configuration descriptions
- Improved traffic class explanations
- Fixed formatting in configuration examples

vmdq_forwarding.rst:
- Simplified VMDq pool configuration descriptions
- Improved MAC address filtering explanations

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/vhost_blk.rst        |  66 +++--
 doc/guides/sample_app_ug/vhost_crypto.rst     |  64 ++---
 .../sample_app_ug/vm_power_management.rst     | 262 ++++++++----------
 .../sample_app_ug/vmdq_dcb_forwarding.rst     | 101 ++++---
 doc/guides/sample_app_ug/vmdq_forwarding.rst  |  38 ++-
 5 files changed, 251 insertions(+), 280 deletions(-)

diff --git a/doc/guides/sample_app_ug/vhost_blk.rst b/doc/guides/sample_app_ug/vhost_blk.rst
index 788eef0d5f..7a3421302d 100644
--- a/doc/guides/sample_app_ug/vhost_blk.rst
+++ b/doc/guides/sample_app_ug/vhost_blk.rst
@@ -4,34 +4,37 @@
 Vhost_blk Sample Application
 =============================
 
-The vhost_blk sample application implemented a simple block device,
-which used as the  backend of Qemu vhost-user-blk device. Users can extend
-the exist example to use other type of block device(e.g. AIO) besides
-memory based block device. Similar with vhost-user-net device, the sample
-application used domain socket to communicate with Qemu, and the virtio
-ring (split or packed format) was processed by vhost_blk sample application.
-
-The sample application reuse lots codes from SPDK(Storage Performance
-Development Kit, https://github.com/spdk/spdk) vhost-user-blk target,
-for DPDK vhost library used in storage area, user can take SPDK as
-reference as well.
-
-Testing steps
--------------
-
-This section shows the steps how to start a VM with the block device as
+Overview
+--------
+
+The vhost_blk sample application implements a simple block device,
+which serves as the backend for the QEMU vhost-user-blk device. Users can
+extend the existing example to use other types of block devices (e.g., AIO)
+besides memory-based block devices. Similar to the vhost-user-net device,
+the sample application uses a Unix domain socket to communicate with QEMU
+and processes the virtio ring (split or packed format).
+
+The sample application reuses code from SPDK (Storage Performance
+Development Kit, https://github.com/spdk/spdk) vhost-user-blk target.
+For DPDK vhost library usage in storage applications, users can refer
+to SPDK as well.
+
+This section shows how to start a VM with the block device as
 fast data path for critical application.
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
-The application is located in the ``examples`` sub-directory.
+The application is located in the ``examples/vhost_blk`` directory.
 
-You will also need to build DPDK both on the host and inside the guest
+You will need to build DPDK both on the host and inside the guest.
 
-Start the vhost_blk example
+Running the Application
+-----------------------
+
+Start the vhost_blk application
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: console
@@ -54,12 +57,17 @@ Start the VM
         -device vhost-user-blk-pci,packed=on,chardev=char0,num-queues=1 \
         ...
 
-.. note::
-    You must check whether your Qemu can support "vhost-user-blk" or not,
-    Qemu v4.0 or newer version is required.
-    reconnect=1 means live recovery support that qemu can reconnect vhost_blk
-    after we restart vhost_blk example.
-    packed=on means the device support packed ring but need the guest kernel
-    version >= 5.0.
-    Now Qemu commit 9bb73502321d46f4d320fa17aa38201445783fc4 both support the
-    vhost-blk reconnect and packed ring.
+QEMU Options
+^^^^^^^^^^^^
+
+QEMU v4.0 or newer is required for vhost-user-blk support.
+
+reconnect=1
+   Enables live recovery support, allowing QEMU to reconnect to the
+   vhost_blk application after it restarts.
+
+packed=on
+   Enables packed virtqueue support. Requires guest kernel version 5.0
+   or newer.
+
+QEMU commit 9bb73502321d supports both vhost-blk reconnect and packed ring.
diff --git a/doc/guides/sample_app_ug/vhost_crypto.rst b/doc/guides/sample_app_ug/vhost_crypto.rst
index 5c4475342c..9f8e71b1b7 100644
--- a/doc/guides/sample_app_ug/vhost_crypto.rst
+++ b/doc/guides/sample_app_ug/vhost_crypto.rst
@@ -4,25 +4,25 @@
 Vhost_Crypto Sample Application
 ===============================
 
-The vhost_crypto sample application implemented a simple Crypto device,
-which used as the  backend of Qemu vhost-user-crypto device. Similar with
-vhost-user-net and vhost-user-scsi device, the sample application used
-domain socket to communicate with Qemu, and the virtio ring was processed
-by vhost_crypto sample application.
+The vhost_crypto sample application implements a Crypto device,
+which serves as the backend for the QEMU vhost-user-crypto device.
+Similar to vhost-user-net and vhost-user-scsi devices, the application uses
+a domain socket to communicate with QEMU, and processes the virtio rings
+to provide cryptographic services to the guest.
 
-Testing steps
--------------
-
-This section shows the steps how to start a VM with the crypto device as
+This section shows the steps to start a VM with the crypto device as
 fast data path for critical application.
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``examples`` sub-directory.
 
+Running the Application
+-----------------------
+
 Start the vhost_crypto example
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -37,33 +37,35 @@ Start the vhost_crypto example
 
 where,
 
-* config (lcore,cdev-id,queue-id): build the lcore-cryptodev id-queue id
-  connection. Once specified, the specified lcore will only work with
-  specified cryptodev's queue.
+* config (lcore,cdev-id,queue-id): builds the lcore-cryptodev id-queue id
+  connection. When specified, the lcore only works with the
+  specified cryptodev queue.
 
-* socket-file lcore,PATH: the path of UNIX socket file to be created and
-  the lcore id that will deal with the all workloads of the socket. Multiple
-  instances of this config item is supported and one lcore supports processing
+* socket-file lcore,PATH: specifies the path of the UNIX socket file to be created and
+  the lcore id that handles all workloads of the socket. Multiple
+  instances of this config item are supported and one lcore can process
   multiple sockets.
 
-* zero-copy: the presence of this item means the ZERO-COPY feature will be
-  enabled. Otherwise it is disabled. PLEASE NOTE the ZERO-COPY feature is still
-  in experimental stage and may cause the problem like segmentation fault. If
-  the user wants to use LKCF in the guest, this feature shall be turned off.
+* zero-copy: when present, indicates the zero-copy feature will be
+  enabled. Otherwise it is disabled.
 
-* guest-polling: the presence of this item means the application assumes the
-  guest works in polling mode, thus will NOT notify the guest completion of
+* guest-polling: when present, assumes the guest works in polling
+  mode and does not notify the guest upon completion of
   processing.
 
-* asymmetric-crypto: the presence of this item means
-  the application can handle the asymmetric crypto requests.
-  When this option is used,
-  symmetric crypto requests can not be handled by the application.
+* asymmetric-crypto: when present, indicates the application handles
+  asymmetric crypto requests. When this option is used, the application
+  cannot handle symmetric crypto requests.
 
-The application requires that crypto devices capable of performing
-the specified crypto operation are available on application initialization.
-This means that HW crypto device/s must be bound to a DPDK driver or
-a SW crypto device/s (virtual crypto PMD) must be created (using --vdev).
+.. note::
+   The zero-copy feature is experimental and may cause segmentation faults.
+   If you want to use LKCF in the guest, disable this feature.
+
+.. note::
+   The application requires that crypto devices capable of performing
+   the specified crypto operation are available on application initialization.
+   HW crypto devices must be bound to a DPDK driver or an SW crypto device
+   (virtual crypto PMD) must be created using --vdev.
 
 .. _vhost_crypto_app_run_vm:
 
@@ -83,4 +85,4 @@ Start the VM
         ...
 
 .. note::
-    You must check whether your Qemu can support "vhost-user-crypto" or not.
+   You must verify that your QEMU supports vhost-user-crypto.
diff --git a/doc/guides/sample_app_ug/vm_power_management.rst b/doc/guides/sample_app_ug/vm_power_management.rst
index 1955140bb3..590078dbd7 100644
--- a/doc/guides/sample_app_ug/vm_power_management.rst
+++ b/doc/guides/sample_app_ug/vm_power_management.rst
@@ -4,38 +4,37 @@
 Virtual Machine Power Management Application
 ============================================
 
-Applications running in virtual environments have an abstract view of
-the underlying hardware on the host. Specifically, applications cannot
-see the binding of virtual components to physical hardware. When looking
-at CPU resourcing, the pinning of Virtual CPUs (vCPUs) to Physical CPUs
-(pCPUs) on the host is not apparent to an application and this pinning
-may change over time. In addition, operating systems on Virtual Machines
-(VMs) do not have the ability to govern their own power policy. The
-Machine Specific Registers (MSRs) for enabling P-state transitions are
-not exposed to the operating systems running on the VMs.
-
-The solution demonstrated in this sample application shows an example of
-how a DPDK application can indicate its processing requirements using
-VM-local only information (vCPU/lcore, and so on) to a host resident VM
-Power Manager. The VM Power Manager is responsible for:
-
-- **Accepting requests for frequency changes for a vCPU**
-- **Translating the vCPU to a pCPU using libvirt**
-- **Performing the change in frequency**
+Overview
+--------
+
+Applications in virtual environments have a limited view of the host hardware.
+They cannot see how virtual components map to physical hardware, including the
+pinning of virtual CPUs (vCPUs) to physical CPUs (pCPUs), which may change over time.
+Additionally, virtual machine operating systems cannot manage their own power policies,
+as the necessary Machine Specific Registers (MSRs) for controlling P-state transitions
+is not available to them.
+
+This sample application demonstrates how a DPDK application can communicate its
+processing needs using local VM information (like vCPU or lcore details) to a
+host-based VM Power Manager.
+
+The VM Power Manager is responsible for:
+
+- Accepting requests for frequency changes for a vCPU
+- Translating the vCPU to a pCPU using libvirt
+- Performing the change in frequency
 
 This application demonstrates the following features:
 
-- **The handling of VM application requests to change frequency.**
+- **Handling of VM application requests to change frequency.**
   VM applications can request frequency changes for a vCPU. The VM
-  Power Management Application uses libvirt to translate that
+  Power Management application uses libvirt to translate that
   virtual CPU (vCPU) request to a physical CPU (pCPU) request and
   performs the frequency change.
 
-- **The acceptance of power management policies from VM applications.**
+- **Acceptance of power management policies from VM applications.**
   A VM application can send a policy to the host application. The
-  policy contains rules that define the power management behaviour
-  of the VM. The host application then applies the rules of the
-  policy independent of the VM application. For example, the
+  policy contains rules that define the power management behavior of the VM, which the host application then applies independent of the VM. For example, the
   policy can contain time-of-day information for busy/quiet
   periods, and the host application can scale up/down the relevant
   cores when required. See :ref:`sending_policy` for information on
@@ -51,7 +50,7 @@ This application demonstrates the following features:
 In addition to the ``librte_power`` library used on the host, the
 application uses a special version of ``librte_power`` on each VM, which
 directs frequency changes and policies to the host monitor rather than
-the APCI ``cpufreq`` ``sysfs`` interface used on the host in non-virtualised
+the ACPI ``cpufreq`` ``sysfs`` interface used on the host in non-virtualised
 environments.
 
 .. _figure_vm_power_mgr_highlevel:
@@ -84,77 +83,64 @@ in the host.
   state, manually altering CPU frequency. Also allows for the changings
   of vCPU to pCPU pinning
 
-Sample Application Architecture Overview
-----------------------------------------
-
-The VM power management solution employs ``qemu-kvm`` to provide
-communications channels between the host and VMs in the form of a
-``virtio-serial`` connection that appears as a para-virtualised serial
-device on a VM and can be configured to use various backends on the
-host. For this example, the configuration of each ``virtio-serial`` endpoint
-on the host as an ``AF_UNIX`` file socket, supporting poll/select and
-``epoll`` for event notification. In this example, each channel endpoint on
-the host is monitored for ``EPOLLIN`` events using ``epoll``. Each channel
-is specified as ``qemu-kvm`` arguments or as ``libvirt`` XML for each VM,
-where each VM can have several channels up to a maximum of 64 per VM. In this
-example, each DPDK lcore on a VM has exclusive access to a channel.
-
-To enable frequency changes from within a VM, the VM forwards a
-``librte_power`` request over the ``virtio-serial`` channel to the host. Each
-request contains the vCPU and power command (scale up/down/min/max). The
-API for the host ``librte_power`` and guest ``librte_power`` is consistent
-across environments, with the selection of VM or host implementation
-determined automatically at runtime based on the environment. On
-receiving a request, the host translates the vCPU to a pCPU using the
-libvirt API before forwarding it to the host ``librte_power``.
+Sample Application Architecture
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+The VM power management solution uses ``qemu-kvm`` to create communication
+channels between the host and VMs through a ``virtio-serial`` connection.
+This connection appears as a para-virtualized serial device on the VM
+and can use various backends on the host. In this example, each ``virtio-serial``
+endpoint is configured as an ``AF_UNIX`` file socket on the host, supporting
+event notifications via ``poll``, ``select``, or ``epoll``. The host monitors
+each channel for ``EPOLLIN`` events using ``epoll``, with up to 64 channels per VM.
+Each DPDK lcore on a VM has exclusive access to a channel.
+
+To enable frequency scaling from within a VM, the VM sends a ``librte_power``
+request over the ``virtio-serial`` channel to the host. The request specifies
+the vCPU and desired power action (e.g., scale up, scale down, set to min/max).
+The ``librte_power`` API is consistent across environments, automatically selecting
+the appropriate VM or host implementation at runtime. Upon receiving a request,
+the host maps the vCPU to a pCPU using the libvirt API and forwards the command
+to the host’s ``librte_power`` for execution.
 
 .. _figure_vm_power_mgr_vm_request_seq:
 
 .. figure:: img/vm_power_mgr_vm_request_seq.*
 
-In addition to the ability to send power management requests to the
-host, a VM can send a power management policy to the host. In some
-cases, using a power management policy is a preferred option because it
-can eliminate possible latency issues that can occur when sending power
-management requests. Once the VM sends the policy to the host, the VM no
-longer needs to worry about power management, because the host now
-manages the power for the VM based on the policy. The policy can specify
-power behavior that is based on incoming traffic rates or time-of-day
-power adjustment (busy/quiet hour power adjustment for example). See
-:ref:`sending_policy` for more information.
-
-One method of power management is to sense how busy a core is when
-processing packets and adjusting power accordingly. One technique for
-doing this is to monitor the ratio of the branch miss to branch hits
-counters and scale the core power accordingly. This technique is based
-on the premise that when a core is not processing packets, the ratio of
-branch misses to branch hits is very low, but when the core is
-processing packets, it is measurably higher. The implementation of this
-capability is as a policy of type ``BRANCH_RATIO``.
-See :ref:`sending_policy` for more information on using the
-BRANCH_RATIO policy option.
-
-A JSON interface enables the specification of power management requests
-and policies in JSON format. The JSON interfaces provide a more
-convenient and more easily interpreted interface for the specification
-of requests and policies. See :ref:`power_man_requests` for more information.
+In addition to sending power management requests to the
+host, a VM can send a power management policy to the host.
+Using a policy is often preferred as it avoids potential
+latency issues from frequent requests. Once the policy is
+sent, the host manages the VM's power based on the policy,
+freeing the VM from further involvement. Policies can include
+rules like adjusting power based on traffic rates or setting
+power levels for busy and quiet hours. See :ref:`sending_policy`
+for more information.
+
+One power management method monitors core activity by tracking
+the ratio of branch misses to branch hits. When a core is idle,
+this ratio is low; when it’s busy processing packets, the ratio increases.
+This technique, implemented as a ``BRANCH_RATIO`` policy, adjusts core power
+dynamically based on workload. See :ref:`sending_policy` for more information
+on using the BRANCH_RATIO policy option.
+
+Power management requests and policies can also be defined using a JSON interface,
+which provides a simpler and more readable way to specify configurations. See
+For more details, see :ref:`power_man_requests` for more information.
 
 Performance Considerations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-While the Haswell microarchitecture allows for independent power control
-for each core, earlier microarchitectures do not offer such fine-grained
-control. When deploying on pre-Haswell platforms, greater care must be
-taken when selecting which cores are assigned to a VM, for example, a
-core does not scale down in frequency until all of its siblings are
-similarly scaled down.
+The Haswell microarchitecture enables independent power control for each core,
+but earlier microarchitectures lack this level of precision. On pre-Haswell platforms,
+careful consideration is needed when assigning cores to a VM. For instance, a core cannot
+scale down its frequency until all its sibling cores are also scaled down.
 
 Configuration
--------------
+~~~~~~~~~~~~~
 
 BIOS
-~~~~
+^^^^
 
 To use the power management features of the DPDK, you must enable
 Enhanced Intel SpeedStep® Technology in the platform BIOS. Otherwise,
@@ -163,7 +149,7 @@ exist, and you cannot use CPU frequency-based power management. Refer to the
 relevant BIOS documentation to determine how to access these settings.
 
 Host Operating System
-~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^
 
 The DPDK Power Management library can use either the ``acpi_cpufreq`` or
 the ``intel_pstate`` kernel driver for the management of core frequencies. In
@@ -180,10 +166,12 @@ Linux command line:
 
 On reboot, load the ``acpi_cpufreq`` module:
 
+.. code-block:: console
+
    ``modprobe acpi_cpufreq``
 
 Hypervisor Channel Configuration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Configure ``virtio-serial`` channels using ``libvirt`` XML.
 The XML structure is as follows:
@@ -202,7 +190,7 @@ The XML structure is as follows:
 
 Where a single controller of type ``virtio-serial`` is created, up to 32
 channels can be associated with a single controller, and multiple
-controllers can be specified. The convention is to use the name of the
+controllers can be specified. The convention is to use the VM name in the
 VM in the host path ``{vm_name}`` and to increment ``{channel_num}`` for each
 channel. Likewise, the port value ``{N}`` must be incremented for each
 channel.
@@ -271,11 +259,11 @@ than the EAL options:
 
 .. code-block:: console
 
-   ./<build_dir>/examples/dpdk-vm_power_mgr [EAL options]
+   ./<build_dir>/examples/dpdk-vm_power_manager [EAL options]
 
 The application requires exactly two cores to run. One core for the CLI
 and the other for the channel endpoint monitor. For example, to run on
-cores 0 and 1 on a system, issue the command:
+cores 0 and 1, issue the command:
 
 .. code-block:: console
 
@@ -308,83 +296,50 @@ Manager using the command:
 
    rm_vm {vm_name}
 
-Add communication channels for the specified VM using the following
-command. The ``virtio`` channels must be enabled in the VM configuration
-(``qemu/libvirt``) and the associated VM must be active. ``{list}`` is a
-comma-separated list of channel numbers to add. Specifying the keyword
-``all`` attempts to add all channels for the VM:
-
-.. code-block:: console
-
-   set_pcpu {vm_name} {vcpu} {pcpu}
-
-  Enable query of physical core information from a VM:
-
-.. code-block:: console
-
-   set_query {vm_name} enable|disable
-
-Manual control and inspection can also be carried in relation CPU frequency scaling:
-
-  Get the current frequency for each core specified in the mask:
-
-.. code-block:: console
-
-   show_cpu_freq_mask {mask}
-
-  Set the current frequency for the cores specified in {core_mask} by scaling each up/down/min/max:
+Add communication channels for the specified VM using the following command.
+The ``virtio`` channels must be enabled in the VM configuration
+(``qemu/libvirt``) and the associated VM must be active.
+``{list}`` is a comma-separated list of channel numbers to add.
+Specifying the keyword ``all`` attempts to add all channels for the VM:
 
 .. code-block:: console
 
    add_channels {vm_name} {list}|all
 
 Enable or disable the communication channels in ``{list}`` (comma-separated)
-for the specified VM. Alternatively, replace ``list`` with the keyword
-``all``. Disabled channels receive packets on the host. However, the commands
-they specify are ignored. Set the status to enabled to begin processing
-requests again:
+for the specified VM. Alternatively, replace ``{list}`` with the keyword
+``all``. Disabled channels still receive packets on the host, however the
+commands they specify are ignored. Set the status to enabled to begin
+processing requests again:
 
 .. code-block:: console
 
    set_channel_status {vm_name} {list}|all enabled|disabled
 
-Print to the CLI information on the specified VM. The information lists
-the number of vCPUs, the pinning to pCPU(s) as a bit mask, along with
-any communication channels associated with each VM, and the status of
-each channel:
+Enable or disable a VM's ability to query physical core information from the
+host:
 
 .. code-block:: console
 
-   show_vm {vm_name}
+   set_query {vm_name} enable|disable
 
-Set the binding of a virtual CPU on a VM with name ``{vm_name}`` to the
-physical CPU mask:
+Print to the CLI the information on the specified VM. The information lists
+the number of vCPUs, the pinning to pCPU(s) as a bit mask, along with any
+communication channels associated with each VM, and the status of each
+channel:
 
 .. code-block:: console
 
-   set_pcpu_mask {vm_name} {vcpu} {pcpu}
-
-Set the binding of the virtual CPU on the VM to the physical CPU:
-
-  .. code-block:: console
-
-   set_pcpu {vm_name} {vcpu} {pcpu}
-
-It is also possible to perform manual control and inspection in relation
-to CPU frequency scaling.
+   show_vm {vm_name}
 
-Get the current frequency for each core specified in the mask:
+Set the binding of a virtual CPU on the VM with name ``{vm_name}`` to a
+physical CPU:
 
 .. code-block:: console
 
-   show_cpu_freq_mask {mask}
-
-Set the current frequency for the cores specified in ``{core_mask}`` by
-scaling each up/down/min/max:
-
-.. code-block:: console
+   set_pcpu {vm_name} {vcpu} {pcpu}
 
-   set_cpu_freq {core_mask} up|down|min|max
+Manual control and inspection of host CPU frequency scaling is also available.
 
 Get the current frequency for the specified core:
 
@@ -392,11 +347,12 @@ Get the current frequency for the specified core:
 
    show_cpu_freq {core_num}
 
-Set the current frequency for the specified core by scaling up/down/min/max:
+Set the current frequency for the specified core by scaling it up, down, to
+min or to max:
 
 .. code-block:: console
 
-   set_cpu_freq {core_num} up|down|min|max
+   set_cpu_freq {core_num} up|down|min|max|enable_turbo|disable_turbo
 
 .. _enabling_out_of_band:
 
@@ -404,7 +360,7 @@ Command Line Options for Enabling Out-of-band Branch Ratio Monitoring
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 There are a couple of command line parameters for enabling the out-of-band
-monitoring of branch ratios on cores doing busy polling using PMDs as
+monitoring of branch ratios on cores doing busy polling using PMDs,
 described below:
 
 ``--core-branch-ratio {list of cores}:{branch ratio for listed cores}``
@@ -415,15 +371,14 @@ described below:
    causing the branch ratio to increase. When the ratio goes above
    the ratio threshold, the core frequency scales up to the maximum
    allowed value. The specified branch-ratio is a floating point number
-   that identifies the threshold at which to scale up or down for the
-   elements of the core-list. If not included the default branch ratio of
-   0.01 but will need adjustment for different workloads
+   that identifies the threshold at which to scale up or down. If not
+   included, the default branch ratio of 0.01 is used, but this may need
+   adjustment for different workloads.
 
    This parameter can be used multiple times for different sets of cores.
    The branch ratio mechanism can also be useful for non-PMD cores and
    hyper-threaded environments where C-States are disabled.
 
-
 Compiling and Running the Guest Applications
 --------------------------------------------
 
@@ -479,7 +434,7 @@ correct directory using the following find command:
    /usr/lib/i386-linux-gnu/pkgconfig
    /usr/lib/x86_64-linux-gnu/pkgconfig
 
-Then use:
+Then, use:
 
 .. code-block:: console
 
@@ -683,7 +638,7 @@ The following is an example JSON string for a power management request.
 
 To query the available frequencies of an lcore, use the query_cpu_freq command.
 Where {core_num} is the lcore to query.
-Before using this command, please enable responses via the set_query command on the host.
+Before using this command, enable responses via the set_query command on the host:
 
 .. code-block:: console
 
@@ -851,7 +806,6 @@ policy_type
 
 Description
    The type of policy to apply.
-   See the ``--policy`` option description for more information.
 Type
    string
 Values
@@ -864,9 +818,9 @@ Values
    - WORKLOAD: Determine how heavily loaded the cores are
      and scale up and down accordingly.
    - BRANCH_RATIO: An out-of-band policy that looks at the ratio
-     between branch hits and misses on a core
-     and uses that information to determine how much packet processing
-     a core is doing.
+     between branch hits and misses on a core and uses that information
+     to determine how much packet processing a core is doing.
+
 
 Required
    For ``CREATE`` and ``DESTROY`` policy requests only.
diff --git a/doc/guides/sample_app_ug/vmdq_dcb_forwarding.rst b/doc/guides/sample_app_ug/vmdq_dcb_forwarding.rst
index efb133c11c..98a234fe15 100644
--- a/doc/guides/sample_app_ug/vmdq_dcb_forwarding.rst
+++ b/doc/guides/sample_app_ug/vmdq_dcb_forwarding.rst
@@ -4,37 +4,45 @@
 VMDQ and DCB Forwarding Sample Application
 ==========================================
 
-The VMDQ and DCB Forwarding sample application is a simple example of packet processing using the DPDK.
-The application performs L2 forwarding using VMDQ and DCB to divide the incoming traffic into queues.
-The traffic splitting is performed in hardware by the VMDQ and DCB features of the Intel® 82599 and X710/XL710 Ethernet Controllers.
+The VMDQ and DCB Forwarding sample application demonstrates L2 forwarding packet processing
+using VMDQ and DCB.
+It divides the incoming traffic into queues, which is performed in hardware
+by the VMDQ and DCB features of the Intel 82599 and X710/XL710 Ethernet Controllers.
 
 Overview
 --------
 
-This sample application can be used as a starting point for developing a new application that is based on the DPDK and
-uses VMDQ and DCB for traffic partitioning.
+This sample application can be used as a starting point for developing a new application
+that is based on the DPDK and uses VMDQ and DCB for traffic partitioning.
 
-The VMDQ and DCB filters work on MAC and VLAN traffic to divide the traffic into input queues on the basis of the Destination MAC
-address, VLAN ID and VLAN user priority fields.
-VMDQ filters split the traffic into 16 or 32 groups based on the Destination MAC and VLAN ID.
-Then, DCB places each packet into one of queues within that group, based upon the VLAN user priority field.
+The VMDQ and DCB filters work on MAC and VLAN traffic to divide the traffic into input queues
+on the basis of the Destination MAC address, VLAN ID and VLAN user priority fields.
 
-All traffic is read from a single incoming port (port 0) and output on port 1, without any processing being performed.
-With Intel® 82599 NIC, for example, the traffic is split into 128 queues on input, where each thread of the application reads from
-multiple queues. When run with 8 threads, that is, with the -c FF option, each thread receives and forwards packets from 16 queues.
+VMDQ filters
+   Split the traffic into 16 or 32 groups based on the Destination MAC and VLAN ID.
+
+DCB (Data Center Bridging)
+   Place each packet into one of the queues within that group, based upon the VLAN user priority field.
+
+All traffic is read from a single incoming port (port 0) and output on port 1.
+No packet processing is performed.
+
+Using Intel® 82599 NIC, the traffic is split into 128 queues on input, where each thread of the application reads from
+multiple queues. When run with 8 threads (with the -c FF option), each thread receives and forwards packets from 16 queues.
 
 As supplied, the sample application configures the VMDQ feature to have 32 pools with 4 queues each as indicated in :numref:`figure_vmdq_dcb_example`.
-The Intel® 82599 10 Gigabit Ethernet Controller NIC also supports the splitting of traffic into 16 pools of 8 queues. While the
-Intel® X710 or XL710 Ethernet Controller NICs support many configurations of VMDQ pools of 4 or 8 queues each. For simplicity, only 16
-or 32 pools is supported in this sample. And queues numbers for each VMDQ pool can be changed by setting RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM
-in config/rte_config.h file.
-The nb-pools, nb-tcs and enable-rss parameters can be passed on the command line, after the EAL parameters:
+The Intel 82599 10 Gigabit Ethernet Controller NIC also supports the splitting of traffic into 16 pools of 8 queues.
+The Intel X710 or XL710 Ethernet Controller NICs support many configurations of VMDQ pools of 4 or 8 queues each.
+
+For simplicity, only 16 or 32 pools are supported in this sample. Queues numbers for each VMDQ pool
+can be changed by setting RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM in config/rte_config.h file.
+The nb-pools, nb-tcs and enable-rss parameters can be passed on the command line after the EAL parameters:
 
 .. code-block:: console
 
     ./<build_dir>/examples/dpdk-vmdq_dcb [EAL options] -- -p PORTMASK --nb-pools NP --nb-tcs TC --enable-rss
 
-where, NP can be 16 or 32, TC can be 4 or 8, rss is disabled by default.
+where NP can be 16 or 32, TC can be 4 or 8, and RSS is disabled by default.
 
 .. _figure_vmdq_dcb_example:
 
@@ -43,10 +51,10 @@ where, NP can be 16 or 32, TC can be 4 or 8, rss is disabled by default.
    Packet Flow Through the VMDQ and DCB Sample Application
 
 
-In Linux* user space, the application can display statistics with the number of packets received on each queue.
-To have the application display the statistics, send a SIGHUP signal to the running application process.
+In Linux user space, the application can display statistics with the number of packets received on each queue.
+To have the application display statistics, send a SIGHUP signal to the running application process.
 
-The VMDQ and DCB Forwarding sample application is in many ways simpler than the L2 Forwarding application
+This sample application is in many ways simpler than the L2 Forwarding application
 (see :doc:`l2_forward_real_virtual`)
 as it performs unidirectional L2 forwarding of packets from one port to a second port.
 No command-line options are taken by this application apart from the standard EAL command-line options.
@@ -54,25 +62,23 @@ No command-line options are taken by this application apart from the standard EA
 .. note::
 
     Since VMD queues are being used for VMM, this application works correctly
-    when VTd is disabled in the BIOS or Linux* kernel (intel_iommu=off).
+    when VTd is disabled in the BIOS or Linux kernel (intel_iommu=off).
 
 Compiling the Application
 -------------------------
 
-
-
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``vmdq_dcb`` sub-directory.
 
 Running the Application
 -----------------------
 
-To run the example in a linux environment:
+To run the example in a Linux environment:
 
 .. code-block:: console
 
-    user@target:~$ ./<build_dir>/examples/dpdk-vmdq_dcb -l 0-3 -- -p 0x3 --nb-pools 32 --nb-tcs 4
+    user@host:~$ ./<build_dir>/examples/dpdk-vmdq_dcb -l 0-3 -- -p 0x3 --nb-pools 32 --nb-tcs 4
 
 Refer to the *DPDK Getting Started Guide* for general information on running applications and
 the Environment Abstraction Layer (EAL) options.
@@ -80,20 +86,20 @@ the Environment Abstraction Layer (EAL) options.
 Explanation
 -----------
 
-The following sections provide some explanation of the code.
+The following sections provide explanation of the code.
 
 Initialization
 ~~~~~~~~~~~~~~
 
 The EAL, driver and PCI configuration is performed largely as in the L2 Forwarding sample application,
 as is the creation of the mbuf pool.
-See :doc:`l2_forward_real_virtual`.
-Where this example application differs is in the configuration of the NIC port for RX.
+
+See :doc:`l2_forward_real_virtual`. This example application differs in the configuration of the NIC port for Rx.
 
 The VMDQ and DCB hardware feature is configured at port initialization time by setting the appropriate values in the
 rte_eth_conf structure passed to the rte_eth_dev_configure() API.
-Initially in the application,
-a default structure is provided for VMDQ and DCB configuration to be filled in later by the application.
+
+Initially in the application, a default structure is provided for VMDQ and DCB configuration to be filled in later by the application.
 
 .. literalinclude:: ../../../examples/vmdq_dcb/main.c
     :language: c
@@ -101,18 +107,21 @@ a default structure is provided for VMDQ and DCB configuration to be filled in l
     :end-before: >8 End of empty vmdq+dcb configuration structure.
 
 The get_eth_conf() function fills in an rte_eth_conf structure with the appropriate values,
-based on the global vlan_tags array,
-and dividing up the possible user priority values equally among the individual queues
-(also referred to as traffic classes) within each pool. With Intel® 82599 NIC,
-if the number of pools is 32, then the user priority fields are allocated 2 to a queue.
+based on the global vlan_tags array, and divides up the possible user priority values equally
+among the individual queues (also referred to as traffic classes) within each pool.
+
+With Intel 82599 NIC, if the number of pools is 32, then the user priority fields are allocated 2 to a queue.
 If 16 pools are used, then each of the 8 user priority fields is allocated to its own queue within the pool.
-With Intel® X710/XL710 NICs, if number of tcs is 4, and number of queues in pool is 8,
-then the user priority fields are allocated 2 to one tc, and a tc has 2 queues mapping to it, then
-RSS will determine the destination queue in 2.
-For the VLAN IDs, each one can be allocated to possibly multiple pools of queues,
-so the pools parameter in the rte_eth_vmdq_dcb_conf structure is specified as a bitmask value.
+
+With Intel X710/XL710 NICs, if the number of tcs is 4, and number of queues in pool is 8,
+then the user priority fields are allocated 2 to one tc.
+
+If the tc has 2 queues mapping to it, then RSS will determine the destination queue in 2.
+For the VLAN IDs, each one can be allocated to multiple pools of queues,
+so the ``pools`` parameter in the ``rte_eth_vmdq_dcb_conf`` structure is specified as a bitmask value.
+
 For destination MAC, each VMDQ pool will be assigned with a MAC address. In this sample, each VMDQ pool
-is assigned to the MAC like 52:54:00:12:<port_id>:<pool_id>, that is,
+is assigned to the MAC like 52:54:00:12:<port_id>:<pool_id>, and
 the MAC of VMDQ pool 2 on port 1 is 52:54:00:12:01:02.
 
 .. literalinclude:: ../../../examples/vmdq_dcb/main.c
@@ -134,8 +143,8 @@ See :doc:`l2_forward_real_virtual` for more information.
 Statistics Display
 ~~~~~~~~~~~~~~~~~~
 
-When run in a linux environment,
-the VMDQ and DCB Forwarding sample application can display statistics showing the number of packets read from each RX queue.
+When run in a Linux environment, the VMDQ and DCB Forwarding sample application can display
+statistics showing the number of packets read from each Rx queue.
 This is provided by way of a signal handler for the SIGHUP signal,
 which simply prints to standard output the packet counts in grid form.
 Each row of the output is a single pool with the columns being the queue number within that pool.
@@ -144,7 +153,7 @@ To generate the statistics output, use the following command:
 
 .. code-block:: console
 
-    user@host$ sudo killall -HUP vmdq_dcb_app
+    user@host$ sudo killall -HUP dpdk-vmdq_dcb
 
-Please note that the statistics output will appear on the terminal where the vmdq_dcb_app is running,
+Please note that the statistics output will appear on the terminal where the dpdk-vmdq_dcb is running,
 rather than the terminal from which the HUP signal was sent.
diff --git a/doc/guides/sample_app_ug/vmdq_forwarding.rst b/doc/guides/sample_app_ug/vmdq_forwarding.rst
index c998a5a223..3124483f79 100644
--- a/doc/guides/sample_app_ug/vmdq_forwarding.rst
+++ b/doc/guides/sample_app_ug/vmdq_forwarding.rst
@@ -2,9 +2,9 @@
     Copyright(c) 2020 Intel Corporation.
 
 VMDq Forwarding Sample Application
-==========================================
+==================================
 
-The VMDq Forwarding sample application is a simple example of packet processing using the DPDK.
+The VMDq Forwarding sample application is an example of packet processing using the DPDK.
 The application performs L2 forwarding using VMDq to divide the incoming traffic into queues.
 The traffic splitting is performed in hardware by the VMDq feature of the Intel® 82599 and X710/XL710 Ethernet Controllers.
 
@@ -14,28 +14,28 @@ Overview
 This sample application can be used as a starting point for developing a new application that is based on the DPDK and
 uses VMDq for traffic partitioning.
 
-VMDq filters split the incoming packets up into different "pools" - each with its own set of RX queues - based upon
+VMDq filtering splits incoming packets into different "pools" (each with its own set of Rx queues) based on
 the MAC address and VLAN ID within the VLAN tag of the packet.
 
 All traffic is read from a single incoming port and output on another port, without any processing being performed.
-With Intel® 82599 NIC, for example, the traffic is split into 128 queues on input, where each thread of the application reads from
-multiple queues. When run with 8 threads, that is, with the -c FF option, each thread receives and forwards packets from 16 queues.
+With the Intel® 82599 NIC, for example, traffic is split into 128 queues on input, where each thread of the application reads from
+multiple queues. When run with 8 threads with the -c FF option, each thread receives and forwards packets from 16 queues.
 
-As supplied, the sample application configures the VMDq feature to have 32 pools with 4 queues each.
+As supplied, the sample application configures the VMDq feature to have 32 pools, with 4 queues each.
 The Intel® 82599 10 Gigabit Ethernet Controller NIC also supports the splitting of traffic into 16 pools of 2 queues.
 While the Intel® X710 or XL710 Ethernet Controller NICs support many configurations of VMDq pools of 4 or 8 queues each.
 And queues numbers for each VMDq pool can be changed by setting RTE_LIBRTE_I40E_QUEUE_NUM_PER_VM
 in config/rte_config.h file.
-The nb-pools and enable-rss parameters can be passed on the command line, after the EAL parameters:
+The ``--nb-pools`` and ``--enable-rss`` parameters can be passed on the command line, after the EAL parameters:
 
 .. code-block:: console
 
     ./<build_dir>/examples/dpdk-vmdq [EAL options] -- -p PORTMASK --nb-pools NP --enable-rss
 
-where, NP can be 8, 16 or 32, rss is disabled by default.
+where NP can be 8, 16 or 32. RSS is disabled by default.
 
-In Linux* user space, the application can display statistics with the number of packets received on each queue.
-To have the application display the statistics, send a SIGHUP signal to the running application process.
+In a Linux user space, the application can display statistics on standard output showing the number of packets received on each queue.
+To have the application display statistics, send a SIGHUP signal to the running application process.
 
 The VMDq Forwarding sample application is in many ways simpler than the L2 Forwarding application
 (see :doc:`l2_forward_real_virtual`)
@@ -45,7 +45,7 @@ No command-line options are taken by this application apart from the standard EA
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``vmdq`` sub-directory.
 
@@ -64,20 +64,18 @@ the Environment Abstraction Layer (EAL) options.
 Explanation
 -----------
 
-The following sections provide some explanation of the code.
+The following sections provide explanation of the code.
 
 Initialization
 ~~~~~~~~~~~~~~
 
 The EAL, driver and PCI configuration is performed largely as in the L2 Forwarding sample application,
-as is the creation of the mbuf pool.
-See :doc:`l2_forward_real_virtual`.
-Where this example application differs is in the configuration of the NIC port for RX.
+as is the creation of the mbuf pool. See :doc:`l2_forward_real_virtual`.
+This example application differs in the configuration of the NIC port for Rx.
 
 The VMDq hardware feature is configured at port initialization time by setting the appropriate values in the
 rte_eth_conf structure passed to the rte_eth_dev_configure() API.
-Initially in the application,
-a default structure is provided for VMDq configuration to be filled in later by the application.
+Initially in the application, a default structure is provided for VMDq configuration to be filled in later by the application.
 
 .. literalinclude:: ../../../examples/vmdq/main.c
     :language: c
@@ -88,8 +86,8 @@ The get_eth_conf() function fills in an rte_eth_conf structure with the appropri
 based on the global vlan_tags array.
 For the VLAN IDs, each one can be allocated to possibly multiple pools of queues.
 For destination MAC, each VMDq pool will be assigned with a MAC address. In this sample, each VMDq pool
-is assigned to the MAC like 52:54:00:12:<port_id>:<pool_id>, that is,
-the MAC of VMDq pool 2 on port 1 is 52:54:00:12:01:02.
+is assigned to the MAC like 52:54:00:12:<port_id>:<pool_id>.
+The MAC of VMDq pool 2 on port 1 is 52:54:00:12:01:02.
 
 .. literalinclude:: ../../../examples/vmdq/main.c
     :language: c
@@ -126,5 +124,5 @@ To generate the statistics output, use the following command:
 
     user@host$ sudo killall -HUP vmdq_app
 
-Please note that the statistics output will appear on the terminal where the vmdq_app is running,
+Please note that the statistics output will appear on the terminal where the application is running,
 rather than the terminal from which the HUP signal was sent.
-- 
2.53.0


^ permalink raw reply related

* [PATCH 13/15] doc: enhance skeleton, pipeline, timer, and vhost guides
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Bruce Richardson, John McNamara,
	Cristian Dumitrescu, Erik Gabriel Carrillo, Maxime Coquelin,
	Chenbo Xia
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Improved documentation for core sample applications:

skeleton.rst:
- Simplified basic forwarding example descriptions
- Fixed formatting consistency

test_pipeline.rst:
- Enhanced pipeline testing methodology descriptions
- Improved command-line parameter documentation

timer.rst:
- Clarified timer management mechanisms
- Improved periodic timer explanations

vdpa.rst:
- Enhanced vDPA device configuration descriptions
- Improved live migration support explanations

vhost.rst:
- Restructured vhost-user implementation sections
- Improved virtio device emulation descriptions
- Enhanced performance tuning recommendations

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/skeleton.rst      |   8 +-
 doc/guides/sample_app_ug/test_pipeline.rst |  17 +-
 doc/guides/sample_app_ug/timer.rst         |  19 +--
 doc/guides/sample_app_ug/vdpa.rst          |  51 +++---
 doc/guides/sample_app_ug/vhost.rst         | 178 +++++++++++----------
 5 files changed, 143 insertions(+), 130 deletions(-)

diff --git a/doc/guides/sample_app_ug/skeleton.rst b/doc/guides/sample_app_ug/skeleton.rst
index 5f7f18dbc2..b1cd4e12f5 100644
--- a/doc/guides/sample_app_ug/skeleton.rst
+++ b/doc/guides/sample_app_ug/skeleton.rst
@@ -73,7 +73,7 @@ Mbufs are the packet buffer structure used by DPDK. They are explained in
 detail in the "Mbuf Library" section of the *DPDK Programmer's Guide*.
 
 The ``main()`` function also initializes all the ports using the user defined
-``port_init()`` function which is explained in the next section:
+``port_init()`` function, which is explained in the next section:
 
 .. literalinclude:: ../../../examples/skeleton/basicfwd.c
     :language: c
@@ -129,10 +129,10 @@ Finally, the Rx port is set in promiscuous mode:
         :dedent: 1
 
 
-The Lcores Main
+The Lcore Main
 ~~~~~~~~~~~~~~~
 
-As we saw above, the ``main()`` function calls an application function
+As shown above, the ``main()`` function calls an application function
 on the available lcores.
 For the basic forwarding application, the lcore function
 looks like the following:
@@ -152,7 +152,7 @@ The main work of the application is done within the loop:
 
 Packets are received in bursts on the RX ports and transmitted in bursts on
 the TX ports. The ports are grouped in pairs with a simple mapping scheme
-using the an XOR on the port number::
+using XOR on the port number::
 
     0 -> 1
     1 -> 0
diff --git a/doc/guides/sample_app_ug/test_pipeline.rst b/doc/guides/sample_app_ug/test_pipeline.rst
index 818be93cd6..c18326105d 100644
--- a/doc/guides/sample_app_ug/test_pipeline.rst
+++ b/doc/guides/sample_app_ug/test_pipeline.rst
@@ -30,7 +30,7 @@ The application uses three CPU cores:
 
 Compiling the Application
 -------------------------
-To compile the sample application see :doc:`compiling`
+To compile the sample application, see :doc:`compiling`
 
 The application is located in the ``dpdk/<build_dir>/app`` directory.
 
@@ -95,7 +95,7 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          | [4-byte index, 4 bytes of 0]                          |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The action configured for all table entries is        |
-   |       |                        |                                                          | "Sendto output port", with the output port index      |
+   |       |                        |                                                          | "send to output port", with the output port index     |
    |       |                        |                                                          | uniformly distributed for the range of output ports.  |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The default table rule (used in the case of a lookup  |
@@ -118,7 +118,7 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          | [4-byte index, 12 bytes of 0]                         |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The action configured for all table entries is        |
-   |       |                        |                                                          | "Send to output port", with the output port index     |
+   |       |                        |                                                          | "send to output port", with the output port index     |
    |       |                        |                                                          | uniformly distributed for the range of output ports.  |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The default table rule (used in the case of a lookup  |
@@ -141,7 +141,7 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          | [4-byte index, 28 bytes of 0].                        |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The action configured for all table entries is        |
-   |       |                        |                                                          | "Send to output port", with the output port index     |
+   |       |                        |                                                          | "send to output port", with the output port index     |
    |       |                        |                                                          | uniformly distributed for the range of output ports.  |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The default table rule (used in the case of a lookup  |
@@ -149,7 +149,7 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | At run time, core A is creating the following lookup  |
    |       |                        |                                                          | key and storing it into the packet meta data for      |
-   |       |                        |                                                          | Lpmcore B to use for table lookup:                    |
+   |       |                        |                                                          | core B to use for table lookup:                       |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | [destination IPv4 address, 28 bytes of 0]             |
    |       |                        |                                                          |                                                       |
@@ -177,8 +177,8 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          | [0.192.0.0/10 => send to output port 3]               |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The default table rule (used in the case of a lookup  |
-   |       |                        |                                                          | miss) is to drop the packet.                          |
    |       |                        |                                                          |                                                       |
+   |       |                        |                                                          | miss) is to drop the packet.                          |
    |       |                        |                                                          | At run time, core A is storing the IPv4 destination   |
    |       |                        |                                                          | within the packet meta data to be later used by core  |
    |       |                        |                                                          | B as the lookup key.                                  |
@@ -201,11 +201,9 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | => send to output port 0]                             |
    |       |                        |                                                          |                                                       |
-   |       |                        |                                                          |                                                       |
    |       |                        |                                                          | [priority = 0 (highest),                              |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | IPv4 source = ANY,                                    |
-   |       |                        |                                                          |                                                       |
    |       |                        |                                                          | IPv4 destination = 0.128.0.0/9,                       |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | L4 protocol = ANY,                                    |
@@ -214,8 +212,7 @@ For hash tables, the following parameters can be selected:
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | TCP destination port = ANY                            |
    |       |                        |                                                          |                                                       |
-   |       |                        |                                                          | => send to output port 0].                            |
-   |       |                        |                                                          |                                                       |
+   |       |                        |                                                          | => send to output port 1]                             |
    |       |                        |                                                          |                                                       |
    |       |                        |                                                          | The default table rule (used in the case of a lookup  |
    |       |                        |                                                          | miss) is to drop the packet.                          |
diff --git a/doc/guides/sample_app_ug/timer.rst b/doc/guides/sample_app_ug/timer.rst
index 7af35d3d67..e6b821d7fd 100644
--- a/doc/guides/sample_app_ug/timer.rst
+++ b/doc/guides/sample_app_ug/timer.rst
@@ -4,20 +4,23 @@
 Timer Sample Application
 ========================
 
-The Timer sample application is a simple application that demonstrates the use of a timer in a DPDK application.
-This application prints some messages from different lcores regularly, demonstrating the use of timers.
+Overview
+--------
+
+The Timer sample application demonstrates the use of a timer in a DPDK application.
+This application prints messages from different lcores regularly using timers.
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``timer`` sub-directory.
 
 Running the Application
 -----------------------
 
-To run the example in linux environment:
+To run the example in a Linux environment:
 
 .. code-block:: console
 
@@ -29,8 +32,6 @@ the Environment Abstraction Layer (EAL) options.
 Explanation
 -----------
 
-The following sections provide some explanation of the code.
-
 Initialization and Main Loop
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -44,7 +45,7 @@ In addition to EAL initialization, the timer subsystem must be initialized, by c
 
 After timer creation (see the next paragraph), the main loop is
 executed on each worker lcore using the well-known
-rte_eal_remote_launch() and also on the main.
+rte_eal_remote_launch() and also on the main lcore.
 
 .. literalinclude:: ../../../examples/timer/main.c
     :language: c
@@ -76,13 +77,13 @@ This call to rte_timer_init() is necessary before doing any other operation on t
     :end-before: >8 End of init timer structures.
     :dedent: 1
 
-Then, the two timers are configured:
+Next, the two timers are configured:
 
 *   The first timer (timer0) is loaded on the main lcore and expires every second.
     Since the PERIODICAL flag is provided, the timer is reloaded automatically by the timer subsystem.
     The callback function is timer0_cb().
 
-*   The second timer (timer1) is loaded on the next available lcore every 333 ms.
+*   The second timer (timer1) is loaded on a next available lcore every 333 ms.
     The SINGLE flag means that the timer expires only once and must be reloaded manually if required.
     The callback function is timer1_cb().
 
diff --git a/doc/guides/sample_app_ug/vdpa.rst b/doc/guides/sample_app_ug/vdpa.rst
index 873efbf7c7..ed4c9676d6 100644
--- a/doc/guides/sample_app_ug/vdpa.rst
+++ b/doc/guides/sample_app_ug/vdpa.rst
@@ -4,27 +4,30 @@
 Vdpa Sample Application
 =======================
 
-The vdpa sample application creates vhost-user sockets by using the
-vDPA backend. vDPA stands for vhost Data Path Acceleration which utilizes
-virtio ring compatible devices to serve virtio driver directly to enable
-datapath acceleration. As vDPA driver can help to set up vhost datapath,
-this application doesn't need to launch dedicated worker threads for vhost
+Overview
+--------
+
+The vDPA sample application creates vhost-user sockets by using the
+vDPA backend. vDPA (vhost Data Path Acceleration) utilizes
+virtio ring-compatible devices to serve a virtio driver directly to enable
+datapath acceleration. A vDPA driver can help to set up the vhost datapath.
+This application does not need to launch dedicated worker threads for vhost
 enqueue/dequeue operations.
 
-Testing steps
--------------
-
-This section shows the steps of how to start VMs with vDPA vhost-user
+The following shows how to start VMs with vDPA vhost-user
 backend and verify network connection & live migration.
 
-Build
-~~~~~
+Compiling the Application
+-------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``vdpa`` sub-directory.
 
-Start the vdpa example
+Running the Application
+-----------------------
+
+Start the vDPA example
 ~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: console
@@ -40,7 +43,7 @@ where
   (n starts from 0).
 * --interactive means run the vDPA sample in interactive mode:
 
-  #. help: show help message
+  #. help: show the help message
 
   #. list: list all available vDPA devices
 
@@ -50,7 +53,7 @@ where
 
   #. quit: unregister vhost driver and exit the application
 
-Take IFCVF driver for example:
+Take IFCVF driver, for example:
 
 .. code-block:: console
 
@@ -62,10 +65,11 @@ Take IFCVF driver for example:
     Here 0000:06:00.3 and 0000:06:00.4 refer to virtio ring compatible devices,
     and we need to bind vfio-pci to them before running vdpa sample.
 
-    * modprobe vfio-pci
-    * ./usertools/dpdk-devbind.py -b vfio-pci 06:00.3 06:00.4
+    .. code-block:: console
 
-Then we can create 2 vdpa ports in interactive cmdline.
+        modprobe vfio-pci
+        ./usertools/dpdk-devbind.py -b vfio-pci 06:00.3 06:00.4
+Then, we can create 2 vdpa ports in interactive cmdline.
 
 .. code-block:: console
 
@@ -92,7 +96,7 @@ Start the VMs
        -netdev type=vhost-user,id=vdpa,chardev=char0 \
        -device virtio-net-pci,netdev=vdpa,mac=00:aa:bb:cc:dd:ee,page-per-vq=on \
 
-After the VMs launches, we can login the VMs and configure the ip, verify the
+After the VMs launch, we can log into the VMs and configure the IP, verify the
 network connection via ping or netperf.
 
 .. note::
@@ -100,11 +104,12 @@ network connection via ping or netperf.
 
 Live Migration
 ~~~~~~~~~~~~~~
-vDPA supports cross-backend live migration, user can migrate SW vhost backend
-VM to vDPA backend VM and vice versa. Here are the detailed steps. Assume A is
-the source host with SW vhost VM and B is the destination host with vDPA.
+vDPA supports cross-backend live migration. A user can migrate SW vhost backend
+VM to vDPA backend VM and vice versa. Here are the detailed steps.
+Assume A is the source host with SW vhost VM and B is the destination host with vDPA.
 
-#. Start vdpa sample and launch a VM with exact same parameters as the VM on A,
+#. On the destination host (B), start the vdpa sample application and launch a VM
+   with the exact same parameters as the VM on A,
    in migration-listen mode:
 
    .. code-block:: console
diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst
index 4c944a844a..257985c430 100644
--- a/doc/guides/sample_app_ug/vhost.rst
+++ b/doc/guides/sample_app_ug/vhost.rst
@@ -4,36 +4,39 @@
 Vhost Sample Application
 ========================
 
-The vhost sample application demonstrates integration of the Data Plane
-Development Kit (DPDK) with the Linux* KVM hypervisor by implementing the
-vhost-net offload API. The sample application performs simple packet
-switching between virtual machines based on Media Access Control (MAC)
-address or Virtual Local Area Network (VLAN) tag. The splitting of Ethernet
-traffic from an external switch is performed in hardware by the Virtual
-Machine Device Queues (VMDQ) and Data Center Bridging (DCB) features of
-the Intel® 82599 10 Gigabit Ethernet Controller.
+Overview
+--------
+
+The vhost sample application demonstrates integration of DPDK with the
+Linux KVM hypervisor by implementing the vhost-user protocol. The
+application performs packet switching between virtual machines and
+physical network interfaces. Packets are switched based on Media Access
+Control (MAC) address or Virtual Local Area Network (VLAN) tag. When
+using supported NICs, the splitting of Ethernet traffic can be performed
+in hardware by Virtual Machine Device Queues (VMDQ) and Data Center
+Bridging (DCB) features.
 
 Testing steps
--------------
+~~~~~~~~~~~~~
 
-This section shows the steps how to test a typical PVP case with this
-dpdk-vhost sample, whereas packets are received from the physical NIC
+This section shows the steps to test a typical PVP (physical-virtual-physical)
+case with this dpdk-vhost sample. Packets are received from the physical NIC
 port first and enqueued to the VM's Rx queue. Through the guest testpmd's
 default forwarding mode (io forward), those packets will be put into
 the Tx queue. The dpdk-vhost example, in turn, gets the packets and
 puts back to the same physical NIC port.
 
-Build
-~~~~~
+Compiling the Application
+-------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``vhost`` sub-directory.
 
 .. note::
-   In this example, you need build DPDK both on the host and inside guest.
+   In this example, you need to build DPDK both on the host and inside the guest.
 
-. _vhost_app_run_vm:
+.. _vhost_app_run_vm:
 
 Start the VM
 ~~~~~~~~~~~~
@@ -51,8 +54,8 @@ Start the VM
 
 .. note::
     For basic vhost-user support, QEMU 2.2 (or above) is required. For
-    some specific features, a higher version might be need. Such as
-    QEMU 2.7 (or above) for the reconnect feature.
+    some specific features, a higher version might be needed. For example,
+    QEMU 2.7 or above is required for the reconnect feature.
 
 
 Start the vswitch example
@@ -64,132 +67,139 @@ Start the vswitch example
              -- --socket-file /tmp/sock0 --client \
              ...
 
-Check the `Parameters`_ section for the explanations on what do those
+Check the `Parameters`_ section for the explanations on what the
 parameters mean.
 
+Running the Application
+-----------------------
+
 .. _vhost_app_run_dpdk_inside_guest:
 
 Run testpmd inside guest
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-Make sure you have DPDK built inside the guest. Also make sure the
-corresponding virtio-net PCI device is bond to a UIO driver, which
-could be done by:
+Ensure DPDK is built inside the guest and that the
+corresponding virtio-net PCI device is bound to a UIO driver, which
+can be done by:
 
 .. code-block:: console
 
    modprobe vfio-pci
    dpdk/usertools/dpdk-devbind.py -b vfio-pci 0000:00:04.0
 
-Then start testpmd for packet forwarding testing.
+Then, start testpmd for packet forwarding testing.
 
 .. code-block:: console
 
     ./<build_dir>/app/dpdk-testpmd -l 0-1 -- -i
     > start tx_first
 
-For more information about vIOMMU and NO-IOMMU and VFIO please refer to
-:doc:`/../linux_gsg/linux_drivers` section of the DPDK Getting started guide.
+For more information about vIOMMU, NO-IOMMU, and VFIO, refer to the
+:doc:`../linux_gsg/linux_drivers` section of the DPDK Getting started guide.
+
+Explanation
+-----------
 
 Inject packets
---------------
+~~~~~~~~~~~~~~
 
-While a virtio-net is connected to dpdk-vhost, a VLAN tag starts with
-1000 is assigned to it. So make sure configure your packet generator
-with the right MAC and VLAN tag, you should be able to see following
-log from the dpdk-vhost console. It means you get it work::
+When a virtio-net device is connected to dpdk-vhost, a VLAN tag starting
+from 1000 is assigned to it. Therefore, configure your packet generator
+with the correct MAC address and VLAN tag. You should see the following
+log from the dpdk-vhost console::
 
     VHOST_DATA: (0) mac 52:54:00:00:00:14 and vlan 1000 registered
 
 
+
 .. _vhost_app_parameters:
 
 Parameters
-----------
+~~~~~~~~~~
 
 **--socket-file path**
-Specifies the vhost-user socket file path.
+   Specifies the vhost-user socket file path.
 
 **--client**
-DPDK vhost-user will act as the client mode when such option is given.
-In the client mode, QEMU will create the socket file. Otherwise, DPDK
-will create it. Put simply, it's the server to create the socket file.
+   DPDK vhost-user will act in client mode when this option is given.
+   In client mode, QEMU creates the socket file. Otherwise, DPDK creates
+   it. In other words, the server creates the socket file.
 
 
 **--vm2vm mode**
-The vm2vm parameter sets the mode of packet switching between guests in
-the host.
+   Sets the mode of packet switching between guests on the host:
 
-- 0 disables vm2vm, implying that VM's packets will always go to the NIC port.
-- 1 means a normal mac lookup packet routing.
-- 2 means hardware mode packet forwarding between guests, it allows packets
-  go to the NIC port, hardware L2 switch will determine which guest the
-  packet should forward to or need send to external, which bases on the
-  packet destination MAC address and VLAN tag.
+   - 0 disables vm2vm, meaning VM packets always go to the NIC port.
+   - 1 enables normal MAC lookup packet routing.
+   - 2 enables hardware mode packet forwarding between guests. Packets can
+     go to the NIC port, and the hardware L2 switch determines which guest
+     receives the packet or whether to send it externally, based on the
+     destination MAC address and VLAN tag.
 
 **--mergeable 0|1**
-Set 0/1 to disable/enable the mergeable Rx feature. It's disabled by default.
+   Set to 0 or 1 to disable or enable the mergeable Rx feature. Disabled by default.
 
 **--stats interval**
-The stats parameter controls the printing of virtio-net device statistics.
-The parameter specifies an interval (in unit of seconds) to print statistics,
-with an interval of 0 seconds disabling statistics.
+   Controls printing of virtio-net device statistics. Specifies the interval
+   (in seconds) to print statistics. Setting the interval to 0 disables statistics.
 
 **--rx-retry 0|1**
-The rx-retry option enables/disables enqueue retries when the guests Rx queue
-is full. This feature resolves a packet loss that is observed at high data
-rates, by allowing it to delay and retry in the receive path. This option is
-enabled by default.
+   Enables or disables enqueue retries when the guest Rx queue is full. This
+   feature resolves packet loss observed at high data rates by allowing delays
+   and retries in the receive path. Enabled by default.
 
 **--rx-retry-num num**
-The rx-retry-num option specifies the number of retries on an Rx burst, it
-takes effect only when rx retry is enabled.  The default value is 4.
+   Specifies the number of retries on an Rx burst. Takes effect only when
+   rx-retry is enabled. Default value is 4.
 
 **--rx-retry-delay msec**
-The rx-retry-delay option specifies the timeout (in micro seconds) between
-retries on an RX burst, it takes effect only when rx retry is enabled. The
-default value is 15.
+   Specifies the timeout (in microseconds) between retries on an Rx burst.
+   Takes effect only when rx-retry is enabled. Default value is 15.
 
 **--builtin-net-driver**
-A very simple vhost-user net driver which demonstrates how to use the generic
-vhost APIs will be used when this option is given. It is disabled by default.
+   Uses a simple built-in vhost-user net driver that demonstrates how to use
+   the generic vhost APIs. Disabled by default.
 
 **--dmas**
-This parameter is used to specify the assigned DMA device of a vhost device.
-Async vhost-user net driver will be used if --dmas is set. For example
---dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use
+   Specifies the assigned DMA device for a vhost device. The async vhost-user
+   net driver will be used if --dmas is set.
+
+   For example::
+
+      --dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3]
+
+   This means use
 DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation
 and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue
 operation. The index of the device corresponds to the socket file in order,
 that means vhost device 0 is created through the first socket file, vhost
 device 1 is created through the second socket file, and so on.
 
-**--total-num-mbufs 0-N**
-This parameter sets the number of mbufs to be allocated in mbuf pools,
-the default value is 147456. This is can be used if launch of a port fails
-due to shortage of mbufs.
+**--total-num-mbufs N**
+   Sets the number of mbufs to allocate in mbuf pools. Default value is 147456.
+   Use this option if port startup fails due to mbuf shortage.
 
 **--tso 0|1**
-Disables/enables TCP segment offload.
+   Disables or enables TCP segment offload.
 
 **--tx-csum 0|1**
-Disables/enables TX checksum offload.
+   Disables or enables TX checksum offload.
 
 **-p mask**
-Port mask which specifies the ports to be used
+   Port mask specifying which ports to use.
 
 Common Issues
--------------
+~~~~~~~~~~~~~
 
-* QEMU fails to allocate memory on hugetlbfs, with an error like the
+* QEMU fails to allocate memory on hugetlbfs and shows an error like the
   following::
 
       file_ram_alloc: can't mmap RAM pages: Cannot allocate memory
 
-  When running QEMU the above error indicates that it has failed to allocate
-  memory for the Virtual Machine on the hugetlbfs. This is typically due to
+  When running QEMU, the above error indicates that it has failed to allocate
+  memory for the virtual machine on hugetlbfs. This is typically due to
   insufficient hugepages being free to support the allocation request. The
-  number of free hugepages can be checked as follows:
+  number of free hugepages can be checked with:
 
   .. code-block:: console
 
@@ -200,23 +210,23 @@ Common Issues
 
 * Failed to build DPDK in VM
 
-  Make sure "-cpu host" QEMU option is given.
+  Make sure the "-cpu host" QEMU option is given.
 
-* Device start fails if NIC's max queues > the default number of 128
+* Device start fails if the NIC's max queues exceeds the default number of 128
 
-  mbuf pool size is dependent on the MAX_QUEUES configuration, if NIC's
-  max queue number is larger than 128, device start will fail due to
-  insufficient mbuf. This can be adjusted using ``--total-num-mbufs``
-  parameter.
+  The mbuf pool size is dependent on the MAX_QUEUES configuration.
+  If the NIC's maximum queue count is larger than 128,
+  then the device start will fail due to insufficient mbufs.
+  Adjust this using the ``--total-num-mbufs`` parameter.
 
 * Option "builtin-net-driver" is incompatible with QEMU
 
-  QEMU vhost net device start will fail if protocol feature is not negotiated.
-  DPDK virtio-user PMD can be the replacement of QEMU.
+  QEMU vhost-net device start will fail if the protocol feature is not negotiated.
+  The DPDK virtio-user PMD can replace QEMU.
 
 * Device start fails when enabling "builtin-net-driver" without memory
   pre-allocation
 
-  The builtin example doesn't support dynamic memory allocation. When vhost
-  backend enables "builtin-net-driver", "--numa-mem" option should be
-  added at virtio-user PMD side as a startup item.
+  The builtin example does not support dynamic memory allocation.
+  When vhost backend enables "builtin-net-driver", the "--numa-mem" option should
+  be added on the virtio-user PMD side as a startup parameter.
-- 
2.53.0


^ permalink raw reply related

* [PATCH 12/15] doc: improve QoS, callbacks, EFD, and service cores guides
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Cristian Dumitrescu, Bruce Richardson,
	John McNamara, Yipeng Wang
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Enhanced documentation for QoS and service applications:

qos_metering.rst:
- Improved traffic metering algorithm descriptions
- Fixed formatting consistency

qos_scheduler.rst:
- Enhanced hierarchical scheduler explanations
- Improved traffic class descriptions

rxtx_callbacks.rst:
- Clarified callback mechanism usage
- Fixed formatting in code examples

server_node_efd.rst:
- Minor formatting improvements

service_cores.rst:
- Restructured service core architecture descriptions
- Improved service mapping explanations
- Enhanced clarity of multi-service configurations

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/qos_metering.rst    |  9 +--
 doc/guides/sample_app_ug/qos_scheduler.rst   | 16 +++--
 doc/guides/sample_app_ug/rxtx_callbacks.rst  |  9 ++-
 doc/guides/sample_app_ug/server_node_efd.rst |  4 +-
 doc/guides/sample_app_ug/service_cores.rst   | 67 +++++++++++---------
 5 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/doc/guides/sample_app_ug/qos_metering.rst b/doc/guides/sample_app_ug/qos_metering.rst
index e7101559aa..b317d4079d 100644
--- a/doc/guides/sample_app_ug/qos_metering.rst
+++ b/doc/guides/sample_app_ug/qos_metering.rst
@@ -4,7 +4,7 @@
 QoS Metering Sample Application
 ===============================
 
-The QoS meter sample application is an example that demonstrates the use of DPDK to provide QoS marking and metering,
+The QoS meter sample application demonstrates the use of DPDK to provide QoS marking and metering,
 as defined by RFC2697 for Single Rate Three Color Marker (srTCM) and RFC 2698 for Two Rate Three Color Marker (trTCM) algorithm.
 
 Overview
@@ -14,7 +14,8 @@ The application uses a single thread for reading the packets from the RX port,
 metering, marking them with the appropriate color (green, yellow or red) and writing them to the TX port.
 
 A policing scheme can be applied before writing the packets to the TX port by dropping or
-changing the color of the packet in a static manner depending on both the input and output colors of the packets that are processed by the meter.
+changing the color of the packet in a static manner. This would depend on both the input and output colors
+of the packets that are processed by the meter.
 
 The operation mode can be selected as compile time out of the following options:
 
@@ -126,9 +127,9 @@ There are four different actions:
 
 In this particular case:
 
-*   Every packet which input and output color are the same, keeps the same color.
+*   For every packet where the input and output color are the same, keep the same color.
 
-*   Every packet which color has improved is dropped (this particular case can't happen, so these values will not be used).
+*   For every packet where the color has improved is dropped (this particular case can't happen, so these values will not be used).
 
 *   For the rest of the cases, the color is changed to red.
 
diff --git a/doc/guides/sample_app_ug/qos_scheduler.rst b/doc/guides/sample_app_ug/qos_scheduler.rst
index cd33beecb0..645f134e7f 100644
--- a/doc/guides/sample_app_ug/qos_scheduler.rst
+++ b/doc/guides/sample_app_ug/qos_scheduler.rst
@@ -20,24 +20,26 @@ The architecture of the QoS scheduler application is shown in the following figu
 
 There are two flavors of the runtime execution for this application,
 with two or three threads per each packet flow configuration being used.
-The RX thread reads packets from the RX port,
+
+The RX thread reads packets from the RX port and
 classifies the packets based on the double VLAN (outer and inner) and
-the lower byte of the IP destination address and puts them into the ring queue.
+the lower byte of the IP destination address. It then puts them into the ring queue.
+
 The worker thread dequeues the packets from the ring and calls the QoS scheduler enqueue/dequeue functions.
 If a separate TX core is used, these are sent to the TX ring.
 Otherwise, they are sent directly to the TX port.
-The TX thread, if present, reads from the TX ring and write the packets to the TX port.
+The TX thread, if present, reads from the TX ring and writes the packets to the TX port.
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``qos_sched`` sub-directory.
 
     .. note::
 
-        This application is intended as a linux only.
+        This application is intended for Linux only.
 
 .. note::
 
@@ -63,7 +65,7 @@ Mandatory application parameters include:
 
 *   --pfc "RX PORT, TX PORT, RX LCORE, WT LCORE, TX CORE": Packet flow configuration.
     Multiple pfc entities can be configured in the command line,
-    having 4 or 5 items (if TX core defined or not).
+    with 4 or 5 items (depending on whether a TX core is defined).
 
 Optional application parameters include:
 
@@ -153,7 +155,7 @@ These are the commands that are currently working under the command line interfa
 
 All of these commands work the same way, averaging the number of packets throughout a specific subset of queues.
 
-Two parameters can be configured for this prior to calling any of these commands:
+Two parameters can be configured prior to calling any of these commands:
 
     *   qavg n X: n is the number of times that the calculation will take place.
         Bigger numbers provide higher accuracy. The default value is 10.
diff --git a/doc/guides/sample_app_ug/rxtx_callbacks.rst b/doc/guides/sample_app_ug/rxtx_callbacks.rst
index cd6512508b..7dfcab5d65 100644
--- a/doc/guides/sample_app_ug/rxtx_callbacks.rst
+++ b/doc/guides/sample_app_ug/rxtx_callbacks.rst
@@ -18,10 +18,9 @@ prior to transmission to calculate the elapsed time in CPU cycles.
 
 If hardware timestamping is supported by the NIC, the sample application will
 also display the average latency.
-The packet was timestamped in hardware
-on top of the latency since the packet was received and processed by the RX
-callback.
-
+This shows the additional hardware timestamping latency on top of the
+software latency measured since the packet was received and processed by
+the RX callback.
 
 Compiling the Application
 -------------------------
@@ -72,7 +71,7 @@ callbacks are added. This is explained in the next section:
 
 
 The Port Initialization Function
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The main functional part of the port initialization is shown below with
 comments:
diff --git a/doc/guides/sample_app_ug/server_node_efd.rst b/doc/guides/sample_app_ug/server_node_efd.rst
index c90d74ae1c..d543148328 100644
--- a/doc/guides/sample_app_ug/server_node_efd.rst
+++ b/doc/guides/sample_app_ug/server_node_efd.rst
@@ -9,7 +9,7 @@ load balancer. For more information about the EFD Library, please refer to the
 DPDK programmer's guide.
 
 This sample application is a variant of the :doc:`multi_process`
-where a specific target node is specified for every and each flow
+where a specific target node is specified for each flow
 (not in a round-robin fashion as the original load balancing sample application).
 
 Overview
@@ -192,7 +192,7 @@ flow is not handled by the node.
     :start-after: Packets dequeued from the shared ring. 8<
     :end-before: >8 End of packets dequeuing.
 
-Finally, note that both processes updates statistics, such as transmitted, received
+Finally, note that both processes update statistics, such as transmitted, received
 and dropped packets, which are shown and refreshed by the server app.
 
 .. literalinclude:: ../../../examples/server_node_efd/efd_server/main.c
diff --git a/doc/guides/sample_app_ug/service_cores.rst b/doc/guides/sample_app_ug/service_cores.rst
index 307a6c5fbb..11b6ebde1e 100644
--- a/doc/guides/sample_app_ug/service_cores.rst
+++ b/doc/guides/sample_app_ug/service_cores.rst
@@ -4,31 +4,34 @@
 Service Cores Sample Application
 ================================
 
-The service cores sample application demonstrates the service cores capabilities
-of DPDK. The service cores infrastructure is part of the DPDK EAL, and allows
-any DPDK component to register a service. A service is a work item or task, that
+Overview
+--------
+
+This sample application demonstrates the service cores API of DPDK.
+The service cores infrastructure is part of the DPDK EAL and allows any
+DPDK component to register a service. A service is a work item or task that
 requires CPU time to perform its duty.
 
-This sample application registers 5 dummy services. These 5 services are used
-to show how the service_cores API can be used to orchestrate these services to
-run on different service lcores. This orchestration is done by calling the
-service cores APIs, however the sample application introduces a "profile"
-concept to contain the service mapping details. Note that the profile concept
-is application specific, and not a part of the service cores API.
+This sample application registers 5 dummy services to demonstrate how the
+service cores API can orchestrate these services to run on different service
+lcores. The orchestration is performed by calling the service cores APIs.
+The sample application introduces a "profile" concept to group service-to-core
+mapping configurations. Note that profiles are application-specific and not
+part of the service cores API itself.
 
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+See :doc:`compiling`.
 
 The application is located in the ``service_cores`` sub-directory.
 
 Running the Application
 -----------------------
 
-To run the example, just execute the binary. Since the application dynamically
-adds service cores in the application code itself, there is no requirement to
+To run the example, execute the binary. Since the application dynamically
+adds service cores at runtime, there is no requirement to
 pass a service core-mask as an EAL argument at startup time.
 
 .. code-block:: console
@@ -39,19 +42,19 @@ pass a service core-mask as an EAL argument at startup time.
 Explanation
 -----------
 
-The following sections provide some explanation of code focusing on
-registering applications from an applications point of view, and modifying the
-service core counts and mappings at runtime.
+The following sections explain the application code, focusing on registering
+services from an application's perspective and modifying service core counts
+and mappings at runtime.
 
 
 Registering a Service
 ~~~~~~~~~~~~~~~~~~~~~
 
 The following code section shows how to register a service as an application.
-Note that the service component header must be included by the application in
-order to register services: ``rte_service_component.h``, in addition
-to the ordinary service cores header ``rte_service.h`` which provides
-the runtime functions to add, remove and remap service cores.
+Note: The service component header must be included by the application in
+order to register services: ``rte_service_component.h``. In addition, the
+service cores header ``rte_service.h`` provides the runtime functions to add,
+remove, and remap service cores.
 
 .. literalinclude:: ../../../examples/service_cores/main.c
     :language: c
@@ -63,9 +66,9 @@ the runtime functions to add, remove and remap service cores.
 Controlling A Service Core
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This section demonstrates how to add a service core. The ``rte_service.h``
-header file provides the functions for dynamically adding and removing cores.
-The APIs to add and remove cores use lcore IDs similar to existing DPDK
+This section demonstrates how to add a service core and assign a service to it.
+The ``rte_service.h`` header file provides functions for dynamically adding
+and removing cores. These APIs use lcore IDs similar to existing DPDK
 functions.
 
 These are the functions to start a service core, and have it run a service:
@@ -79,18 +82,20 @@ These are the functions to start a service core, and have it run a service:
 Removing A Service Core
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-To remove a service core, the steps are similar to adding but in reverse order.
-Note that it is not allowed to remove a service core if the service is running,
-and the service-core is the only core running that service (see documentation
+To remove a service core, perform the adding steps in reverse order.
+Note: Removing a service core is not allowed if a service is running and
+the service core is the only core running that service (see documentation
 for ``rte_service_lcore_stop`` function for details).
 
 
 Conclusion
 ~~~~~~~~~~
 
-The service cores infrastructure provides DPDK with two main features. The first
-is to abstract away hardware differences: the service core can CPU cycles to
-a software fallback implementation, allowing the application to be abstracted
-from the difference in HW / SW availability. The second feature is a flexible
-method of registering functions to be run, allowing the running of the
-functions to be scaled across multiple CPUs.
+The service cores infrastructure provides DPDK with two main features.
+
+First, it abstracts hardware differences: service cores can provide CPU cycles
+to software fallback implementations, allowing applications to be abstracted
+from hardware and software availability differences.
+
+Second, it provides a flexible method for registering functions to run,
+allowing function execution to scale across multiple CPUs.
-- 
2.53.0


^ permalink raw reply related

* [PATCH 11/15] doc: enhance multi-process, NTB, ordering, and PTP guides
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Anatoly Burakov, Jingjing Wu, Volodymyr Fialko
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Improved documentation for various sample applications:

multi_process.rst:
- Clarified primary/secondary process model
- Enhanced shared memory descriptions
- Improved command-line examples

ntb.rst:
- Minor formatting fix

packet_ordering.rst:
- Improved packet reordering algorithm descriptions
- Enhanced worker thread explanations
- Fixed formatting consistency

pipeline.rst:
- Clarified pipeline stage configurations
- Improved table action descriptions

ptpclient.rst:
- Enhanced PTP synchronization explanations
- Improved timestamp handling descriptions
- Fixed formatting in configuration examples

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/multi_process.rst   | 47 ++++++++++---------
 doc/guides/sample_app_ug/ntb.rst             |  2 +-
 doc/guides/sample_app_ug/packet_ordering.rst | 42 +++++++++--------
 doc/guides/sample_app_ug/pipeline.rst        | 26 ++++++-----
 doc/guides/sample_app_ug/ptpclient.rst       | 49 +++++++++++---------
 5 files changed, 89 insertions(+), 77 deletions(-)

diff --git a/doc/guides/sample_app_ug/multi_process.rst b/doc/guides/sample_app_ug/multi_process.rst
index 1bd858bfb5..e9e5809a92 100644
--- a/doc/guides/sample_app_ug/multi_process.rst
+++ b/doc/guides/sample_app_ug/multi_process.rst
@@ -42,9 +42,10 @@ passing at least two cores in the corelist:
 
     ./<build_dir>/examples/dpdk-simple_mp -l 0-1 --proc-type=primary
 
-For the first DPDK process run, the proc-type flag can be omitted or set to auto,
-since all DPDK processes will default to being a primary instance,
-meaning they have control over the hugepage shared memory regions.
+For the first DPDK process run, the proc-type flag can be omitted or set to auto
+since all DPDK processes will default to being a primary instance
+(meaning, they have control over the hugepage shared memory regions).
+
 The process should start successfully and display a command prompt as follows:
 
 .. code-block:: console
@@ -99,7 +100,7 @@ At any stage, either process can be terminated using the quit command.
     The secondary process can be stopped and restarted without affecting the primary process.
 
 How the Application Works
-^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
 This application uses two queues and a single memory pool created in the primary process.
 The secondary process then uses lookup functions to attach to these objects.
@@ -141,7 +142,7 @@ Each process writes outgoing packets to a different Tx queue on each port.
 Running the Application
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-The first instance of the ``symmetric_mp`` process is the primary instance, with the EAL arguments:
+The first instance of the ``symmetric_mp`` process becomes the primary instance, with the following application arguments:
 
 ``-p <portmask>``
   The ``portmask`` is a hexadecimal bitmask of what ports on the system are to be used.
@@ -155,7 +156,7 @@ The first instance of the ``symmetric_mp`` process is the primary instance, with
 ``--proc-id <n>``
   Where ``n`` is a numeric value in the range ``0 <= n < N``
   (number of processes, specified above).
-  This identifies which ``symmetric_mp`` instance is being run,
+  This identifies which ``symmetric_mp`` instance is running,
   so that each process can read a unique receive queue on each network port.
 
 The secondary instance must be started with similar EAL parameters.
@@ -173,7 +174,7 @@ Example:
    In the above example, ``auto`` is used so the first instance becomes the primary process.
 
 How the Application Works
-^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The primary instance creates the memory pool and initializes the network ports.
 
@@ -183,8 +184,8 @@ The primary instance creates the memory pool and initializes the network ports.
         :end-before: >8 End of primary instance initialization.
         :dedent: 1
 
-The secondary instance gets the port information and exported by the primary process.
-The memory pool is accessed by doing a lookup for it by name:
+The secondary instance gets the port information exported by the primary process.
+The memory pool is accessed by looking it up by name:
 
 .. code-block:: c
 
@@ -198,7 +199,7 @@ Each process reads from each port using the queue corresponding to its proc-id p
 and writes to the corresponding transmit queue on the output port.
 
 Client-Server Multi-process Example
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------------------------
 
 The example multi-process application demonstrates a client-server type multi-process design.
 A single server process receives a set of packets from the ports
@@ -216,22 +217,22 @@ The following diagram shows the data-flow through the application, using two cli
 
 
 Running the Application
-^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~
 
-The server process must be run as the primary process to set up all memory structures.
+The server process must run as the primary process to set up all memory structures.
 In addition to the EAL parameters, the application-specific parameters are:
 
-*   -p <portmask >, where portmask is a hexadecimal bitmask of what ports on the system are to be used.
-    For example: -p 3 to use ports 0 and 1 only.
+*   ``-p <portmask>``, where portmask is a hexadecimal bitmask of what ports on the system are to be used.
+    For example: ``-p 3`` to use ports 0 and 1 only.
 
-*   -n <num-clients>, where the num-clients parameter is the number of client processes that will process the packets received
+*   ``-n <num-clients>``, where the num-clients parameter is the number of client processes that will process the packets received
     by the server application.
 
 .. note::
 
-   In the server process, has a single thread using the lowest numbered lcore
-   in the corelist, performs all packet I/O.
-   If corelist parameter specifies with more than a single lcore,
+   In the server process, a single thread using the lowest numbered lcore
+   in the corelist performs all packet I/O.
+   If the corelist parameter specifies more than a single lcore,
    an additional lcore will be used for a thread to print packet count periodically.
 
 The server application stores configuration data in shared memory,
@@ -251,14 +252,14 @@ the commands are:
 
     If the server application dies and needs to be restarted, all client applications also need to be restarted,
     as there is no support in the server application for it to run as a secondary process.
-    Any client processes that need restarting can be restarted without affecting the server process.
+    Client processes can be restarted without affecting the server process.
 
 How the Application Works
-^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The server (primary) process performs the initialization of network port and data structure
+The server (primary) process performs the initialization of network ports and data structures
 and stores its port configuration data in a memory zone in hugepage shared memory.
-The client process does not need the portmask parameter passed in via the command line.
+The client processes do not need the portmask parameter passed in via the command line.
 The server process is the primary process, and the client processes are secondary processes.
 
 The server operates by reading packets from each network port
@@ -266,4 +267,4 @@ and distributing those packets to the client queues.
 The client reads from the ring and routes the packet to a different network port.
 The routing used is very simple: all packets received on the first NIC port
 are transmitted back out on the second port and vice versa.
-Similarly, packets are routed between the 3rd and 4th network ports and so on.
+Similarly, packets are routed between the third and fourth network ports and so on.
diff --git a/doc/guides/sample_app_ug/ntb.rst b/doc/guides/sample_app_ug/ntb.rst
index f54de6cecd..ca0ff54757 100644
--- a/doc/guides/sample_app_ug/ntb.rst
+++ b/doc/guides/sample_app_ug/ntb.rst
@@ -12,7 +12,7 @@ This sample supports 4 types of packet forwarding mode.
 
 * ``file-trans``: transmit files between two systems. The sample will
   be polling to receive files from the peer and save the file as
-  ``ntb_recv_file[N]``, [N] represents the number of received file.
+  ``ntb_recv_file[N]``, where [N] represents the number of received files.
 * ``rxonly``: NTB receives packets but doesn't transmit them.
 * ``txonly``: NTB generates and transmits packets without receiving any.
 * ``iofwd``: iofwd between NTB device and ethdev.
diff --git a/doc/guides/sample_app_ug/packet_ordering.rst b/doc/guides/sample_app_ug/packet_ordering.rst
index f96c0ad697..2082f49ad1 100644
--- a/doc/guides/sample_app_ug/packet_ordering.rst
+++ b/doc/guides/sample_app_ug/packet_ordering.rst
@@ -4,29 +4,30 @@
 Packet Ordering Application
 ============================
 
-The Packet Ordering sample app simply shows the impact of reordering a stream.
-It's meant to stress the library with different configurations for performance.
+The Packet Ordering sample application demonstrates packet reordering
+functionality and its impact on stream processing.
+It stresses the library with different configurations for performance.
 
 Overview
 --------
 
 The application uses at least three CPU cores:
 
-* RX core (main core) receives traffic from the NIC ports and feeds Worker
+* The RX core (main core) receives traffic from the NIC ports and feeds worker
   cores with traffic through SW queues.
 
-* Worker (worker core) basically do some light work on the packet.
-  Currently it modifies the output port of the packet for configurations with
-  more than one port enabled.
+* Worker cores perform lightweight processing on each packet.
+  For configurations with more than one port enabled, it swaps the destination
+  port of the packet.
 
-* TX Core (worker core) receives traffic from Worker cores through software queues,
+* The TX core receives traffic from worker cores through software queues,
   inserts out-of-order packets into reorder buffer, extracts ordered packets
-  from the reorder buffer and sends them to the NIC ports for transmission.
+  from the reorder buffer, and sends them to the NIC ports for transmission.
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``packet_ordering`` sub-directory.
 
@@ -36,6 +37,9 @@ Running the Application
 Refer to *DPDK Getting Started Guide* for general information on running applications
 and the Environment Abstraction Layer (EAL) options.
 
+Explanation
+-----------
+
 Application Command Line
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -46,16 +50,16 @@ The application execution command line is:
     ./<build_dir>/examples/dpdk-packet_ordering [EAL options] -- -p PORTMASK /
     [--disable-reorder] [--insight-worker]
 
-The -l EAL corelist option has to contain at least 3 CPU cores.
-The first CPU core in the core mask is the main core and would be assigned to
-RX core, the last to TX core and the rest to Worker cores.
+The ``-l`` EAL corelist option must contain at least 3 CPU cores.
+The first CPU core in the corelist is assigned to the RX core (main core),
+the last to the TX core, and the remaining cores to worker cores.
 
-The PORTMASK parameter must contain either 1 or even enabled port numbers.
-When setting more than 1 port, traffic would be forwarded in pairs.
-For example, if we enable 4 ports, traffic from port 0 to 1 and from 1 to 0,
-then the other pair from 2 to 3 and from 3 to 2, having [0,1] and [2,3] pairs.
+The ``PORTMASK`` parameter must specify either 1 port or an even number of ports.
+When setting more than 1 port, traffic is forwarded in pairs.
+For example, if 4 ports are enabled, traffic flows between port 0 and port 1,
+and between port 2 and port 3 (forming port pairs [0,1] and [2,3]).
 
-The disable-reorder long option does, as its name implies, disable the reordering
-of traffic, which should help evaluate reordering performance impact.
+The ``--disable-reorder`` option disables packet reordering, which allows
+evaluation of the performance impact of reordering.
 
-The insight-worker long option enables output the packet statistics of each worker thread.
+The ``--insight-worker`` long option enables outputting packet statistics for each worker thread.
diff --git a/doc/guides/sample_app_ug/pipeline.rst b/doc/guides/sample_app_ug/pipeline.rst
index 2d7c977068..2e8f5b7f24 100644
--- a/doc/guides/sample_app_ug/pipeline.rst
+++ b/doc/guides/sample_app_ug/pipeline.rst
@@ -4,10 +4,10 @@
 Pipeline Application
 ====================
 
-Application overview
---------------------
+Overview
+--------
 
-This application showcases the features of the Software Switch (SWX) pipeline that is aligned with the P4 language.
+This application showcases the features of the Software Switch (SWX) pipeline that aligns with the P4 language.
 
 Each pipeline is created using a specification file that can either be manually developed or generated using a P4 compiler.
 
@@ -78,7 +78,7 @@ To run remote client (e.g. telnet) to communicate with the application:
 
     $ telnet 0.0.0.0 8086
 
-When running a telnet client as above, command prompt is displayed:
+When running a telnet client as above, the command prompt is displayed:
 
 .. code-block:: console
 
@@ -90,23 +90,25 @@ When running a telnet client as above, command prompt is displayed:
 
     pipeline>
 
-Once application and telnet client start running, messages can be sent from client to application.
+Once the application and telnet client start running, you can send messages from the client to the application.
 
 
-Application stages
-------------------
+Explanation
+-----------
+
+Here is a description of the various stages of the application.
 
 Initialization
 ~~~~~~~~~~~~~~
 
-During this stage, EAL layer is initialised and application specific arguments are parsed. Furthermore, the data structures
-for application objects are initialized. In case of any initialization error, an error message is displayed and the application
-is terminated.
+During this stage, EAL layer is initialized and application specific arguments are parsed.
+Furthermore, the data structures for application objects are initialized.
+In case of any initialization error, an error message is displayed and the application is terminated.
 
 Run-time
 ~~~~~~~~
 
-The main thread is creating and managing all the application objects based on CLI input.
+The main thread creates and manages all the application objects based on CLI input.
 
 Each data plane thread runs one or several pipelines previously assigned to it in round-robin order. Each data plane thread
 executes two tasks in time-sharing mode:
@@ -114,5 +116,5 @@ executes two tasks in time-sharing mode:
 #. *Packet processing task*: Process bursts of input packets read from the pipeline input ports.
 
 #. *Message handling task*: Periodically, the data plane thread pauses the packet processing task and polls for request
-   messages send by the main thread. Examples: add/remove pipeline to/from current data plane thread, add/delete rules
+   messages sent by the main thread. Examples: add/remove pipeline to/from current data plane thread, add/delete rules
    to/from given table of a specific pipeline owned by the current data plane thread, read statistics, etc.
diff --git a/doc/guides/sample_app_ug/ptpclient.rst b/doc/guides/sample_app_ug/ptpclient.rst
index 0df465bcb4..28007604a0 100644
--- a/doc/guides/sample_app_ug/ptpclient.rst
+++ b/doc/guides/sample_app_ug/ptpclient.rst
@@ -5,14 +5,19 @@ PTP Client Sample Application
 =============================
 
 The PTP (Precision Time Protocol) client sample application is a simple
-example of using the DPDK IEEE1588 API to communicate with a PTP time transmitter
+example of using the DPDK IEEE1588 API to communicate with a PTP timeTransmitter
 to synchronize the time on the NIC and, optionally, on the Linux system.
 
-Note, PTP is a time syncing protocol and cannot be used within DPDK as a
+Note, PTP is a time synchronization protocol and cannot be used within DPDK as a
 time-stamping mechanism. See the following for an explanation of the protocol:
 `Precision Time Protocol
 <https://en.wikipedia.org/wiki/Precision_Time_Protocol>`_.
 
+.. note::
+
+   This documentation uses PTPv3 (IEEE 1588-2019) terminology where timeTransmitter/timeReceiver
+   replace the older master/slave terms from PTPv2.
+
 
 Limitations
 -----------
@@ -21,10 +26,10 @@ The PTP sample application is intended as a simple reference implementation of
 a PTP client using the DPDK IEEE1588 API.
 In order to keep the application simple the following assumptions are made:
 
-* The first discovered time transmitter is the main for the session.
+* The first discovered timeTransmitter is used for the session.
 * Only L2 PTP packets are supported.
 * Only the PTP v2 protocol is supported.
-* Only the time receiver clock is implemented.
+* Only the timeReceiver clock is implemented.
 
 
 How the Application Works
@@ -36,18 +41,18 @@ How the Application Works
 
    PTP Synchronization Protocol
 
-The PTP synchronization in the sample application works as follows:
+The PTP synchronization in the sample application (timeReceiver mode) works as follows:
 
-* Time transmitter sends *Sync* message - the time receiver saves it as T2.
-* Time transmitter sends *Follow Up* message and sends time of T1.
-* Time receiver sends *Delay Request* frame to PTP time transmitter and stores T3.
-* Time transmitter sends *Delay Response* T4 time which is time of received T3.
+* TimeTransmitter sends *Sync* message - the timeReceiver saves the receive time as T2.
+* TimeTransmitter sends *Follow Up* message containing the transmit time T1.
+* TimeReceiver sends *Delay Request* frame to the PTP timeTransmitter and stores the transmit time as T3.
+* TimeTransmitter sends *Delay Response* containing T4, the time it received the Delay Request.
 
-The adjustment for time receiver can be represented as:
+The clock adjustment for the timeReceiver can be calculated as:
 
    adj = -[(T2-T1)-(T4 - T3)]/2
 
-If the command line parameter ``-T 1`` is used the application also
+If the command line parameter ``-T 1`` is used, the application also
 synchronizes the PTP PHC clock with the Linux kernel clock.
 
 Compiling the Application
@@ -61,7 +66,7 @@ The application is located in the ``ptpclient`` sub-directory.
 Running the Application
 -----------------------
 
-To run the example in a ``linux`` environment:
+To run the example in a Linux environment:
 
 .. code-block:: console
 
@@ -71,8 +76,8 @@ Refer to *DPDK Getting Started Guide* for general information on running
 applications and the Environment Abstraction Layer (EAL) options.
 
 * ``-p portmask``: Hexadecimal portmask.
-* ``-T 0``: Update only the PTP time receiver clock.
-* ``-T 1``: Update the PTP time receiver clock and synchronize the Linux Kernel to the PTP clock.
+* ``-T 0``: Update only the PTP timeReceiver clock.
+* ``-T 1``: Update the PTP timeReceiver clock and synchronize the Linux kernel clock to the PTP clock.
 
 
 Code Explanation
@@ -101,7 +106,7 @@ function. The value returned is the number of parsed arguments:
     :end-before: >8 End of initialization of EAL.
     :dedent: 1
 
-And than we parse application specific arguments
+Then we parse application-specific arguments:
 
 .. literalinclude:: ../../../examples/ptpclient/ptpclient.c
     :language: c
@@ -178,8 +183,8 @@ The forwarding loop can be interrupted and the application closed using
 PTP parsing
 ~~~~~~~~~~~
 
-The ``parse_ptp_frames()`` function processes PTP packets, implementing time receiver
-PTP IEEE1588 L2 functionality.
+The ``parse_ptp_frames()`` function processes PTP packets, implementing
+PTP IEEE1588 L2 timeReceiver functionality.
 
 .. literalinclude:: ../../../examples/ptpclient/ptpclient.c
     :language: c
@@ -187,12 +192,12 @@ PTP IEEE1588 L2 functionality.
     :end-before:  >8 End of function processes PTP packets.
 
 There are 3 types of packets on the RX path which we must parse to create a minimal
-implementation of the PTP time receiver client:
+implementation of the PTP timeReceiver:
 
 * SYNC packet.
-* FOLLOW UP packet
+* FOLLOW UP packet.
 * DELAY RESPONSE packet.
 
-When we parse the *FOLLOW UP* packet we also create and send a *DELAY_REQUEST* packet.
-Also when we parse the *DELAY RESPONSE* packet, and all conditions are met
-we adjust the PTP time receiver clock.
+When we parse the *FOLLOW UP* packet, we also create and send a *DELAY REQUEST* packet.
+When we parse the *DELAY RESPONSE* packet, and all conditions are met,
+we adjust the PTP timeReceiver clock.
-- 
2.53.0


^ permalink raw reply related

* [PATCH 10/15] doc: enhance L2 forwarding sample application guides
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Tomasz Kantecki, Sunil Kumar Kori,
	Pavan Nikhilesh, Akhil Goyal
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Improved documentation for L2 forwarding variants:

l2_forward_cat.rst:
- Enhanced Cache Allocation Technology descriptions
- Fixed command-line parameter formatting
- Improved clarity of CAT configuration steps

l2_forward_crypto.rst:
- Restructured cryptographic operation descriptions
- Improved cipher and authentication algorithm lists
- Enhanced command-line option explanations

l2_forward_event.rst:
- Clarified event-driven processing model
- Fixed formatting and terminology consistency

l2_forward_job_stats.rst:
- Improved job statistics collection descriptions
- Enhanced clarity of monitoring mechanisms

l2_forward_macsec.rst:
- Simplified MACsec configuration explanations
- Fixed formatting in security association setup

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/l2_forward_cat.rst   | 34 ++++-----
 .../sample_app_ug/l2_forward_crypto.rst       | 70 +++++++++----------
 doc/guides/sample_app_ug/l2_forward_event.rst | 20 +++---
 .../sample_app_ug/l2_forward_job_stats.rst    | 46 ++++++------
 .../sample_app_ug/l2_forward_macsec.rst       | 38 +++++-----
 5 files changed, 100 insertions(+), 108 deletions(-)

diff --git a/doc/guides/sample_app_ug/l2_forward_cat.rst b/doc/guides/sample_app_ug/l2_forward_cat.rst
index f96f789f75..9901a796e0 100644
--- a/doc/guides/sample_app_ug/l2_forward_cat.rst
+++ b/doc/guides/sample_app_ug/l2_forward_cat.rst
@@ -14,22 +14,22 @@ Overview
 
 This app is intended as a demonstration of the basic components
 of a DPDK forwarding application
-and use of the libpqos library to the program CAT.
+and the use of the libpqos library to program CAT.
 For more detailed implementations, see the L2 and L3 forwarding sample applications.
 
-CAT and Code Data Prioritization (CDP) features allow management of the CPU's
-last level cache. CAT introduces classes of service (COS) that are essentially
+The CAT and Code Data Prioritization (CDP) features allow management of the CPU's
+last level cache. CAT introduces Classes of Service (COS) that are essentially
 bitmasks. In current CAT implementations, a bit in a COS bitmask corresponds to
 one cache way in last level cache.
 
 A CPU core is always assigned to one of the CAT classes.
 By programming CPU core assignment and COS bitmasks, applications can be given
 exclusive, shared, or mixed access to the CPU's last level cache.
-CDP extends CAT so that there are two bitmasks per COS,
+The CDP feature extends CAT so that there are two bitmasks per COS,
 one for data and one for code.
 The number of classes and number of valid bits in a COS bitmask is CPU model
 specific and COS bitmasks need to be contiguous. Sample code calls this bitmask
-``cbm`` or capacity bitmask.
+a ``cbm`` or capacity bitmask.
 By default, after reset, all CPU cores are assigned to COS 0 and all classes
 are programmed to allow fill into all cache ways.
 CDP is off by default.
@@ -47,7 +47,7 @@ Compiling the Application
 
 .. note::
 
-    Requires ``libpqos`` from Intel's
+    Requires the ``libpqos`` library from Intel's
     `intel-cmt-cat software package <https://github.com/01org/intel-cmt-cat>`_
     hosted on GitHub repository. For installation notes, please see ``README`` file.
 
@@ -70,7 +70,7 @@ The application is located in the ``l2fwd-cat`` sub-directory.
 Running the Application
 -----------------------
 
-To run the example in a ``linux`` environment and enable CAT on cpus 0-2:
+To run the example in a Linux environment and enable CAT on CPUs 0-2:
 
 .. code-block:: console
 
@@ -87,7 +87,7 @@ If CDP is not supported, it will fail with following error message:
 .. code-block:: console
 
     PQOS: CDP requested but not supported.
-    PQOS: Requested CAT configuration is not valid!
+    PQOS: Requested CAT configuration is not valid.
     PQOS: Shutting down PQoS library...
     EAL: Error - exiting with code: 1
       Cause: PQOS: L3CA init failed!
@@ -99,7 +99,7 @@ The option to enable CAT is:
   where ``cbm`` stands for capacity bitmask and must be expressed in
   hexadecimal form.
 
-  ``common_cbm`` is a single mask, for a CDP enabled system, a group of two
+  ``common_cbm`` is a single mask; for a CDP-enabled system, a group of two
   masks (``code_cbm`` and ``data_cbm``) is used.
 
   ``(`` and ``)`` are necessary if it's a group.
@@ -125,7 +125,7 @@ The option to enable CAT is:
     data ways are not overlapping.
 
 
-Refer to *DPDK Getting Started Guide* for general information on running
+Refer to the *DPDK Getting Started Guide* for general information on running
 applications and the Environment Abstraction Layer (EAL) options.
 
 
@@ -133,7 +133,7 @@ To reset or list CAT configuration and control CDP please use ``pqos`` tool
 from Intel's
 `intel-cmt-cat software package <https://github.com/01org/intel-cmt-cat>`_.
 
-To enabled or disable CDP:
+To enable or disable CDP:
 
 .. code-block:: console
 
@@ -141,7 +141,7 @@ To enabled or disable CDP:
 
     sudo ./pqos -S cdp-off
 
-to reset CAT configuration:
+To reset CAT configuration:
 
 .. code-block:: console
 
@@ -193,11 +193,11 @@ function. The value returned is the number of parsed arguments:
     :end-before: >8 End of initialization of PQoS.
     :dedent: 1
 
-``cat_init()`` is a wrapper function which parses the command, validates
+The ``cat_init()`` function is a wrapper function which parses the command, validates
 the requested parameters and configures CAT accordingly.
 
-The parsing of command line arguments is done in ``parse_args(...)``.
-Libpqos is then initialized with the ``pqos_init(...)`` call.
+The parsing of command line arguments is done in the ``parse_args(...)`` function.
+The libpqos library is then initialized with the ``pqos_init(...)`` call.
 Next, libpqos is
 queried for system CPU information and L3CA capabilities via
 ``pqos_cap_get(...)`` and ``pqos_cap_get_type(..., PQOS_CAP_TYPE_L3CA, ...)``
@@ -207,6 +207,6 @@ for a sufficient number of un-associated COS. COS are selected and
 configured via the ``pqos_l3ca_set(...)`` call. Finally, COS are associated to
 relevant CPUs via ``pqos_l3ca_assoc_set(...)`` calls.
 
-``atexit(...)`` is used to register ``cat_exit(...)`` to be called on
-a clean exit. ``cat_exit(...)`` performs a simple CAT clean-up, by associating
+The ``atexit(...)`` function is used to register ``cat_exit(...)`` to be called on
+a clean exit. The ``cat_exit(...)`` function performs a simple CAT clean-up, by associating
 COS 0 to all involved CPUs via ``pqos_l3ca_assoc_set(...)`` calls.
diff --git a/doc/guides/sample_app_ug/l2_forward_crypto.rst b/doc/guides/sample_app_ug/l2_forward_crypto.rst
index ba38d9f22e..add535cd39 100644
--- a/doc/guides/sample_app_ug/l2_forward_crypto.rst
+++ b/doc/guides/sample_app_ug/l2_forward_crypto.rst
@@ -1,18 +1,18 @@
 ..  SPDX-License-Identifier: BSD-3-Clause
     Copyright(c) 2016-2017 Intel Corporation.
 
-L2 Forwarding with Crypto Sample Application
+L2 forwarding with crypto sample application
 ============================================
 
 The L2 Forwarding with Crypto (l2fwd-crypto) sample application
 is a simple example of packet processing
-using the Data Plane Development Kit (DPDK)
-in conjunction with the cryptodev library.
+using the Data Plane Development Kit (DPDK),
+in conjunction with the Cryptodev library.
 
 Overview
 --------
 
-The L2 Forwarding with Crypto sample application performs a crypto operation (cipher/hash)
+The L2 Forwarding with Crypto sample application performs crypto operations (cipher/hash)
 specified by the user from the command line (or using the default values),
 with a crypto device capable of doing that operation,
 for each packet that is received on an RX_PORT and performs L2 forwarding.
@@ -20,10 +20,9 @@ for each packet that is received on an RX_PORT and performs L2 forwarding.
 The destination port is the adjacent port from the enabled portmask.
 If the first four ports are enabled (portmask 0xf),
 ports 0 and 1 forward into each other, and ports 2 and 3 forward into each other.
-If the MAC addresses updating is enabled, the MAC addresses are affected as follows:
+If MAC address updating is enabled, the MAC addresses are affected as follows:
 
 *   The source MAC address is replaced by the TX_PORT MAC address
-
 *   The destination MAC address is replaced by  02:00:00:00:00:TX_PORT_ID
 
 Compiling the Application
@@ -40,17 +39,17 @@ The application requires a number of command line options:
 
 .. code-block:: console
 
-    ./<build_dir>/examples/dpdk-l2fwd-crypto [EAL options] -- [-p PORTMASK] [-q NQ] [-s] [-T PERIOD] /
-    [--cdev_type HW/SW/ANY] [--chain HASH_CIPHER/CIPHER_HASH/CIPHER_ONLY/HASH_ONLY/AEAD] /
-    [--cipher_algo ALGO] [--cipher_op ENCRYPT/DECRYPT] [--cipher_dataunit_len SIZE] /
-    [--cipher_key KEY] [--cipher_key_random_size SIZE] [--cipher_iv IV] /
-    [--cipher_iv_random_size SIZE] /
-    [--auth_algo ALGO] [--auth_op GENERATE/VERIFY] [--auth_key KEY] /
-    [--auth_key_random_size SIZE] [--auth_iv IV] [--auth_iv_random_size SIZE] /
-    [--aead_algo ALGO] [--aead_op ENCRYPT/DECRYPT] [--aead_key KEY] /
-    [--aead_key_random_size SIZE] [--aead_iv] [--aead_iv_random_size SIZE] /
-    [--aad AAD] [--aad_random_size SIZE] /
-    [--digest size SIZE] [--sessionless] [--cryptodev_mask MASK] /
+    ./<build_dir>/examples/dpdk-l2fwd-crypto [EAL options] -- [-p PORTMASK] [-q NQ] [-s] [-T PERIOD] \
+    [--cdev_type HW/SW/ANY] [--chain HASH_CIPHER/CIPHER_HASH/CIPHER_ONLY/HASH_ONLY/AEAD] \
+    [--cipher_algo ALGO] [--cipher_op ENCRYPT/DECRYPT] [--cipher_dataunit_len SIZE] \
+    [--cipher_key KEY] [--cipher_key_random_size SIZE] [--cipher_iv IV] \
+    [--cipher_iv_random_size SIZE] \
+    [--auth_algo ALGO] [--auth_op GENERATE/VERIFY] [--auth_key KEY] \
+    [--auth_key_random_size SIZE] [--auth_iv IV] [--auth_iv_random_size SIZE] \
+    [--aead_algo ALGO] [--aead_op ENCRYPT/DECRYPT] [--aead_key KEY] \
+    [--aead_key_random_size SIZE] [--aead_iv] [--aead_iv_random_size SIZE] \
+    [--aad AAD] [--aad_random_size SIZE] \
+    [--digest size SIZE] [--sessionless] [--cryptodev_mask MASK] \
     [--mac-updating] [--no-mac-updating]
 
 where,
@@ -143,7 +142,7 @@ where,
 
 *   aad_random_size: set the size of the AAD, which will be generated randomly.
 
-    Note that if --aad is used, this will be ignored.
+    Note that if ``--aad`` is used, this will be ignored.
 
 *   digest_size: set the size of the digest to be generated/verified.
 
@@ -159,10 +158,10 @@ where,
 
 The application requires that crypto devices capable of performing
 the specified crypto operation are available on application initialization.
-This means that HW crypto device/s must be bound to a DPDK driver or
+This means that HW crypto devices must be bound to a DPDK driver or
 a SW crypto device/s (virtual crypto PMD) must be created (using --vdev).
 
-To run the application in Linux environment with 2 lcores, 2 ports and 2 crypto devices, issue the command:
+To run the application in the Linux environment with 2 lcores, 2 ports and 2 crypto devices, issue the command:
 
 .. code-block:: console
 
@@ -178,7 +177,7 @@ and the Environment Abstraction Layer (EAL) options.
 
 .. Note::
 
-    * The ``l2fwd-crypto`` sample application requires IPv4 packets for crypto operation.
+    * The ``l2fwd-crypto`` sample application requires IPv4 packets for crypto operations.
 
     * If multiple Ethernet ports are passed, then equal number of crypto devices are to be passed.
 
@@ -200,17 +199,15 @@ from reception until transmission.
    Encryption flow through the L2 Forwarding with Crypto Application
 
 
-The following sections provide explanation of the application.
+The following sections provide an explanation of the application.
 
 Crypto operation specification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 All the packets received in all the ports get transformed by the crypto devices
 (ciphering and/or authentication).
-The crypto operation to be performed on the packet is parsed from the command line.
-(Go to "Running the Application" section for all the options.)
-
-If no parameter is passed, the default crypto operation is:
+The crypto operation to be performed on the packet is parsed from the command line (see "Running the Application" for all options).
+If no parameters are passed, the default crypto operation is:
 
 * Encryption with AES-CBC with 128 bit key.
 
@@ -218,7 +215,7 @@ If no parameter is passed, the default crypto operation is:
 
 * Keys, IV and AAD are generated randomly.
 
-There are two methods to pass keys, IV and ADD from the command line:
+There are two methods to pass keys, IV and AAD from the command line:
 
 * Passing the full key, separated bytes by ":"::
 
@@ -236,7 +233,7 @@ The size of these keys are checked (regardless of the method) before starting th
 to make sure that it is supported by the crypto devices.
 
 Crypto device initialization
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Once the encryption operation is defined, the crypto devices are initialized.
 The crypto devices must be either bound to a DPDK driver (if they are physical devices)
@@ -245,9 +242,9 @@ when running the application.
 
 The initialize_cryptodevs() function performs the device initialization.
 It iterates through the list of the available crypto devices and
-checks which ones are capable of performing the operation.
-Each device has a set of capabilities associated with it,
-which are stored in the device info structure, so the function checks if the operation
+checks which are capable of performing the operation.
+Each device has a set of capabilities associated with it
+that are stored in the device info structure, so the function checks if the operation
 is within the structure of each device.
 
 The following code checks if the device supports the specified cipher algorithm
@@ -273,14 +270,15 @@ crypto device list.
 
 **Note**:
    The number of crypto devices that supports the specified crypto operation
-   must be at least the number of ports to be used.
+   must be at least equal to the number of Ethernet ports in use. If using
+   multiple Ethernet ports, an equal number of crypto devices must be provided.
 
 Session creation
 ~~~~~~~~~~~~~~~~
 
 The crypto operation has a crypto session associated to it, which contains
-information such as the transform chain to perform (e.g. ciphering then hashing)
-pointers to the keys, lengths... etc.
+information such as the transform chain to perform (e.g. ciphering then hashing),
+pointers to the keys, lengths, etc.
 
 This session is created and is later attached to the crypto operation:
 
@@ -306,7 +304,7 @@ the mbuf which will be transformed is attached to it::
 
    op->sym->m_src = m;
 
-Since no destination mbuf is set, the source mbuf will be overwritten
+Since no destination mbuf is set, the source mbuf is overwritten
 after the operation is done (in-place).
 
 Crypto operation enqueuing/dequeuing
@@ -315,7 +313,7 @@ Crypto operation enqueuing/dequeuing
 Once the operation has been created, it has to be enqueued in one of the crypto devices.
 Before doing so, for performance reasons, the operation stays in a buffer.
 When the buffer has enough operations (MAX_PKT_BURST), they are enqueued in the device,
-which will perform the operation at that moment:
+which performs the operation at that moment:
 
 .. literalinclude:: ../../../examples/l2fwd-crypto/main.c
     :language: c
diff --git a/doc/guides/sample_app_ug/l2_forward_event.rst b/doc/guides/sample_app_ug/l2_forward_event.rst
index 1d52211b9a..c4329d0036 100644
--- a/doc/guides/sample_app_ug/l2_forward_event.rst
+++ b/doc/guides/sample_app_ug/l2_forward_event.rst
@@ -17,7 +17,7 @@ The destination port is the adjacent port from the enabled portmask.
 If the first four ports are enabled (portmask=0x0f),
 ports 1 and 2 forward into each other,
 and ports 3 and 4 forward into each other.
-Also, if MAC address updating is enabled,
+Also, if MAC addresses updating is enabled,
 the MAC addresses are affected as follows:
 
 *   The source MAC address is replaced by the Tx port MAC address
@@ -26,7 +26,7 @@ the MAC addresses are affected as follows:
 Application receives packets from Rx port using these methods:
 
 *   Poll mode
-*   Eventdev mode (default)
+*   Eventdev mode (by default)
 
 This application can be used to benchmark performance using a traffic-generator,
 as shown in the :numref:`figure_l2fwd_event_benchmark_setup`.
@@ -64,7 +64,7 @@ where,
 
 *   q NQ: Maximum number of queues per lcore (default is 1)
 
-*   --[no-]mac-updating: Enable or disable MAC addresses updating (enabled by default).
+*   --[no-]mac-updating: Enable or disable MAC addresses updating (enabled by default)
 
 *   --mode=MODE: Packet transfer mode for I/O, poll or eventdev. Eventdev by default.
 
@@ -374,9 +374,9 @@ is assigned that is either the next or previous enabled port from the portmask.
 If the number of ports are odd in portmask, then the packet from the last port will be
 forwarded to first port i.e. if portmask=0x07, then forwarding will take place
 like p0--->p1, p1--->p2, p2--->p0.
+If MAC address updating is enabled, the source and destination MAC addresses are updated.
 
-Also, to optimize enqueue operation, ``l2fwd_simple_forward()`` stores incoming mbufs
-up to ``MAX_PKT_BURST``.
+To optimize enqueue operation, ``l2fwd_simple_forward()`` stores incoming mbufs up to ``MAX_PKT_BURST``.
 Once it reaches the limit, all packets are transmitted to destination ports.
 
 .. literalinclude:: ../../../examples/l2fwd/main.c
@@ -420,15 +420,15 @@ to ensure the correct allowed deq length by the eventdev.
 The ``rte_event_dequeue_burst()`` function writes the mbuf pointers in a local table
 and returns the number of available mbufs in the table.
 
-Then, each mbuf in the table is processed by the ``l2fwd_eventdev_forward()``
-function. The processing is very simple: process the TX port from the RX port,
-then replace the source and destination MAC addresses if MAC address updating
-is enabled.
+Then, each mbuf in the table is processed by the ``l2fwd_eventdev_forward()`` function.
+The processing is very simple: determine the TX port from the RX port,
+then replace the source and destination MAC addresses if MAC addresses updating is enabled.
 
 During the initialization process, a static array of destination ports
 (``l2fwd_dst_ports[]``) is filled so that for each source port, a destination port
 is assigned that is either the next or previous enabled port from the portmask.
-If number of ports are odd in portmask then packet from last port will be
+If the number of ports are odd in portmask,
+then the packet from the last port will be
 forwarded to first port i.e. if portmask=0x07, then forwarding will take place
 like p0--->p1, p1--->p2, p2--->p0.
 
diff --git a/doc/guides/sample_app_ug/l2_forward_job_stats.rst b/doc/guides/sample_app_ug/l2_forward_job_stats.rst
index 23448f23a4..6b8f8dda3e 100644
--- a/doc/guides/sample_app_ug/l2_forward_job_stats.rst
+++ b/doc/guides/sample_app_ug/l2_forward_job_stats.rst
@@ -10,8 +10,8 @@ also takes advantage of Single Root I/O Virtualization (SR-IOV) features in a vi
 
 .. note::
 
-    This application is a variation of L2 Forwarding sample application. It demonstrate possible
-    scheme of job stats library usage therefore some parts of this document is identical with original
+    This application is a variation of the L2 Forwarding sample application. It demonstrates a possible
+    scheme of job stats library usage therefore some parts of this document are identical with the original
     L2 forwarding application.
 
 Overview
@@ -92,7 +92,7 @@ where,
 
 *   q NQ: Maximum number of queues per lcore (default is 1)
 
-*   l: Use locale thousands separator when formatting big numbers.
+*   l: Use a locale thousands separator when formatting big numbers.
 
 To run the application in a Linux environment with 4 lcores, 16 ports, 8 RX queues per lcore
 and thousands separator printing, issue the command:
@@ -157,14 +157,14 @@ but it is possible to extend this code to allocate one mbuf pool per socket.
 The ``rte_pktmbuf_pool_create()`` function uses the default mbuf pool and mbuf
 initializers, respectively ``rte_pktmbuf_pool_init()`` and ``rte_pktmbuf_init()``.
 An advanced application may want to use the mempool API to create the
-mbuf pool with more control.
+mbuf pool with greater control.
 
 Driver Initialization
 ~~~~~~~~~~~~~~~~~~~~~
 
 The main part of the code in the ``main()`` function relates to the initialization of the driver.
 To fully understand this code, it is recommended to study the chapters that related to the Poll Mode Driver
-in the *DPDK Programmer's Guide* and the *DPDK API Reference*.
+To fully understand this code, it is recommended to study the chapters related to the Poll Mode Driver in the *DPDK Programmer's Guide* and the *DPDK API Reference*.
 
 .. literalinclude:: ../../../examples/l2fwd-jobstats/main.c
     :language: c
@@ -211,7 +211,7 @@ Values of struct lcore_queue_conf:
 *   n_rx_port and rx_port_list[] are used in the main packet processing loop
     (see Section `Receive, Process and Transmit Packets`_ later in this chapter).
 
-*   rx_timers and flush_timer are used to ensure forced TX on low packet rate.
+*   rx_timers and flush_timer are used to force TX on low packet rate.
 
 *   flush_job, idle_job and jobs_context are librte_jobstats objects used for managing l2fwd jobs.
 
@@ -228,9 +228,9 @@ Each lcore should be able to transmit on any port. For every port, a single TX q
     :end-before: >8 End of init one TX queue on each port.
     :dedent: 2
 
-Jobs statistics initialization
+Job Statistics Initialization
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There are several statistics objects available:
+Several statistics objects are available:
 
 *   Flush job statistics
 
@@ -260,7 +260,7 @@ Main loop
 ~~~~~~~~~
 
 The forwarding path is reworked comparing to original L2 Forwarding application.
-In the ``l2fwd_main_loop()`` function, three loops are placed.
+In the ``l2fwd_main_loop()`` function, three loop iterations are used.
 
 .. literalinclude:: ../../../examples/l2fwd-jobstats/main.c
     :language: c
@@ -268,13 +268,13 @@ In the ``l2fwd_main_loop()`` function, three loops are placed.
     :end-before: >8 End of minimize impact of stats reading.
     :dedent: 1
 
-The first infinite for loop is to minimize impact of stats reading.
+The first infinite loop minimizes the impact of statistics reading.
 Lock is only locked/unlocked when asked.
 
-Second inner while loop do the whole jobs management.
-When any job is ready, the use ``rte_timer_manage()`` is used to call the job handler.
+The second inner while loop performs the whole jobs management.
+When any job is ready, ``rte_timer_manage()`` is used to call the job handler.
 
-In this place, functions ``l2fwd_fwd_job()`` and ``l2fwd_flush_job()`` are called when needed.
+At this point, functions ``l2fwd_fwd_job()`` and ``l2fwd_flush_job()`` are called when needed.
 Then, ``rte_jobstats_context_finish()`` is called to mark loop end -
 no other jobs are ready to execute.
 By this time, stats are ready to be read
@@ -283,11 +283,11 @@ and if stats_read_pending is set, loop breaks allowing stats to be read.
 Third do-while loop is the idle job (idle stats counter).
 Its only purpose is monitoring if any job is ready
 or stats job read is pending for this lcore.
-Statistics from this part of the code is considered as
+Statistics from this part of the code are considered as
 the headroom available for additional processing.
 
-Receive, Process and Transmit Packets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Receive, Process, and Transmit Packets
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The main task of ``l2fwd_fwd_job()`` function is to read ingress packets
 from the Rx queue of particular port and forward it.
@@ -354,18 +354,12 @@ However, in real-life applications (such as, L3 routing),
 packet N is not necessarily forwarded on the same port as packet N-1.
 The application is implemented to illustrate that, so the same approach can be reused in a more complex application.
 
-The ``l2fwd_send_packet()`` function stores the packet in a per-lcore and per-txport table.
-If the table is full, the whole packets table is transmitted
-using the ``l2fwd_send_burst()`` function:
+The ``l2fwd_simple_forward()`` function uses ``rte_eth_tx_buffer()`` to buffer packets
+for transmission. When the buffer is full, packets are automatically transmitted.
 
-.. literalinclude:: ../../../examples/l2fwd-crypto/main.c
-    :language: c
-    :start-after: Enqueue packets for TX and prepare them to be sent. 8<
-    :end-before: >8 End of Enqueuing packets for TX.
-
-To ensure that no packets remain in the tables, the flush job exists.
+To ensure that no packets remain in the buffers, the flush job exists.
 The ``l2fwd_flush_job()``
-is called periodically to for each lcore draining TX queue of each port.
+is called periodically for each lcore to drain the TX queue of each port.
 This technique introduces some latency when there are not many packets to send,
 however it improves performance:
 
diff --git a/doc/guides/sample_app_ug/l2_forward_macsec.rst b/doc/guides/sample_app_ug/l2_forward_macsec.rst
index 7e6c971465..c0452992e8 100644
--- a/doc/guides/sample_app_ug/l2_forward_macsec.rst
+++ b/doc/guides/sample_app_ug/l2_forward_macsec.rst
@@ -15,7 +15,7 @@ The L2 forwarding MACsec application performs L2 forwarding for each packet
 that is received on an Rx port after encrypting/decrypting the packets
 based on rte_security sessions using inline protocol mode.
 
-The destination port is the adjacent port from the enabled portmask
+The destination port is the adjacent port from the enabled portmask,
 if the first four ports are enabled (portmask ``0xf``),
 ports 1 and 2 forward into each other, and ports 3 and 4 forward into each other.
 
@@ -43,44 +43,44 @@ The application requires a number of command line options:
        [--portmap="(port, port)[,(port, port)]"]
        [-T STAT_INTERVAL]
 
-where,
+where:
 
-``p PORTMASK``
-  Hexadecimal bitmask of the ports to configure.
+``-p PORTMASK``
+   Hexadecimal bitmask of the ports to configure.
 
-``q NQ``
-  Number of queues (=ports) per lcore (default is 1).
+``-q NQ``
+   Number of queues (=ports) per lcore (default is 1).
 
-``T STAT_INTERVAL``
-  Time interval in seconds for refreshing the stats (default is 1 sec).
-  Value 0 disables stats display.
+``-T STAT_INTERVAL``
+   Time interval in seconds for refreshing the stats (default is 1).
+   Value 0 disables stats display.
 
 ``--mcs-tx-portmask OUTBOUND_PORTMASK``
-  Hexadecimal bitmask of the ports to configure encryption flows.
+   Hexadecimal bitmask of the ports to configure encryption flows.
 
 ``--mcs-rx-portmask INBOUND_PORTMASK``
-  Hexadecimal bitmask of the ports to configure decryption flows.
+   Hexadecimal bitmask of the ports to configure decryption flows.
 
-``--mcs-port-config '(port,src_mac,dst_mac)[,(port,src_mac,dst_mac)]'``
-  Source and destination MAC addresses of incoming packets
-  on a port for which MACsec processing is to be done.
+``--mcs-port-config "(port,src_mac,dst_mac)[,(port,src_mac,dst_mac)]"``
+   Source and destination MAC addresses of incoming packets
+   on a port for which MACsec processing is to be done.
 
 ``--portmap="(port,port)[,(port,port)]"``
   Forwarding ports mapping.
 
 To run the application in Linux environment with 4 lcores,
-4 ports with 2 ports for outbound and 2 ports for outbound,
+4 ports with 2 ports for outbound and 2 ports for inbound,
 issue the command:
 
 .. code-block:: console
 
    $ ./<build_dir>/examples/dpdk-l2fwd-macsec -a 0002:04:00.0 -a 0002:05:00.0 \
        -a 0002:06:00.0 -a 0002:07:00.0 -l 1-4 -- -p 0xf                      \
-       --mcs-tx-portmask 0x5 --mcs-rx-portmask 0xA                            \
-       --mcs-port-config '(0,02:03:04:05:06:07,01:02:03:04:05:06),            \
+       --mcs-tx-portmask 0x5 --mcs-rx-portmask 0xa                            \
+       --mcs-port-config="(0,02:03:04:05:06:07,01:02:03:04:05:06),            \
        (1,02:03:04:05:06:17,01:02:03:04:05:16),                               \
        (2,02:03:04:05:06:27,01:02:03:04:05:26),                               \
-       (3,02:03:04:05:06:37,01:02:03:04:05:36)' -T 10
+       (3,02:03:04:05:06:37,01:02:03:04:05:36)" -T 10
 
 To run the application in Linux environment with 4 lcores, 4 ports,
 to forward Rx traffic of ports 0 & 1 on ports 2 & 3 respectively and vice versa,
@@ -90,7 +90,7 @@ issue the command:
 
    $ ./<build_dir>/examples/dpdk-l2fwd-macsec -a 0002:04:00.0 -a 0002:05:00.0 \
        -a 0002:06:00.0 -a 0002:07:00.0 -l 1-4 -- -p 0xf                      \
-       --mcs-tx-portmask 0x5 --mcs-rx-portmask 0xA                            \
+       --mcs-tx-portmask 0x5 --mcs-rx-portmask 0xa                            \
        --mcs-port-config="(0,02:03:04:05:06:07,01:02:03:04:05:06),            \
        (1,02:03:04:05:06:17,01:02:03:04:05:16),                               \
        (2,02:03:04:05:06:27,01:02:03:04:05:26),                               \
-- 
2.53.0


^ permalink raw reply related

* [PATCH 09/15] doc: improve IP reassembly, IPsec, multicast, and keep-alive
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Konstantin Ananyev, Radu Nicolau, Akhil Goyal
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Updated multiple networking sample application guides:

ip_reassembly.rst:
- Enhanced fragment reassembly process descriptions
- Improved command-line parameter documentation
- Fixed formatting and terminology consistency

ipsec_secgw.rst:
- Restructured configuration file format sections
- Improved security association descriptions
- Enhanced clarity of IPsec gateway operations

ipv4_multicast.rst:
- Simplified multicast forwarding explanations
- Fixed formatting in code examples
- Improved readability of routing descriptions

keep_alive.rst:
- Enhanced keep-alive mechanism descriptions
- Fixed minor formatting issues

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/ip_reassembly.rst  |  57 ++++++-----
 doc/guides/sample_app_ug/ipsec_secgw.rst    | 108 ++++++++++----------
 doc/guides/sample_app_ug/ipv4_multicast.rst |  26 ++---
 doc/guides/sample_app_ug/keep_alive.rst     |  12 +--
 4 files changed, 105 insertions(+), 98 deletions(-)

diff --git a/doc/guides/sample_app_ug/ip_reassembly.rst b/doc/guides/sample_app_ug/ip_reassembly.rst
index 04b581a489..02ed9f0472 100644
--- a/doc/guides/sample_app_ug/ip_reassembly.rst
+++ b/doc/guides/sample_app_ug/ip_reassembly.rst
@@ -4,15 +4,15 @@
 IP Reassembly Sample Application
 ================================
 
-The L3 Forwarding application is a simple example of packet processing using the DPDK.
-The application performs L3 forwarding with reassembly for fragmented IPv4 and IPv6 packets.
+The IP Reassembly application is a simple example of packet processing using the DPDK.
+The application performs L3 forwarding with reassembly of fragmented IPv4 and IPv6 packets.
 
 Overview
 --------
 
 The application demonstrates the use of the DPDK libraries to implement packet forwarding
 with reassembly for IPv4 and IPv6 fragmented packets.
-The initialization and run- time paths are very similar to those of the :doc:`l2_forward_real_virtual`.
+The initialization and run-time paths are very similar to those of the :doc:`l2_forward_real_virtual`.
 The main difference from the L2 Forwarding sample application is that
 it reassembles fragmented IPv4 and IPv6 packets before forwarding.
 The maximum allowed size of reassembled packet is 9.5 KB.
@@ -21,11 +21,11 @@ There are two key differences from the L2 Forwarding sample application:
 
 *   The first difference is that the forwarding decision is taken based on information read from the input packet's IP header.
 
-*   The second difference is that the application differentiates between IP and non-IP traffic by means of offload flags.
+*   The second difference is that the applications differentiate between IP and non-IP traffic by means of offload flags.
 
 The Longest Prefix Match (LPM for IPv4, LPM6 for IPv6) table is used to store/lookup an outgoing port number,
-associated with that IPv4 address. Any unmatched packets are forwarded to the originating port.
-
+associated with that IPv4 address.
+Any unmatched packets are forwarded to the originating port.
 
 Compiling the Application
 -------------------------
@@ -46,15 +46,20 @@ The application has a number of command line options:
 
 where:
 
-*   -p PORTMASK: Hexadecimal bitmask of ports to configure
+``-p PORTMASK``
+    Hexadecimal bitmask of ports to configure
 
-*   -q NQ: Number of RX queues per lcore
+``-q NQ``
+    Number of RX queues per lcore
 
-*   --maxflows=FLOWS: determines maximum number of active fragmented flows (1-65535). Default value: 4096.
+``--maxflows=FLOWS``
+    Determines the maximum number of active fragmented flows (1-65535).
+    Default value: 4096.
 
-*   --flowttl=TTL[(s|ms)]: determines maximum Time To Live for fragmented packet.
-    If all fragments of the packet wouldn't appear within given time-out,
-    then they are considered as invalid and will be dropped.
+``--flowttl=TTL[(s|ms)]``
+    Determines the maximum Time To Live for fragmented packets.
+    If all fragments of the packet do not appear within the given timeout,
+    then they are considered invalid and will be dropped.
     Valid range is 1ms - 3600s. Default value: 1s.
 
 To run the example in a Linux environment with 2 lcores (2,4) over 2 ports(0,2)
@@ -69,7 +74,7 @@ with 1 Rx queue per lcore:
     Skipping disabled port 1
     Initializing port 2 on lcore 4... Address:00:1B:21:5C:FF:54, rxq=0 txq=2,0 txq=4,1
     done: Link Up - speed 10000 Mbps - full-duplex
-    Skipping disabled port 3IP_FRAG: Socket 0: adding route 100.10.0.0/16 (port 0)
+    Skipping disabled port 3 IP_FRAG: Socket 0: adding route 100.10.0.0/16 (port 0)
     IP_RSMBL: Socket 0: adding route 100.20.0.0/16 (port 1)
     ...
     IP_RSMBL: Socket 0: adding route 0101:0101:0101:0101:0101:0101:0101:0101/48 (port 0)
@@ -110,7 +115,7 @@ The default l3fwd_ipv6_route_array table is:
     :end-before: >8 End of default l3fwd_ipv6_route_array table.
 
 For example, for the fragmented input IPv4 packet with destination address: 100.10.1.1,
-a reassembled IPv4 packet be sent out from port #0 to the destination address 100.10.1.1
+a reassembled IPv4 packet will be sent out from port #0 to the destination address 100.10.1.1
 once all the fragments are collected.
 
 Explanation
@@ -119,12 +124,12 @@ Explanation
 The following sections provide in-depth explanation of the sample application code.
 As mentioned in the overview section, the initialization and run-time paths
 are very similar to those of the :doc:`l2_forward_real_virtual`.
-The following sections describe aspects that are specific to the IP reassemble sample application.
+The following sections describe aspects that are specific to the IP reassembly sample application.
 
 IPv4 Fragment Table Initialization
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-This application uses the :doc:`../prog_guide/ip_fragment_reassembly_lib` library.
+This application uses the IP Fragment and Reassembly Library (:doc:`../prog_guide/ip_fragment_reassembly_lib`).
 The fragment table maintains information about already received fragments of the packet.
 Each IP packet is uniquely identified by triple <Source IP address>, <Destination IP address>, <ID>.
 To avoid lock contention, each Rx queue has its own fragment table.
@@ -144,7 +149,7 @@ Mempools Initialization
 
 The reassembly application demands a lot of mbuf's to be allocated.
 At any given time, up to (2 \* max_flow_num \* RTE_LIBRTE_IP_FRAG_MAX_FRAG \* <maximum number of mbufs per packet>)
-can be stored inside the fragment table waiting for remaining fragments.
+can be stored inside the fragment table, waiting for remaining fragments.
 To keep mempool size under reasonable limits
 and to avoid a situation when one Rx queue can starve other queues,
 each Rx queue uses its own mempool.
@@ -161,8 +166,8 @@ Packet Reassembly and Forwarding
 For each input packet, the packet forwarding operation is done by the l3fwd_simple_forward() function.
 If the packet is an IPv4 or IPv6 fragment, then it calls ``rte_ipv4_reassemble_packet()`` for IPv4 packets,
 or ``rte_ipv6_reassemble_packet()`` for IPv6 packets.
-These functions either return a pointer to a valid mbuf that contains a reassembled packet,
-or NULL (if the packet can't be reassembled for some reason).
+These functions return either a pointer to a valid mbuf that contains a reassembled packet,
+or NULL (if the packet cannot be reassembled for some reason).
 Then, ``l3fwd_simple_forward()`` continues with the code for the packet forwarding decision
 (that is, the identification of the output interface for the packet) and
 actual transmit of the packet.
@@ -171,25 +176,25 @@ The ``rte_ipv4_reassemble_packet()`` or ``rte_ipv6_reassemble_packet()`` are res
 
 #.  Searching the fragment table for entry with packet's <IP Source Address, IP Destination Address, Packet ID>
 
-#.  If the entry is found, then check if that entry already timed-out.
+#.  If the entry is found, then check whether that entry has already timed out.
     If yes, then free all previously received fragments,
     and remove information about them from the entry.
 
 #.  If no entry with such key is found, then try to create a new one by one of two ways:
 
-    #.  Use as empty entry
+    #.  Use an empty entry
 
-    #.  Delete a timed-out entry, free mbufs associated with it mbufs and store a new entry with specified key in it.
+    #.  Delete a timed-out entry, free mbufs associated with it, and store a new entry with the specified key in it.
 
-#.  Update the entry with new fragment information and check
-    if a packet can be reassembled (the packet's entry contains all fragments).
+#.  Update the entry with new fragment information and check whether
+    the packet can be reassembled (the packet's entry contains all fragments).
 
     #.  If yes, then, reassemble the packet, mark table's entry as empty and return the reassembled mbuf to the caller.
 
     #.  If no, then just return a NULL to the caller.
 
-If at any stage of packet processing a reassembly function encounters an error
-(can't insert new entry into the Fragment table, or invalid/timed-out fragment),
+If at any stage of packet processing, a reassembly function encounters an error
+(cannot insert new entry into the Fragment table, or invalid/timed-out fragment),
 then it will free all associated with the packet fragments,
 mark the table entry as invalid and return NULL to the caller.
 
diff --git a/doc/guides/sample_app_ug/ipsec_secgw.rst b/doc/guides/sample_app_ug/ipsec_secgw.rst
index 7c31c96b7c..8826ffb286 100644
--- a/doc/guides/sample_app_ug/ipsec_secgw.rst
+++ b/doc/guides/sample_app_ug/ipsec_secgw.rst
@@ -11,40 +11,40 @@ application using DPDK cryptodev framework.
 Overview
 --------
 
-The application demonstrates the implementation of a Security Gateway
-(not IPsec compliant, see the Constraints section below) using DPDK based on RFC4301,
-RFC4303, RFC3602 and RFC2404.
+This application demonstrates the implementation of a Security Gateway
+(not fully IPsec-compliant; see the Constraints section) using DPDK, based
+on RFC4301, RFC4303, RFC3602, and RFC2404.
 
-Internet Key Exchange (IKE) is not implemented, so only manual setting of
-Security Policies and Security Associations is supported.
+Internet Key Exchange (IKE) is not implemented in this example; only manual
+setting of Security Policies and Security Associations is supported.
 
 The Security Policies (SP) are implemented as ACL rules, the Security
-Associations (SA) are stored in a table and the routing is implemented
+Associations (SA) are stored in a table, and the routing is implemented
 using LPM.
 
-The application classifies the ports as *Protected* and *Unprotected*.
-Thus, traffic received on an Unprotected or Protected port is consider
-Inbound or Outbound respectively.
+The application classifies ports as *Protected* or *Unprotected*, with traffic
+received on Unprotected ports considered Inbound and traffic on Protected ports
+considered Outbound.
 
 The application also supports complete IPsec protocol offload to hardware
-(Look aside crypto accelerator or using ethernet device). It also support
-inline ipsec processing by the supported ethernet device during transmission.
-These modes can be selected during the SA creation configuration.
+using crypto acceleration hardware or NIC with crypto acceleration. It also
+supports inline IPsec processing by supported Ethernet devices during
+transmission. These modes can be selected during SA creation.
 
-In case of complete protocol offload, the processing of headers(ESP and outer
-IP header) is done by the hardware and the application does not need to
-add/remove them during outbound/inbound processing.
+In case of complete protocol offload, the processing of headers (ESP and
+outer IP header) is done by the hardware and the application does not need
+to add/remove them during Outbound/Inbound processing.
 
-For inline offloaded outbound traffic, the application will not do the LPM
-lookup for routing, as the port on which the packet has to be forwarded will be
-part of the SA. Security parameters will be configured on that port only, and
+For inline offloaded Outbound traffic, the application does not perform the
+LPM lookup for routing, as the port on which the packet is to be forwarded
+is part of the SA. Security parameters are configured on that port only, and
 sending the packet on other ports could result in unencrypted packets being
 sent out.
 
 The Path for IPsec Inbound traffic is:
 
 *  Read packets from the port.
-*  Classify packets between IPv4 and ESP.
+*  Classify packets as IPv4 or ESP.
 *  Perform Inbound SA lookup for ESP packets based on their SPI.
 *  Perform Verification/Decryption (Not needed in case of inline ipsec).
 *  Remove ESP and outer IP header (Not needed in case of protocol offload).
@@ -64,28 +64,30 @@ The Path for the IPsec Outbound traffic is:
 
 The application supports two modes of operation: poll mode and event mode.
 
-* In the poll mode a core receives packets from statically configured list
+* In the poll mode, a core receives packets from statically configured list
   of eth ports and eth ports' queues.
 
-* In the event mode a core receives packets as events. After packet processing
-  is done core submits them back as events to an event device. This enables
-  multicore scaling and HW assisted scheduling by making use of the event device
-  capabilities. The event mode configuration is predefined. All packets reaching
-  given eth port will arrive at the same event queue. All event queues are mapped
-  to all event ports. This allows all cores to receive traffic from all ports.
-  Since the underlying event device might have varying capabilities, the worker
-  threads can be drafted differently to maximize performance. For example, if an
-  event device - eth device pair has Tx internal port, then application can call
-  rte_event_eth_tx_adapter_enqueue() instead of regular rte_event_enqueue_burst().
-  So a thread which assumes that the device pair has internal port will not be the
-  right solution for another pair. The infrastructure added for the event mode aims
-  to help application to have multiple worker threads by maximizing performance from
-  every type of event device without affecting existing paths/use cases. The worker
-  to be used will be determined by the operating conditions and the underlying device
+* In event mode, a core receives packets as events. After processing,
+  the core submits them back as events to an event device. This enables
+  multicore scaling and hardware-assisted scheduling by making use of
+  the event device capabilities. The event mode configuration is
+  predefined. All packets reaching a given Ethernet port arrive at the
+  same event queue. All event queues are mapped to all event ports,
+  allowing all cores to receive traffic from all ports.
+  Since the underlying event device might have varying capabilities,
+  worker threads can be designed differently to maximize performance.
+  For example, if an event device-Ethernet device pair has a Tx internal
+  port, the application can call ``rte_event_eth_tx_adapter_enqueue()``
+  instead of ``rte_event_enqueue_burst()``. A thread that assumes the
+  device pair has an internal port may not be suitable for another pair.
+  The event mode infrastructure supports multiple worker threads,
+  maximizing performance from every type of event device without
+  affecting existing paths or use cases. The worker to be used is
+  determined by the operating conditions and underlying device
   capabilities.
+
   **Currently the application provides non-burst, internal port worker threads.**
-  It also provides infrastructure for non-internal port
-  however does not define any worker threads.
+  It also provides infrastructure for non-internal ports but does not define any worker threads.
 
   Event mode also supports event vectorization. The event devices, ethernet device
   pairs which support the capability ``RTE_EVENT_ETH_RX_ADAPTER_CAP_EVENT_VECTOR`` can
@@ -99,7 +101,7 @@ The application supports two modes of operation: poll mode and event mode.
   ``RTE_EVENT_CRYPTO_ADAPTER_CAP_EVENT_VECTOR`` vector aggregation
   could also be enable using event-vector option.
 
-Additionally the event mode introduces two submodes of processing packets:
+Additionally, the event mode introduces two submodes of processing packets:
 
 * Driver submode: This submode has bare minimum changes in the application to support
   IPsec. There are no lookups, no routing done in the application. And for inline
@@ -115,7 +117,7 @@ Additionally the event mode introduces two submodes of processing packets:
   benchmark numbers.
 
 Constraints
------------
+~~~~~~~~~~~
 
 *  No IPv6 options headers.
 *  No AH mode.
@@ -127,7 +129,7 @@ Constraints
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
+To compile the sample application, see :doc:`compiling`.
 
 The application is located in the ``ipsec-secgw`` sub-directory.
 
@@ -354,7 +356,7 @@ where each option means:
 Refer to the *DPDK Getting Started Guide* for general information on running
 applications and the Environment Abstraction Layer (EAL) options.
 
-The application would do a best effort to "map" crypto devices to cores, with
+The application makes a best effort to "map" crypto devices to cores, with
 hardware devices having priority. Basically, hardware devices if present would
 be assigned to a core before software ones.
 This means that if the application is using a single core and both hardware
@@ -377,7 +379,7 @@ For example, something like the following command line:
 
 
 Configurations
---------------
+~~~~~~~~~~~~~~
 
 The following sections provide the syntax of configurations to initialize
 your SP, SA, Routing, Flow and Neighbour tables.
@@ -390,17 +392,17 @@ accordingly.
 Configuration File Syntax
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
-As mention in the overview, the Security Policies are ACL rules.
-The application parsers the rules specified in the configuration file and
-passes them to the ACL table, and replicates them per socket in use.
+As mentioned in the overview, the Security Policies are ACL rules.
+The application parses the rules specified in the configuration file and
+passes them to ACL table, and replicates them per socket in use.
 
-Following are the configuration file syntax.
+The following sections contains the configuration file syntax.
 
 General rule syntax
 ^^^^^^^^^^^^^^^^^^^
 
 The parse treats one line in the configuration file as one configuration
-item (unless the line concatenation symbol exists). Every configuration
+item (unless line concatenation is used). Every configuration
 item shall follow the syntax of either SP, SA, Routing, Flow or Neighbour
 rules specified below.
 
@@ -711,7 +713,7 @@ where each options means:
  * Port/device ID of the ethernet/crypto accelerator for which the SA is
    configured. For *inline-crypto-offload* and *inline-protocol-offload*, this
    port will be used for routing. The routing table will not be referred in
-   this case.
+   that case.
 
  * Optional: No, if *type* is not *no-offload*
 
@@ -766,7 +768,7 @@ where each options means:
 
  ``<mss>``
 
- * Maximum segment size for TSO offload, available for egress SAs only.
+ * Maximum segment size for TSO offload; available for egress SAs only.
    Currently only supports TCP/IP.
 
  * Optional: Yes, TSO offload not set by default
@@ -1113,7 +1115,7 @@ available.
 Server configuration
 ~~~~~~~~~~~~~~~~~~~~
 
-Two servers are required for the tests, SUT and DUT.
+Two servers are required for the tests: SUT and DUT.
 
 Make sure the user from the SUT can ssh to the DUT without entering the password.
 To enable this feature keys must be setup on the DUT.
@@ -1152,7 +1154,7 @@ It then tries to perform some data transfer using the scheme described above.
 Usage
 ~~~~~
 
-In the ipsec-secgw/test directory run
+In the ipsec-secgw/test directory run:
 
 /bin/bash run_test.sh <options> <ipsec_mode>
 
@@ -1165,7 +1167,7 @@ Available options:
     selected.
 
 *   ``-m`` Add IPSec tunnel mixed IP version tests - outer IP version different
-    than inner. Inner IP version will match selected option [-46].
+    from inner. Inner IP version will match selected option [-46].
 
 *   ``-i`` Run tests in inline mode. Regular tests will not be invoked.
 
@@ -1185,4 +1187,4 @@ Available options:
 *   ``-h`` Show usage.
 
 If <ipsec_mode> is specified, only tests for that mode will be invoked. For the
-list of available modes please refer to run_test.sh.
+list of available modes, please refer to run_test.sh.
diff --git a/doc/guides/sample_app_ug/ipv4_multicast.rst b/doc/guides/sample_app_ug/ipv4_multicast.rst
index 3eb8b95f29..4304be665d 100644
--- a/doc/guides/sample_app_ug/ipv4_multicast.rst
+++ b/doc/guides/sample_app_ug/ipv4_multicast.rst
@@ -63,7 +63,7 @@ where,
 
 *   -p PORTMASK: Hexadecimal bitmask of ports to configure
 
-*   -q NQ: determines the number of queues per lcore
+*   -q NQ: number of queues per lcore
 
 .. note::
 
@@ -125,7 +125,7 @@ Forwarding
 ~~~~~~~~~~
 
 All forwarding is done inside the ``mcast_forward()`` function.
-Firstly, the Ethernet* header is removed from the packet and the IPv4 address is extracted from the IPv4 header:
+Firstly, the Ethernet header is removed from the packet and the IPv4 address is extracted from the IPv4 header:
 
 .. literalinclude:: ../../../examples/ipv4_multicast/main.c
     :language: c
@@ -170,7 +170,7 @@ with the Ethernet address 01:00:5e:00:00:00, as per RFC 1112:
     :start-after: Construct Ethernet multicast address from IPv4 multicast Address. 8<
     :end-before: >8 End of Construction of multicast address from IPv4 multicast address.
 
-Packets are then dispatched to the destination ports according to the portmask associated with a multicast group:
+Packets are then dispatched to the destination ports according to the port mask associated with the multicast group:
 
 .. literalinclude:: ../../../examples/ipv4_multicast/main.c
     :language: c
@@ -190,12 +190,11 @@ Buffer Cloning
 
 This is the most important part of the application
 since it demonstrates the use of zero-copy buffer cloning.
-There are two approaches for creating the outgoing packet.
-Although both are based on the data zero-copy idea,
-there are some differences in the details.
+There are two approaches for creating outgoing packets.
+Both are based on the zero-copy idea, but they differ in implementation details.
 
-The first approach creates a clone of the input packet. For example,
-walk though all segments of the input packet and for each of segment,
+The first approach creates a clone of the input packet:
+walk through all segments of the input packet and for each segment,
 create a new buffer and attach that new buffer to the segment
 (refer to ``rte_pktmbuf_clone()`` in the mbuf library for more details).
 A new buffer is then allocated for the packet header and is prepended to the cloned buffer.
@@ -205,18 +204,19 @@ It simply increments the reference counter for all input packet segments,
 allocates a new buffer for the packet header and prepends it to the input packet.
 
 Basically, the first approach reuses only the input packet's data, but creates its own copy of packet's metadata.
-The second approach reuses both input packet's data and metadata.
+The second approach reuses both the input packet's data and metadata.
 
-The advantage of the first approach is that each outgoing packet has its own copy of the metadata,
+The advantage of the first approach is that each outgoing packet has its own copy of metadata,
 so we can safely modify the data pointer of the input packet.
-That allows us to skip creation if the output packet is for the last destination port
+That allows us to skip packet creation if the output packet is for the last destination port
 and, instead, modify the input packet's header in place.
 For example, for N destination ports, we need to invoke ``mcast_out_pkt()`` (N-1) times.
 
 The advantage of the second approach is that there is less work to be done for each outgoing packet.
 The "clone" operation is skipped completely.
-However, there is a price to pay.
-The input packet's metadata must remain intact. For N destination ports,
+However, there is a price to pay:
+the input packet's metadata must remain intact.
+For N destination ports,
 we need to invoke ``mcast_out_pkt()`` (N) times.
 
 Therefore, for a small number of outgoing ports (and segments in the input packet),
diff --git a/doc/guides/sample_app_ug/keep_alive.rst b/doc/guides/sample_app_ug/keep_alive.rst
index 8ae9d7c689..9353b65e91 100644
--- a/doc/guides/sample_app_ug/keep_alive.rst
+++ b/doc/guides/sample_app_ug/keep_alive.rst
@@ -5,7 +5,7 @@ Keep Alive Sample Application
 =============================
 
 The Keep Alive application is a simple example of a
-heartbeat/watchdog for packet processing cores. It demonstrates how
+heartbeat and watchdog for packet processing cores. It demonstrates how
 to detect 'failed' DPDK cores and notify a fault management entity
 of this failure. Its purpose is to ensure the failure of the core
 does not result in a fault that is not detectable by a management
@@ -19,7 +19,7 @@ The application demonstrates how to protect against 'silent outages'
 on packet processing cores. A Keep Alive Monitor Agent Core (main)
 monitors the state of packet processing cores (worker cores) by
 dispatching pings at a regular time interval (default is 5ms) and
-monitoring the state of the cores. Cores states are: Alive, MIA, Dead
+monitoring the state of the cores. Core states are: Alive, MIA, Dead
 or Buried. MIA indicates a missed ping, and Dead indicates two missed
 pings within the specified time interval. When a core is Dead, a
 callback function is invoked to restart the packet processing core;
@@ -80,16 +80,16 @@ Explanation
 
 The following sections provide explanation of the
 Keep-Alive/'Liveliness' conceptual scheme. As mentioned in the
-overview section, the initialization and run-time paths are very
+Overview section, the initialization and run-time paths are very
 similar to those of the :doc:`l2_forward_real_virtual`.
 
 The Keep-Alive/'Liveliness' conceptual scheme:
 
-* A Keep- Alive Agent Runs every N Milliseconds.
+* A keep-alive agent runs every N milliseconds.
 
-* DPDK Cores respond to the keep-alive agent.
+* DPDK cores respond to the keep-alive agent.
 
-* If a keep-alive agent detects time-outs, it notifies the
+* If a keep-alive agent detects timeouts, it notifies the
   fault management entity through a callback function.
 
 The following sections provide explanation of the code aspects
-- 
2.53.0


^ permalink raw reply related

* [PATCH 08/15] doc: enhance hello_world, intro, IP frag and pipeline
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Bruce Richardson, Konstantin Ananyev,
	Cristian Dumitrescu
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Improved documentation across multiple sample applications:

hello_world.rst:
- Minor formatting and clarity improvements

intro.rst:
- Updated sample application descriptions for accuracy
- Fixed formatting and improved readability
- Clarified application purposes and use cases

ip_frag.rst:
- Enhanced command-line option descriptions
- Improved flow explanations
- Fixed terminology and formatting consistency

ip_pipeline.rst:
- Restructured pipeline configuration sections
- Improved CLI command descriptions
- Enhanced clarity of pipeline concepts

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 doc/guides/sample_app_ug/hello_world.rst |   6 +-
 doc/guides/sample_app_ug/intro.rst       |  34 +++----
 doc/guides/sample_app_ug/ip_frag.rst     |  46 +++++-----
 doc/guides/sample_app_ug/ip_pipeline.rst | 109 ++++++++++++-----------
 4 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/doc/guides/sample_app_ug/hello_world.rst b/doc/guides/sample_app_ug/hello_world.rst
index b7167aa345..603a1d8767 100644
--- a/doc/guides/sample_app_ug/hello_world.rst
+++ b/doc/guides/sample_app_ug/hello_world.rst
@@ -9,7 +9,7 @@ Overview
 --------
 
 The Hello World sample application is an example of the simplest DPDK application that can be written.
-The application simply prints an "helloworld" message on every enabled lcore.
+The application simply prints a "helloworld" message on every enabled lcore.
 
 Compiling the Application
 -------------------------
@@ -21,7 +21,7 @@ The application is located in the ``helloworld`` sub-directory.
 Running the Application
 -----------------------
 
-To run the example in a linux environment:
+To run the example in a Linux environment:
 
 .. code-block:: console
 
@@ -50,7 +50,7 @@ This call finishes the initialization process that was started before main() is
 The argc and argv arguments are provided to the rte_eal_init() function.
 The value returned is the number of parsed arguments.
 
-Starting Application Unit Lcores
+Starting Application on Lcores
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Once the EAL is initialized, the application is ready to launch a function on an lcore.
diff --git a/doc/guides/sample_app_ug/intro.rst b/doc/guides/sample_app_ug/intro.rst
index a19c0b8c13..dfd0254b0c 100644
--- a/doc/guides/sample_app_ug/intro.rst
+++ b/doc/guides/sample_app_ug/intro.rst
@@ -4,9 +4,9 @@
 Introduction to the DPDK Sample Applications
 ============================================
 
-The DPDK Sample Applications are small standalone applications that
-demonstrate various features of DPDK. They can be considered as a cookbook of
-DPDK features. Users interested in getting started with DPDK can take the
+The DPDK sample applications are small standalone applications that
+demonstrate various features of DPDK. They serve as practical examples of
+DPDK functionality. Users interested in getting started with DPDK can take the
 applications, try out the features, and then extend them to fit their needs.
 
 
@@ -30,7 +30,7 @@ examples are highlighted below.
 
 
 * :doc:`Hello World<hello_world>`: As with most introductions to a
-  programming framework, a good place to start is with the Hello World
+  programming framework, a good place to start is the Hello World
   application. The Hello World example sets up the DPDK Environment Abstraction
   Layer (EAL), and prints a simple "Hello World" message to each of the DPDK
   enabled cores. This application doesn't do any packet forwarding, but it is a
@@ -38,32 +38,32 @@ examples are highlighted below.
 
 * :doc:`Basic Forwarding/Skeleton Application<skeleton>`: The Basic
   Forwarding/Skeleton contains the minimum amount of code required to enable
-  basic packet forwarding with DPDK. This allows you to test if your network
+  basic packet forwarding with DPDK. This allows you to test whether your network
   interfaces are working with DPDK.
 
 * :doc:`Network Layer 2 forwarding<l2_forward_real_virtual>`: The Network Layer 2
   forwarding, or ``l2fwd`` application does forwarding based on Ethernet MAC
   addresses like a simple switch.
 
-* :doc:`Network Layer 2 forwarding<l2_forward_event>`: The Network Layer 2
-  forwarding, or ``l2fwd-event`` application does forwarding based on Ethernet MAC
-  addresses like a simple switch. It demonstrates usage of poll and event mode
-  IO mechanism under a single application.
+* :doc:`Network Layer 2 forwarding with event mode<l2_forward_event>`: The
+  ``l2fwd-event`` application does forwarding based on Ethernet MAC addresses
+  like a simple switch. It demonstrates usage of poll and event mode IO
+  mechanisms under a single application.
 
-* :doc:`Network Layer 3 forwarding<l3_forward>`: The Network Layer3
+* :doc:`Network Layer 3 forwarding<l3_forward>`: The Network Layer 3
   forwarding, or ``l3fwd`` application does forwarding based on Internet
   Protocol, IPv4 or IPv6 like a simple router.
 
-* :doc:`Network Layer 3 forwarding Graph<l3_forward_graph>`: The Network Layer3
+* :doc:`Network Layer 3 forwarding Graph<l3_forward_graph>`: The Network Layer 3
   forwarding Graph, or ``l3fwd_graph`` application does forwarding based on IPv4
-  like a simple router with DPDK Graph framework.
+  like a simple router with the DPDK Graph framework.
 
 * :doc:`Hardware packet copying<dma>`: The Hardware packet copying,
-  or ``dmafwd`` application demonstrates how to use DMAdev library for
+  or ``dmafwd`` application demonstrates how to use the DMAdev library for
   copying packets between two threads.
 
 * :doc:`Packet Distributor<dist_app>`: The Packet Distributor
-  demonstrates how to distribute packets arriving on an Rx port to different
+  demonstrates how to distribute packets arriving on a receive port to different
   cores for processing and transmission.
 
 * :doc:`Multi-Process Application<multi_process>`: The
@@ -78,9 +78,9 @@ examples are highlighted below.
   and TX packet processing functions.
 
 * :doc:`IPsec Security Gateway<ipsec_secgw>`: The IPsec Security
-  Gateway application is a minimal example of something closer to a real world
-  example. This is also a good example of an application using the DPDK
-  Cryptodev framework.
+  Gateway application demonstrates a minimal implementation that is closer to
+  a real-world use case. This is also a good example of an application using
+  the DPDK Cryptodev framework.
 
 * :doc:`Precision Time Protocol (PTP) client<ptpclient>`: The PTP
   client is another minimal implementation of a real world application.
diff --git a/doc/guides/sample_app_ug/ip_frag.rst b/doc/guides/sample_app_ug/ip_frag.rst
index d2c66683e3..0d0a01ac0a 100644
--- a/doc/guides/sample_app_ug/ip_frag.rst
+++ b/doc/guides/sample_app_ug/ip_frag.rst
@@ -15,23 +15,28 @@ The application demonstrates the use of zero-copy buffers for packet fragmentati
 The initialization and run-time paths are very similar to those of the :doc:`l2_forward_real_virtual`.
 This guide highlights the differences between the two applications.
 
-There are three key differences from the L2 Forwarding sample application:
+Key differences from the L2 Forwarding sample application:
 
-*   The first difference is that the IP Fragmentation sample application makes use of indirect buffers.
+indirect buffers
+    The IP Fragmentation application uses indirect buffers for zero-copy packet fragmentation.
 
-*   The second difference is that the forwarding decision is taken
-    based on information read from the input packet's IP header.
+IP-based forwarding
+    Forwarding decisions are based on the destination IP address in the packet header,
+    using Longest Prefix Match (LPM) lookup.
 
-*   The third difference is that the application differentiates between
-    IP and non-IP traffic by means of offload flags.
+traffic classification
+    The application distinguishes IP traffic from non-IP traffic using packet offload flags.
 
-The Longest Prefix Match (LPM for IPv4, LPM6 for IPv6) table
+The application supports both IPv4 and IPv6 packet fragmentation.
+
+The Longest Prefix Match table (LPM for IPv4, LPM6 for IPv6)
 is used to store/lookup an outgoing port number associated with that IP address.
 Any unmatched packets are forwarded to the originating port.
 
 By default, input frame sizes up to 9.5 KB are supported.
 Before forwarding, the input IP packet is fragmented
-to fit into the "standard" Ethernet* v2 MTU (1500 bytes).
+to fit the standard Ethernet v2 MTU of 1500 bytes (the L3 payload size,
+excluding the Ethernet frame overhead).
 
 Compiling the Application
 -------------------------
@@ -43,10 +48,10 @@ The application is located in the ``ip_fragmentation`` sub-directory.
 Running the Application
 -----------------------
 
-The LPM object is created and loaded with the pre-configured entries read from
-global l3fwd_ipv4_route_array and l3fwd_ipv6_route_array tables.
-For each input packet, the packet forwarding decision
-(that is, the identification of the output interface for the packet) is taken as a result of LPM lookup.
+The application creates an LPM object and populates it with pre-configured routing entries
+from the global ``l3fwd_ipv4_route_array`` and ``l3fwd_ipv6_route_array`` tables.
+For each input packet, the forwarding decision (output interface selection)
+is determined by an LPM lookup on the destination IP address.
 If the IP packet size is greater than the default output MTU,
 then the input packet is fragmented and several fragments are sent via the output interface.
 
@@ -56,13 +61,13 @@ Application usage:
 
     ./<build_dir>/examples/dpdk-ip_fragmentation [EAL options] -- -p PORTMASK [-q NQ]
 
-where:
+where,
 
-*   -p PORTMASK is a hexadecimal bitmask of ports to configure
+*   ``-p PORTMASK``: hexadecimal bitmask of ports to configure
 
 *   -q NQ: Maximum number of queues per lcore (default is 1)
 
-To run the example in linux environment with 2 lcores (2,4) over 2 ports(0,2) with 1 RX queue per lcore:
+To run the example in a Linux environment with 2 lcores (2,4) over 2 ports (0,2) with 1 RX queue per lcore:
 
 .. code-block:: console
 
@@ -73,12 +78,12 @@ To run the example in linux environment with 2 lcores (2,4) over 2 ports(0,2) wi
     Skipping disabled port 1
     Initializing port 2 on lcore 4... Address:00:1B:21:5C:FF:54, rxq=0 txq=2,0 txq=4,1
     done: Link Up - speed 10000 Mbps - full-duplex
-    Skipping disabled port 3IP_FRAG: Socket 0: adding route 100.10.0.0/16 (port 0)
+    Skipping disabled port 3
+    IP_FRAG: Socket 0: adding route 100.10.0.0/16 (port 0)
     IP_FRAG: Socket 0: adding route 100.20.0.0/16 (port 1)
     ...
     IP_FRAG: Socket 0: adding route 0101:0101:0101:0101:0101:0101:0101:0101/48 (port 0)
     IP_FRAG: Socket 0: adding route 0201:0101:0101:0101:0101:0101:0101:0101/48 (port 1)
-    ...
     IP_FRAG: entering main loop on lcore 4
     IP_FRAG: -- lcoreid=4 portid=2
     IP_FRAG: entering main loop on lcore 2
@@ -108,10 +113,11 @@ The default l3fwd_ipv6_route_array table is:
     :end-before: >8 End of default l3fwd_ipv6_route_array table.
 
 For example, for the input IPv4 packet with destination address: 100.10.1.1 and packet length 9198 bytes,
-seven IPv4 packets will be sent out from port #0 to the destination address 100.10.1.1:
-six of those packets will have length 1500 bytes and one packet will have length 318 bytes.
+the application will fragment it into seven packets sent out from port 0:
+six fragments of 1500 bytes each (the MTU limit for L3 payload) and one final fragment of 318 bytes.
+
 IP Fragmentation sample application provides basic NUMA support
-in that all the memory structures are allocated on all sockets that have active lcores on them.
+in that all memory structures are allocated on all sockets that have active lcores on them.
 
 
 Refer to the *DPDK Getting Started Guide* for general information on running applications
diff --git a/doc/guides/sample_app_ug/ip_pipeline.rst b/doc/guides/sample_app_ug/ip_pipeline.rst
index f9e8caa0a8..1c8ad05061 100644
--- a/doc/guides/sample_app_ug/ip_pipeline.rst
+++ b/doc/guides/sample_app_ug/ip_pipeline.rst
@@ -4,15 +4,15 @@
 Internet Protocol (IP) Pipeline Application
 ===========================================
 
-Application overview
---------------------
+Overview
+--------
 
-The *Internet Protocol (IP) Pipeline* application is intended to be a vehicle for rapid development of packet processing
+The *Internet Protocol (IP) Pipeline* application is a vehicle for rapid development of packet processing
 applications on multi-core CPUs.
 
-Following OpenFlow and P4 design principles, the application can be used to create functional blocks called pipelines out
-of input/output ports, tables and actions in a modular way. Multiple pipelines can be inter-connected through packet queues
-to create complete applications (super-pipelines).
+Following OpenFlow and P4 design principles, the application can be used to create functional blocks called pipelines
+from input/output ports, tables and actions in a modular way. Multiple pipelines can be inter-connected through packet
+queues to create complete applications (super-pipelines).
 
 The pipelines are mapped to application threads, with each pipeline executed by a single thread and each thread able to run
 one or several pipelines. The possibilities of creating pipelines out of ports, tables and actions, connecting multiple
@@ -21,13 +21,14 @@ a true application generator.
 
 Pipelines are created and managed through Command Line Interface (CLI):
 
- * Any standard TCP client (e.g. telnet, netcat, custom script, etc) is typically able to connect to the application, send
+ * Any standard TCP client (e.g. telnet, netcat, custom script, etc.) is typically able to connect to the application, send
    commands through the network and wait for the response before pushing the next command.
 
  * All the application objects are created and managed through CLI commands:
-    * 'Primitive' objects used to create pipeline ports: memory pools, links (i.e. network interfaces), SW queues, traffic managers, etc.
-    * Action profiles: used to define the actions to be executed by pipeline input/output ports and tables.
-    * Pipeline components: input/output ports, tables, pipelines, mapping of pipelines to execution threads.
+
+   * 'Primitive' objects used to create pipeline ports: memory pools, links (i.e. network interfaces), SW queues, traffic managers, etc.
+   * Action profiles: used to define the actions to be executed by pipeline input/output ports and tables.
+   * Pipeline components: input/output ports, tables, pipelines, mapping of pipelines to execution threads.
 
 Running the application
 -----------------------
@@ -85,7 +86,7 @@ The application should start successfully and display as follows:
     EAL:   probe driver: 8086:10fb net_ixgbe
     ...
 
-To run remote client (e.g. telnet) to communicate with the ip pipeline application:
+To run a remote client (for example, telnet) to communicate with the IP pipeline application:
 
 .. code-block:: console
 
@@ -103,19 +104,21 @@ When running a telnet client as above, command prompt is displayed:
 
     pipeline>
 
-Once application and telnet client start running, messages can be sent from client to application.
-At any stage, telnet client can be terminated using the quit command.
+Once the application and telnet client start running, messages can be sent from the client to the application.
+At any stage, the telnet client can be terminated using the ``quit`` command.
 
 
-Application stages
-------------------
+Explanation
+-----------
+
+The following explains the stages of the application.
 
 Initialization
 ~~~~~~~~~~~~~~
 
-During this stage, EAL layer is initialised and application specific arguments are parsed. Furthermore, the data structures
-(i.e. linked lists) for application objects are initialized. In case of any initialization error, an error message
-is displayed and the application is terminated.
+During this stage, the EAL layer is initialized and application-specific arguments are parsed.
+Furthermore, the data structures (linked lists) for application objects are initialized.
+In case of any initialization error, an error message is displayed and the application is terminated.
 
 .. _ip_pipeline_runtime:
 
@@ -124,17 +127,18 @@ Run-time
 
 The main thread is creating and managing all the application objects based on CLI input.
 
-Each data plane thread runs one or several pipelines previously assigned to it in round-robin order. Each data plane thread
-executes two tasks in time-sharing mode:
+Each data plane thread runs one or more pipelines previously assigned to it in round-robin order.
+Each data plane thread executes two tasks in time-sharing mode:
 
 #. *Packet processing task*: Process bursts of input packets read from the pipeline input ports.
 
-#. *Message handling task*: Periodically, the data plane thread pauses the packet processing task and polls for request
-   messages send by the main thread. Examples: add/remove pipeline to/from current data plane thread, add/delete rules
-   to/from given table of a specific pipeline owned by the current data plane thread, read statistics, etc.
+#. *Message handling task*: Periodically, the data plane thread pauses the packet processing task and polls for
+   request messages sent by the main thread. Examples include adding or removing pipelines from the current
+   data plane thread, adding or deleting rules in a table of a specific pipeline owned by the current data
+   plane thread, reading statistics, and similar operations.
 
 Examples
---------
+~~~~~~~~
 
 .. _table_examples:
 
@@ -207,7 +211,7 @@ Link
 
  Link configuration ::
 
-   link <link_name>
+  link <link_name>
     dev <device_name>|port <port_id>
     rxq <n_queues> <queue_size> <mempool_name>
     txq <n_queues> <queue_size> promiscuous on | off
@@ -236,7 +240,7 @@ Software queue
 Traffic manager
 ~~~~~~~~~~~~~~~
 
- Add traffic manager subport profile ::
+Add traffic manager subport profile ::
 
   tmgr subport profile
    <tb_rate> <tb_size>
@@ -245,7 +249,7 @@ Traffic manager
    <tc9_rate> <tc10_rate> <tc11_rate> <tc12_rate>
    <tc_period>
 
- Add traffic manager pipe profile ::
+Add traffic manager pipe profile ::
 
   tmgr pipe profile
    <tb_rate> <tb_size>
@@ -256,7 +260,7 @@ Traffic manager
    <tc_ov_weight>
    <wrr_weight0..3>
 
- Create traffic manager port ::
+Create traffic manager port ::
 
   tmgr <tmgr_name>
    rate <rate>
@@ -266,16 +270,16 @@ Traffic manager
    mtu <mtu>
    cpu <cpu_id>
 
- Configure traffic manager subport ::
+Configure traffic manager subport ::
 
   tmgr <tmgr_name>
    subport <subport_id>
    profile <subport_profile_id>
 
- Configure traffic manager pipe ::
+Configure traffic manager pipe ::
 
   tmgr <tmgr_name>
-   subport <subport_id>
+  subport <subport_id>
    pipe from <pipe_id_first> to <pipe_id_last>
    profile <pipe_profile_id>
 
@@ -291,7 +295,7 @@ Tap
 Cryptodev
 ~~~~~~~~~
 
-  Create cryptodev port ::
+Create cryptodev port ::
 
    cryptodev <cryptodev_name>
     dev <DPDK Cryptodev PMD name>
@@ -300,13 +304,13 @@ Cryptodev
 Action profile
 ~~~~~~~~~~~~~~
 
- Create action profile for pipeline input port ::
+Create action profile for pipeline input port ::
 
   port in action profile <profile_name>
    [filter match | mismatch offset <key_offset> mask <key_mask> key <key_value> port <port_id>]
    [balance offset <key_offset> mask <key_mask> port <port_id0> ... <port_id15>]
 
- Create action profile for the pipeline table ::
+Create action profile for the pipeline table ::
 
   table action profile <profile_name>
    ipv4 | ipv6
@@ -389,18 +393,18 @@ Connect pipeline input port to table ::
 
   pipeline <pipeline_name> port in <port_id> table <table_id>
 
-Display statistics for specific pipeline input port, output port
+Display statistics for specific pipeline input port, output port,
 or table ::
 
   pipeline <pipeline_name> port in <port_id> stats read [clear]
   pipeline <pipeline_name> port out <port_id> stats read [clear]
   pipeline <pipeline_name> table <table_id> stats read [clear]
 
-Enable given input port for specific pipeline instance ::
+Enable given output port for specific pipeline instance ::
 
-  pipeline <pipeline_name> port out <port_id> disable
+  pipeline <pipeline_name> port out <port_id> enable
 
-Disable given input port for specific pipeline instance ::
+Disable given output port for specific pipeline instance ::
 
   pipeline <pipeline_name> port out <port_id> disable
 
@@ -408,9 +412,9 @@ Add default rule to table for specific pipeline instance ::
 
   pipeline <pipeline_name> table <table_id> rule add
      match
-        default
+       default
      action
-        fwd
+       fwd
            drop
            | port <port_id>
            | meta
@@ -484,9 +488,10 @@ Add bulk rules to table for specific pipeline instance ::
 
   pipeline <pipeline_name> table <table_id> rule add bulk <file_name> <n_rules>
 
-  Where:
-  - file_name = path to file
-  - File line format = match <match> action <action>
+Where:
+
+- ``file_name`` = path to file
+- File line format = ``match <match> action <action>``
 
 Delete table rule for specific pipeline instance ::
 
@@ -497,9 +502,9 @@ Delete default table rule for specific pipeline instance ::
 
   pipeline <pipeline_name> table <table_id> rule delete
      match
-        default
+       default
 
-Add meter profile to the table for specific pipeline instance ::
+Add meter profile to table for specific pipeline instance ::
 
   pipeline <pipeline_name> table <table_id> meter profile <meter_profile_id>
    add srtcm cir <cir> cbs <cbs> ebs <ebs>
@@ -512,24 +517,24 @@ Delete meter profile from the table for specific pipeline instance ::
 
 
 Update the dscp table for meter or traffic manager action for specific
-pipeline instance ::
+pipeline instance::
 
    pipeline <pipeline_name> table <table_id> dscp <file_name>
 
-   Where:
-      - file_name = path to file
-      - exactly 64 lines
-      - File line format = <tc_id> <tc_queue_id> <color>, with <color> as: g | y | r
+Where:
 
+- ``file_name`` = path to file
+- exactly 64 lines
+- File line format = ``<tc_id> <tc_queue_id> <color>``, with ``<color>`` as: g | y | r
 
 Pipeline enable/disable
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-   Enable given pipeline instance for specific data plane thread ::
+Enable given pipeline instance for specific data plane thread::
 
     thread <thread_id> pipeline <pipeline_name> enable
 
 
-   Disable given pipeline instance for specific data plane thread ::
+Disable given pipeline instance for specific data plane thread::
 
     thread <thread_id> pipeline <pipeline_name> disable
-- 
2.53.0


^ permalink raw reply related

* [PATCH 07/15] doc: improve clarity in eventdev, FIPS, and flow filtering
From: Stephen Hemminger @ 2026-06-11 21:18 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Gowrishankar Muthukrishnan, Ori Kam
In-Reply-To: <20260611212119.1026721-1-stephen@networkplumber.org>

Enhanced multiple sample application guides:

eventdev_pipeline.rst:
- Improved command-line option formatting and descriptions
- Standardized terminology and fixed grammatical issues
- Clarified pipeline stage descriptions

fips_validation.rst:
- Restructured algorithm support sections for clarity
- Fixed formatting inconsistencies
- Improved readability of validation process descriptions

flow_filtering.rst:
- Enhanced code explanations and flow descriptions
- Fixed formatting and indentation issues
- Clarified API usage examples

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 .../sample_app_ug/eventdev_pipeline.rst       | 51 ++++++++-------
 doc/guides/sample_app_ug/fips_validation.rst  | 63 +++++++++----------
 doc/guides/sample_app_ug/flow_filtering.rst   | 51 +++++++--------
 3 files changed, 82 insertions(+), 83 deletions(-)

diff --git a/doc/guides/sample_app_ug/eventdev_pipeline.rst b/doc/guides/sample_app_ug/eventdev_pipeline.rst
index 19ff53803e..343d3f46ec 100644
--- a/doc/guides/sample_app_ug/eventdev_pipeline.rst
+++ b/doc/guides/sample_app_ug/eventdev_pipeline.rst
@@ -4,41 +4,40 @@
 Eventdev Pipeline Sample Application
 ====================================
 
-The eventdev pipeline sample application is a sample app that demonstrates
+The eventdev pipeline sample application is an application that demonstrates
 the usage of the eventdev API using the software PMD. It shows how an
 application can configure a pipeline and assign a set of worker cores to
 perform the processing required.
 
-The application has a range of command line arguments allowing it to be
-configured for various numbers worker cores, stages,queue depths and cycles per
-stage of work. This is useful for performance testing as well as quickly testing
+The application has a range of command line arguments that allow it to be
+configured for various numbers of worker cores, stages, queue depths, and cycles
+per stage of work. This is useful for performance testing as well as quickly testing
 a particular pipeline configuration.
 
 
 Compiling the Application
 -------------------------
 
-To compile the sample application see :doc:`compiling`.
-
-The application is located in the ``examples`` sub-directory.
+To compile the sample application, see :doc:`compiling`.
 
+The application is located in the ``examples`` directory.
 
 
 Running the Application
 -----------------------
 
-The application has a lot of command line options. This allows specification of
-the eventdev PMD to use, and a number of attributes of the processing pipeline
+The application has a lot of command line options. This allows the specification of
+the eventdev PMD to use and for a number of attributes of the processing pipeline
 options.
 
 An example eventdev pipeline running with the software eventdev PMD using
 these settings is shown below:
 
- * ``-l 0,2,8-15``: lcore to use
+ * ``-l 0,2,8-15``: lcores to use
  * ``-r1``: core mask 0x1 for RX
  * ``-t1``: core mask 0x1 for TX
  * ``-e4``: core mask 0x4 for the software scheduler
- * ``-w FF00``: core mask for worker cores, 8 cores from 8th to 16th
+ * ``-w FF00``: core mask for worker cores, 8 cores from 8th to 15th
  * ``-s4``: 4 atomic stages
  * ``-n0``: process infinite packets (run forever)
  * ``-c32``: worker dequeue depth of 32
@@ -50,8 +49,8 @@ these settings is shown below:
     ./<build_dir>/examples/dpdk-eventdev_pipeline -l 0,2,8-15 --vdev event_sw0 \
     -- -r1 -t1 -e4 -w FF00 -s4 -n0 -c32 -W1000 -D
 
-The application has some sanity checking built-in, so if there is a function
-(e.g.; the RX core) which doesn't have a cpu core mask assigned, the application
+The application has sanity checking built-in, so if there is a function
+(e.g., the RX core) which does not have a CPU core mask assigned, the application
 will print an error message:
 
 .. code-block:: console
@@ -61,26 +60,26 @@ will print an error message:
           rx: 0
           tx: 1
 
-Configuration of the eventdev is covered in detail in the programmers guide,
-see the Event Device Library section.
+Configuration of the eventdev is covered in detail in the programmer's guide.
+See the Event Device Library section.
 
 
 Observing the Application
--------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-At runtime the eventdev pipeline application prints out a summary of the
-configuration, and some runtime statistics like packets per second. On exit the
-worker statistics are printed, along with a full dump of the PMD statistics if
+At runtime, the eventdev pipeline application prints out a summary of the
+configuration, and some runtime statistics like packets per second. On exit, the
+worker core statistics are printed, along with a full dump of the PMD statistics if
 required. The following sections show sample output for each of the output
 types.
 
 Configuration
-~~~~~~~~~~~~~
+^^^^^^^^^^^^^
 
-This provides an overview of the pipeline,
-scheduling type at each stage, and parameters to options such as how many
-flows to use and what eventdev PMD is in use. See the following sample output
-for details:
+The configuration output provides an overview of the pipeline, the scheduling
+type at each stage,
+and parameters such as the number of flows and the eventdev PMD in use.
+See the following sample output for details:
 
 .. code-block:: console
 
@@ -101,7 +100,7 @@ for details:
         Stage 3, Type Atomic    Priority = 128
 
 Runtime
-~~~~~~~
+^^^^^^^
 
 At runtime, the statistics of the consumer are printed, stating the number of
 packets received, runtime in milliseconds, average mpps, and current mpps.
@@ -111,7 +110,7 @@ packets received, runtime in milliseconds, average mpps, and current mpps.
   # consumer RX= xxxxxxx, time yyyy ms, avg z.zzz mpps [current w.www mpps]
 
 Shutdown
-~~~~~~~~
+^^^^^^^^
 
 At shutdown, the application prints the number of packets received and
 transmitted, and an overview of the distribution of work across worker cores.
diff --git a/doc/guides/sample_app_ug/fips_validation.rst b/doc/guides/sample_app_ug/fips_validation.rst
index 613c5afd19..732f47212a 100644
--- a/doc/guides/sample_app_ug/fips_validation.rst
+++ b/doc/guides/sample_app_ug/fips_validation.rst
@@ -7,13 +7,13 @@ Federal Information Processing Standards (FIPS) CryptoDev Validation
 Overview
 --------
 
+This application parses and performs symmetric cryptography computations
+using test vectors from the NIST Cryptographic Algorithm Validation Program
+(CAVP) and Automated Crypto Validation Protocol (ACVP).
+
 Federal Information Processing Standards (FIPS) are publicly announced standards
 developed by the United States federal government for use in computer systems by
-non-military government agencies and government contractors.
-
-This application is used to parse and perform symmetric cryptography
-computation to the NIST Cryptographic Algorithm Validation Program (CAVP) and
-Automated Crypto Validation Protocol (ACVP) test vectors.
+non-military agencies and government contractors.
 
 For an algorithm implementation to be listed on a cryptographic module
 validation certificate as an Approved security function, the algorithm
@@ -21,6 +21,7 @@ implementation must meet all the requirements of FIPS 140-2 (in case of CAVP)
 and FIPS 140-3 (in case of ACVP) and must successfully complete the
 cryptographic algorithm validation process.
 
+
 Limitations
 -----------
 
@@ -28,17 +29,17 @@ CAVP
 ----
 
 * The version of request file supported is ``CAVS 21.0``.
-* If the header comment in a ``.req`` file does not contain a Algo tag
-  i.e ``AES,TDES,GCM`` you need to manually add it into the header comment for
-  example::
+* If the header comment in a ``.req`` file does not contain an algorithm tag
+  (i.e., ``AES``, ``TDES``, ``GCM``), you must manually add it to the header
+  comment, for example::
 
       # VARIABLE KEY - KAT for CBC / # TDES VARIABLE KEY - KAT for CBC
 
-* The application does not supply the test vectors. The user is expected to
-  obtain the test vector files from `CAVP
+* The application does not supply the test vectors. Users must obtain the
+  test vector files from the `CAVP
   <https://csrc.nist.gov/projects/cryptographic-algorithm-validation-
-  program/block-ciphers>`_ website. To obtain the ``.req`` files you need to
-  email a person from the NIST website and pay for the ``.req`` files.
+  program/block-ciphers>`_ website. To obtain the ``.req`` files, you need to
+  contact a representative from the NIST website and pay for the ``.req`` files.
   The ``.rsp`` files from the site can be used to validate and compare with
   the ``.rsp`` files created by the FIPS application.
 
@@ -54,7 +55,7 @@ CAVP
 ACVP
 ----
 
-* The application does not supply the test vectors. The user is expected to
+* The application does not supply the test vectors. Users must
   obtain the test vector files from `ACVP  <https://pages.nist.gov/ACVP>`_
   website.
 * Supported test vectors
@@ -78,19 +79,17 @@ ACVP
 Application Information
 -----------------------
 
-If a ``.req`` is used as the input file after the application is finished
-running it will generate a response file or ``.rsp``. Differences between the
-two files are, the ``.req`` file has missing information for instance if doing
-encryption you will not have the cipher text and that will be generated in the
-response file. Also if doing decryption it will not have the plain text until it
-finished the work and in the response file it will be added onto the end of each
-operation.
-
-The application can be run with a ``.rsp`` file and what the outcome of that
-will be is it will add a extra line in the generated ``.rsp`` which should be
-the same as the ``.rsp`` used to run the application, this is useful for
-validating if the application has done the operation correctly.
+If a ``.req`` file is used as input, the application generates a response
+file (``.rsp``) after completion. The ``.req`` file has missing fields that
+the application fills in. For example, when
+performing encryption the cipher text is absent; when performing decryption
+the plain text is absent. These are computed and added to the ``.rsp`` file
+at the end of each operation.
 
+The application can also run with a ``.rsp`` file as input. In this case,
+it generates a new ``.rsp`` with an additional verification line. The output
+should match the input ``.rsp``, which is useful for validating that the
+application performed the operations correctly.
 
 Compiling the Application
 -------------------------
@@ -125,23 +124,23 @@ The application requires a number of command line options:
          --mbuf-dataroom DATAROOM_SIZE
 
 where,
-  * req-file: The path of the request file or folder, separated by
+  * req-file: The path of the request file or folder, indicated by
     ``path-is-folder`` option.
 
-  * rsp-file: The path that the response file or folder is stored. separated by
+  * rsp-file: The path where the response file or folder is stored, indicated by
     ``path-is-folder`` option.
 
   * cryptodev: The name of the target DPDK Crypto device to be validated.
 
   * cryptodev-id: The id of the target DPDK Crypto device to be validated.
 
-  * path-is-folder: If presented the application expects req-file and rsp-file
-    are folder paths.
+  * path-is-folder: If present, the application treats req-file and rsp-file
+    as folder paths.
 
   * mbuf-dataroom: By default the application creates mbuf pool with maximum
-    possible data room (65535 bytes). If the user wants to test scatter-gather
-    list feature of the PMD he or she may set this value to reduce the dataroom
-    size so that the input data may be divided into multiple chained mbufs.
+    possible data room (65535 bytes). To test the scatter-gather
+    list feature of a PMD, this value may be set to reduce the dataroom
+    size so that the input data is divided into multiple chained mbufs.
 
 
 To run the application in linux environment to test one AES FIPS test data
diff --git a/doc/guides/sample_app_ug/flow_filtering.rst b/doc/guides/sample_app_ug/flow_filtering.rst
index 179e978942..db5947d9e4 100644
--- a/doc/guides/sample_app_ug/flow_filtering.rst
+++ b/doc/guides/sample_app_ug/flow_filtering.rst
@@ -7,11 +7,11 @@ Flow Filtering Sample Application
 Overview
 --------
 
-The flow filtering sample application provides a simple example of creating flow rules.
+The flow filtering sample application is a simple example of creating flow rules.
 
 It serves as a demonstration of the fundamental components of flow rules.
 
-It demonstrates how to create rules and configure them, using both template and non template API.
+It demonstrates how to create and configure rules using both template and non-template APIs.
 
 
 Compiling the Application
@@ -25,7 +25,7 @@ The application is located in the ``flow_filtering`` sub-directory.
 Running the Application
 -----------------------
 
-To run the example in a ``linux`` environment:
+To run the example in a Linux environment:
 
 .. code-block:: console
 
@@ -34,7 +34,7 @@ To run the example in a ``linux`` environment:
 where,
 
 ``--[non-]template``
-  Specify whether to use the template API (default is template API).
+  Specifies whether to use the template API (default is template API).
 
 For more details on template API please refer to :ref:`flow_template_api`.
 
@@ -50,7 +50,7 @@ The example is built from 2 main files:
 - ``main.c``: Contains the application logic, including initializations and the main loop.
 - ``flow_skeleton.c``: Implements the creation of flow rules.
 
-Additionally, the ``snippets`` directory includes code snippets showcasing various features
+Additionally, the ``snippets`` directory contains code snippets showcasing various features
 that can override the basic ``flow_skeleton.c`` implementation.
 
 
@@ -87,7 +87,7 @@ those configuration are defined in the snippet file.
    :end-before: >8 End of snippet-specific configuration.
    :dedent: 1
 
-Initialize the ports using the user-defined ``init_port()`` function,
+Initialize the ports using the ``init_port()`` function,
 configuring Ethernet ports with default settings, including both Rx and Tx queues for a single port:
 
 .. literalinclude:: ../../../examples/flow_filtering/main.c
@@ -96,7 +96,7 @@ configuring Ethernet ports with default settings, including both Rx and Tx queue
    :end-before: >8 End of Initializing the ports using user defined init_port().
    :dedent: 1
 
-For template API, the flow API requires preallocating resources.
+For the template API, the flow API requires preallocating resources.
 The function ``rte_flow_configure()`` should be called after configuring the Ethernet device
 and before creating any flow rules to set up flow queues for asynchronous operations.
 
@@ -109,14 +109,14 @@ and before creating any flow rules to set up flow queues for asynchronous operat
 Creating the Flow Rule
 ~~~~~~~~~~~~~~~~~~~~~~
 
-This section is the core of the flow filtering functionality involves creating flow rules.
-The flow rules are created using two primary approaches: template API and non-template API.
-Both template and non-template API configure flow rules using attributes (like ingress or egress),
-pattern items (for matching packet data), and actions (for operations on matched packets).
-However, template API extend this by introducing pattern templates and actions templates,
+This section covers the core of the flow filtering functionality: creating flow rules.
+Flow rules are created using two primary approaches: template API and non-template API.
+Both APIs configure flow rules using the same components: attributes (such as ingress or egress),
+pattern items (for matching packet data), and actions (to perform operations on matched packets).
+However, the template API extends this by introducing pattern templates and action templates,
 which define reusable matching criteria and action lists, respectively.
-These templates are then combined in a template table to optimize resource allocation and management.
-In contrast, non-template API handle each rule individually without such shared templates.
+The pattern and action templates are combined in a template table to optimize resource allocation.
+In contrast, the non-template API handles each rule individually without such shared templates.
 
 This is handled by the ``generate_flow_skeleton()`` function in ``flow_skeleton.c``.
 
@@ -127,8 +127,8 @@ This is handled by the ``generate_flow_skeleton()`` function in ``flow_skeleton.
    :dedent: 1
 
 This part of the code defines necessary data structures,
-as well as configures action and pattern structures for the rule.
-Common for both template and non-template API.
+and configures action and pattern structures for the rule.
+This is common to both template and non-template APIs.
 
 .. literalinclude:: ../../../examples/flow_filtering/flow_skeleton.c
    :language: c
@@ -136,7 +136,7 @@ Common for both template and non-template API.
    :end-before: >8 End of setting the common action and pattern structures.
    :dedent: 1
 
-For template API, this part of the code creates the necessary template tables and finally create the rule.
+For the template API, the code creates pattern and action templates, combines them in a template table, and creates the rule.
 
 .. literalinclude:: ../../../examples/flow_filtering/flow_skeleton.c
    :language: c
@@ -144,7 +144,7 @@ For template API, this part of the code creates the necessary template tables an
    :end-before: >8 End of creating a flow rule using template API.
    :dedent: 1
 
-For non-template API, validate the rule and create it.
+For the non-template API, the code validates and creates the rule directly.
 
 .. literalinclude:: ../../../examples/flow_filtering/flow_skeleton.c
    :language: c
@@ -156,7 +156,7 @@ Main Loop Execution
 ~~~~~~~~~~~~~~~~~~~
 
 Launch the ``main_loop()`` function from ``main.c``,
-which reading the packets from all queues and printing for each packet the destination queue:
+which reads packets from all queues and prints the destination queue for each packet:
 
 .. literalinclude:: ../../../examples/flow_filtering/main.c
    :language: c
@@ -186,18 +186,19 @@ Using Snippets
 
 Developers can customize flow rules by modifying ``flow_skeleton.c``
 and utilizing functions from ``snippets`` directory.
-For example, within ``snippet_match_ipv4_flow.c``, developers can find the functions:
+For example, ``snippet_match_ipv4_flow.c`` provides:
 
 - ``snippet_ipv4_flow_create_actions()`` for defining actions,
 - ``snippet_ipv4_flow_create_patterns()`` for setting packet matching patterns,
 - ``snippet_ipv4_flow_create_table()`` for creating the patterns and actions template table.
 
-To use a different snippet, simply update the include statement in ``flow_skeleton.c``
-to point to the desired snippet file, this will change the default created flow.
+To use a different snippet, update the include statement in ``flow_skeleton.c``
+to point to the desired snippet file. This will change the default flow rule created.
 
-Some snippets may require different configuration,
-those configuration are defined in the snippet file:
+Some snippets require additional port or flow configuration.
+These are defined in the snippet header file, for example:
 
 - ``snippet_init_ipv4`` for configuration of the port and flow attributes.
 
-In order to use them the developer should include the snippet header file in main.c
+To apply these configurations, include the snippet header file in ``main.c``
+so that the snippet-specific initialization is called during port setup.
-- 
2.53.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox