RE: [PATCH] acl: classify with RISC-V vector extension

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
To: P1erreCashon <2022302111412@whu.edu.cn>,
	"Stanisław Kardach" <stanislaw.kardach@gmail.com>,
	"Sun Yuechi" <sunyuechi@iscas.ac.cn>
Cc: "dev@dpdk.org" <dev@dpdk.org>, gong-flying <gongxiaofei24@iscas.ac.cn>
Subject: RE: [PATCH] acl: classify with RISC-V vector extension
Date: Thu, 12 Mar 2026 08:50:45 +0000	[thread overview]
Message-ID: <ff8d34eecbea40208537696b4ece33ed@huawei.com> (raw)
In-Reply-To: <20260309122818.661741-1-2022302111412@whu.edu.cn>



> From: Tang ShiHao <2022302111412@whu.edu.cn>
> 
> Implement rte acl classify function for RISC-V architecture
> using RISC-V Vector Extension instruction set
> Verified with testacl and acl_autotest applications on riscv architecture.
> 
> Performance improvements measured with dpdk-test-acl:
> 
> - 100 rules / 100 traces: 57.3 → 48.6 cycles/pkt (~1.18x)
> - 1k rules / 1k traces: 13.6 → 8.3 cycles/pkt (~1.64x)
> - 10k rules / 1M traces: 36.6 → 27.7 cycles/pkt (~1.32x)
> 
> Throughput improvement up to ~64%.
> 
> This patch is co-developed with Gong Xiaofei.
> 
> Signed-off-by: gong-flying <gongxiaofei24@iscas.ac.cn>
> Signed-off-by: Tang ShiHao <2022302111412@whu.edu.cn>

There is already an acked by me patch:
https://patchwork.dpdk.org/project/dpdk/patch/20260201160957.1898027-1-sunyuechi@iscas.ac.cn/
that provides identical functionality.
Can you probably explain why a new one is required?
BTW, if you did spot any issues with the patch above, please
provide your comments.
Thanks
Konstantin

> ---
>  app/test-acl/main.c   |   4 +
>  lib/acl/acl.h         |   4 +
>  lib/acl/acl_run_rvv.c |  19 ++++
>  lib/acl/acl_run_rvv.h | 210 ++++++++++++++++++++++++++++++++++++++++++
>  lib/acl/meson.build   |   4 +-
>  lib/acl/rte_acl.c     |  43 +++++++++
>  lib/acl/rte_acl.h     |   1 +
>  7 files changed, 284 insertions(+), 1 deletion(-)
>  create mode 100644 lib/acl/acl_run_rvv.c
>  create mode 100644 lib/acl/acl_run_rvv.h
> 
> diff --git a/app/test-acl/main.c b/app/test-acl/main.c
> index 3a791b3ccf..ad0bc89644 100644
> --- a/app/test-acl/main.c
> +++ b/app/test-acl/main.c
> @@ -97,6 +97,10 @@ static const struct acl_alg acl_alg[] = {
>  		.name = "avx512x32",
>  		.alg = RTE_ACL_CLASSIFY_AVX512X32,
>  	},
> +	{
> +		.name = "rvv",
> +		.alg = RTE_ACL_CLASSIFY_RVV,
> +	},
>  };
> 
>  static struct {
> diff --git a/lib/acl/acl.h b/lib/acl/acl.h
> index 9c85a3d58a..af202a84ed 100644
> --- a/lib/acl/acl.h
> +++ b/lib/acl/acl.h
> @@ -226,6 +226,10 @@ int
>  rte_acl_classify_altivec(const struct rte_acl_ctx *ctx, const uint8_t **data,
>  	uint32_t *results, uint32_t num, uint32_t categories);
> 
> +int
> +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data,
> +	uint32_t *results, uint32_t num, uint32_t categories);
> +
>  #ifdef __cplusplus
>  }
>  #endif /* __cplusplus */
> diff --git a/lib/acl/acl_run_rvv.c b/lib/acl/acl_run_rvv.c
> new file mode 100644
> index 0000000000..2b53e28213
> --- /dev/null
> +++ b/lib/acl/acl_run_rvv.c
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2026 Institute of Software Chinese Academy of Sciences (ISCAS)
> + */
> +
> +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +
> +#include "acl_run_rvv.h"
> +
> +int
> +rte_acl_classify_rvv(const struct rte_acl_ctx *ctx, const uint8_t **data,
> +		      uint32_t *results, uint32_t num, uint32_t categories)
> +{
> +	if (num >= 4)
> +		return search_rvv_4(ctx, data, results, num, categories);
> +	else
> +		return rte_acl_classify_scalar(ctx, data, results, num, categories);
> +}
> +
> +#endif
> diff --git a/lib/acl/acl_run_rvv.h b/lib/acl/acl_run_rvv.h
> new file mode 100644
> index 0000000000..ed21ce2ba6
> --- /dev/null
> +++ b/lib/acl/acl_run_rvv.h
> @@ -0,0 +1,210 @@
> +#include <stdalign.h>
> +
> +#include "acl_run.h"
> +
> +#include <riscv_vector.h>
> +
> +
> +static const uint8_t idx_const[16] = {
> +	0, 0, 0, 0, 4, 4, 4, 4,
> +	8, 8, 8, 8, 12, 12, 12, 12
> +};
> +
> +/*
> + * Resolve priority for multiple results (scalar version).
> + * This consists comparing the priority of the current traversal with the
> + * running set of results for the packet.
> + * For each result, keep a running array of the result (rule number) and
> + * its priority for each category.
> + */
> +static inline void
> +resolve_priority_rvv(uint64_t transition, int n,
> +						const struct rte_acl_ctx *ctx,
> +						struct parms *parms,
> +						const struct
> rte_acl_match_results *p,
> +						uint32_t categories)
> +{
> +	uint32_t x;
> +
> +	for (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {
> +
> +		int32_t *saved_results  = (int32_t *)&parms[n].cmplt->results[x];
> +		int32_t *saved_priority = (int32_t *)&parms[n].cmplt->priority[x];
> +
> +		const int32_t *cur_results  = (const int32_t
> *)&p[transition].results[x];
> +		const int32_t *cur_priority = (const int32_t
> *)&p[transition].priority[x];
> +
> +		size_t vl = __riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER);
> +
> +		/* load current trie results / priority */
> +		vint32m1_t v_results  = __riscv_vle32_v_i32m1(cur_results, vl);
> +		vint32m1_t v_priority = __riscv_vle32_v_i32m1(cur_priority, vl);
> +
> +		if (parms[n].cmplt->count != ctx->num_tries) {
> +
> +			/* load running best */
> +			vint32m1_t v_results1  =
> __riscv_vle32_v_i32m1(saved_results, vl);
> +			vint32m1_t v_priority1 =
> __riscv_vle32_v_i32m1(saved_priority, vl);
> +
> +			/* selector = priority1 > priority */
> +			vbool32_t mask =
> __riscv_vmsgt_vv_i32m1_b32(v_priority1, v_priority, vl);
> +
> +			/* results = mask ? results1 : results */
> +			v_results  = __riscv_vmerge_vvm_i32m1(v_results,
> v_results1, mask, vl);
> +			v_priority = __riscv_vmerge_vvm_i32m1(v_priority,
> v_priority1, mask, vl);
> +		}
> +
> +		/* store back running best */
> +		__riscv_vse32_v_i32m1(saved_results,  v_results,  vl);
> +		__riscv_vse32_v_i32m1(saved_priority, v_priority, vl);
> +	}
> +}
> +
> +vuint32m1_t
> +transition4_rvv(vuint32m1_t next_input,
> +				const uint64_t *trans,
> +				uint64_t transitions[4])
> +{
> +	size_t vl = 4;
> +
> +	vuint64m2_t vtr = __riscv_vle64_v_u64m2(transitions, vl);
> +
> +	vuint32m1_t lo = __riscv_vnsrl_wx_u32m1(vtr, 0, vl);
> +	vuint32m1_t hi = __riscv_vnsrl_wx_u32m1(vtr, 32, vl);
> +
> +	vuint32m1_t addr =
> +		__riscv_vxor_vv_u32m1(lo, __riscv_vand_vx_u32m1(lo,
> ~RTE_ACL_NODE_INDEX, vl), vl);
> +
> +	vuint32m1_t node_type =
> +		__riscv_vand_vx_u32m1(lo, ~RTE_ACL_NODE_INDEX, vl);
> +
> +	vbool32_t m_dfa =
> +		__riscv_vmseq_vx_u32m1_b32(node_type, 0, vl);
> +
> +	vuint32m1_t input =
> +		__riscv_vand_vx_u32m1(next_input, 0xff, vl);
> +
> +	/* ---------------- DFA ---------------- */
> +
> +	vuint32m1_t grp =
> +		__riscv_vsrl_vx_u32m1(input, 6, vl);
> +
> +	vuint32m1_t shift =
> +		__riscv_vmul_vx_u32m1(grp, RTE_ACL_DFA_GR64_BIT, vl);
> +
> +	vuint32m1_t dfa_base =
> +		__riscv_vsrl_vv_u32m1(hi, shift, vl);
> +
> +	vuint32m1_t dfa_x =
> +		__riscv_vsub_vv_u32m1(input,
> +			__riscv_vand_vx_u32m1(dfa_base, UINT8_MAX, vl),
> +			vl);
> +
> +	/* ---------------- QRANGE ---------------- */
> +	vuint8m1_t mask = __riscv_vle8_v_u8m1(idx_const, 16);
> +
> +	vuint8m1_t in =
> +		__riscv_vrgather_vv_u8m1(
> +			__riscv_vreinterpret_v_u32m1_u8m1(next_input),
> +			mask,
> +			16);
> +
> +	vint8m1_t in_s8 =
> +		__riscv_vreinterpret_v_u8m1_i8m1(in);
> +
> +	vuint8m1_t ranges_u8 =
> +		__riscv_vreinterpret_v_u32m1_u8m1(hi);
> +
> +	vint8m1_t ranges_s8 =
> +		__riscv_vreinterpret_v_u8m1_i8m1(ranges_u8);
> +
> +	vbool8_t cmp =
> +		__riscv_vmsgt_vv_i8m1_b8(in_s8, ranges_s8, 16);
> +	int32_t q_1 = __riscv_vcpop_m_b8(cmp, 4);
> +	int32_t q_2 = __riscv_vcpop_m_b8(cmp, 8);
> +	int32_t q_3 = __riscv_vcpop_m_b8(cmp, 12);
> +	int32_t q_4 = __riscv_vcpop_m_b8(cmp, 16);
> +	uint32_t q_scalar[4] = {q_1, q_2 - q_1, q_3 - q_2, q_4 - q_3};
> +	vuint32m1_t q_x = __riscv_vle32_v_u32m1(q_scalar, 4);
> +
> +
> +	vuint32m1_t x =
> +		__riscv_vmerge_vvm_u32m1(q_x, dfa_x, m_dfa, vl);
> +
> +	addr = __riscv_vadd_vv_u32m1(addr, x, vl);
> +
> +	vuint64m2_t addr64 =
> +		__riscv_vwmulu_vx_u64m2(addr, sizeof(uint64_t), vl);
> +	vuint64m2_t next =
> +		__riscv_vloxei64_v_u64m2(trans, addr64, vl);
> +
> +	__riscv_vse64_v_u64m2(transitions, next, vl);
> +
> +	return __riscv_vsrl_vx_u32m1(next_input, 8, vl);
> +}
> +
> +/*
> + * Check for any match in 4 transitions
> + */
> +static __rte_always_inline uint32_t
> +check_any_match_x4(uint64_t val[])
> +{
> +	return (val[0] | val[1] | val[2] | val[3]) & RTE_ACL_NODE_MATCH;
> +}
> +
> +static __rte_always_inline void
> +acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,
> +			struct acl_flow_data *flows, uint64_t transitions[])
> +{
> +	while (check_any_match_x4(transitions)) {
> +		transitions[0] = acl_match_check(transitions[0], slot, ctx,
> +			parms, flows, resolve_priority_rvv);
> +		transitions[1] = acl_match_check(transitions[1], slot + 1, ctx,
> +			parms, flows, resolve_priority_rvv);
> +		transitions[2] = acl_match_check(transitions[2], slot + 2, ctx,
> +			parms, flows, resolve_priority_rvv);
> +		transitions[3] = acl_match_check(transitions[3], slot + 3, ctx,
> +			parms, flows, resolve_priority_rvv);
> +	}
> +}
> +
> +static inline int
> +search_rvv_4(const struct rte_acl_ctx *ctx,
> +			const uint8_t **data,
> +			uint32_t *results,
> +			int total_packets,
> +			uint32_t categories)
> +{
> +	struct acl_flow_data flows;
> +	uint64_t index_array[4];
> +	struct completion cmplt[4];
> +	struct parms parms[4];
> +	vuint32m1_t input;
> +
> +	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data,
> +				results, total_packets,
> +				categories, ctx->trans_table);
> +
> +	for (int i = 0; i < 4; i++)
> +		index_array[i] =
> +			acl_start_next_trie(&flows, parms, i, ctx);
> +
> +	acl_match_check_x4(0, ctx, parms, &flows, index_array);
> +
> +	while (flows.started > 0) {
> +		input = __riscv_vmv_v_x_u32m1(GET_NEXT_4BYTES(parms, 0),
> 4);
> +		input = __riscv_vslide1down_vx_u32m1(
> +				input, GET_NEXT_4BYTES(parms, 1), 4);
> +		input = __riscv_vslide1down_vx_u32m1(
> +				input, GET_NEXT_4BYTES(parms, 2), 4);
> +		input = __riscv_vslide1down_vx_u32m1(
> +				input, GET_NEXT_4BYTES(parms, 3), 4);
> +
> +		input = transition4_rvv(input, flows.trans, index_array);
> +		input = transition4_rvv(input, flows.trans, index_array);
> +		input = transition4_rvv(input, flows.trans, index_array);
> +		input = transition4_rvv(input, flows.trans, index_array);
> +		acl_match_check_x4(0, ctx, parms, &flows, index_array);
> +	}
> +	return 0;
> +}
> diff --git a/lib/acl/meson.build b/lib/acl/meson.build
> index 87e9f25f8e..2d2b8d46c6 100644
> --- a/lib/acl/meson.build
> +++ b/lib/acl/meson.build
> @@ -25,4 +25,6 @@ elif dpdk_conf.has('RTE_ARCH_ARM')
>      sources += files('acl_run_neon.c')
>  elif dpdk_conf.has('RTE_ARCH_PPC_64')
>      sources += files('acl_run_altivec.c')
> -endif
> +elif dpdk_conf.has('RTE_ARCH_RISCV')
> +    sources += files('acl_run_rvv.c')
> +endif
> \ No newline at end of file
> diff --git a/lib/acl/rte_acl.c b/lib/acl/rte_acl.c
> index 3f2b194206..8fc54e8037 100644
> --- a/lib/acl/rte_acl.c
> +++ b/lib/acl/rte_acl.c
> @@ -8,6 +8,10 @@
>  #include <rte_acl.h>
>  #include <rte_tailq.h>
> 
> +#if defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +#include <riscv_vector.h>
> +#endif
> +
>  #include "acl.h"
>  #include "acl_log.h"
> 
> @@ -94,6 +98,18 @@ rte_acl_classify_altivec(__rte_unused const struct
> rte_acl_ctx *ctx,
>  }
>  #endif
> 
> +#ifndef RTE_ARCH_RISCV
> +int
> +rte_acl_classify_rvv(__rte_unused const struct rte_acl_ctx *ctx,
> +	__rte_unused const uint8_t **data,
> +	__rte_unused uint32_t *results,
> +	__rte_unused uint32_t num,
> +	__rte_unused uint32_t categories)
> +{
> +	return -ENOTSUP;
> +}
> +#endif
> +
>  static const rte_acl_classify_t classify_fns[] = {
>  	[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,
>  	[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,
> @@ -103,6 +119,7 @@ static const rte_acl_classify_t classify_fns[] = {
>  	[RTE_ACL_CLASSIFY_ALTIVEC] = rte_acl_classify_altivec,
>  	[RTE_ACL_CLASSIFY_AVX512X16] = rte_acl_classify_avx512x16,
>  	[RTE_ACL_CLASSIFY_AVX512X32] = rte_acl_classify_avx512x32,
> +	[RTE_ACL_CLASSIFY_RVV] = rte_acl_classify_rvv,
>  };
> 
>  /*
> @@ -202,6 +219,28 @@ acl_check_alg_x86(enum rte_acl_classify_alg alg)
>  	return -EINVAL;
>  }
> 
> +
> +/*
> + * Helper function for acl_check_alg.
> + * Check support for RISCV specific classify methods.
> + */
> +static int
> +acl_check_alg_rvv(enum rte_acl_classify_alg alg)
> +{
> +	if (alg == RTE_ACL_CLASSIFY_RVV) {
> +#if defined(RTE_RISCV_FEATURE_V)
> +		if (__riscv_vsetvl_e32m1(RTE_ACL_RESULTS_MULTIPLIER) >=
> +				RTE_ACL_RESULTS_MULTIPLIER &&
> +				__riscv_vsetvl_e32m1(4) >= 4)
> +			return 0;
> +#endif
> +		return -ENOTSUP;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +
>  /*
>   * Check if input alg is supported by given platform/binary.
>   * Note that both conditions should be met:
> @@ -221,6 +260,8 @@ acl_check_alg(enum rte_acl_classify_alg alg)
>  	case RTE_ACL_CLASSIFY_AVX2:
>  	case RTE_ACL_CLASSIFY_SSE:
>  		return acl_check_alg_x86(alg);
> +	case RTE_ACL_CLASSIFY_RVV:
> +		return acl_check_alg_rvv(alg);
>  	/* scalar method is supported on all platforms */
>  	case RTE_ACL_CLASSIFY_SCALAR:
>  		return 0;
> @@ -249,6 +290,8 @@ acl_get_best_alg(void)
>  		RTE_ACL_CLASSIFY_AVX512X16,
>  		RTE_ACL_CLASSIFY_AVX2,
>  		RTE_ACL_CLASSIFY_SSE,
> +#elif defined(RTE_ARCH_RISCV) && defined(RTE_RISCV_FEATURE_V)
> +		RTE_ACL_CLASSIFY_RVV,
>  #endif
>  		RTE_ACL_CLASSIFY_SCALAR,
>  	};
> diff --git a/lib/acl/rte_acl.h b/lib/acl/rte_acl.h
> index 0db4600cbe..0e9d09511d 100644
> --- a/lib/acl/rte_acl.h
> +++ b/lib/acl/rte_acl.h
> @@ -303,6 +303,7 @@ enum rte_acl_classify_alg {
>  	RTE_ACL_CLASSIFY_ALTIVEC = 5,    /**< requires ALTIVEC support. */
>  	RTE_ACL_CLASSIFY_AVX512X16 = 6,  /**< requires AVX512 support. */
>  	RTE_ACL_CLASSIFY_AVX512X32 = 7,  /**< requires AVX512 support. */
> +	RTE_ACL_CLASSIFY_RVV = 8,  /**< requires RVV support. */
>  };
> 
>  /**
> --
> 2.43.0
>

next prev parent reply	other threads:[~2026-03-12  8:50 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-09 12:28 [PATCH] acl: classify with RISC-V vector extension P1erreCashon
2026-03-12  8:50 ` Konstantin Ananyev [this message]
2026-03-28 12:04   ` sunyuechi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ff8d34eecbea40208537696b4ece33ed@huawei.com \
    --to=konstantin.ananyev@huawei.com \
    --cc=2022302111412@whu.edu.cn \
    --cc=dev@dpdk.org \
    --cc=gongxiaofei24@iscas.ac.cn \
    --cc=stanislaw.kardach@gmail.com \
    --cc=sunyuechi@iscas.ac.cn \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.