[PATCH] test/flow: add support for async API

public inbox for dev@dpdk.org
 help / color / mirror / Atom feed

* [PATCH] test/flow: add support for async API
@ 2026-02-24 10:56 Maxime Peim
  2026-02-25 22:23 ` Stephen Hemminger
  0 siblings, 1 reply; 8+ messages in thread
From: Maxime Peim @ 2026-02-24 10:56 UTC (permalink / raw)
  To: Wisam Jaddo; +Cc: Maxime Peim, dev

Add async flow API mode to test-flow-perf application for improved
flow rule insertion performance. The async API allows batching flow
rule creation operations and processing completions in bulk, reducing
per-rule overhead.

New command line options:
  --async: enable async flow API mode
  --async-queue-size=N: size of async queues (default: 1024)
  --async-push-batch=N: flows to batch before push (default: 256)

Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
---
 app/test-flow-perf/actions_gen.c | 172 +++++++++++++
 app/test-flow-perf/actions_gen.h |   4 +
 app/test-flow-perf/async_flow.c  | 239 ++++++++++++++++++
 app/test-flow-perf/async_flow.h  |  41 ++++
 app/test-flow-perf/items_gen.c   |  13 +
 app/test-flow-perf/items_gen.h   |   4 +
 app/test-flow-perf/main.c        | 410 ++++++++++++++++++++++++-------
 app/test-flow-perf/meson.build   |   1 +
 8 files changed, 798 insertions(+), 86 deletions(-)
 create mode 100644 app/test-flow-perf/async_flow.c
 create mode 100644 app/test-flow-perf/async_flow.h

diff --git a/app/test-flow-perf/actions_gen.c b/app/test-flow-perf/actions_gen.c
index 9d102e3af4..af5ed2b30a 100644
--- a/app/test-flow-perf/actions_gen.c
+++ b/app/test-flow-perf/actions_gen.c
@@ -1165,3 +1165,175 @@ fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	free(queues);
 	free(hairpin_queues);
 }
+
+void
+fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+		      uint64_t *flow_actions, bool *need_wire_orig_table)
+{
+	uint8_t actions_counter = 0;
+	uint8_t i, j;
+
+	*need_wire_orig_table = false;
+
+	/* Static configurations for actions that need them in templates */
+	static struct rte_flow_action_mark mark_conf = {
+		.id = 1,
+	};
+	static struct rte_flow_action_queue queue_conf = {
+		.index = 0,
+	};
+	static struct rte_flow_action_port_id port_id_conf = {
+		.id = 0,
+	};
+	static struct rte_flow_action_jump jump_conf = {
+		.group = 1,
+	};
+	static struct rte_flow_action_modify_field set_meta_conf = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst =
+			{
+				.field = RTE_FLOW_FIELD_META,
+			},
+		.src =
+			{
+				.field = RTE_FLOW_FIELD_VALUE,
+				.value = {0, 0, 0, META_DATA},
+			},
+		.width = 32,
+	};
+
+	/* Static mask configurations for each action type */
+	static struct rte_flow_action_mark mark_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_queue queue_mask = {
+		.index = UINT16_MAX,
+	};
+	static struct rte_flow_action_jump jump_mask = {
+		.group = UINT32_MAX,
+	};
+	static struct rte_flow_action_rss rss_mask = {
+		.level = UINT32_MAX,
+		.types = UINT64_MAX,
+	};
+	static struct rte_flow_action_set_meta set_meta_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_tag set_tag_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+		.index = UINT8_MAX,
+	};
+	static struct rte_flow_action_port_id port_id_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_count count_mask;
+	static struct rte_flow_action_set_mac set_mac_mask = {
+		.mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	};
+	static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
+		.ipv4_addr = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_ipv6 set_ipv6_mask;
+	static struct rte_flow_action_set_tp set_tp_mask = {
+		.port = UINT16_MAX,
+	};
+	static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
+	static struct rte_flow_action_set_ttl set_ttl_mask = {
+		.ttl_value = UINT8_MAX,
+	};
+	static struct rte_flow_action_set_dscp set_dscp_mask = {
+		.dscp = UINT8_MAX,
+	};
+	static struct rte_flow_action_meter meter_mask = {
+		.mtr_id = UINT32_MAX,
+	};
+
+	/* Initialize ipv6 mask */
+	memset(set_ipv6_mask.ipv6_addr.a, 0xff, 16);
+
+	static const struct {
+		uint64_t flow_mask;
+		enum rte_flow_action_type type;
+		const void *action_conf;
+		const void *action_mask;
+		const bool need_wire_orig_table;
+	} template_actions[] = {
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK), RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
+		 &mark_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT), RTE_FLOW_ACTION_TYPE_COUNT, NULL,
+		 &count_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
+		 RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf, &set_meta_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG), RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
+		 &set_tag_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG), RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
+		 false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC), RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST), RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL), RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
+		 &set_ttl_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL), RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
+		 NULL, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE), RTE_FLOW_ACTION_TYPE_QUEUE,
+		 &queue_conf, &queue_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS), RTE_FLOW_ACTION_TYPE_RSS, NULL,
+		 &rss_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP), RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
+		 &jump_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID), RTE_FLOW_ACTION_TYPE_PORT_ID,
+		 &port_id_conf, &port_id_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP), RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
+		 false},
+		{HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE, &queue_conf, &queue_mask, false},
+		{HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL, &rss_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER), RTE_FLOW_ACTION_TYPE_METER, NULL,
+		 &meter_mask, false},
+	};
+
+	for (j = 0; j < MAX_ACTIONS_NUM; j++) {
+		if (flow_actions[j] == 0)
+			break;
+		for (i = 0; i < RTE_DIM(template_actions); i++) {
+			if ((flow_actions[j] & template_actions[i].flow_mask) == 0)
+				continue;
+			actions[actions_counter].type = template_actions[i].type;
+			actions[actions_counter].conf = template_actions[i].action_conf;
+			masks[actions_counter].type = template_actions[i].type;
+			masks[actions_counter].conf = template_actions[i].action_mask;
+			*need_wire_orig_table |= template_actions[i].need_wire_orig_table;
+			actions_counter++;
+			break;
+		}
+	}
+
+	actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+	masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+}
diff --git a/app/test-flow-perf/actions_gen.h b/app/test-flow-perf/actions_gen.h
index 9e13b164f9..7450d45ef7 100644
--- a/app/test-flow-perf/actions_gen.h
+++ b/app/test-flow-perf/actions_gen.h
@@ -22,4 +22,8 @@ void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
 	bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
 
+/* Fill actions template for async flow API (types only, no values) */
+void fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+			   uint64_t *flow_actions, bool *need_wire_orig_table);
+
 #endif /* FLOW_PERF_ACTION_GEN */
diff --git a/app/test-flow-perf/async_flow.c b/app/test-flow-perf/async_flow.c
new file mode 100644
index 0000000000..ba12012c85
--- /dev/null
+++ b/app/test-flow-perf/async_flow.c
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Mellanox Technologies, Ltd
+ *
+ * This file contains the async flow API implementation
+ * for the flow-perf application.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_malloc.h>
+
+#include "actions_gen.h"
+#include "async_flow.h"
+#include "flow_gen.h"
+#include "items_gen.h"
+
+/* Per-port async flow resources */
+static struct async_flow_resources port_resources[MAX_PORTS];
+
+int
+async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+		     uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+		     uint8_t flow_group, uint32_t rules_count)
+{
+	struct rte_flow_port_info port_info = {0};
+	struct rte_flow_queue_info queue_info = {0};
+	struct rte_flow_error error = {0};
+	struct rte_flow_port_attr port_attr = {0};
+	struct rte_flow_queue_attr *queue_attr = alloca(sizeof(struct rte_flow_queue_attr));
+	const struct rte_flow_queue_attr **queue_attr_list =
+		alloca(sizeof(struct rte_flow_queue_attr) * nb_queues);
+	struct rte_flow_pattern_template_attr pt_attr = {0};
+	struct rte_flow_actions_template_attr at_attr = {0};
+	struct rte_flow_template_table_attr table_attr = {0};
+	struct rte_flow_item pattern[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
+	struct async_flow_resources *res;
+	bool need_wire_orig_table = false;
+	uint32_t i;
+	int ret;
+
+	if (port_id >= MAX_PORTS)
+		return -1;
+
+	res = &port_resources[port_id];
+	memset(res, 0, sizeof(*res));
+
+	/* Query port flow info */
+	ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Limit to device capabilities if reported */
+	if (port_info.max_nb_queues != 0 && port_info.max_nb_queues != UINT32_MAX &&
+	    nb_queues > port_info.max_nb_queues)
+		nb_queues = port_info.max_nb_queues;
+	if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX &&
+	    queue_size > queue_info.max_size)
+		queue_size = queue_info.max_size;
+
+	queue_attr->size = queue_size;
+	for (i = 0; i < nb_queues; i++)
+		queue_attr_list[i] = queue_attr;
+
+	ret = rte_flow_configure(port_id, &port_attr, nb_queues, queue_attr_list, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_configure failed (ret=%d, type=%d): %s\n",
+			port_id, ret, error.type, error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Create pattern template */
+	for (i = 0; i < MAX_ATTRS_NUM; i++) {
+		if (flow_attrs[i] == 0)
+			break;
+		if (flow_attrs[i] & INGRESS)
+			pt_attr.ingress = 1;
+		else if (flow_attrs[i] & EGRESS)
+			pt_attr.egress = 1;
+		else if (flow_attrs[i] & TRANSFER)
+			pt_attr.transfer = 1;
+	}
+	/* Enable relaxed matching for better performance */
+	pt_attr.relaxed_matching = 1;
+
+	memset(pattern, 0, sizeof(pattern));
+	memset(actions, 0, sizeof(actions));
+	memset(action_masks, 0, sizeof(action_masks));
+
+	fill_items_template(pattern, flow_items, 0, 0);
+
+	res->pattern_template =
+		rte_flow_pattern_template_create(port_id, &pt_attr, pattern, &error);
+	if (res->pattern_template == NULL) {
+		fprintf(stderr, "Port %u: pattern template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return -1;
+	}
+
+	/* Create actions template */
+	at_attr.ingress = pt_attr.ingress;
+	at_attr.egress = pt_attr.egress;
+	at_attr.transfer = pt_attr.transfer;
+
+	fill_actions_template(actions, action_masks, flow_actions, &need_wire_orig_table);
+
+	res->actions_template =
+		rte_flow_actions_template_create(port_id, &at_attr, actions, action_masks, &error);
+	if (res->actions_template == NULL) {
+		fprintf(stderr, "Port %u: actions template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		return -1;
+	}
+
+	/* Create template table */
+	table_attr.flow_attr.group = flow_group;
+	table_attr.flow_attr.priority = 0;
+	table_attr.flow_attr.ingress = pt_attr.ingress;
+	table_attr.flow_attr.egress = pt_attr.egress;
+	table_attr.flow_attr.transfer = pt_attr.transfer;
+	table_attr.nb_flows = rules_count;
+
+	if (pt_attr.transfer && need_wire_orig_table)
+		table_attr.specialize = RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
+
+	res->table = rte_flow_template_table_create(port_id, &table_attr, &res->pattern_template, 1,
+						    &res->actions_template, 1, &error);
+	if (res->table == NULL) {
+		fprintf(stderr, "Port %u: template table create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		res->actions_template = NULL;
+		return -1;
+	}
+
+	res->table_capacity = rules_count;
+	res->initialized = true;
+
+	printf("Port %u: Async flow engine initialized (queues=%u, queue_size=%u)\n", port_id,
+	       nb_queues, queue_size);
+
+	return 0;
+}
+
+struct rte_flow *
+async_generate_flow(uint16_t port_id, uint32_t queue_id, uint64_t *flow_items,
+		    uint64_t *flow_actions, uint32_t counter, uint16_t hairpinq,
+		    uint64_t encap_data, uint64_t decap_data, uint16_t dst_port, uint8_t core_idx,
+		    uint8_t rx_queues_count, bool unique_data, bool postpone,
+		    struct rte_flow_error *error)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_item items[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_op_attr op_attr = {
+		.postpone = postpone,
+	};
+	struct rte_flow *flow;
+
+	if (port_id >= MAX_PORTS) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid port ID");
+		return NULL;
+	}
+
+	res = &port_resources[port_id];
+	if (!res->initialized) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Async flow resources not initialized");
+		return NULL;
+	}
+
+	/* Fill pattern items with actual values */
+	memset(items, 0, sizeof(items));
+	fill_items(items, flow_items, counter, core_idx);
+
+	/* Fill actions with actual values */
+	memset(actions, 0, sizeof(actions));
+	fill_actions(actions, flow_actions, counter, JUMP_ACTION_TABLE, hairpinq, encap_data,
+		     decap_data, core_idx, unique_data, rx_queues_count, dst_port);
+
+	/* Create flow asynchronously */
+	flow = rte_flow_async_create(port_id, queue_id, &op_attr, res->table, items, 0, actions, 0,
+				     NULL, error);
+
+	return flow;
+}
+
+void
+async_flow_cleanup_port(uint16_t port_id)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_error error;
+	struct rte_flow_op_result results[64];
+	int ret, i;
+
+	if (port_id >= MAX_PORTS)
+		return;
+
+	res = &port_resources[port_id];
+	if (!res->initialized)
+		return;
+
+	/* Drain any pending async completions from flow flush */
+	for (i = 0; i < 100; i++) { /* Max iterations to avoid infinite loop */
+		rte_flow_push(port_id, 0, &error);
+		ret = rte_flow_pull(port_id, 0, results, 64, &error);
+		if (ret <= 0)
+			break;
+	}
+
+	if (res->table != NULL) {
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		res->table = NULL;
+	}
+
+	if (res->actions_template != NULL) {
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		res->actions_template = NULL;
+	}
+
+	if (res->pattern_template != NULL) {
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+	}
+
+	res->initialized = false;
+}
diff --git a/app/test-flow-perf/async_flow.h b/app/test-flow-perf/async_flow.h
new file mode 100644
index 0000000000..2684fc4156
--- /dev/null
+++ b/app/test-flow-perf/async_flow.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Mellanox Technologies, Ltd
+ *
+ * This file contains the async flow API related definitions
+ * and function declarations.
+ */
+
+#ifndef FLOW_PERF_ASYNC_FLOW
+#define FLOW_PERF_ASYNC_FLOW
+
+#include <rte_flow.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "config.h"
+
+/* Per-port async flow resources */
+struct async_flow_resources {
+	struct rte_flow_pattern_template *pattern_template;
+	struct rte_flow_actions_template *actions_template;
+	struct rte_flow_template_table *table;
+	uint32_t table_capacity;
+	bool initialized;
+};
+
+/* Initialize async flow engine for a port */
+int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+			 uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+			 uint8_t flow_group, uint32_t rules_count);
+
+/* Create a flow rule asynchronously */
+struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id, uint64_t *flow_items,
+				     uint64_t *flow_actions, uint32_t counter, uint16_t hairpinq,
+				     uint64_t encap_data, uint64_t decap_data, uint16_t dst_port,
+				     uint8_t core_idx, uint8_t rx_queues_count, bool unique_data,
+				     bool postpone, struct rte_flow_error *error);
+
+/* Cleanup async flow resources for a port */
+void async_flow_cleanup_port(uint16_t port_id);
+
+#endif /* FLOW_PERF_ASYNC_FLOW */
diff --git a/app/test-flow-perf/items_gen.c b/app/test-flow-perf/items_gen.c
index c740e1838f..4f20175f01 100644
--- a/app/test-flow-perf/items_gen.c
+++ b/app/test-flow-perf/items_gen.c
@@ -389,3 +389,16 @@ fill_items(struct rte_flow_item *items,
 
 	items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
 }
+
+void
+fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+		    uint8_t core_idx)
+{
+	uint8_t i;
+
+	fill_items(items, flow_items, outer_ip_src, core_idx);
+
+	/* For templates, set spec to NULL - only mask matters for template matching */
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++)
+		items[i].spec = NULL;
+}
diff --git a/app/test-flow-perf/items_gen.h b/app/test-flow-perf/items_gen.h
index f4b0e9a981..50bb4d9fd0 100644
--- a/app/test-flow-perf/items_gen.h
+++ b/app/test-flow-perf/items_gen.h
@@ -15,4 +15,8 @@
 void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
 	uint32_t outer_ip_src, uint8_t core_idx);
 
+/* Fill items template for async flow API (masks only, no spec values) */
+void fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+			 uint8_t core_idx);
+
 #endif /* FLOW_PERF_ITEMS_GEN */
diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 6636d1517f..32f2260ba0 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -37,9 +37,11 @@
 #include <rte_mtr.h>
 #include <rte_os_shim.h>
 
-#include "config.h"
 #include "actions_gen.h"
+#include "async_flow.h"
+#include "config.h"
 #include "flow_gen.h"
+#include "rte_common.h"
 
 #define MAX_BATCHES_COUNT          100
 #define DEFAULT_RULES_COUNT    4000000
@@ -81,6 +83,9 @@ static bool enable_fwd;
 static bool unique_data;
 static bool policy_mtr;
 static bool packet_mode;
+static bool async_mode;
+static uint32_t async_queue_size = 1024;
+static uint32_t async_push_batch = 256;
 
 static uint8_t rx_queues_count;
 static uint8_t tx_queues_count;
@@ -598,6 +603,13 @@ usage(char *progname)
 		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
 		"With fixed values\n");
 	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
+
+	printf("\nAsync flow API options:\n");
+	printf("  --async: enable async flow API mode\n");
+	printf("  --async-queue-size=N: size of each async queue,"
+	       " default is 1024\n");
+	printf("  --async-push-batch=N: flows to batch before push,"
+	       " default is 256\n");
 }
 
 static void
@@ -655,86 +667,90 @@ args_parse(int argc, char **argv)
 
 	static const struct option lgopts[] = {
 		/* Control */
-		{ "help",                       0, 0, 0 },
-		{ "rules-count",                1, 0, 0 },
-		{ "rules-batch",                1, 0, 0 },
-		{ "dump-iterations",            0, 0, 0 },
-		{ "deletion-rate",              0, 0, 0 },
-		{ "query-rate",                 0, 0, 0 },
-		{ "dump-socket-mem",            0, 0, 0 },
-		{ "enable-fwd",                 0, 0, 0 },
-		{ "unique-data",                0, 0, 0 },
-		{ "portmask",                   1, 0, 0 },
-		{ "hairpin-conf",               1, 0, 0 },
-		{ "cores",                      1, 0, 0 },
-		{ "random-priority",            1, 0, 0 },
-		{ "meter-profile-alg",          1, 0, 0 },
-		{ "rxq",                        1, 0, 0 },
-		{ "txq",                        1, 0, 0 },
-		{ "rxd",                        1, 0, 0 },
-		{ "txd",                        1, 0, 0 },
-		{ "mbuf-size",                  1, 0, 0 },
-		{ "mbuf-cache-size",            1, 0, 0 },
-		{ "total-mbuf-count",           1, 0, 0 },
+		{"help", 0, 0, 0},
+		{"rules-count", 1, 0, 0},
+		{"rules-batch", 1, 0, 0},
+		{"dump-iterations", 0, 0, 0},
+		{"deletion-rate", 0, 0, 0},
+		{"query-rate", 0, 0, 0},
+		{"dump-socket-mem", 0, 0, 0},
+		{"enable-fwd", 0, 0, 0},
+		{"unique-data", 0, 0, 0},
+		{"portmask", 1, 0, 0},
+		{"hairpin-conf", 1, 0, 0},
+		{"cores", 1, 0, 0},
+		{"random-priority", 1, 0, 0},
+		{"meter-profile-alg", 1, 0, 0},
+		{"rxq", 1, 0, 0},
+		{"txq", 1, 0, 0},
+		{"rxd", 1, 0, 0},
+		{"txd", 1, 0, 0},
+		{"mbuf-size", 1, 0, 0},
+		{"mbuf-cache-size", 1, 0, 0},
+		{"total-mbuf-count", 1, 0, 0},
 		/* Attributes */
-		{ "ingress",                    0, 0, 0 },
-		{ "egress",                     0, 0, 0 },
-		{ "transfer",                   0, 0, 0 },
-		{ "group",                      1, 0, 0 },
+		{"ingress", 0, 0, 0},
+		{"egress", 0, 0, 0},
+		{"transfer", 0, 0, 0},
+		{"group", 1, 0, 0},
 		/* Items */
-		{ "ether",                      0, 0, 0 },
-		{ "vlan",                       0, 0, 0 },
-		{ "ipv4",                       0, 0, 0 },
-		{ "ipv6",                       0, 0, 0 },
-		{ "tcp",                        0, 0, 0 },
-		{ "udp",                        0, 0, 0 },
-		{ "vxlan",                      0, 0, 0 },
-		{ "vxlan-gpe",                  0, 0, 0 },
-		{ "gre",                        0, 0, 0 },
-		{ "geneve",                     0, 0, 0 },
-		{ "gtp",                        0, 0, 0 },
-		{ "meta",                       0, 0, 0 },
-		{ "tag",                        0, 0, 0 },
-		{ "icmpv4",                     0, 0, 0 },
-		{ "icmpv6",                     0, 0, 0 },
+		{"ether", 0, 0, 0},
+		{"vlan", 0, 0, 0},
+		{"ipv4", 0, 0, 0},
+		{"ipv6", 0, 0, 0},
+		{"tcp", 0, 0, 0},
+		{"udp", 0, 0, 0},
+		{"vxlan", 0, 0, 0},
+		{"vxlan-gpe", 0, 0, 0},
+		{"gre", 0, 0, 0},
+		{"geneve", 0, 0, 0},
+		{"gtp", 0, 0, 0},
+		{"meta", 0, 0, 0},
+		{"tag", 0, 0, 0},
+		{"icmpv4", 0, 0, 0},
+		{"icmpv6", 0, 0, 0},
 		/* Actions */
-		{ "port-id",                    2, 0, 0 },
-		{ "rss",                        0, 0, 0 },
-		{ "queue",                      0, 0, 0 },
-		{ "jump",                       0, 0, 0 },
-		{ "mark",                       0, 0, 0 },
-		{ "count",                      0, 0, 0 },
-		{ "set-meta",                   0, 0, 0 },
-		{ "set-tag",                    0, 0, 0 },
-		{ "drop",                       0, 0, 0 },
-		{ "hairpin-queue",              1, 0, 0 },
-		{ "hairpin-rss",                1, 0, 0 },
-		{ "set-src-mac",                0, 0, 0 },
-		{ "set-dst-mac",                0, 0, 0 },
-		{ "set-src-ipv4",               0, 0, 0 },
-		{ "set-dst-ipv4",               0, 0, 0 },
-		{ "set-src-ipv6",               0, 0, 0 },
-		{ "set-dst-ipv6",               0, 0, 0 },
-		{ "set-src-tp",                 0, 0, 0 },
-		{ "set-dst-tp",                 0, 0, 0 },
-		{ "inc-tcp-ack",                0, 0, 0 },
-		{ "dec-tcp-ack",                0, 0, 0 },
-		{ "inc-tcp-seq",                0, 0, 0 },
-		{ "dec-tcp-seq",                0, 0, 0 },
-		{ "set-ttl",                    0, 0, 0 },
-		{ "dec-ttl",                    0, 0, 0 },
-		{ "set-ipv4-dscp",              0, 0, 0 },
-		{ "set-ipv6-dscp",              0, 0, 0 },
-		{ "flag",                       0, 0, 0 },
-		{ "meter",                      0, 0, 0 },
-		{ "raw-encap",                  1, 0, 0 },
-		{ "raw-decap",                  1, 0, 0 },
-		{ "vxlan-encap",                0, 0, 0 },
-		{ "vxlan-decap",                0, 0, 0 },
-		{ "policy-mtr",                 1, 0, 0 },
-		{ "meter-profile",              1, 0, 0 },
-		{ "packet-mode",                0, 0, 0 },
-		{ 0, 0, 0, 0 },
+		{"port-id", 2, 0, 0},
+		{"rss", 0, 0, 0},
+		{"queue", 0, 0, 0},
+		{"jump", 0, 0, 0},
+		{"mark", 0, 0, 0},
+		{"count", 0, 0, 0},
+		{"set-meta", 0, 0, 0},
+		{"set-tag", 0, 0, 0},
+		{"drop", 0, 0, 0},
+		{"hairpin-queue", 1, 0, 0},
+		{"hairpin-rss", 1, 0, 0},
+		{"set-src-mac", 0, 0, 0},
+		{"set-dst-mac", 0, 0, 0},
+		{"set-src-ipv4", 0, 0, 0},
+		{"set-dst-ipv4", 0, 0, 0},
+		{"set-src-ipv6", 0, 0, 0},
+		{"set-dst-ipv6", 0, 0, 0},
+		{"set-src-tp", 0, 0, 0},
+		{"set-dst-tp", 0, 0, 0},
+		{"inc-tcp-ack", 0, 0, 0},
+		{"dec-tcp-ack", 0, 0, 0},
+		{"inc-tcp-seq", 0, 0, 0},
+		{"dec-tcp-seq", 0, 0, 0},
+		{"set-ttl", 0, 0, 0},
+		{"dec-ttl", 0, 0, 0},
+		{"set-ipv4-dscp", 0, 0, 0},
+		{"set-ipv6-dscp", 0, 0, 0},
+		{"flag", 0, 0, 0},
+		{"meter", 0, 0, 0},
+		{"raw-encap", 1, 0, 0},
+		{"raw-decap", 1, 0, 0},
+		{"vxlan-encap", 0, 0, 0},
+		{"vxlan-decap", 0, 0, 0},
+		{"policy-mtr", 1, 0, 0},
+		{"meter-profile", 1, 0, 0},
+		{"packet-mode", 0, 0, 0},
+		/* Async flow API options */
+		{"async", 0, 0, 0},
+		{"async-queue-size", 1, 0, 0},
+		{"async-push-batch", 1, 0, 0},
+		{0, 0, 0, 0},
 	};
 
 	RTE_ETH_FOREACH_DEV(i)
@@ -913,14 +929,15 @@ args_parse(int argc, char **argv)
 					rte_exit(EXIT_FAILURE, "Invalid hairpin config mask\n");
 				hairpin_conf_mask = hp_conf;
 			}
-			if (strcmp(lgopts[opt_idx].name,
-					"port-id") == 0) {
+			if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
 				uint16_t port_idx = 0;
 
-				token = strtok(optarg, ",");
-				while (token != NULL) {
-					dst_ports[port_idx++] = atoi(token);
-					token = strtok(NULL, ",");
+				if (optarg != NULL) {
+					token = strtok(optarg, ",");
+					while (token != NULL) {
+						dst_ports[port_idx++] = atoi(token);
+						token = strtok(NULL, ",");
+					}
 				}
 			}
 			if (strcmp(lgopts[opt_idx].name, "rxq") == 0) {
@@ -981,6 +998,22 @@ args_parse(int argc, char **argv)
 			}
 			if (strcmp(lgopts[opt_idx].name, "packet-mode") == 0)
 				packet_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async") == 0)
+				async_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async-queue-size") == 0) {
+				n = atoi(optarg);
+				if (n > 0)
+					async_queue_size = n;
+				else
+					rte_exit(EXIT_FAILURE, "async-queue-size should be > 0\n");
+			}
+			if (strcmp(lgopts[opt_idx].name, "async-push-batch") == 0) {
+				n = atoi(optarg);
+				if (n > 0)
+					async_push_batch = n;
+				else
+					rte_exit(EXIT_FAILURE, "async-push-batch should be > 0\n");
+			}
 			break;
 		default:
 			usage(argv[0]);
@@ -1578,6 +1611,197 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 	return flows_list;
 }
 
+static inline int
+push_pull_flows_async(int port_id, int queue_id, uint64_t enqueued, uint64_t *in_flight,
+		      bool force_push, bool force_pull, bool check_op_status,
+		      struct rte_flow_op_result *results, struct rte_flow_error *error)
+{
+	/* Keep queue at most 75% full to avoid overflow */
+	uint32_t max_in_flight = (async_queue_size * 3) / 4;
+	int pulled, i;
+	int ret = 0;
+	bool do_pull = force_pull || *in_flight >= max_in_flight;
+	/* If we need to pull, we want all the in fligt work to have been pushed */
+	bool do_push = do_pull || force_push || (enqueued % async_push_batch) == 0;
+
+	/* Push periodically to give HW work to do */
+	if (do_push) {
+		ret = rte_flow_push(port_id, queue_id, error);
+		if (ret)
+			return ret;
+	}
+
+	/* Check if queue is getting full, if so push and drain completions */
+	if (do_pull) {
+		do {
+			pulled = rte_flow_pull(port_id, queue_id, results, async_push_batch, error);
+			if (pulled < 0) {
+				return -1;
+			} else if (pulled == 0) {
+				rte_pause();
+				continue;
+			}
+
+			*in_flight -= pulled;
+			if (!check_op_status)
+				continue;
+
+			for (i = 0; i < pulled; i++) {
+				if (results[i].status != RTE_FLOW_OP_SUCCESS) {
+					rte_flow_error_set(error, EINVAL,
+							   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+							   "Some flow rule insertion failed");
+					return -1;
+				}
+			}
+		} while (*in_flight >= max_in_flight);
+	}
+
+	return 0;
+}
+
+static struct rte_flow **
+insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id)
+{
+	struct rte_flow **flows_list;
+	struct rte_flow_error error;
+	struct rte_flow_op_result *results;
+	clock_t start_batch, end_batch;
+	double first_flow_latency;
+	double cpu_time_used;
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
+	double delta;
+	uint32_t flow_index;
+	uint32_t counter, start_counter = 0, end_counter;
+	int rules_batch_idx;
+	int rules_count_per_core;
+	uint64_t total_enqueued = 0;
+	uint64_t in_flight = 0;
+	uint32_t queue_id = core_id;
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	flows_list = rte_zmalloc("flows_list",
+				 (sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
+	if (flows_list == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+
+	results = rte_zmalloc("results", sizeof(struct rte_flow_op_result) * async_push_batch, 0);
+	if (results == NULL) {
+		rte_free(flows_list);
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+	}
+
+	cpu_time_used = 0;
+	flow_index = 0;
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+
+		uint64_t global_items[MAX_ITEMS_NUM] = {0};
+		uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
+		global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+		global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
+		flow = generate_flow(port_id, 0, flow_attrs, global_items, global_actions,
+				     flow_group, 0, 0, 0, 0, dst_port_id, core_id, rx_queues_count,
+				     unique_data, max_priority, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_get_timer_cycles();
+	for (counter = start_counter; counter < end_counter; counter++) {
+		if (push_pull_flows_async(port_id, queue_id, total_enqueued, &in_flight, false,
+					  false, false, results, &error)) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error push/pull async operations\n");
+		}
+
+		/* Create flow with postpone=true to batch operations */
+		flow = async_generate_flow(port_id, queue_id, flow_items, flow_actions, counter,
+					   hairpin_queues_num, encap_data, decap_data, dst_port_id,
+					   core_id, rx_queues_count, unique_data, true, &error);
+
+		if (counter == start_counter) {
+			first_flow_latency = (double)(rte_get_timer_cycles() - start_batch);
+			first_flow_latency /= rte_get_timer_hz();
+			/* In millisecond */
+			first_flow_latency *= 1000;
+			printf(":: First Flow Latency (Async) :: Port %d :: First flow "
+			       "installed in %f milliseconds\n",
+			       port_id, first_flow_latency);
+		}
+
+		if (force_quit)
+			break;
+
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating async flow\n");
+		}
+
+		flows_list[flow_index++] = flow;
+		total_enqueued++;
+		in_flight++;
+
+		/*
+		 * Save the insertion rate for rules batch.
+		 * Check if the insertion reached the rules
+		 * patch counter, then save the insertion rate
+		 * for this batch.
+		 */
+		if (!((counter + 1) % rules_batch)) {
+			end_batch = rte_get_timer_cycles();
+			delta = (double)(end_batch - start_batch);
+			rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+			cpu_time_per_batch[rules_batch_idx] = delta / rte_get_timer_hz();
+			cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+			start_batch = rte_get_timer_cycles();
+		}
+	}
+
+	if (push_pull_flows_async(port_id, queue_id, total_enqueued, &in_flight, true, true, true,
+				  results, &error)) {
+		print_flow_error(error);
+		rte_exit(EXIT_FAILURE, "Error final push/pull async operations\n");
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] - end @[%d]\n", port_id,
+	       core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	insertion_rate = ((double)(rules_count_per_core / cpu_time_used) / 1000);
+	printf(":: Port %d :: Core %d :: Async rules insertion rate -> %f K Rule/Sec\n", port_id,
+	       core_id, insertion_rate);
+	printf(":: Port %d :: Core %d :: The time for creating %d async rules is %f seconds\n",
+	       port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	rte_free(results);
+	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
+	return flows_list;
+}
+
 static void
 flows_handler(uint8_t core_id)
 {
@@ -1602,8 +1826,10 @@ flows_handler(uint8_t core_id)
 		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 		if (has_meter())
 			meters_handler(port_id, core_id, METER_CREATE);
-		flows_list = insert_flows(port_id, core_id,
-						dst_ports[port_idx++]);
+		if (async_mode)
+			flows_list = insert_flows_async(port_id, core_id, dst_ports[port_idx++]);
+		else
+			flows_list = insert_flows(port_id, core_id, dst_ports[port_idx++]);
 		if (flows_list == NULL)
 			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
 		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
@@ -2212,6 +2438,16 @@ init_port(void)
 			}
 		}
 
+		/* Configure async flow engine before device start */
+		if (async_mode) {
+			ret = async_flow_init_port(port_id, mc_pool.cores_count, async_queue_size,
+						   flow_items, flow_actions, flow_attrs, flow_group,
+						   rules_count);
+			if (ret != 0)
+				rte_exit(EXIT_FAILURE, "Failed to init async flow on port %d\n",
+					 port_id);
+		}
+
 		ret = rte_eth_dev_start(port_id);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE,
@@ -2291,6 +2527,8 @@ main(int argc, char **argv)
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
+		if (async_mode)
+			async_flow_cleanup_port(port);
 		if (rte_eth_dev_stop(port) != 0)
 			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
diff --git a/app/test-flow-perf/meson.build b/app/test-flow-perf/meson.build
index e101449e32..2f820a7597 100644
--- a/app/test-flow-perf/meson.build
+++ b/app/test-flow-perf/meson.build
@@ -3,6 +3,7 @@
 
 sources = files(
         'actions_gen.c',
+        'async_flow.c',
         'flow_gen.c',
         'items_gen.c',
         'main.c',
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] test/flow: add support for async API
  2026-02-24 10:56 [PATCH] test/flow: add support for async API Maxime Peim
@ 2026-02-25 22:23 ` Stephen Hemminger
  2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
                     ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: Stephen Hemminger @ 2026-02-25 22:23 UTC (permalink / raw)
  To: Maxime Peim; +Cc: Wisam Jaddo, dev

On Tue, 24 Feb 2026 11:56:47 +0100
Maxime Peim <maxime.peim@gmail.com> wrote:

> Add async flow API mode to test-flow-perf application for improved
> flow rule insertion performance. The async API allows batching flow
> rule creation operations and processing completions in bulk, reducing
> per-rule overhead.
> 
> New command line options:
>   --async: enable async flow API mode
>   --async-queue-size=N: size of async queues (default: 1024)
>   --async-push-batch=N: flows to batch before push (default: 256)
> 
> Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
> ---
>  app/test-flow-perf/actions_gen.c | 172 +++++++++++++
>  app/test-flow-perf/actions_gen.h |   4 +
>  app/test-flow-perf/async_flow.c  | 239 ++++++++++++++++++
>  app/test-flow-perf/async_flow.h  |  41 ++++
>  app/test-flow-perf/items_gen.c   |  13 +
>  app/test-flow-perf/items_gen.h   |   4 +
>  app/test-flow-perf/main.c        | 410 ++++++++++++++++++++++++-------
>  app/test-flow-perf/meson.build   |   1 +
>  8 files changed, 798 insertions(+), 86 deletions(-)
>  create mode 100644 app/test-flow-perf/async_flow.c
>  create mode 100644 app/test-flow-perf/async_flow.h

Looked good to me, but AI more detailed dive found some things:

The one thing it found was:

Race condition on `flow` variable — insert_flows_async() is called
concurrently from multiple cores but uses a file-scope static `flow`
pointer. One core can overwrite it between async_generate_flow() and
the flows_list assignment. Needs a local `struct rte_flow *flow;`
declaration.

But lots more stuff that you should address:

# Review: [PATCH] test/flow: add support for async API

**Patch**: test/flow: add support for async API
**Author**: Maxime Peim <maxime.peim@gmail.com>
**Files changed**: 8 (798 insertions, 86 deletions)

---

## Errors

### 1. `alloca()` for `queue_attr_list` allocates wrong size (buffer overflow)

In `async_flow.c`, `async_flow_init_port()`:

```c
const struct rte_flow_queue_attr **queue_attr_list =
    alloca(sizeof(struct rte_flow_queue_attr) * nb_queues);
```

This allocates `nb_queues * sizeof(struct rte_flow_queue_attr)` bytes, but `queue_attr_list` is an array of **pointers** (`const struct rte_flow_queue_attr **`). It should allocate `nb_queues * sizeof(struct rte_flow_queue_attr *)`. If `sizeof(struct rte_flow_queue_attr) < sizeof(pointer)`, this is a buffer overflow when iterating `nb_queues` entries. If `sizeof(struct rte_flow_queue_attr) > sizeof(pointer)`, it wastes stack space but is not dangerous.

**Fix**:
```c
const struct rte_flow_queue_attr **queue_attr_list =
    alloca(sizeof(struct rte_flow_queue_attr *) * nb_queues);
```

Or more idiomatically: `alloca(sizeof(*queue_attr_list) * nb_queues)`.

### 2. `alloca()` with unbounded `nb_queues` — potential stack overflow

`nb_queues` comes from `mc_pool.cores_count` which is user-controlled via `--cores`. While it's capped by `port_info.max_nb_queues` if the device reports it, if `max_nb_queues` is 0 or `UINT32_MAX` (which the code explicitly skips), there's no upper bound enforced before `alloca()`. A large value could blow the stack.

**Fix**: Add a reasonable upper bound check before the `alloca()` calls, or use `rte_malloc`/`malloc` instead.

### 3. `flow` variable used in `insert_flows_async()` without declaration

The function `insert_flows_async()` in `main.c` references `flow` (e.g., line 994: `flow = generate_flow(...)`, line 1014: `flow = async_generate_flow(...)`) but `flow` is never declared as a local variable within this function. It appears to rely on a file-scope `static struct rte_flow *flow` variable. This creates a hidden data race: if multiple cores call `insert_flows_async()` concurrently (which they do — `flows_handler` is called per-core), they share the same `flow` pointer without synchronization. One core could overwrite `flow` between the `async_generate_flow()` call and the `flows_list[flow_index++] = flow` assignment.

**Fix**: Declare `struct rte_flow *flow;` as a local variable inside `insert_flows_async()`.

### 4. Missing NULL check on `flow` before measuring first-flow latency

In `insert_flows_async()`:

```c
flow = async_generate_flow(...);

if (counter == start_counter) {
    first_flow_latency = ...;
    printf(":: First Flow Latency ...\n");
}

if (force_quit)
    break;

if (!flow) {
    print_flow_error(error);
    rte_exit(EXIT_FAILURE, ...);
}
```

The `flow == NULL` check happens **after** the first-flow latency measurement and the `force_quit` check. If the first flow creation fails (`flow == NULL`) and `force_quit` is set, the code breaks out of the loop and proceeds to use `flows_list` which has a NULL entry at index 0, potentially causing issues during cleanup. More critically, if it's the first flow and it fails, the code prints a misleading "First Flow installed" latency message before discovering the failure.

**Fix**: Move the `if (!flow)` check immediately after `async_generate_flow()`, before the latency print.

### 5. `port_attr.nb_counters` not set — may cause configure failure

In `async_flow_init_port()`, `port_attr` is zero-initialized but `nb_counters` and `nb_aging_objects` are not set even though the flow actions may include `COUNT` or metering. Some PMDs require these to be nonzero when counters are used with async flows. This could cause silent failures or `rte_flow_configure` to reject the configuration.

This is a moderate-confidence concern (~60%) — it depends on the specific PMD, but the code doesn't attempt to set these fields at all.

### 6. `#include "rte_common.h"` — incorrect include style

In `main.c`:
```c
#include "rte_common.h"
```

DPDK public headers should be included with angle brackets: `#include <rte_common.h>`. Using quotes searches the local directory first, which is incorrect for a system/library header. This could also fail to compile if the local directory doesn't have a copy.

---

## Warnings

### 7. `memset` on `set_ipv6_mask` inside function body with `static` variable

In `fill_actions_template()`:
```c
static struct rte_flow_action_set_ipv6 set_ipv6_mask;
/* ... */
memset(set_ipv6_mask.ipv6_addr.a, 0xff, 16);
```

The `memset` is called **every time** `fill_actions_template()` is invoked, but the variable is `static`. This is redundant after the first call and suggests the initialization should either be done once (with a static flag or at file scope) or the variable shouldn't be `static`. It's not a bug, but it's wasteful and indicates a design inconsistency — all other static mask variables are initialized at declaration.

### 8. `actions_counter` is `uint8_t` — can overflow with many actions

In `fill_actions_template()`, `actions_counter` is declared as `uint8_t`, which limits it to 255 entries. While `MAX_ACTIONS_NUM` is likely smaller, there's no bounds check against the `actions[]` and `masks[]` array sizes. If the template_actions table plus the END sentinel exceeds the caller's array, this silently writes out of bounds.

### 9. Unnecessary `rte_zmalloc` for `results` array in `insert_flows_async()`

```c
results = rte_zmalloc("results", sizeof(struct rte_flow_op_result) * async_push_batch, 0);
```

Per AGENTS.md guidelines, `rte_malloc` is for DMA-accessible or shared memory. The `results` array is only used locally by the CPU for pulling flow operation results. Standard `malloc` (or even a stack allocation if `async_push_batch` is bounded) would be more appropriate and wouldn't consume limited hugepage memory.

### 10. Gratuitous reformatting of the `lgopts[]` table

The patch reformats the entire existing `lgopts[]` table from `{ "name", ...}` with alignment spaces to `{"name", ...}` without alignment. This changes 60+ lines that have nothing to do with the async feature, making the diff much harder to review and potentially causing merge conflicts with other in-flight patches.

This should be a separate cleanup patch, or the new entries should match the existing style.

### 11. `rte_zmalloc` for `flows_list` — unnecessary hugepage usage

```c
flows_list = rte_zmalloc("flows_list",
    (sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
```

This is a pointer array used only for bookkeeping. Standard `calloc` is appropriate here. (Note: the existing `insert_flows()` has the same pattern, so this may be intentional consistency, but it's still worth noting.)

### 12. Division by zero if `cpu_time_used` is zero

In `insert_flows_async()`:
```c
insertion_rate = ((double)(rules_count_per_core / cpu_time_used) / 1000);
```

If the loop completes very quickly or `rules_count_per_core` is 0, `cpu_time_used` could be 0, causing a floating-point division by zero (infinity/NaN). Additionally, the integer division `rules_count_per_core / cpu_time_used` truncates — it should be `(double)rules_count_per_core / cpu_time_used` to get an accurate rate.

### 13. Commit message body line length

The commit body line `"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan"` in the usage() help text additions (within the code) is fine, but the actual commit message body should be checked — the `--async-queue-size=N:` line formatting appears tight but within limits.

### 14. Copyright uses "Mellanox Technologies" — potentially outdated

The new file `async_flow.c` and `async_flow.h` use `Copyright 2026 Mellanox Technologies, Ltd`. The author's email is `@gmail.com` (individual), and Mellanox was acquired by NVIDIA in 2020. This is noted only as an observation per AGENTS.md — the copyright holder's choice is not subject to review.

---

## Summary

The most critical finding is **#3** — the `flow` variable race condition in the multi-core `insert_flows_async()` path. If multiple cores run this function concurrently (which is the intended use), they would share and clobber a file-scope `flow` pointer. This is a real correctness bug that would cause intermittent failures.

The **#1** `alloca()` size mismatch is also a clear bug that could cause memory corruption depending on the struct size vs pointer size relationship on the target architecture.

The remaining issues range from stack safety (#2), error handling order (#4), to style/efficiency concerns (#7–#12).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] test/flow: add support for async API
  2026-02-25 22:23 ` Stephen Hemminger
@ 2026-03-01 23:29   ` Maxime Peim
  2026-03-01 23:36     ` Maxime Peim
  2026-03-02  0:52     ` Stephen Hemminger
  2026-03-02 10:57   ` [PATCH v3] " Maxime Peim
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 8+ messages in thread
From: Maxime Peim @ 2026-03-01 23:29 UTC (permalink / raw)
  To: dev; +Cc: stephen

Add async flow API mode to test-flow-perf application for improved
flow rule insertion performance. The async API allows batching flow
rule creation operations and processing completions in bulk, reducing
per-rule overhead.

New command line options:
  --async: enable async flow API mode
  --async-queue-size=N: size of async queues (default: 1024)
  --async-push-batch=N: flows to batch before push (default: 256)

Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
---
v2:
  - Replace per-flow stack allocation with pre-allocated slot pool;
    flat buffers are initialized once at init time and the hot path
    only patches per-flow item/action values into a pre-set slot
  - Fix alloca misuse: use heap allocation for queue_attr_list, round
    queue_size to power of 2 for bitmask wrapping, add bounds checks
  - Fix race on file-scope flow variable, premature latency
    measurement, and integer division in rate calculation
  - Drop unrelated lgopts reformatting
  - Use malloc instead of rte_zmalloc for non-dataplane allocations
  - Various robustness and style fixes

 app/test-flow-perf/actions_gen.c | 281 +++++++++++-
 app/test-flow-perf/actions_gen.h |  31 ++
 app/test-flow-perf/async_flow.c  | 761 +++++++++++++++++++++++++++++++
 app/test-flow-perf/async_flow.h  |  54 +++
 app/test-flow-perf/items_gen.c   |  58 +++
 app/test-flow-perf/items_gen.h   |   6 +
 app/test-flow-perf/main.c        | 302 +++++++++++-
 app/test-flow-perf/meson.build   |   1 +
 8 files changed, 1454 insertions(+), 40 deletions(-)
 create mode 100644 app/test-flow-perf/async_flow.c
 create mode 100644 app/test-flow-perf/async_flow.h

diff --git a/app/test-flow-perf/actions_gen.c b/app/test-flow-perf/actions_gen.c
index 9d102e3af4..2b8edd50c8 100644
--- a/app/test-flow-perf/actions_gen.c
+++ b/app/test-flow-perf/actions_gen.c
@@ -36,27 +36,7 @@ struct additional_para {
 	bool unique_data;
 };
 
-/* Storage for struct rte_flow_action_raw_encap including external data. */
-struct action_raw_encap_data {
-	struct rte_flow_action_raw_encap conf;
-	uint8_t data[128];
-	uint8_t preserve[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_raw_decap including external data. */
-struct action_raw_decap_data {
-	struct rte_flow_action_raw_decap conf;
-	uint8_t data[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_rss including external data. */
-struct action_rss_data {
-	struct rte_flow_action_rss conf;
-	uint8_t key[40];
-	uint16_t queue[128];
-};
+/* Compound action data structs defined in actions_gen.h */
 
 static void
 add_mark(struct rte_flow_action *actions,
@@ -1165,3 +1145,262 @@ fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	free(queues);
 	free(hairpin_queues);
 }
+
+static size_t
+action_conf_size(enum rte_flow_action_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ACTION_TYPE_MARK:
+		return sizeof(struct rte_flow_action_mark);
+	case RTE_FLOW_ACTION_TYPE_QUEUE:
+		return sizeof(struct rte_flow_action_queue);
+	case RTE_FLOW_ACTION_TYPE_JUMP:
+		return sizeof(struct rte_flow_action_jump);
+	case RTE_FLOW_ACTION_TYPE_RSS:
+		return sizeof(struct action_rss_data);
+	case RTE_FLOW_ACTION_TYPE_SET_META:
+		return sizeof(struct rte_flow_action_set_meta);
+	case RTE_FLOW_ACTION_TYPE_SET_TAG:
+		return sizeof(struct rte_flow_action_set_tag);
+	case RTE_FLOW_ACTION_TYPE_PORT_ID:
+		return sizeof(struct rte_flow_action_port_id);
+	case RTE_FLOW_ACTION_TYPE_COUNT:
+		return sizeof(struct rte_flow_action_count);
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+		return sizeof(struct rte_flow_action_set_mac);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+		return sizeof(struct rte_flow_action_set_ipv4);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+		return sizeof(struct rte_flow_action_set_ipv6);
+	case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+		return sizeof(struct rte_flow_action_set_tp);
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+		return sizeof(rte_be32_t);
+	case RTE_FLOW_ACTION_TYPE_SET_TTL:
+		return sizeof(struct rte_flow_action_set_ttl);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+		return sizeof(struct rte_flow_action_set_dscp);
+	case RTE_FLOW_ACTION_TYPE_METER:
+		return sizeof(struct rte_flow_action_meter);
+	case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+		return sizeof(struct action_raw_encap_data);
+	case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+		return sizeof(struct action_raw_decap_data);
+	case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		return sizeof(struct rte_flow_action_vxlan_encap) +
+		       5 * sizeof(struct rte_flow_item) + sizeof(struct rte_flow_item_eth) +
+		       sizeof(struct rte_flow_item_ipv4) + sizeof(struct rte_flow_item_udp) +
+		       sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD:
+		return sizeof(struct rte_flow_action_modify_field);
+	/* Zero-conf types */
+	case RTE_FLOW_ACTION_TYPE_DROP:
+	case RTE_FLOW_ACTION_TYPE_FLAG:
+	case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+	case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+void
+fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+		      uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+		      bool *need_wire_orig_table, size_t *conf_sizes, uint32_t *n_actions_out)
+{
+	uint8_t actions_counter = 0;
+	uint8_t i, j;
+
+	*need_wire_orig_table = false;
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	/* Static configurations for actions that need them in templates */
+	static struct rte_flow_action_mark mark_conf = {
+		.id = 1,
+	};
+	static struct rte_flow_action_queue queue_conf = {
+		.index = 0,
+	};
+	static struct rte_flow_action_port_id port_id_conf = {
+		.id = 0,
+	};
+	static struct rte_flow_action_jump jump_conf = {
+		.group = 1,
+	};
+	static struct rte_flow_action_modify_field set_meta_conf = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst = {.field = RTE_FLOW_FIELD_META},
+		.src =
+			{
+				.field = RTE_FLOW_FIELD_VALUE,
+				.value = {0, 0, 0, META_DATA},
+			},
+		.width = 32,
+	};
+
+	/* Static mask configurations for each action type */
+	static struct rte_flow_action_mark mark_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_queue queue_mask = {
+		.index = UINT16_MAX,
+	};
+	static struct rte_flow_action_jump jump_mask = {
+		.group = UINT32_MAX,
+	};
+	static struct rte_flow_action_rss rss_mask = {
+		.level = UINT32_MAX,
+		.types = UINT64_MAX,
+	};
+	static struct rte_flow_action_set_meta set_meta_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_tag set_tag_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+		.index = UINT8_MAX,
+	};
+	static struct rte_flow_action_port_id port_id_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_count count_mask;
+	static struct rte_flow_action_set_mac set_mac_mask = {
+		.mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	};
+	static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
+		.ipv4_addr = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_ipv6 set_ipv6_mask = {
+		.ipv6_addr.a = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff}};
+	static struct rte_flow_action_set_tp set_tp_mask = {
+		.port = UINT16_MAX,
+	};
+	static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
+	static struct rte_flow_action_set_ttl set_ttl_mask = {
+		.ttl_value = UINT8_MAX,
+	};
+	static struct rte_flow_action_set_dscp set_dscp_mask = {
+		.dscp = UINT8_MAX,
+	};
+	static struct rte_flow_action_meter meter_mask = {
+		.mtr_id = UINT32_MAX,
+	};
+
+	static const struct {
+		uint64_t flow_mask;
+		enum rte_flow_action_type type;
+		const void *action_conf;
+		const void *action_mask;
+		const bool need_wire_orig_table;
+	} template_actions[] = {
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK), RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
+		 &mark_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT), RTE_FLOW_ACTION_TYPE_COUNT, NULL,
+		 &count_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
+		 RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf, &set_meta_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG), RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
+		 &set_tag_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG), RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
+		 false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC), RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST), RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL), RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
+		 &set_ttl_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL), RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
+		 NULL, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE), RTE_FLOW_ACTION_TYPE_QUEUE,
+		 &queue_conf, &queue_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS), RTE_FLOW_ACTION_TYPE_RSS, NULL,
+		 &rss_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP), RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
+		 &jump_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID), RTE_FLOW_ACTION_TYPE_PORT_ID,
+		 &port_id_conf, &port_id_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP), RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
+		 false},
+		{HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE, &queue_conf, &queue_mask, false},
+		{HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL, &rss_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER), RTE_FLOW_ACTION_TYPE_METER, NULL,
+		 &meter_mask, false},
+	};
+
+	for (j = 0; j < MAX_ACTIONS_NUM; j++) {
+		if (flow_actions[j] == 0)
+			break;
+		for (i = 0; i < RTE_DIM(template_actions); i++) {
+			if ((flow_actions[j] & template_actions[i].flow_mask) == 0)
+				continue;
+
+			switch (template_actions[i].type) {
+			case RTE_FLOW_ACTION_TYPE_COUNT:
+				port_attr->nb_counters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_AGE:
+				port_attr->nb_aging_objects++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_METER:
+				port_attr->nb_meters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_CONNTRACK:
+				port_attr->nb_conn_tracks++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_QUOTA:
+				port_attr->nb_quotas++;
+			default:;
+			}
+
+			actions[actions_counter].type = template_actions[i].type;
+			actions[actions_counter].conf = template_actions[i].action_conf;
+			masks[actions_counter].type = template_actions[i].type;
+			masks[actions_counter].conf = template_actions[i].action_mask;
+			conf_sizes[actions_counter] = action_conf_size(template_actions[i].type);
+			*need_wire_orig_table |= template_actions[i].need_wire_orig_table;
+			actions_counter++;
+			break;
+		}
+	}
+
+	actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+	masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+
+	/* take END into account */
+	*n_actions_out = actions_counter + 1;
+}
diff --git a/app/test-flow-perf/actions_gen.h b/app/test-flow-perf/actions_gen.h
index 9e13b164f9..3ac0ffed59 100644
--- a/app/test-flow-perf/actions_gen.h
+++ b/app/test-flow-perf/actions_gen.h
@@ -17,9 +17,40 @@
 #define RTE_VXLAN_GPE_UDP_PORT 250
 #define RTE_GENEVE_UDP_PORT 6081
 
+/* Compound action data structures (needed by async_flow.c for slot init) */
+
+/* Storage for struct rte_flow_action_raw_encap including external data. */
+struct action_raw_encap_data {
+	struct rte_flow_action_raw_encap conf;
+	uint8_t data[128];
+	uint8_t preserve[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_raw_decap including external data. */
+struct action_raw_decap_data {
+	struct rte_flow_action_raw_decap conf;
+	uint8_t data[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_rss including external data. */
+struct action_rss_data {
+	struct rte_flow_action_rss conf;
+	uint8_t key[40];
+	uint16_t queue[128];
+};
+
 void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	uint32_t counter, uint16_t next_table, uint16_t hairpinq,
 	uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
 	bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
 
+/* Fill actions template for async flow API (types only, no values).
+ * If conf_sizes is non-NULL, populates per-action conf sizes and n_actions_out.
+ */
+void fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+			   uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+			   bool *need_wire_orig_table, size_t *conf_sizes, uint32_t *n_actions_out);
+
 #endif /* FLOW_PERF_ACTION_GEN */
diff --git a/app/test-flow-perf/async_flow.c b/app/test-flow-perf/async_flow.c
new file mode 100644
index 0000000000..ae5a922856
--- /dev/null
+++ b/app/test-flow-perf/async_flow.c
@@ -0,0 +1,761 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API implementation
+ * for the flow-perf application.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_bitops.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_vxlan.h>
+
+#include "actions_gen.h"
+#include "async_flow.h"
+#include "flow_gen.h"
+#include "items_gen.h"
+
+/* Max iterations when draining pending async completions during cleanup */
+#define DRAIN_MAX_ITERATIONS 100
+
+/* Per-port async flow resources */
+static struct async_flow_resources port_resources[MAX_PORTS];
+
+/*
+ * Initialize compound action types within a pre-allocated slot.
+ * Called once per slot during pool init to set up internal pointers
+ * for RSS, RAW_ENCAP, RAW_DECAP and VXLAN_ENCAP actions.
+ */
+static void
+init_slot_compound_actions(struct rte_flow_action *actions, uint32_t n_actions,
+			   const size_t *action_conf_sizes)
+{
+	uint32_t i;
+
+	for (i = 0; i < n_actions; i++) {
+		if (action_conf_sizes[i] == 0)
+			continue;
+
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *rss =
+				(struct action_rss_data *)(uintptr_t)actions[i].conf;
+			rss->conf.func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+			rss->conf.level = 0;
+			rss->conf.types = GET_RSS_HF();
+			rss->conf.key_len = sizeof(rss->key);
+			rss->conf.key = rss->key;
+			rss->conf.queue = rss->queue;
+			rss->key[0] = 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap =
+				(struct action_raw_encap_data *)(uintptr_t)actions[i].conf;
+			encap->conf.data = encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap =
+				(struct action_raw_decap_data *)(uintptr_t)actions[i].conf;
+			decap->conf.data = decap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			/*
+			 * Layout within the conf area:
+			 *   struct rte_flow_action_vxlan_encap
+			 *   struct rte_flow_item[5]
+			 *   struct rte_flow_item_eth
+			 *   struct rte_flow_item_ipv4
+			 *   struct rte_flow_item_udp
+			 *   struct rte_flow_item_vxlan
+			 */
+			uint8_t *base = (uint8_t *)(uintptr_t)actions[i].conf;
+			struct rte_flow_action_vxlan_encap *ve =
+				(struct rte_flow_action_vxlan_encap *)base;
+			struct rte_flow_item *items =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			uint8_t *data = (uint8_t *)(items + 5);
+
+			struct rte_flow_item_eth *item_eth = (struct rte_flow_item_eth *)data;
+			data += sizeof(struct rte_flow_item_eth);
+			struct rte_flow_item_ipv4 *item_ipv4 = (struct rte_flow_item_ipv4 *)data;
+			data += sizeof(struct rte_flow_item_ipv4);
+			struct rte_flow_item_udp *item_udp = (struct rte_flow_item_udp *)data;
+			data += sizeof(struct rte_flow_item_udp);
+			struct rte_flow_item_vxlan *item_vxlan = (struct rte_flow_item_vxlan *)data;
+
+			memset(item_eth, 0, sizeof(*item_eth));
+			memset(item_ipv4, 0, sizeof(*item_ipv4));
+			memset(item_udp, 0, sizeof(*item_udp));
+			memset(item_vxlan, 0, sizeof(*item_vxlan));
+
+			item_ipv4->hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+			item_ipv4->hdr.version_ihl = RTE_IPV4_VHL_DEF;
+			item_udp->hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+			item_vxlan->hdr.vni[2] = 1;
+
+			items[0].type = RTE_FLOW_ITEM_TYPE_ETH;
+			items[0].spec = item_eth;
+			items[0].mask = item_eth;
+			items[1].type = RTE_FLOW_ITEM_TYPE_IPV4;
+			items[1].spec = item_ipv4;
+			items[1].mask = item_ipv4;
+			items[2].type = RTE_FLOW_ITEM_TYPE_UDP;
+			items[2].spec = item_udp;
+			items[2].mask = item_udp;
+			items[3].type = RTE_FLOW_ITEM_TYPE_VXLAN;
+			items[3].spec = item_vxlan;
+			items[3].mask = item_vxlan;
+			items[4].type = RTE_FLOW_ITEM_TYPE_END;
+
+			ve->definition = items;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Allocate and pre-initialize all per-slot flat buffers.
+ * Returns 0 on success.
+ */
+static int
+init_slot_pool(struct async_flow_resources *res, uint32_t nb_queues, uint32_t queue_size,
+	       const struct rte_flow_item *pattern, uint32_t n_items, const size_t *item_spec_sizes,
+	       const struct rte_flow_action *template_actions, uint32_t n_actions,
+	       const size_t *action_conf_sizes)
+{
+	uint32_t items_array_bytes, actions_array_bytes;
+	uint32_t spec_data_bytes, conf_data_bytes, mask_data_bytes;
+	uint32_t slot_size, num_slots;
+	uint32_t s, i;
+	uint8_t *mptr;
+
+	/* Compute shared mask size */
+	mask_data_bytes = 0;
+	for (i = 0; i < n_items; i++)
+		mask_data_bytes += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
+
+	/* specs and masks have the same size */
+	spec_data_bytes = mask_data_bytes;
+
+	conf_data_bytes = 0;
+	for (i = 0; i < n_actions; i++)
+		conf_data_bytes += RTE_ALIGN_CEIL(action_conf_sizes[i], 8);
+
+	/* Compute per-slot layout sizes (+ 1 for END sentinel) */
+	items_array_bytes = n_items * sizeof(struct rte_flow_item);
+	actions_array_bytes = n_actions * sizeof(struct rte_flow_action);
+
+	slot_size = RTE_ALIGN_CEIL(items_array_bytes + actions_array_bytes + spec_data_bytes +
+					   conf_data_bytes,
+				   RTE_CACHE_LINE_SIZE);
+
+	num_slots = queue_size * nb_queues;
+
+	/* Store layout info */
+	res->slot_size = slot_size;
+	res->slots_per_queue = queue_size;
+	res->nb_queues = nb_queues;
+	res->n_items = n_items;
+	res->n_actions = n_actions;
+
+	/* Allocate shared masks */
+	if (mask_data_bytes > 0) {
+		res->shared_masks = aligned_alloc(
+			RTE_CACHE_LINE_SIZE, RTE_ALIGN_CEIL(mask_data_bytes, RTE_CACHE_LINE_SIZE));
+		if (res->shared_masks == NULL) {
+			fprintf(stderr, "Failed to allocate shared masks (%u bytes)\n",
+				mask_data_bytes);
+			return -ENOMEM;
+		}
+		memset(res->shared_masks, 0, mask_data_bytes);
+
+		/* Copy mask data from template pattern */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			if (item_spec_sizes[i] > 0 && pattern[i].mask != NULL)
+				memcpy(mptr, pattern[i].mask, item_spec_sizes[i]);
+			mptr += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
+		}
+	}
+
+	/* Allocate per-slot pool */
+	/* slot_size is already cache-line aligned, so total is a multiple */
+	res->slot_pool = aligned_alloc(RTE_CACHE_LINE_SIZE, (size_t)num_slots * slot_size);
+	if (res->slot_pool == NULL) {
+		fprintf(stderr, "Failed to allocate slot pool (%u slots * %u bytes)\n", num_slots,
+			slot_size);
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+		return -ENOMEM;
+	}
+	memset(res->slot_pool, 0, (size_t)num_slots * slot_size);
+
+	/* Pre-initialize every slot */
+	for (s = 0; s < num_slots; s++) {
+		uint8_t *slot = res->slot_pool + (size_t)s * slot_size;
+		struct rte_flow_item *items = (struct rte_flow_item *)slot;
+		struct rte_flow_action *actions =
+			(struct rte_flow_action *)(slot + items_array_bytes);
+		uint8_t *data = slot + items_array_bytes + actions_array_bytes;
+
+		/* Pre-set items: spec → per-slot data, mask → shared masks */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			items[i].type = pattern[i].type;
+			if (item_spec_sizes[i] > 0) {
+				items[i].spec = data;
+				items[i].mask = mptr;
+				data += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
+				mptr += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
+			}
+		}
+		items[n_items].type = RTE_FLOW_ITEM_TYPE_END;
+
+		/* Pre-set actions: conf → per-slot data */
+		for (i = 0; i < n_actions; i++) {
+			actions[i].type = template_actions[i].type;
+			if (action_conf_sizes[i] > 0) {
+				actions[i].conf = data;
+				data += RTE_ALIGN_CEIL(action_conf_sizes[i], 8);
+			}
+		}
+		actions[n_actions].type = RTE_FLOW_ACTION_TYPE_END;
+
+		/* Initialize compound action types (RSS, RAW_ENCAP, etc.) */
+		init_slot_compound_actions(actions, n_actions, action_conf_sizes);
+	}
+
+	/* Allocate and initialize per-queue slot tracking */
+	res->queues = aligned_alloc(
+		RTE_CACHE_LINE_SIZE,
+		RTE_ALIGN_CEIL(nb_queues * sizeof(struct async_flow_queue), RTE_CACHE_LINE_SIZE));
+	if (res->queues == NULL) {
+		fprintf(stderr, "Failed to allocate queue structs (%u queues)\n", nb_queues);
+		free(res->slot_pool);
+		res->slot_pool = NULL;
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+		return -ENOMEM;
+	}
+	memset(res->queues, 0, nb_queues * sizeof(struct async_flow_queue));
+	for (s = 0; s < nb_queues; s++) {
+		res->queues[s].slots = res->slot_pool + (size_t)s * queue_size * slot_size;
+		res->queues[s].head = 0;
+	}
+
+	printf(":: Slot pool: %u slots * %u bytes = %u KB (shared masks: %u bytes)\n", num_slots,
+	       slot_size, (num_slots * slot_size) / 1024, mask_data_bytes);
+
+	return 0;
+}
+
+/*
+ * Hot-path: update per-flow item values through pre-set pointers.
+ * Only IPV4/IPV6 src_addr varies per flow (based on counter).
+ */
+static void
+update_item_values(struct rte_flow_item *items, uint32_t counter)
+{
+	uint8_t i;
+
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
+		switch (items[i].type) {
+		case RTE_FLOW_ITEM_TYPE_IPV4:
+			((struct rte_flow_item_ipv4 *)(uintptr_t)items[i].spec)->hdr.src_addr =
+				RTE_BE32(counter);
+			break;
+		case RTE_FLOW_ITEM_TYPE_IPV6: {
+			struct rte_flow_item_ipv6 *spec =
+				(struct rte_flow_item_ipv6 *)(uintptr_t)items[i].spec;
+			uint8_t j;
+			for (j = 0; j < 4; j++)
+				spec->hdr.src_addr.a[15 - j] = counter >> (j * 8);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Hot-path: update per-flow action values through pre-set pointers.
+ */
+static void
+update_action_values(struct rte_flow_action *actions, uint32_t counter, uint16_t hairpinq,
+		     uint64_t encap_data, uint64_t decap_data, __rte_unused uint8_t core_idx,
+		     bool unique_data, uint8_t rx_queues_count, uint16_t dst_port)
+{
+	uint8_t i;
+
+	for (i = 0; actions[i].type != RTE_FLOW_ACTION_TYPE_END; i++) {
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_MARK:
+			((struct rte_flow_action_mark *)(uintptr_t)actions[i].conf)->id =
+				(counter % 255) + 1;
+			break;
+		case RTE_FLOW_ACTION_TYPE_QUEUE:
+			((struct rte_flow_action_queue *)(uintptr_t)actions[i].conf)->index =
+				hairpinq ? (counter % hairpinq) + rx_queues_count :
+					   counter % rx_queues_count;
+			break;
+		case RTE_FLOW_ACTION_TYPE_METER:
+			((struct rte_flow_action_meter *)(uintptr_t)actions[i].conf)->mtr_id =
+				counter;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *rss =
+				(struct action_rss_data *)(uintptr_t)actions[i].conf;
+			uint16_t q;
+			if (hairpinq) {
+				rss->conf.queue_num = hairpinq;
+				for (q = 0; q < hairpinq; q++)
+					rss->queue[q] = q + rx_queues_count;
+			} else {
+				rss->conf.queue_num = rx_queues_count;
+				for (q = 0; q < rx_queues_count; q++)
+					rss->queue[q] = q;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: {
+			struct rte_flow_action_set_mac *mac =
+				(struct rte_flow_action_set_mac *)(uintptr_t)actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < RTE_ETHER_ADDR_LEN; j++) {
+				mac->mac_addr[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: {
+			uint32_t ip = unique_data ? counter : 1;
+			((struct rte_flow_action_set_ipv4 *)(uintptr_t)actions[i].conf)->ipv4_addr =
+				RTE_BE32(ip + 1);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: {
+			struct rte_flow_action_set_ipv6 *v6 =
+				(struct rte_flow_action_set_ipv6 *)(uintptr_t)actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < 16; j++) {
+				v6->ipv6_addr.a[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: {
+			uint32_t tp = unique_data ? counter : 100;
+			tp = tp % 0xffff;
+			((struct rte_flow_action_set_tp *)(uintptr_t)actions[i].conf)->port =
+				RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST: {
+			uint32_t tp = unique_data ? counter : 100;
+			if (tp > 0xffff)
+				tp >>= 16;
+			((struct rte_flow_action_set_tp *)(uintptr_t)actions[i].conf)->port =
+				RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: {
+			uint32_t val = unique_data ? counter : 1;
+			*(rte_be32_t *)(uintptr_t)actions[i].conf = RTE_BE32(val);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TTL: {
+			uint32_t val = unique_data ? counter : 1;
+			((struct rte_flow_action_set_ttl *)(uintptr_t)actions[i].conf)->ttl_value =
+				val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: {
+			uint32_t val = unique_data ? counter : 1;
+			((struct rte_flow_action_set_dscp *)(uintptr_t)actions[i].conf)->dscp =
+				val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_PORT_ID:
+			((struct rte_flow_action_port_id *)(uintptr_t)actions[i].conf)->id =
+				dst_port;
+			break;
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap =
+				(struct action_raw_encap_data *)(uintptr_t)actions[i].conf;
+			uint8_t *header = encap->data;
+			struct rte_ether_hdr eth_hdr;
+			struct rte_ipv4_hdr ipv4_hdr;
+			struct rte_udp_hdr udp_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4)) {
+				uint32_t ip_dst = unique_data ? counter : 1;
+				memset(&ipv4_hdr, 0, sizeof(ipv4_hdr));
+				ipv4_hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+				ipv4_hdr.dst_addr = RTE_BE32(ip_dst);
+				ipv4_hdr.version_ihl = RTE_IPV4_VHL_DEF;
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP))
+					ipv4_hdr.next_proto_id = 17; /* UDP */
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE))
+					ipv4_hdr.next_proto_id = 47; /* GRE */
+				memcpy(header, &ipv4_hdr, sizeof(ipv4_hdr));
+				header += sizeof(ipv4_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP)) {
+				memset(&udp_hdr, 0, sizeof(udp_hdr));
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN))
+					udp_hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+				memcpy(header, &udp_hdr, sizeof(udp_hdr));
+				header += sizeof(udp_hdr);
+			}
+			encap->conf.size = header - encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap_d =
+				(struct action_raw_decap_data *)(uintptr_t)actions[i].conf;
+			uint8_t *header = decap_d->data;
+			struct rte_ether_hdr eth_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			decap_d->conf.size = header - decap_d->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			uint8_t *base = (uint8_t *)(uintptr_t)actions[i].conf;
+			struct rte_flow_item *vitems =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			uint32_t ip_dst = unique_data ? counter : 1;
+			/* vitems[1] is IPV4 */
+			((struct rte_flow_item_ipv4 *)(uintptr_t)vitems[1].spec)->hdr.dst_addr =
+				RTE_BE32(ip_dst);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int
+async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+		     uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+		     uint8_t flow_group, uint32_t rules_count)
+{
+	struct rte_flow_port_info port_info = {0};
+	struct rte_flow_queue_info queue_info = {0};
+	struct rte_flow_error error = {0};
+	struct rte_flow_port_attr port_attr = {0};
+	struct rte_flow_queue_attr queue_attr;
+	const struct rte_flow_queue_attr **queue_attr_list;
+	struct rte_flow_pattern_template_attr pt_attr = {0};
+	struct rte_flow_actions_template_attr at_attr = {0};
+	struct rte_flow_template_table_attr table_attr = {0};
+	struct rte_flow_item pattern[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
+	size_t item_spec_sizes[MAX_ITEMS_NUM];
+	size_t action_conf_sizes[MAX_ACTIONS_NUM];
+	uint32_t n_items, n_actions;
+	struct async_flow_resources *res;
+	bool need_wire_orig_table = false;
+	uint32_t i;
+	int ret;
+
+	if (port_id >= MAX_PORTS)
+		return -1;
+
+	res = &port_resources[port_id];
+	memset(res, 0, sizeof(*res));
+
+	/* Query port flow info */
+	ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	if (port_info.max_nb_queues == 0 || queue_info.max_size == 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get reports that no queues are supported\n",
+			port_id);
+		return -1;
+	}
+
+	/* Limit to device capabilities if reported */
+	if (port_info.max_nb_queues != 0 && port_info.max_nb_queues != UINT32_MAX &&
+	    nb_queues > port_info.max_nb_queues)
+		nb_queues = port_info.max_nb_queues;
+	if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX &&
+	    queue_size > queue_info.max_size)
+		queue_size = queue_info.max_size;
+
+	/* Slot ring uses bitmask wrapping, so queue_size must be power of 2 */
+	queue_size = rte_align32prevpow2(queue_size);
+	if (queue_size == 0) {
+		fprintf(stderr, "Port %u: queue_size is 0 after rounding\n", port_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < MAX_ATTRS_NUM; i++) {
+		if (flow_attrs[i] == 0)
+			break;
+		if (flow_attrs[i] & INGRESS)
+			pt_attr.ingress = 1;
+		else if (flow_attrs[i] & EGRESS)
+			pt_attr.egress = 1;
+		else if (flow_attrs[i] & TRANSFER)
+			pt_attr.transfer = 1;
+	}
+	/* Enable relaxed matching for better performance */
+	pt_attr.relaxed_matching = 1;
+
+	memset(pattern, 0, sizeof(pattern));
+	memset(actions, 0, sizeof(actions));
+	memset(action_masks, 0, sizeof(action_masks));
+
+	/* Fill templates and gather per-item/action sizes */
+	fill_items_template(pattern, flow_items, 0, 0, item_spec_sizes, &n_items);
+
+	at_attr.ingress = pt_attr.ingress;
+	at_attr.egress = pt_attr.egress;
+	at_attr.transfer = pt_attr.transfer;
+
+	fill_actions_template(actions, action_masks, flow_actions, &port_attr,
+			      &need_wire_orig_table, action_conf_sizes, &n_actions);
+
+	/* fill_actions_template count the number of actions that require each kind of object,
+	 * so we multiply by the number of rules to have correct number */
+	port_attr.nb_counters *= rules_count;
+	port_attr.nb_aging_objects *= rules_count;
+	port_attr.nb_meters *= rules_count;
+	port_attr.nb_conn_tracks *= rules_count;
+	port_attr.nb_quotas *= rules_count;
+
+	table_attr.flow_attr.group = flow_group;
+	table_attr.flow_attr.priority = 0;
+	table_attr.flow_attr.ingress = pt_attr.ingress;
+	table_attr.flow_attr.egress = pt_attr.egress;
+	table_attr.flow_attr.transfer = pt_attr.transfer;
+	table_attr.nb_flows = rules_count;
+
+	if (pt_attr.transfer && need_wire_orig_table)
+		table_attr.specialize = RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
+
+	queue_attr_list = malloc(sizeof(*queue_attr_list) * nb_queues);
+	if (queue_attr_list == NULL) {
+		fprintf(stderr, "Port %u: failed to allocate queue_attr_list\n", port_id);
+		return -ENOMEM;
+	}
+
+	queue_attr.size = queue_size;
+	for (i = 0; i < nb_queues; i++)
+		queue_attr_list[i] = &queue_attr;
+
+	ret = rte_flow_configure(port_id, &port_attr, nb_queues, queue_attr_list, &error);
+
+	free(queue_attr_list);
+
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_configure failed (ret=%d, type=%d): %s\n",
+			port_id, ret, error.type, error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Create pattern template */
+	res->pattern_template =
+		rte_flow_pattern_template_create(port_id, &pt_attr, pattern, &error);
+	if (res->pattern_template == NULL) {
+		fprintf(stderr, "Port %u: pattern template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return -1;
+	}
+
+	/* Create actions template */
+	res->actions_template =
+		rte_flow_actions_template_create(port_id, &at_attr, actions, action_masks, &error);
+	if (res->actions_template == NULL) {
+		fprintf(stderr, "Port %u: actions template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		return -1;
+	}
+
+	/* Create template table */
+	res->table = rte_flow_template_table_create(port_id, &table_attr, &res->pattern_template, 1,
+						    &res->actions_template, 1, &error);
+	if (res->table == NULL) {
+		fprintf(stderr, "Port %u: template table create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		res->actions_template = NULL;
+		return -1;
+	}
+
+	/* Allocate and pre-initialize per-slot flat buffers */
+	ret = init_slot_pool(res, nb_queues, queue_size, pattern, n_items, item_spec_sizes, actions,
+			     n_actions, action_conf_sizes);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: slot pool init failed\n", port_id);
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->table = NULL;
+		res->actions_template = NULL;
+		res->pattern_template = NULL;
+		return ret;
+	}
+
+	res->table_capacity = rules_count;
+	res->initialized = true;
+
+	printf(":: Port %u: Async flow engine initialized (queues=%u, queue_size=%u)\n", port_id,
+	       nb_queues, queue_size);
+
+	return 0;
+}
+
+struct rte_flow *
+async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter, uint16_t hairpinq,
+		    uint64_t encap_data, uint64_t decap_data, uint16_t dst_port, uint8_t core_idx,
+		    uint8_t rx_queues_count, bool unique_data, bool postpone,
+		    struct rte_flow_error *error)
+{
+	struct async_flow_resources *res;
+	struct async_flow_queue *q;
+	uint8_t *slot;
+	uint32_t idx, items_array_bytes;
+	struct rte_flow_item *items;
+	struct rte_flow_action *actions;
+	struct rte_flow_op_attr op_attr = {
+		.postpone = postpone,
+	};
+
+	if (port_id >= MAX_PORTS) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid port ID");
+		return NULL;
+	}
+
+	res = &port_resources[port_id];
+	if (!res->initialized) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Async flow resources not initialized");
+		return NULL;
+	}
+
+	if (queue_id >= res->nb_queues) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid queue ID");
+		return NULL;
+	}
+
+	/* Pick the next slot from this queue's ring */
+	q = &res->queues[queue_id];
+	idx = q->head;
+	q->head = (idx + 1) & (res->slots_per_queue - 1);
+	slot = q->slots + (size_t)idx * res->slot_size;
+	items_array_bytes = res->n_items * sizeof(struct rte_flow_item);
+	items = (struct rte_flow_item *)slot;
+	actions = (struct rte_flow_action *)(slot + items_array_bytes);
+
+	/* Update only per-flow varying values */
+	update_item_values(items, counter);
+	update_action_values(actions, counter, hairpinq, encap_data, decap_data, core_idx,
+			     unique_data, rx_queues_count, dst_port);
+
+	return rte_flow_async_create(port_id, queue_id, &op_attr, res->table, items, 0, actions, 0,
+				     NULL, error);
+}
+
+void
+async_flow_cleanup_port(uint16_t port_id)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_error error;
+	struct rte_flow_op_result results[64];
+	int ret, i;
+
+	if (port_id >= MAX_PORTS)
+		return;
+
+	res = &port_resources[port_id];
+	if (!res->initialized)
+		return;
+
+	/* Drain any pending async completions from flow flush */
+	for (i = 0; i < DRAIN_MAX_ITERATIONS; i++) {
+		rte_flow_push(port_id, 0, &error);
+		ret = rte_flow_pull(port_id, 0, results, 64, &error);
+		if (ret <= 0)
+			break;
+	}
+
+	if (res->table != NULL) {
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		res->table = NULL;
+	}
+
+	if (res->actions_template != NULL) {
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		res->actions_template = NULL;
+	}
+
+	if (res->pattern_template != NULL) {
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+	}
+
+	free(res->queues);
+	res->queues = NULL;
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+	free(res->shared_masks);
+	res->shared_masks = NULL;
+
+	res->initialized = false;
+}
diff --git a/app/test-flow-perf/async_flow.h b/app/test-flow-perf/async_flow.h
new file mode 100644
index 0000000000..8c12924bc6
--- /dev/null
+++ b/app/test-flow-perf/async_flow.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API related definitions
+ * and function declarations.
+ */
+
+#ifndef FLOW_PERF_ASYNC_FLOW
+#define FLOW_PERF_ASYNC_FLOW
+
+#include <rte_flow.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "config.h"
+
+/* Per-queue slot ring — tracks which slot to use next */
+struct async_flow_queue {
+	uint8_t *slots; /* pointer to this queue's region within slot_pool */
+	uint32_t head;	/* next slot index (wraps mod slots_per_queue) */
+};
+
+/* Per-port async flow resources */
+struct async_flow_resources {
+	struct rte_flow_pattern_template *pattern_template;
+	struct rte_flow_actions_template *actions_template;
+	struct rte_flow_template_table *table;
+	uint8_t *slot_pool;    /* flat buffer pool for all slots */
+	uint8_t *shared_masks; /* shared item mask data (one copy for all slots) */
+	struct async_flow_queue *queues;
+	uint32_t slot_size;	  /* bytes per slot (cache-line aligned) */
+	uint32_t slots_per_queue; /* = queue_size */
+	uint32_t nb_queues;
+	uint32_t n_items;   /* item count (excl. END) */
+	uint32_t n_actions; /* action count (excl. END) */
+	uint32_t table_capacity;
+	bool initialized;
+};
+
+/* Initialize async flow engine for a port */
+int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+			 uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+			 uint8_t flow_group, uint32_t rules_count);
+
+/* Create a flow rule asynchronously using pre-allocated slot */
+struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter,
+				     uint16_t hairpinq, uint64_t encap_data, uint64_t decap_data,
+				     uint16_t dst_port, uint8_t core_idx, uint8_t rx_queues_count,
+				     bool unique_data, bool postpone, struct rte_flow_error *error);
+
+/* Cleanup async flow resources for a port */
+void async_flow_cleanup_port(uint16_t port_id);
+
+#endif /* FLOW_PERF_ASYNC_FLOW */
diff --git a/app/test-flow-perf/items_gen.c b/app/test-flow-perf/items_gen.c
index c740e1838f..58f1c16cf8 100644
--- a/app/test-flow-perf/items_gen.c
+++ b/app/test-flow-perf/items_gen.c
@@ -389,3 +389,61 @@ fill_items(struct rte_flow_item *items,
 
 	items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
 }
+
+static size_t
+item_spec_size(enum rte_flow_item_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ITEM_TYPE_ETH:
+		return sizeof(struct rte_flow_item_eth);
+	case RTE_FLOW_ITEM_TYPE_VLAN:
+		return sizeof(struct rte_flow_item_vlan);
+	case RTE_FLOW_ITEM_TYPE_IPV4:
+		return sizeof(struct rte_flow_item_ipv4);
+	case RTE_FLOW_ITEM_TYPE_IPV6:
+		return sizeof(struct rte_flow_item_ipv6);
+	case RTE_FLOW_ITEM_TYPE_TCP:
+		return sizeof(struct rte_flow_item_tcp);
+	case RTE_FLOW_ITEM_TYPE_UDP:
+		return sizeof(struct rte_flow_item_udp);
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		return sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		return sizeof(struct rte_flow_item_vxlan_gpe);
+	case RTE_FLOW_ITEM_TYPE_GRE:
+		return sizeof(struct rte_flow_item_gre);
+	case RTE_FLOW_ITEM_TYPE_GENEVE:
+		return sizeof(struct rte_flow_item_geneve);
+	case RTE_FLOW_ITEM_TYPE_GTP:
+		return sizeof(struct rte_flow_item_gtp);
+	case RTE_FLOW_ITEM_TYPE_META:
+		return sizeof(struct rte_flow_item_meta);
+	case RTE_FLOW_ITEM_TYPE_TAG:
+		return sizeof(struct rte_flow_item_tag);
+	case RTE_FLOW_ITEM_TYPE_ICMP:
+		return sizeof(struct rte_flow_item_icmp);
+	case RTE_FLOW_ITEM_TYPE_ICMP6:
+		return sizeof(struct rte_flow_item_icmp6);
+	default:
+		return 0;
+	}
+}
+
+void
+fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+		    uint8_t core_idx, size_t *spec_sizes, uint32_t *n_items_out)
+{
+	uint32_t count;
+
+	fill_items(items, flow_items, outer_ip_src, core_idx);
+
+	/* Count items before END */
+	for (count = 0; items[count].type != RTE_FLOW_ITEM_TYPE_END; count++) {
+		spec_sizes[count] = item_spec_size(items[count].type);
+		/* For templates, set spec to NULL - only mask matters for template matching */
+		items[count].spec = NULL;
+	}
+
+	/* take END into account */
+	*n_items_out = count + 1;
+}
diff --git a/app/test-flow-perf/items_gen.h b/app/test-flow-perf/items_gen.h
index f4b0e9a981..0987f7be3c 100644
--- a/app/test-flow-perf/items_gen.h
+++ b/app/test-flow-perf/items_gen.h
@@ -15,4 +15,10 @@
 void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
 	uint32_t outer_ip_src, uint8_t core_idx);
 
+/* Fill items template for async flow API (masks only, no spec values).
+ * If spec_sizes is non-NULL, populates per-item spec sizes and n_items_out.
+ */
+void fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+			 uint8_t core_idx, size_t *spec_sizes, uint32_t *n_items_out);
+
 #endif /* FLOW_PERF_ITEMS_GEN */
diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 6636d1517f..2c6def95c2 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -37,11 +37,15 @@
 #include <rte_mtr.h>
 #include <rte_os_shim.h>
 
-#include "config.h"
 #include "actions_gen.h"
+#include "async_flow.h"
+#include "config.h"
 #include "flow_gen.h"
+#include "rte_build_config.h"
 
 #define MAX_BATCHES_COUNT          100
+#define MAX_ASYNC_QUEUE_SIZE	     (1 << 14)
+#define MAX_PULL_RETRIES	     (1 << 20)
 #define DEFAULT_RULES_COUNT    4000000
 #define DEFAULT_RULES_BATCH     100000
 #define DEFAULT_GROUP                0
@@ -55,7 +59,6 @@
 #define HAIRPIN_TX_CONF_LOCKED_MEMORY (0x0100)
 #define HAIRPIN_TX_CONF_RTE_MEMORY    (0x0200)
 
-struct rte_flow *flow;
 static uint8_t flow_group;
 
 static uint64_t encap_data;
@@ -81,6 +84,9 @@ static bool enable_fwd;
 static bool unique_data;
 static bool policy_mtr;
 static bool packet_mode;
+static bool async_mode;
+static uint32_t async_queue_size = 1024;
+static uint32_t async_push_batch = 256;
 
 static uint8_t rx_queues_count;
 static uint8_t tx_queues_count;
@@ -598,6 +604,29 @@ usage(char *progname)
 		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
 		"With fixed values\n");
 	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
+
+	printf("\nAsync flow API options:\n");
+	printf("  --async: enable async flow API mode\n");
+	printf("  --async-queue-size=N: size of each async queue,"
+	       " default is 1024\n");
+	printf("  --async-push-batch=N: flows to batch before push,"
+	       " default is 256\n");
+}
+
+static inline uint32_t
+prev_power_of_two(uint32_t x)
+{
+	uint32_t saved = x;
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+	x++;
+	if (x == saved)
+		return x;
+	return x >> 1;
 }
 
 static void
@@ -734,6 +763,9 @@ args_parse(int argc, char **argv)
 		{ "policy-mtr",                 1, 0, 0 },
 		{ "meter-profile",              1, 0, 0 },
 		{ "packet-mode",                0, 0, 0 },
+		{ "async",                      0, 0, 0 },
+		{ "async-queue-size",           1, 0, 0 },
+		{ "async-push-batch",           1, 0, 0 },
 		{ 0, 0, 0, 0 },
 	};
 
@@ -913,8 +945,7 @@ args_parse(int argc, char **argv)
 					rte_exit(EXIT_FAILURE, "Invalid hairpin config mask\n");
 				hairpin_conf_mask = hp_conf;
 			}
-			if (strcmp(lgopts[opt_idx].name,
-					"port-id") == 0) {
+			if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
 				uint16_t port_idx = 0;
 
 				token = strtok(optarg, ",");
@@ -981,6 +1012,26 @@ args_parse(int argc, char **argv)
 			}
 			if (strcmp(lgopts[opt_idx].name, "packet-mode") == 0)
 				packet_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async") == 0)
+				async_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async-queue-size") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE)
+					async_queue_size = MAX_ASYNC_QUEUE_SIZE;
+				else if (n > 0)
+					async_queue_size = prev_power_of_two(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-queue-size should be > 0\n");
+			}
+			if (strcmp(lgopts[opt_idx].name, "async-push-batch") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE >> 1)
+					async_push_batch = MAX_ASYNC_QUEUE_SIZE >> 1;
+				else if (n > 0)
+					async_push_batch = prev_power_of_two(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-push-batch should be > 0\n");
+			}
 			break;
 		default:
 			usage(argv[0]);
@@ -1457,10 +1508,10 @@ query_flows(int port_id, uint8_t core_id, struct rte_flow **flows_list)
 	mc_pool.flows_record.query[port_id][core_id] = cpu_time_used;
 }
 
-static struct rte_flow **
-insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
+static void
+insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
 {
-	struct rte_flow **flows_list;
+	struct rte_flow *flow;
 	struct rte_flow_error error;
 	clock_t start_batch, end_batch;
 	double first_flow_latency;
@@ -1485,8 +1536,7 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 	global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
 	global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
 
-	flows_list = rte_zmalloc("flows_list",
-		(sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
 	if (flows_list == NULL)
 		rte_exit(EXIT_FAILURE, "No Memory available!\n");
 
@@ -1524,6 +1574,11 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 			core_id, rx_queues_count,
 			unique_data, max_priority, &error);
 
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+
 		if (!counter) {
 			first_flow_latency = (double) (rte_get_timer_cycles() - start_batch);
 			first_flow_latency /= rte_get_timer_hz();
@@ -1537,11 +1592,6 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		if (force_quit)
 			counter = end_counter;
 
-		if (!flow) {
-			print_flow_error(error);
-			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
-		}
-
 		flows_list[flow_index++] = flow;
 
 		/*
@@ -1575,7 +1625,203 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		port_id, core_id, rules_count_per_core, cpu_time_used);
 
 	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
-	return flows_list;
+}
+
+static uint32_t push_counter[RTE_MAX_LCORE];
+
+static inline int
+push_pull_flows_async(int port_id, int queue_id, int core_id, uint32_t enqueued, bool empty,
+		      bool check_op_status, struct rte_flow_error *error)
+{
+	static struct rte_flow_op_result results[RTE_MAX_LCORE][MAX_ASYNC_QUEUE_SIZE];
+	uint32_t to_pull = (empty || async_push_batch > enqueued) ? enqueued : async_push_batch;
+	uint32_t pulled_complete = 0;
+	uint32_t retries = 0;
+	int pulled, i;
+	int ret = 0;
+
+	/* Push periodically to give HW work to do */
+	ret = rte_flow_push(port_id, queue_id, error);
+	if (ret)
+		return ret;
+	push_counter[core_id]++;
+
+	/* Check if queue is getting full, if so push and drain completions */
+	if (!empty && push_counter[core_id] == 1)
+		return 0;
+
+	while (to_pull > 0) {
+		pulled = rte_flow_pull(port_id, queue_id, results[core_id], to_pull, error);
+		if (pulled < 0) {
+			return -1;
+		} else if (pulled == 0) {
+			if (++retries > MAX_PULL_RETRIES) {
+				rte_flow_error_set(error, ETIMEDOUT,
+						   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+						   "Timeout waiting for async completions");
+				return -1;
+			}
+			rte_pause();
+			continue;
+		}
+		retries = 0;
+
+		to_pull -= pulled;
+		pulled_complete += pulled;
+		if (!check_op_status)
+			continue;
+
+		for (i = 0; i < pulled; i++) {
+			if (results[core_id][i].status != RTE_FLOW_OP_SUCCESS) {
+				rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+						   NULL, "Some flow rule insertion failed");
+				return -1;
+			}
+		}
+	}
+
+	return pulled_complete;
+}
+
+static void
+insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
+{
+	struct rte_flow *flow;
+	struct rte_flow_error error;
+	clock_t start_batch, end_batch;
+	double first_flow_latency;
+	double cpu_time_used;
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
+	double delta;
+	uint32_t flow_index;
+	uint32_t counter, batch_counter, start_counter = 0, end_counter;
+	int rules_batch_idx;
+	int rules_count_per_core;
+	uint32_t enqueued = 0;
+	uint32_t queue_id = core_id;
+	bool first_batch = true;
+	int pulled;
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	if (async_push_batch > async_queue_size >> 1)
+		async_push_batch = async_queue_size >> 1;
+
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	cpu_time_used = 0;
+	flow_index = 0;
+	push_counter[core_id] = 0;
+
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+
+		uint64_t global_items[MAX_ITEMS_NUM] = {0};
+		uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
+		global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+		global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
+		flow = generate_flow(port_id, 0, flow_attrs, global_items, global_actions,
+				     flow_group, 0, 0, 0, 0, dst_port_id, core_id, rx_queues_count,
+				     unique_data, max_priority, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_get_timer_cycles();
+	for (counter = start_counter; counter < end_counter;) {
+		/* batch adding flow rules, this avoids unnecessary checks for push/pull */
+		for (batch_counter = 0; batch_counter < async_push_batch && counter < end_counter;
+		     batch_counter++, counter++) {
+			/* Create flow with postpone=true to batch operations */
+			flow = async_generate_flow(port_id, queue_id, counter, hairpin_queues_num,
+						   encap_data, decap_data, dst_port_id, core_id,
+						   rx_queues_count, unique_data, true, &error);
+
+			if (!flow) {
+				print_flow_error(error);
+				rte_exit(EXIT_FAILURE, "Error in creating async flow\n");
+			}
+
+			if (force_quit)
+				break;
+
+			flows_list[flow_index++] = flow;
+			enqueued++;
+
+			/*
+			 * Save the insertion rate for rules batch.
+			 * Check if the insertion reached the rules
+			 * patch counter, then save the insertion rate
+			 * for this batch.
+			 */
+			if (!((counter + 1) % rules_batch)) {
+				end_batch = rte_get_timer_cycles();
+				delta = (double)(end_batch - start_batch);
+				rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+				cpu_time_per_batch[rules_batch_idx] = delta / rte_get_timer_hz();
+				cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+				start_batch = rte_get_timer_cycles();
+			}
+		}
+
+		if ((pulled = push_pull_flows_async(port_id, queue_id, core_id, enqueued, false,
+						    true, &error)) < 0) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error push/pull async operations\n");
+		}
+
+		enqueued -= pulled;
+
+		if (first_batch) {
+			first_flow_latency = (double)(rte_get_timer_cycles() - start_batch);
+			first_flow_latency /= rte_get_timer_hz();
+			/* In millisecond */
+			first_flow_latency *= 1000;
+			printf(":: First Flow Batch Latency (Async) :: Port %d :: First batch (%u) "
+			       "installed in %f milliseconds\n",
+			       port_id, async_push_batch, first_flow_latency);
+			first_batch = false;
+		}
+	}
+
+	if (push_pull_flows_async(port_id, queue_id, core_id, enqueued, true, true, &error) < 0) {
+		print_flow_error(error);
+		rte_exit(EXIT_FAILURE, "Error final push/pull async operations\n");
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] - end @[%d]\n", port_id,
+	       core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	if (cpu_time_used > 0) {
+		insertion_rate = ((double)rules_count_per_core / cpu_time_used) / 1000;
+		printf(":: Port %d :: Core %d :: Async rules insertion rate -> %f K Rule/Sec\n",
+		       port_id, core_id, insertion_rate);
+	}
+	printf(":: Port %d :: Core %d :: The time for creating %d async rules is %f seconds\n",
+	       port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
 }
 
 static void
@@ -1585,12 +1831,18 @@ flows_handler(uint8_t core_id)
 	uint16_t port_idx = 0;
 	uint16_t nr_ports;
 	int port_id;
+	int rules_count_per_core;
 
 	nr_ports = rte_eth_dev_count_avail();
 
 	if (rules_batch > rules_count)
 		rules_batch = rules_count;
 
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
+	if (flows_list == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+
 	printf(":: Rules Count per port: %d\n\n", rules_count);
 
 	for (port_id = 0; port_id < nr_ports; port_id++) {
@@ -1602,10 +1854,10 @@ flows_handler(uint8_t core_id)
 		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 		if (has_meter())
 			meters_handler(port_id, core_id, METER_CREATE);
-		flows_list = insert_flows(port_id, core_id,
-						dst_ports[port_idx++]);
-		if (flows_list == NULL)
-			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
+		if (async_mode)
+			insert_flows_async(port_id, core_id, dst_ports[port_idx++], flows_list);
+		else
+			insert_flows(port_id, core_id, dst_ports[port_idx++], flows_list);
 		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 
 		if (query_flag)
@@ -2212,6 +2464,16 @@ init_port(void)
 			}
 		}
 
+		/* Configure async flow engine before device start */
+		if (async_mode) {
+			ret = async_flow_init_port(port_id, mc_pool.cores_count, async_queue_size,
+						   flow_items, flow_actions, flow_attrs, flow_group,
+						   rules_count);
+			if (ret != 0)
+				rte_exit(EXIT_FAILURE, "Failed to init async flow on port %d\n",
+					 port_id);
+		}
+
 		ret = rte_eth_dev_start(port_id);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE,
@@ -2291,6 +2553,8 @@ main(int argc, char **argv)
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
+		if (async_mode)
+			async_flow_cleanup_port(port);
 		if (rte_eth_dev_stop(port) != 0)
 			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
diff --git a/app/test-flow-perf/meson.build b/app/test-flow-perf/meson.build
index e101449e32..2f820a7597 100644
--- a/app/test-flow-perf/meson.build
+++ b/app/test-flow-perf/meson.build
@@ -3,6 +3,7 @@
 
 sources = files(
         'actions_gen.c',
+        'async_flow.c',
         'flow_gen.c',
         'items_gen.c',
         'main.c',
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] test/flow: add support for async API
  2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
@ 2026-03-01 23:36     ` Maxime Peim
  2026-03-02  0:52     ` Stephen Hemminger
  1 sibling, 0 replies; 8+ messages in thread
From: Maxime Peim @ 2026-03-01 23:36 UTC (permalink / raw)
  To: dev; +Cc: stephen

[-- Attachment #1: Type: text/plain, Size: 83333 bytes --]

Hi Stephen,

Thanks for the review!
Some of  the AI generated comments were indeed issues, but already there
before my changes (the global flow variable for example).
So I updated my changes to fix those nevertheless.

Let me know if any other changes are needed.
Also, I have made improvement by pre-allocating a memory pool for async
flows:
 - it avoid every in-flight flow in the queue to point to the same items /
actions when using static variables
 - it allows better locality and cache usage
With this change I was able to double the number of flows I can insert on a
BF3.

On Mon, Mar 2, 2026 at 12:29 AM Maxime Peim <maxime.peim@gmail.com> wrote:

> Add async flow API mode to test-flow-perf application for improved
> flow rule insertion performance. The async API allows batching flow
> rule creation operations and processing completions in bulk, reducing
> per-rule overhead.
>
> New command line options:
>   --async: enable async flow API mode
>   --async-queue-size=N: size of async queues (default: 1024)
>   --async-push-batch=N: flows to batch before push (default: 256)
>
> Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
> ---
> v2:
>   - Replace per-flow stack allocation with pre-allocated slot pool;
>     flat buffers are initialized once at init time and the hot path
>     only patches per-flow item/action values into a pre-set slot
>   - Fix alloca misuse: use heap allocation for queue_attr_list, round
>     queue_size to power of 2 for bitmask wrapping, add bounds checks
>   - Fix race on file-scope flow variable, premature latency
>     measurement, and integer division in rate calculation
>   - Drop unrelated lgopts reformatting
>   - Use malloc instead of rte_zmalloc for non-dataplane allocations
>   - Various robustness and style fixes
>
>  app/test-flow-perf/actions_gen.c | 281 +++++++++++-
>  app/test-flow-perf/actions_gen.h |  31 ++
>  app/test-flow-perf/async_flow.c  | 761 +++++++++++++++++++++++++++++++
>  app/test-flow-perf/async_flow.h  |  54 +++
>  app/test-flow-perf/items_gen.c   |  58 +++
>  app/test-flow-perf/items_gen.h   |   6 +
>  app/test-flow-perf/main.c        | 302 +++++++++++-
>  app/test-flow-perf/meson.build   |   1 +
>  8 files changed, 1454 insertions(+), 40 deletions(-)
>  create mode 100644 app/test-flow-perf/async_flow.c
>  create mode 100644 app/test-flow-perf/async_flow.h
>
> diff --git a/app/test-flow-perf/actions_gen.c
> b/app/test-flow-perf/actions_gen.c
> index 9d102e3af4..2b8edd50c8 100644
> --- a/app/test-flow-perf/actions_gen.c
> +++ b/app/test-flow-perf/actions_gen.c
> @@ -36,27 +36,7 @@ struct additional_para {
>         bool unique_data;
>  };
>
> -/* Storage for struct rte_flow_action_raw_encap including external data.
> */
> -struct action_raw_encap_data {
> -       struct rte_flow_action_raw_encap conf;
> -       uint8_t data[128];
> -       uint8_t preserve[128];
> -       uint16_t idx;
> -};
> -
> -/* Storage for struct rte_flow_action_raw_decap including external data.
> */
> -struct action_raw_decap_data {
> -       struct rte_flow_action_raw_decap conf;
> -       uint8_t data[128];
> -       uint16_t idx;
> -};
> -
> -/* Storage for struct rte_flow_action_rss including external data. */
> -struct action_rss_data {
> -       struct rte_flow_action_rss conf;
> -       uint8_t key[40];
> -       uint16_t queue[128];
> -};
> +/* Compound action data structs defined in actions_gen.h */
>
>  static void
>  add_mark(struct rte_flow_action *actions,
> @@ -1165,3 +1145,262 @@ fill_actions(struct rte_flow_action *actions,
> uint64_t *flow_actions,
>         free(queues);
>         free(hairpin_queues);
>  }
> +
> +static size_t
> +action_conf_size(enum rte_flow_action_type type)
> +{
> +       switch (type) {
> +       case RTE_FLOW_ACTION_TYPE_MARK:
> +               return sizeof(struct rte_flow_action_mark);
> +       case RTE_FLOW_ACTION_TYPE_QUEUE:
> +               return sizeof(struct rte_flow_action_queue);
> +       case RTE_FLOW_ACTION_TYPE_JUMP:
> +               return sizeof(struct rte_flow_action_jump);
> +       case RTE_FLOW_ACTION_TYPE_RSS:
> +               return sizeof(struct action_rss_data);
> +       case RTE_FLOW_ACTION_TYPE_SET_META:
> +               return sizeof(struct rte_flow_action_set_meta);
> +       case RTE_FLOW_ACTION_TYPE_SET_TAG:
> +               return sizeof(struct rte_flow_action_set_tag);
> +       case RTE_FLOW_ACTION_TYPE_PORT_ID:
> +               return sizeof(struct rte_flow_action_port_id);
> +       case RTE_FLOW_ACTION_TYPE_COUNT:
> +               return sizeof(struct rte_flow_action_count);
> +       case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
> +       case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
> +               return sizeof(struct rte_flow_action_set_mac);
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
> +               return sizeof(struct rte_flow_action_set_ipv4);
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
> +               return sizeof(struct rte_flow_action_set_ipv6);
> +       case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
> +       case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
> +               return sizeof(struct rte_flow_action_set_tp);
> +       case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
> +       case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
> +       case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
> +       case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
> +               return sizeof(rte_be32_t);
> +       case RTE_FLOW_ACTION_TYPE_SET_TTL:
> +               return sizeof(struct rte_flow_action_set_ttl);
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
> +       case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
> +               return sizeof(struct rte_flow_action_set_dscp);
> +       case RTE_FLOW_ACTION_TYPE_METER:
> +               return sizeof(struct rte_flow_action_meter);
> +       case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
> +               return sizeof(struct action_raw_encap_data);
> +       case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
> +               return sizeof(struct action_raw_decap_data);
> +       case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
> +               return sizeof(struct rte_flow_action_vxlan_encap) +
> +                      5 * sizeof(struct rte_flow_item) + sizeof(struct
> rte_flow_item_eth) +
> +                      sizeof(struct rte_flow_item_ipv4) + sizeof(struct
> rte_flow_item_udp) +
> +                      sizeof(struct rte_flow_item_vxlan);
> +       case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD:
> +               return sizeof(struct rte_flow_action_modify_field);
> +       /* Zero-conf types */
> +       case RTE_FLOW_ACTION_TYPE_DROP:
> +       case RTE_FLOW_ACTION_TYPE_FLAG:
> +       case RTE_FLOW_ACTION_TYPE_DEC_TTL:
> +       case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
> +               return 0;
> +       default:
> +               return 0;
> +       }
> +}
> +
> +void
> +fill_actions_template(struct rte_flow_action *actions, struct
> rte_flow_action *masks,
> +                     uint64_t *flow_actions, struct rte_flow_port_attr
> *port_attr,
> +                     bool *need_wire_orig_table, size_t *conf_sizes,
> uint32_t *n_actions_out)
> +{
> +       uint8_t actions_counter = 0;
> +       uint8_t i, j;
> +
> +       *need_wire_orig_table = false;
> +       memset(port_attr, 0, sizeof(*port_attr));
> +
> +       /* Static configurations for actions that need them in templates */
> +       static struct rte_flow_action_mark mark_conf = {
> +               .id = 1,
> +       };
> +       static struct rte_flow_action_queue queue_conf = {
> +               .index = 0,
> +       };
> +       static struct rte_flow_action_port_id port_id_conf = {
> +               .id = 0,
> +       };
> +       static struct rte_flow_action_jump jump_conf = {
> +               .group = 1,
> +       };
> +       static struct rte_flow_action_modify_field set_meta_conf = {
> +               .operation = RTE_FLOW_MODIFY_SET,
> +               .dst = {.field = RTE_FLOW_FIELD_META},
> +               .src =
> +                       {
> +                               .field = RTE_FLOW_FIELD_VALUE,
> +                               .value = {0, 0, 0, META_DATA},
> +                       },
> +               .width = 32,
> +       };
> +
> +       /* Static mask configurations for each action type */
> +       static struct rte_flow_action_mark mark_mask = {
> +               .id = UINT32_MAX,
> +       };
> +       static struct rte_flow_action_queue queue_mask = {
> +               .index = UINT16_MAX,
> +       };
> +       static struct rte_flow_action_jump jump_mask = {
> +               .group = UINT32_MAX,
> +       };
> +       static struct rte_flow_action_rss rss_mask = {
> +               .level = UINT32_MAX,
> +               .types = UINT64_MAX,
> +       };
> +       static struct rte_flow_action_set_meta set_meta_mask = {
> +               .data = UINT32_MAX,
> +               .mask = UINT32_MAX,
> +       };
> +       static struct rte_flow_action_set_tag set_tag_mask = {
> +               .data = UINT32_MAX,
> +               .mask = UINT32_MAX,
> +               .index = UINT8_MAX,
> +       };
> +       static struct rte_flow_action_port_id port_id_mask = {
> +               .id = UINT32_MAX,
> +       };
> +       static struct rte_flow_action_count count_mask;
> +       static struct rte_flow_action_set_mac set_mac_mask = {
> +               .mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
> +       };
> +       static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
> +               .ipv4_addr = UINT32_MAX,
> +       };
> +       static struct rte_flow_action_set_ipv6 set_ipv6_mask = {
> +               .ipv6_addr.a = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> 0xff, 0xff, 0xff, 0xff,
> +                               0xff, 0xff, 0xff, 0xff, 0xff}};
> +       static struct rte_flow_action_set_tp set_tp_mask = {
> +               .port = UINT16_MAX,
> +       };
> +       static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
> +       static struct rte_flow_action_set_ttl set_ttl_mask = {
> +               .ttl_value = UINT8_MAX,
> +       };
> +       static struct rte_flow_action_set_dscp set_dscp_mask = {
> +               .dscp = UINT8_MAX,
> +       };
> +       static struct rte_flow_action_meter meter_mask = {
> +               .mtr_id = UINT32_MAX,
> +       };
> +
> +       static const struct {
> +               uint64_t flow_mask;
> +               enum rte_flow_action_type type;
> +               const void *action_conf;
> +               const void *action_mask;
> +               const bool need_wire_orig_table;
> +       } template_actions[] = {
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK),
> RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
> +                &mark_mask, true},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT),
> RTE_FLOW_ACTION_TYPE_COUNT, NULL,
> +                &count_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
> +                RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf,
> &set_meta_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG),
> RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
> +                &set_tag_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG),
> RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
> +                false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
> +                RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
> +                RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC),
> RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
> +                NULL, &set_tp_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST),
> RTE_FLOW_ACTION_TYPE_SET_TP_DST,
> +                NULL, &set_tp_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
> +                RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL,
> &tcp_seq_ack_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
> +                RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL,
> &tcp_seq_ack_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
> +                RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL,
> &tcp_seq_ack_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
> +                RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL,
> &tcp_seq_ack_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL),
> RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
> +                &set_ttl_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL),
> RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
> +                NULL, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
> +                RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask,
> false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE),
> RTE_FLOW_ACTION_TYPE_QUEUE,
> +                &queue_conf, &queue_mask, true},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS),
> RTE_FLOW_ACTION_TYPE_RSS, NULL,
> +                &rss_mask, true},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP),
> RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
> +                &jump_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID),
> RTE_FLOW_ACTION_TYPE_PORT_ID,
> +                &port_id_conf, &port_id_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP),
> RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
> +                false},
> +               {HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE,
> &queue_conf, &queue_mask, false},
> +               {HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL,
> &rss_mask, false},
> +               {FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER),
> RTE_FLOW_ACTION_TYPE_METER, NULL,
> +                &meter_mask, false},
> +       };
> +
> +       for (j = 0; j < MAX_ACTIONS_NUM; j++) {
> +               if (flow_actions[j] == 0)
> +                       break;
> +               for (i = 0; i < RTE_DIM(template_actions); i++) {
> +                       if ((flow_actions[j] &
> template_actions[i].flow_mask) == 0)
> +                               continue;
> +
> +                       switch (template_actions[i].type) {
> +                       case RTE_FLOW_ACTION_TYPE_COUNT:
> +                               port_attr->nb_counters++;
> +                               break;
> +                       case RTE_FLOW_ACTION_TYPE_AGE:
> +                               port_attr->nb_aging_objects++;
> +                               break;
> +                       case RTE_FLOW_ACTION_TYPE_METER:
> +                               port_attr->nb_meters++;
> +                               break;
> +                       case RTE_FLOW_ACTION_TYPE_CONNTRACK:
> +                               port_attr->nb_conn_tracks++;
> +                               break;
> +                       case RTE_FLOW_ACTION_TYPE_QUOTA:
> +                               port_attr->nb_quotas++;
> +                       default:;
> +                       }
> +
> +                       actions[actions_counter].type =
> template_actions[i].type;
> +                       actions[actions_counter].conf =
> template_actions[i].action_conf;
> +                       masks[actions_counter].type =
> template_actions[i].type;
> +                       masks[actions_counter].conf =
> template_actions[i].action_mask;
> +                       conf_sizes[actions_counter] =
> action_conf_size(template_actions[i].type);
> +                       *need_wire_orig_table |=
> template_actions[i].need_wire_orig_table;
> +                       actions_counter++;
> +                       break;
> +               }
> +       }
> +
> +       actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
> +       masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
> +
> +       /* take END into account */
> +       *n_actions_out = actions_counter + 1;
> +}
> diff --git a/app/test-flow-perf/actions_gen.h
> b/app/test-flow-perf/actions_gen.h
> index 9e13b164f9..3ac0ffed59 100644
> --- a/app/test-flow-perf/actions_gen.h
> +++ b/app/test-flow-perf/actions_gen.h
> @@ -17,9 +17,40 @@
>  #define RTE_VXLAN_GPE_UDP_PORT 250
>  #define RTE_GENEVE_UDP_PORT 6081
>
> +/* Compound action data structures (needed by async_flow.c for slot init)
> */
> +
> +/* Storage for struct rte_flow_action_raw_encap including external data.
> */
> +struct action_raw_encap_data {
> +       struct rte_flow_action_raw_encap conf;
> +       uint8_t data[128];
> +       uint8_t preserve[128];
> +       uint16_t idx;
> +};
> +
> +/* Storage for struct rte_flow_action_raw_decap including external data.
> */
> +struct action_raw_decap_data {
> +       struct rte_flow_action_raw_decap conf;
> +       uint8_t data[128];
> +       uint16_t idx;
> +};
> +
> +/* Storage for struct rte_flow_action_rss including external data. */
> +struct action_rss_data {
> +       struct rte_flow_action_rss conf;
> +       uint8_t key[40];
> +       uint16_t queue[128];
> +};
> +
>  void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
>         uint32_t counter, uint16_t next_table, uint16_t hairpinq,
>         uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
>         bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
>
> +/* Fill actions template for async flow API (types only, no values).
> + * If conf_sizes is non-NULL, populates per-action conf sizes and
> n_actions_out.
> + */
> +void fill_actions_template(struct rte_flow_action *actions, struct
> rte_flow_action *masks,
> +                          uint64_t *flow_actions, struct
> rte_flow_port_attr *port_attr,
> +                          bool *need_wire_orig_table, size_t *conf_sizes,
> uint32_t *n_actions_out);
> +
>  #endif /* FLOW_PERF_ACTION_GEN */
> diff --git a/app/test-flow-perf/async_flow.c
> b/app/test-flow-perf/async_flow.c
> new file mode 100644
> index 0000000000..ae5a922856
> --- /dev/null
> +++ b/app/test-flow-perf/async_flow.c
> @@ -0,0 +1,761 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
> + *
> + * This file contains the async flow API implementation
> + * for the flow-perf application.
> + */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +
> +#include <rte_bitops.h>
> +#include <rte_common.h>
> +#include <rte_ethdev.h>
> +#include <rte_flow.h>
> +#include <rte_vxlan.h>
> +
> +#include "actions_gen.h"
> +#include "async_flow.h"
> +#include "flow_gen.h"
> +#include "items_gen.h"
> +
> +/* Max iterations when draining pending async completions during cleanup
> */
> +#define DRAIN_MAX_ITERATIONS 100
> +
> +/* Per-port async flow resources */
> +static struct async_flow_resources port_resources[MAX_PORTS];
> +
> +/*
> + * Initialize compound action types within a pre-allocated slot.
> + * Called once per slot during pool init to set up internal pointers
> + * for RSS, RAW_ENCAP, RAW_DECAP and VXLAN_ENCAP actions.
> + */
> +static void
> +init_slot_compound_actions(struct rte_flow_action *actions, uint32_t
> n_actions,
> +                          const size_t *action_conf_sizes)
> +{
> +       uint32_t i;
> +
> +       for (i = 0; i < n_actions; i++) {
> +               if (action_conf_sizes[i] == 0)
> +                       continue;
> +
> +               switch (actions[i].type) {
> +               case RTE_FLOW_ACTION_TYPE_RSS: {
> +                       struct action_rss_data *rss =
> +                               (struct action_rss_data
> *)(uintptr_t)actions[i].conf;
> +                       rss->conf.func = RTE_ETH_HASH_FUNCTION_DEFAULT;
> +                       rss->conf.level = 0;
> +                       rss->conf.types = GET_RSS_HF();
> +                       rss->conf.key_len = sizeof(rss->key);
> +                       rss->conf.key = rss->key;
> +                       rss->conf.queue = rss->queue;
> +                       rss->key[0] = 1;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
> +                       struct action_raw_encap_data *encap =
> +                               (struct action_raw_encap_data
> *)(uintptr_t)actions[i].conf;
> +                       encap->conf.data = encap->data;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
> +                       struct action_raw_decap_data *decap =
> +                               (struct action_raw_decap_data
> *)(uintptr_t)actions[i].conf;
> +                       decap->conf.data = decap->data;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
> +                       /*
> +                        * Layout within the conf area:
> +                        *   struct rte_flow_action_vxlan_encap
> +                        *   struct rte_flow_item[5]
> +                        *   struct rte_flow_item_eth
> +                        *   struct rte_flow_item_ipv4
> +                        *   struct rte_flow_item_udp
> +                        *   struct rte_flow_item_vxlan
> +                        */
> +                       uint8_t *base = (uint8_t
> *)(uintptr_t)actions[i].conf;
> +                       struct rte_flow_action_vxlan_encap *ve =
> +                               (struct rte_flow_action_vxlan_encap *)base;
> +                       struct rte_flow_item *items =
> +                               (struct rte_flow_item
> +                                        *)(base + sizeof(struct
> rte_flow_action_vxlan_encap));
> +                       uint8_t *data = (uint8_t *)(items + 5);
> +
> +                       struct rte_flow_item_eth *item_eth = (struct
> rte_flow_item_eth *)data;
> +                       data += sizeof(struct rte_flow_item_eth);
> +                       struct rte_flow_item_ipv4 *item_ipv4 = (struct
> rte_flow_item_ipv4 *)data;
> +                       data += sizeof(struct rte_flow_item_ipv4);
> +                       struct rte_flow_item_udp *item_udp = (struct
> rte_flow_item_udp *)data;
> +                       data += sizeof(struct rte_flow_item_udp);
> +                       struct rte_flow_item_vxlan *item_vxlan = (struct
> rte_flow_item_vxlan *)data;
> +
> +                       memset(item_eth, 0, sizeof(*item_eth));
> +                       memset(item_ipv4, 0, sizeof(*item_ipv4));
> +                       memset(item_udp, 0, sizeof(*item_udp));
> +                       memset(item_vxlan, 0, sizeof(*item_vxlan));
> +
> +                       item_ipv4->hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
> +                       item_ipv4->hdr.version_ihl = RTE_IPV4_VHL_DEF;
> +                       item_udp->hdr.dst_port =
> RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
> +                       item_vxlan->hdr.vni[2] = 1;
> +
> +                       items[0].type = RTE_FLOW_ITEM_TYPE_ETH;
> +                       items[0].spec = item_eth;
> +                       items[0].mask = item_eth;
> +                       items[1].type = RTE_FLOW_ITEM_TYPE_IPV4;
> +                       items[1].spec = item_ipv4;
> +                       items[1].mask = item_ipv4;
> +                       items[2].type = RTE_FLOW_ITEM_TYPE_UDP;
> +                       items[2].spec = item_udp;
> +                       items[2].mask = item_udp;
> +                       items[3].type = RTE_FLOW_ITEM_TYPE_VXLAN;
> +                       items[3].spec = item_vxlan;
> +                       items[3].mask = item_vxlan;
> +                       items[4].type = RTE_FLOW_ITEM_TYPE_END;
> +
> +                       ve->definition = items;
> +                       break;
> +               }
> +               default:
> +                       break;
> +               }
> +       }
> +}
> +
> +/*
> + * Allocate and pre-initialize all per-slot flat buffers.
> + * Returns 0 on success.
> + */
> +static int
> +init_slot_pool(struct async_flow_resources *res, uint32_t nb_queues,
> uint32_t queue_size,
> +              const struct rte_flow_item *pattern, uint32_t n_items,
> const size_t *item_spec_sizes,
> +              const struct rte_flow_action *template_actions, uint32_t
> n_actions,
> +              const size_t *action_conf_sizes)
> +{
> +       uint32_t items_array_bytes, actions_array_bytes;
> +       uint32_t spec_data_bytes, conf_data_bytes, mask_data_bytes;
> +       uint32_t slot_size, num_slots;
> +       uint32_t s, i;
> +       uint8_t *mptr;
> +
> +       /* Compute shared mask size */
> +       mask_data_bytes = 0;
> +       for (i = 0; i < n_items; i++)
> +               mask_data_bytes += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
> +
> +       /* specs and masks have the same size */
> +       spec_data_bytes = mask_data_bytes;
> +
> +       conf_data_bytes = 0;
> +       for (i = 0; i < n_actions; i++)
> +               conf_data_bytes += RTE_ALIGN_CEIL(action_conf_sizes[i], 8);
> +
> +       /* Compute per-slot layout sizes (+ 1 for END sentinel) */
> +       items_array_bytes = n_items * sizeof(struct rte_flow_item);
> +       actions_array_bytes = n_actions * sizeof(struct rte_flow_action);
> +
> +       slot_size = RTE_ALIGN_CEIL(items_array_bytes + actions_array_bytes
> + spec_data_bytes +
> +                                          conf_data_bytes,
> +                                  RTE_CACHE_LINE_SIZE);
> +
> +       num_slots = queue_size * nb_queues;
> +
> +       /* Store layout info */
> +       res->slot_size = slot_size;
> +       res->slots_per_queue = queue_size;
> +       res->nb_queues = nb_queues;
> +       res->n_items = n_items;
> +       res->n_actions = n_actions;
> +
> +       /* Allocate shared masks */
> +       if (mask_data_bytes > 0) {
> +               res->shared_masks = aligned_alloc(
> +                       RTE_CACHE_LINE_SIZE,
> RTE_ALIGN_CEIL(mask_data_bytes, RTE_CACHE_LINE_SIZE));
> +               if (res->shared_masks == NULL) {
> +                       fprintf(stderr, "Failed to allocate shared masks
> (%u bytes)\n",
> +                               mask_data_bytes);
> +                       return -ENOMEM;
> +               }
> +               memset(res->shared_masks, 0, mask_data_bytes);
> +
> +               /* Copy mask data from template pattern */
> +               mptr = res->shared_masks;
> +               for (i = 0; i < n_items; i++) {
> +                       if (item_spec_sizes[i] > 0 && pattern[i].mask !=
> NULL)
> +                               memcpy(mptr, pattern[i].mask,
> item_spec_sizes[i]);
> +                       mptr += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
> +               }
> +       }
> +
> +       /* Allocate per-slot pool */
> +       /* slot_size is already cache-line aligned, so total is a multiple
> */
> +       res->slot_pool = aligned_alloc(RTE_CACHE_LINE_SIZE,
> (size_t)num_slots * slot_size);
> +       if (res->slot_pool == NULL) {
> +               fprintf(stderr, "Failed to allocate slot pool (%u slots *
> %u bytes)\n", num_slots,
> +                       slot_size);
> +               free(res->shared_masks);
> +               res->shared_masks = NULL;
> +               return -ENOMEM;
> +       }
> +       memset(res->slot_pool, 0, (size_t)num_slots * slot_size);
> +
> +       /* Pre-initialize every slot */
> +       for (s = 0; s < num_slots; s++) {
> +               uint8_t *slot = res->slot_pool + (size_t)s * slot_size;
> +               struct rte_flow_item *items = (struct rte_flow_item *)slot;
> +               struct rte_flow_action *actions =
> +                       (struct rte_flow_action *)(slot +
> items_array_bytes);
> +               uint8_t *data = slot + items_array_bytes +
> actions_array_bytes;
> +
> +               /* Pre-set items: spec → per-slot data, mask → shared
> masks */
> +               mptr = res->shared_masks;
> +               for (i = 0; i < n_items; i++) {
> +                       items[i].type = pattern[i].type;
> +                       if (item_spec_sizes[i] > 0) {
> +                               items[i].spec = data;
> +                               items[i].mask = mptr;
> +                               data += RTE_ALIGN_CEIL(item_spec_sizes[i],
> 8);
> +                               mptr += RTE_ALIGN_CEIL(item_spec_sizes[i],
> 8);
> +                       }
> +               }
> +               items[n_items].type = RTE_FLOW_ITEM_TYPE_END;
> +
> +               /* Pre-set actions: conf → per-slot data */
> +               for (i = 0; i < n_actions; i++) {
> +                       actions[i].type = template_actions[i].type;
> +                       if (action_conf_sizes[i] > 0) {
> +                               actions[i].conf = data;
> +                               data +=
> RTE_ALIGN_CEIL(action_conf_sizes[i], 8);
> +                       }
> +               }
> +               actions[n_actions].type = RTE_FLOW_ACTION_TYPE_END;
> +
> +               /* Initialize compound action types (RSS, RAW_ENCAP, etc.)
> */
> +               init_slot_compound_actions(actions, n_actions,
> action_conf_sizes);
> +       }
> +
> +       /* Allocate and initialize per-queue slot tracking */
> +       res->queues = aligned_alloc(
> +               RTE_CACHE_LINE_SIZE,
> +               RTE_ALIGN_CEIL(nb_queues * sizeof(struct
> async_flow_queue), RTE_CACHE_LINE_SIZE));
> +       if (res->queues == NULL) {
> +               fprintf(stderr, "Failed to allocate queue structs (%u
> queues)\n", nb_queues);
> +               free(res->slot_pool);
> +               res->slot_pool = NULL;
> +               free(res->shared_masks);
> +               res->shared_masks = NULL;
> +               return -ENOMEM;
> +       }
> +       memset(res->queues, 0, nb_queues * sizeof(struct
> async_flow_queue));
> +       for (s = 0; s < nb_queues; s++) {
> +               res->queues[s].slots = res->slot_pool + (size_t)s *
> queue_size * slot_size;
> +               res->queues[s].head = 0;
> +       }
> +
> +       printf(":: Slot pool: %u slots * %u bytes = %u KB (shared masks:
> %u bytes)\n", num_slots,
> +              slot_size, (num_slots * slot_size) / 1024, mask_data_bytes);
> +
> +       return 0;
> +}
> +
> +/*
> + * Hot-path: update per-flow item values through pre-set pointers.
> + * Only IPV4/IPV6 src_addr varies per flow (based on counter).
> + */
> +static void
> +update_item_values(struct rte_flow_item *items, uint32_t counter)
> +{
> +       uint8_t i;
> +
> +       for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
> +               switch (items[i].type) {
> +               case RTE_FLOW_ITEM_TYPE_IPV4:
> +                       ((struct rte_flow_item_ipv4
> *)(uintptr_t)items[i].spec)->hdr.src_addr =
> +                               RTE_BE32(counter);
> +                       break;
> +               case RTE_FLOW_ITEM_TYPE_IPV6: {
> +                       struct rte_flow_item_ipv6 *spec =
> +                               (struct rte_flow_item_ipv6
> *)(uintptr_t)items[i].spec;
> +                       uint8_t j;
> +                       for (j = 0; j < 4; j++)
> +                               spec->hdr.src_addr.a[15 - j] = counter >>
> (j * 8);
> +                       break;
> +               }
> +               default:
> +                       break;
> +               }
> +       }
> +}
> +
> +/*
> + * Hot-path: update per-flow action values through pre-set pointers.
> + */
> +static void
> +update_action_values(struct rte_flow_action *actions, uint32_t counter,
> uint16_t hairpinq,
> +                    uint64_t encap_data, uint64_t decap_data,
> __rte_unused uint8_t core_idx,
> +                    bool unique_data, uint8_t rx_queues_count, uint16_t
> dst_port)
> +{
> +       uint8_t i;
> +
> +       for (i = 0; actions[i].type != RTE_FLOW_ACTION_TYPE_END; i++) {
> +               switch (actions[i].type) {
> +               case RTE_FLOW_ACTION_TYPE_MARK:
> +                       ((struct rte_flow_action_mark
> *)(uintptr_t)actions[i].conf)->id =
> +                               (counter % 255) + 1;
> +                       break;
> +               case RTE_FLOW_ACTION_TYPE_QUEUE:
> +                       ((struct rte_flow_action_queue
> *)(uintptr_t)actions[i].conf)->index =
> +                               hairpinq ? (counter % hairpinq) +
> rx_queues_count :
> +                                          counter % rx_queues_count;
> +                       break;
> +               case RTE_FLOW_ACTION_TYPE_METER:
> +                       ((struct rte_flow_action_meter
> *)(uintptr_t)actions[i].conf)->mtr_id =
> +                               counter;
> +                       break;
> +               case RTE_FLOW_ACTION_TYPE_RSS: {
> +                       struct action_rss_data *rss =
> +                               (struct action_rss_data
> *)(uintptr_t)actions[i].conf;
> +                       uint16_t q;
> +                       if (hairpinq) {
> +                               rss->conf.queue_num = hairpinq;
> +                               for (q = 0; q < hairpinq; q++)
> +                                       rss->queue[q] = q +
> rx_queues_count;
> +                       } else {
> +                               rss->conf.queue_num = rx_queues_count;
> +                               for (q = 0; q < rx_queues_count; q++)
> +                                       rss->queue[q] = q;
> +                       }
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
> +               case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: {
> +                       struct rte_flow_action_set_mac *mac =
> +                               (struct rte_flow_action_set_mac
> *)(uintptr_t)actions[i].conf;
> +                       uint32_t val = unique_data ? counter : 1;
> +                       uint8_t j;
> +                       for (j = 0; j < RTE_ETHER_ADDR_LEN; j++) {
> +                               mac->mac_addr[j] = val & 0xff;
> +                               val >>= 8;
> +                       }
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: {
> +                       uint32_t ip = unique_data ? counter : 1;
> +                       ((struct rte_flow_action_set_ipv4
> *)(uintptr_t)actions[i].conf)->ipv4_addr =
> +                               RTE_BE32(ip + 1);
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: {
> +                       struct rte_flow_action_set_ipv6 *v6 =
> +                               (struct rte_flow_action_set_ipv6
> *)(uintptr_t)actions[i].conf;
> +                       uint32_t val = unique_data ? counter : 1;
> +                       uint8_t j;
> +                       for (j = 0; j < 16; j++) {
> +                               v6->ipv6_addr.a[j] = val & 0xff;
> +                               val >>= 8;
> +                       }
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: {
> +                       uint32_t tp = unique_data ? counter : 100;
> +                       tp = tp % 0xffff;
> +                       ((struct rte_flow_action_set_tp
> *)(uintptr_t)actions[i].conf)->port =
> +                               RTE_BE16(tp & 0xffff);
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_TP_DST: {
> +                       uint32_t tp = unique_data ? counter : 100;
> +                       if (tp > 0xffff)
> +                               tp >>= 16;
> +                       ((struct rte_flow_action_set_tp
> *)(uintptr_t)actions[i].conf)->port =
> +                               RTE_BE16(tp & 0xffff);
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
> +               case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
> +               case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
> +               case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: {
> +                       uint32_t val = unique_data ? counter : 1;
> +                       *(rte_be32_t *)(uintptr_t)actions[i].conf =
> RTE_BE32(val);
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_TTL: {
> +                       uint32_t val = unique_data ? counter : 1;
> +                       ((struct rte_flow_action_set_ttl
> *)(uintptr_t)actions[i].conf)->ttl_value =
> +                               val % 0xff;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
> +               case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: {
> +                       uint32_t val = unique_data ? counter : 1;
> +                       ((struct rte_flow_action_set_dscp
> *)(uintptr_t)actions[i].conf)->dscp =
> +                               val % 0xff;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_PORT_ID:
> +                       ((struct rte_flow_action_port_id
> *)(uintptr_t)actions[i].conf)->id =
> +                               dst_port;
> +                       break;
> +               case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
> +                       struct action_raw_encap_data *encap =
> +                               (struct action_raw_encap_data
> *)(uintptr_t)actions[i].conf;
> +                       uint8_t *header = encap->data;
> +                       struct rte_ether_hdr eth_hdr;
> +                       struct rte_ipv4_hdr ipv4_hdr;
> +                       struct rte_udp_hdr udp_hdr;
> +
> +                       memset(&eth_hdr, 0, sizeof(eth_hdr));
> +                       if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
> +                               if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN))
> +                                       eth_hdr.ether_type =
> RTE_BE16(RTE_ETHER_TYPE_VLAN);
> +                               else if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
> +                                       eth_hdr.ether_type =
> RTE_BE16(RTE_ETHER_TYPE_IPV4);
> +                               else if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
> +                                       eth_hdr.ether_type =
> RTE_BE16(RTE_ETHER_TYPE_IPV6);
> +                               memcpy(header, &eth_hdr, sizeof(eth_hdr));
> +                               header += sizeof(eth_hdr);
> +                       }
> +                       if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4)) {
> +                               uint32_t ip_dst = unique_data ? counter :
> 1;
> +                               memset(&ipv4_hdr, 0, sizeof(ipv4_hdr));
> +                               ipv4_hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
> +                               ipv4_hdr.dst_addr = RTE_BE32(ip_dst);
> +                               ipv4_hdr.version_ihl = RTE_IPV4_VHL_DEF;
> +                               if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP))
> +                                       ipv4_hdr.next_proto_id = 17; /*
> UDP */
> +                               if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE))
> +                                       ipv4_hdr.next_proto_id = 47; /*
> GRE */
> +                               memcpy(header, &ipv4_hdr,
> sizeof(ipv4_hdr));
> +                               header += sizeof(ipv4_hdr);
> +                       }
> +                       if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP)) {
> +                               memset(&udp_hdr, 0, sizeof(udp_hdr));
> +                               if (encap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN))
> +                                       udp_hdr.dst_port =
> RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
> +                               memcpy(header, &udp_hdr, sizeof(udp_hdr));
> +                               header += sizeof(udp_hdr);
> +                       }
> +                       encap->conf.size = header - encap->data;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
> +                       struct action_raw_decap_data *decap_d =
> +                               (struct action_raw_decap_data
> *)(uintptr_t)actions[i].conf;
> +                       uint8_t *header = decap_d->data;
> +                       struct rte_ether_hdr eth_hdr;
> +
> +                       memset(&eth_hdr, 0, sizeof(eth_hdr));
> +                       if (decap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
> +                               if (decap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
> +                                       eth_hdr.ether_type =
> RTE_BE16(RTE_ETHER_TYPE_IPV4);
> +                               else if (decap_data &
> FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
> +                                       eth_hdr.ether_type =
> RTE_BE16(RTE_ETHER_TYPE_IPV6);
> +                               memcpy(header, &eth_hdr, sizeof(eth_hdr));
> +                               header += sizeof(eth_hdr);
> +                       }
> +                       decap_d->conf.size = header - decap_d->data;
> +                       break;
> +               }
> +               case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
> +                       uint8_t *base = (uint8_t
> *)(uintptr_t)actions[i].conf;
> +                       struct rte_flow_item *vitems =
> +                               (struct rte_flow_item
> +                                        *)(base + sizeof(struct
> rte_flow_action_vxlan_encap));
> +                       uint32_t ip_dst = unique_data ? counter : 1;
> +                       /* vitems[1] is IPV4 */
> +                       ((struct rte_flow_item_ipv4
> *)(uintptr_t)vitems[1].spec)->hdr.dst_addr =
> +                               RTE_BE32(ip_dst);
> +                       break;
> +               }
> +               default:
> +                       break;
> +               }
> +       }
> +}
> +
> +int
> +async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t
> queue_size,
> +                    uint64_t *flow_items, uint64_t *flow_actions,
> uint64_t *flow_attrs,
> +                    uint8_t flow_group, uint32_t rules_count)
> +{
> +       struct rte_flow_port_info port_info = {0};
> +       struct rte_flow_queue_info queue_info = {0};
> +       struct rte_flow_error error = {0};
> +       struct rte_flow_port_attr port_attr = {0};
> +       struct rte_flow_queue_attr queue_attr;
> +       const struct rte_flow_queue_attr **queue_attr_list;
> +       struct rte_flow_pattern_template_attr pt_attr = {0};
> +       struct rte_flow_actions_template_attr at_attr = {0};
> +       struct rte_flow_template_table_attr table_attr = {0};
> +       struct rte_flow_item pattern[MAX_ITEMS_NUM];
> +       struct rte_flow_action actions[MAX_ACTIONS_NUM];
> +       struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
> +       size_t item_spec_sizes[MAX_ITEMS_NUM];
> +       size_t action_conf_sizes[MAX_ACTIONS_NUM];
> +       uint32_t n_items, n_actions;
> +       struct async_flow_resources *res;
> +       bool need_wire_orig_table = false;
> +       uint32_t i;
> +       int ret;
> +
> +       if (port_id >= MAX_PORTS)
> +               return -1;
> +
> +       res = &port_resources[port_id];
> +       memset(res, 0, sizeof(*res));
> +
> +       /* Query port flow info */
> +       ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
> +       if (ret != 0) {
> +               fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n",
> port_id,
> +                       error.message ? error.message : "(no message)");
> +               return ret;
> +       }
> +
> +       if (port_info.max_nb_queues == 0 || queue_info.max_size == 0) {
> +               fprintf(stderr, "Port %u: rte_flow_info_get reports that
> no queues are supported\n",
> +                       port_id);
> +               return -1;
> +       }
> +
> +       /* Limit to device capabilities if reported */
> +       if (port_info.max_nb_queues != 0 && port_info.max_nb_queues !=
> UINT32_MAX &&
> +           nb_queues > port_info.max_nb_queues)
> +               nb_queues = port_info.max_nb_queues;
> +       if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX
> &&
> +           queue_size > queue_info.max_size)
> +               queue_size = queue_info.max_size;
> +
> +       /* Slot ring uses bitmask wrapping, so queue_size must be power of
> 2 */
> +       queue_size = rte_align32prevpow2(queue_size);
> +       if (queue_size == 0) {
> +               fprintf(stderr, "Port %u: queue_size is 0 after
> rounding\n", port_id);
> +               return -EINVAL;
> +       }
> +
> +       for (i = 0; i < MAX_ATTRS_NUM; i++) {
> +               if (flow_attrs[i] == 0)
> +                       break;
> +               if (flow_attrs[i] & INGRESS)
> +                       pt_attr.ingress = 1;
> +               else if (flow_attrs[i] & EGRESS)
> +                       pt_attr.egress = 1;
> +               else if (flow_attrs[i] & TRANSFER)
> +                       pt_attr.transfer = 1;
> +       }
> +       /* Enable relaxed matching for better performance */
> +       pt_attr.relaxed_matching = 1;
> +
> +       memset(pattern, 0, sizeof(pattern));
> +       memset(actions, 0, sizeof(actions));
> +       memset(action_masks, 0, sizeof(action_masks));
> +
> +       /* Fill templates and gather per-item/action sizes */
> +       fill_items_template(pattern, flow_items, 0, 0, item_spec_sizes,
> &n_items);
> +
> +       at_attr.ingress = pt_attr.ingress;
> +       at_attr.egress = pt_attr.egress;
> +       at_attr.transfer = pt_attr.transfer;
> +
> +       fill_actions_template(actions, action_masks, flow_actions,
> &port_attr,
> +                             &need_wire_orig_table, action_conf_sizes,
> &n_actions);
> +
> +       /* fill_actions_template count the number of actions that require
> each kind of object,
> +        * so we multiply by the number of rules to have correct number */
> +       port_attr.nb_counters *= rules_count;
> +       port_attr.nb_aging_objects *= rules_count;
> +       port_attr.nb_meters *= rules_count;
> +       port_attr.nb_conn_tracks *= rules_count;
> +       port_attr.nb_quotas *= rules_count;
> +
> +       table_attr.flow_attr.group = flow_group;
> +       table_attr.flow_attr.priority = 0;
> +       table_attr.flow_attr.ingress = pt_attr.ingress;
> +       table_attr.flow_attr.egress = pt_attr.egress;
> +       table_attr.flow_attr.transfer = pt_attr.transfer;
> +       table_attr.nb_flows = rules_count;
> +
> +       if (pt_attr.transfer && need_wire_orig_table)
> +               table_attr.specialize =
> RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
> +
> +       queue_attr_list = malloc(sizeof(*queue_attr_list) * nb_queues);
> +       if (queue_attr_list == NULL) {
> +               fprintf(stderr, "Port %u: failed to allocate
> queue_attr_list\n", port_id);
> +               return -ENOMEM;
> +       }
> +
> +       queue_attr.size = queue_size;
> +       for (i = 0; i < nb_queues; i++)
> +               queue_attr_list[i] = &queue_attr;
> +
> +       ret = rte_flow_configure(port_id, &port_attr, nb_queues,
> queue_attr_list, &error);
> +
> +       free(queue_attr_list);
> +
> +       if (ret != 0) {
> +               fprintf(stderr, "Port %u: rte_flow_configure failed
> (ret=%d, type=%d): %s\n",
> +                       port_id, ret, error.type, error.message ?
> error.message : "(no message)");
> +               return ret;
> +       }
> +
> +       /* Create pattern template */
> +       res->pattern_template =
> +               rte_flow_pattern_template_create(port_id, &pt_attr,
> pattern, &error);
> +       if (res->pattern_template == NULL) {
> +               fprintf(stderr, "Port %u: pattern template create failed:
> %s\n", port_id,
> +                       error.message ? error.message : "(no message)");
> +               return -1;
> +       }
> +
> +       /* Create actions template */
> +       res->actions_template =
> +               rte_flow_actions_template_create(port_id, &at_attr,
> actions, action_masks, &error);
> +       if (res->actions_template == NULL) {
> +               fprintf(stderr, "Port %u: actions template create failed:
> %s\n", port_id,
> +                       error.message ? error.message : "(no message)");
> +               rte_flow_pattern_template_destroy(port_id,
> res->pattern_template, &error);
> +               res->pattern_template = NULL;
> +               return -1;
> +       }
> +
> +       /* Create template table */
> +       res->table = rte_flow_template_table_create(port_id, &table_attr,
> &res->pattern_template, 1,
> +
>  &res->actions_template, 1, &error);
> +       if (res->table == NULL) {
> +               fprintf(stderr, "Port %u: template table create failed:
> %s\n", port_id,
> +                       error.message ? error.message : "(no message)");
> +               rte_flow_actions_template_destroy(port_id,
> res->actions_template, &error);
> +               rte_flow_pattern_template_destroy(port_id,
> res->pattern_template, &error);
> +               res->pattern_template = NULL;
> +               res->actions_template = NULL;
> +               return -1;
> +       }
> +
> +       /* Allocate and pre-initialize per-slot flat buffers */
> +       ret = init_slot_pool(res, nb_queues, queue_size, pattern, n_items,
> item_spec_sizes, actions,
> +                            n_actions, action_conf_sizes);
> +       if (ret != 0) {
> +               fprintf(stderr, "Port %u: slot pool init failed\n",
> port_id);
> +               rte_flow_template_table_destroy(port_id, res->table,
> &error);
> +               rte_flow_actions_template_destroy(port_id,
> res->actions_template, &error);
> +               rte_flow_pattern_template_destroy(port_id,
> res->pattern_template, &error);
> +               res->table = NULL;
> +               res->actions_template = NULL;
> +               res->pattern_template = NULL;
> +               return ret;
> +       }
> +
> +       res->table_capacity = rules_count;
> +       res->initialized = true;
> +
> +       printf(":: Port %u: Async flow engine initialized (queues=%u,
> queue_size=%u)\n", port_id,
> +              nb_queues, queue_size);
> +
> +       return 0;
> +}
> +
> +struct rte_flow *
> +async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t
> counter, uint16_t hairpinq,
> +                   uint64_t encap_data, uint64_t decap_data, uint16_t
> dst_port, uint8_t core_idx,
> +                   uint8_t rx_queues_count, bool unique_data, bool
> postpone,
> +                   struct rte_flow_error *error)
> +{
> +       struct async_flow_resources *res;
> +       struct async_flow_queue *q;
> +       uint8_t *slot;
> +       uint32_t idx, items_array_bytes;
> +       struct rte_flow_item *items;
> +       struct rte_flow_action *actions;
> +       struct rte_flow_op_attr op_attr = {
> +               .postpone = postpone,
> +       };
> +
> +       if (port_id >= MAX_PORTS) {
> +               rte_flow_error_set(error, EINVAL,
> RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +                                  "Invalid port ID");
> +               return NULL;
> +       }
> +
> +       res = &port_resources[port_id];
> +       if (!res->initialized) {
> +               rte_flow_error_set(error, EINVAL,
> RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +                                  "Async flow resources not initialized");
> +               return NULL;
> +       }
> +
> +       if (queue_id >= res->nb_queues) {
> +               rte_flow_error_set(error, EINVAL,
> RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +                                  "Invalid queue ID");
> +               return NULL;
> +       }
> +
> +       /* Pick the next slot from this queue's ring */
> +       q = &res->queues[queue_id];
> +       idx = q->head;
> +       q->head = (idx + 1) & (res->slots_per_queue - 1);
> +       slot = q->slots + (size_t)idx * res->slot_size;
> +       items_array_bytes = res->n_items * sizeof(struct rte_flow_item);
> +       items = (struct rte_flow_item *)slot;
> +       actions = (struct rte_flow_action *)(slot + items_array_bytes);
> +
> +       /* Update only per-flow varying values */
> +       update_item_values(items, counter);
> +       update_action_values(actions, counter, hairpinq, encap_data,
> decap_data, core_idx,
> +                            unique_data, rx_queues_count, dst_port);
> +
> +       return rte_flow_async_create(port_id, queue_id, &op_attr,
> res->table, items, 0, actions, 0,
> +                                    NULL, error);
> +}
> +
> +void
> +async_flow_cleanup_port(uint16_t port_id)
> +{
> +       struct async_flow_resources *res;
> +       struct rte_flow_error error;
> +       struct rte_flow_op_result results[64];
> +       int ret, i;
> +
> +       if (port_id >= MAX_PORTS)
> +               return;
> +
> +       res = &port_resources[port_id];
> +       if (!res->initialized)
> +               return;
> +
> +       /* Drain any pending async completions from flow flush */
> +       for (i = 0; i < DRAIN_MAX_ITERATIONS; i++) {
> +               rte_flow_push(port_id, 0, &error);
> +               ret = rte_flow_pull(port_id, 0, results, 64, &error);
> +               if (ret <= 0)
> +                       break;
> +       }
> +
> +       if (res->table != NULL) {
> +               rte_flow_template_table_destroy(port_id, res->table,
> &error);
> +               res->table = NULL;
> +       }
> +
> +       if (res->actions_template != NULL) {
> +               rte_flow_actions_template_destroy(port_id,
> res->actions_template, &error);
> +               res->actions_template = NULL;
> +       }
> +
> +       if (res->pattern_template != NULL) {
> +               rte_flow_pattern_template_destroy(port_id,
> res->pattern_template, &error);
> +               res->pattern_template = NULL;
> +       }
> +
> +       free(res->queues);
> +       res->queues = NULL;
> +       free(res->slot_pool);
> +       res->slot_pool = NULL;
> +       free(res->shared_masks);
> +       res->shared_masks = NULL;
> +
> +       res->initialized = false;
> +}
> diff --git a/app/test-flow-perf/async_flow.h
> b/app/test-flow-perf/async_flow.h
> new file mode 100644
> index 0000000000..8c12924bc6
> --- /dev/null
> +++ b/app/test-flow-perf/async_flow.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
> + *
> + * This file contains the async flow API related definitions
> + * and function declarations.
> + */
> +
> +#ifndef FLOW_PERF_ASYNC_FLOW
> +#define FLOW_PERF_ASYNC_FLOW
> +
> +#include <rte_flow.h>
> +#include <stdbool.h>
> +#include <stdint.h>
> +
> +#include "config.h"
> +
> +/* Per-queue slot ring — tracks which slot to use next */
> +struct async_flow_queue {
> +       uint8_t *slots; /* pointer to this queue's region within slot_pool
> */
> +       uint32_t head;  /* next slot index (wraps mod slots_per_queue) */
> +};
> +
> +/* Per-port async flow resources */
> +struct async_flow_resources {
> +       struct rte_flow_pattern_template *pattern_template;
> +       struct rte_flow_actions_template *actions_template;
> +       struct rte_flow_template_table *table;
> +       uint8_t *slot_pool;    /* flat buffer pool for all slots */
> +       uint8_t *shared_masks; /* shared item mask data (one copy for all
> slots) */
> +       struct async_flow_queue *queues;
> +       uint32_t slot_size;       /* bytes per slot (cache-line aligned) */
> +       uint32_t slots_per_queue; /* = queue_size */
> +       uint32_t nb_queues;
> +       uint32_t n_items;   /* item count (excl. END) */
> +       uint32_t n_actions; /* action count (excl. END) */
> +       uint32_t table_capacity;
> +       bool initialized;
> +};
> +
> +/* Initialize async flow engine for a port */
> +int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t
> queue_size,
> +                        uint64_t *flow_items, uint64_t *flow_actions,
> uint64_t *flow_attrs,
> +                        uint8_t flow_group, uint32_t rules_count);
> +
> +/* Create a flow rule asynchronously using pre-allocated slot */
> +struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id,
> uint32_t counter,
> +                                    uint16_t hairpinq, uint64_t
> encap_data, uint64_t decap_data,
> +                                    uint16_t dst_port, uint8_t core_idx,
> uint8_t rx_queues_count,
> +                                    bool unique_data, bool postpone,
> struct rte_flow_error *error);
> +
> +/* Cleanup async flow resources for a port */
> +void async_flow_cleanup_port(uint16_t port_id);
> +
> +#endif /* FLOW_PERF_ASYNC_FLOW */
> diff --git a/app/test-flow-perf/items_gen.c
> b/app/test-flow-perf/items_gen.c
> index c740e1838f..58f1c16cf8 100644
> --- a/app/test-flow-perf/items_gen.c
> +++ b/app/test-flow-perf/items_gen.c
> @@ -389,3 +389,61 @@ fill_items(struct rte_flow_item *items,
>
>         items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
>  }
> +
> +static size_t
> +item_spec_size(enum rte_flow_item_type type)
> +{
> +       switch (type) {
> +       case RTE_FLOW_ITEM_TYPE_ETH:
> +               return sizeof(struct rte_flow_item_eth);
> +       case RTE_FLOW_ITEM_TYPE_VLAN:
> +               return sizeof(struct rte_flow_item_vlan);
> +       case RTE_FLOW_ITEM_TYPE_IPV4:
> +               return sizeof(struct rte_flow_item_ipv4);
> +       case RTE_FLOW_ITEM_TYPE_IPV6:
> +               return sizeof(struct rte_flow_item_ipv6);
> +       case RTE_FLOW_ITEM_TYPE_TCP:
> +               return sizeof(struct rte_flow_item_tcp);
> +       case RTE_FLOW_ITEM_TYPE_UDP:
> +               return sizeof(struct rte_flow_item_udp);
> +       case RTE_FLOW_ITEM_TYPE_VXLAN:
> +               return sizeof(struct rte_flow_item_vxlan);
> +       case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
> +               return sizeof(struct rte_flow_item_vxlan_gpe);
> +       case RTE_FLOW_ITEM_TYPE_GRE:
> +               return sizeof(struct rte_flow_item_gre);
> +       case RTE_FLOW_ITEM_TYPE_GENEVE:
> +               return sizeof(struct rte_flow_item_geneve);
> +       case RTE_FLOW_ITEM_TYPE_GTP:
> +               return sizeof(struct rte_flow_item_gtp);
> +       case RTE_FLOW_ITEM_TYPE_META:
> +               return sizeof(struct rte_flow_item_meta);
> +       case RTE_FLOW_ITEM_TYPE_TAG:
> +               return sizeof(struct rte_flow_item_tag);
> +       case RTE_FLOW_ITEM_TYPE_ICMP:
> +               return sizeof(struct rte_flow_item_icmp);
> +       case RTE_FLOW_ITEM_TYPE_ICMP6:
> +               return sizeof(struct rte_flow_item_icmp6);
> +       default:
> +               return 0;
> +       }
> +}
> +
> +void
> +fill_items_template(struct rte_flow_item *items, uint64_t *flow_items,
> uint32_t outer_ip_src,
> +                   uint8_t core_idx, size_t *spec_sizes, uint32_t
> *n_items_out)
> +{
> +       uint32_t count;
> +
> +       fill_items(items, flow_items, outer_ip_src, core_idx);
> +
> +       /* Count items before END */
> +       for (count = 0; items[count].type != RTE_FLOW_ITEM_TYPE_END;
> count++) {
> +               spec_sizes[count] = item_spec_size(items[count].type);
> +               /* For templates, set spec to NULL - only mask matters for
> template matching */
> +               items[count].spec = NULL;
> +       }
> +
> +       /* take END into account */
> +       *n_items_out = count + 1;
> +}
> diff --git a/app/test-flow-perf/items_gen.h
> b/app/test-flow-perf/items_gen.h
> index f4b0e9a981..0987f7be3c 100644
> --- a/app/test-flow-perf/items_gen.h
> +++ b/app/test-flow-perf/items_gen.h
> @@ -15,4 +15,10 @@
>  void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
>         uint32_t outer_ip_src, uint8_t core_idx);
>
> +/* Fill items template for async flow API (masks only, no spec values).
> + * If spec_sizes is non-NULL, populates per-item spec sizes and
> n_items_out.
> + */
> +void fill_items_template(struct rte_flow_item *items, uint64_t
> *flow_items, uint32_t outer_ip_src,
> +                        uint8_t core_idx, size_t *spec_sizes, uint32_t
> *n_items_out);
> +
>  #endif /* FLOW_PERF_ITEMS_GEN */
> diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
> index 6636d1517f..2c6def95c2 100644
> --- a/app/test-flow-perf/main.c
> +++ b/app/test-flow-perf/main.c
> @@ -37,11 +37,15 @@
>  #include <rte_mtr.h>
>  #include <rte_os_shim.h>
>
> -#include "config.h"
>  #include "actions_gen.h"
> +#include "async_flow.h"
> +#include "config.h"
>  #include "flow_gen.h"
> +#include "rte_build_config.h"
>
>  #define MAX_BATCHES_COUNT          100
> +#define MAX_ASYNC_QUEUE_SIZE        (1 << 14)
> +#define MAX_PULL_RETRIES            (1 << 20)
>  #define DEFAULT_RULES_COUNT    4000000
>  #define DEFAULT_RULES_BATCH     100000
>  #define DEFAULT_GROUP                0
> @@ -55,7 +59,6 @@
>  #define HAIRPIN_TX_CONF_LOCKED_MEMORY (0x0100)
>  #define HAIRPIN_TX_CONF_RTE_MEMORY    (0x0200)
>
> -struct rte_flow *flow;
>  static uint8_t flow_group;
>
>  static uint64_t encap_data;
> @@ -81,6 +84,9 @@ static bool enable_fwd;
>  static bool unique_data;
>  static bool policy_mtr;
>  static bool packet_mode;
> +static bool async_mode;
> +static uint32_t async_queue_size = 1024;
> +static uint32_t async_push_batch = 256;
>
>  static uint8_t rx_queues_count;
>  static uint8_t tx_queues_count;
> @@ -598,6 +604,29 @@ usage(char *progname)
>                 "Encapped data is fixed with pattern:
> ether,ipv4,udp,vxlan\n"
>                 "With fixed values\n");
>         printf("  --vxlan-decap: add vxlan_decap action to flow
> actions\n");
> +
> +       printf("\nAsync flow API options:\n");
> +       printf("  --async: enable async flow API mode\n");
> +       printf("  --async-queue-size=N: size of each async queue,"
> +              " default is 1024\n");
> +       printf("  --async-push-batch=N: flows to batch before push,"
> +              " default is 256\n");
> +}
> +
> +static inline uint32_t
> +prev_power_of_two(uint32_t x)
> +{
> +       uint32_t saved = x;
> +       x--;
> +       x |= x >> 1;
> +       x |= x >> 2;
> +       x |= x >> 4;
> +       x |= x >> 8;
> +       x |= x >> 16;
> +       x++;
> +       if (x == saved)
> +               return x;
> +       return x >> 1;
>  }
>
>  static void
> @@ -734,6 +763,9 @@ args_parse(int argc, char **argv)
>                 { "policy-mtr",                 1, 0, 0 },
>                 { "meter-profile",              1, 0, 0 },
>                 { "packet-mode",                0, 0, 0 },
> +               { "async",                      0, 0, 0 },
> +               { "async-queue-size",           1, 0, 0 },
> +               { "async-push-batch",           1, 0, 0 },
>                 { 0, 0, 0, 0 },
>         };
>
> @@ -913,8 +945,7 @@ args_parse(int argc, char **argv)
>                                         rte_exit(EXIT_FAILURE, "Invalid
> hairpin config mask\n");
>                                 hairpin_conf_mask = hp_conf;
>                         }
> -                       if (strcmp(lgopts[opt_idx].name,
> -                                       "port-id") == 0) {
> +                       if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
>                                 uint16_t port_idx = 0;
>
>                                 token = strtok(optarg, ",");
> @@ -981,6 +1012,26 @@ args_parse(int argc, char **argv)
>                         }
>                         if (strcmp(lgopts[opt_idx].name, "packet-mode") ==
> 0)
>                                 packet_mode = true;
> +                       if (strcmp(lgopts[opt_idx].name, "async") == 0)
> +                               async_mode = true;
> +                       if (strcmp(lgopts[opt_idx].name,
> "async-queue-size") == 0) {
> +                               n = atoi(optarg);
> +                               if (n >= MAX_ASYNC_QUEUE_SIZE)
> +                                       async_queue_size =
> MAX_ASYNC_QUEUE_SIZE;
> +                               else if (n > 0)
> +                                       async_queue_size =
> prev_power_of_two(n);
> +                               else
> +                                       rte_exit(EXIT_FAILURE,
> "async-queue-size should be > 0\n");
> +                       }
> +                       if (strcmp(lgopts[opt_idx].name,
> "async-push-batch") == 0) {
> +                               n = atoi(optarg);
> +                               if (n >= MAX_ASYNC_QUEUE_SIZE >> 1)
> +                                       async_push_batch =
> MAX_ASYNC_QUEUE_SIZE >> 1;
> +                               else if (n > 0)
> +                                       async_push_batch =
> prev_power_of_two(n);
> +                               else
> +                                       rte_exit(EXIT_FAILURE,
> "async-push-batch should be > 0\n");
> +                       }
>                         break;
>                 default:
>                         usage(argv[0]);
> @@ -1457,10 +1508,10 @@ query_flows(int port_id, uint8_t core_id, struct
> rte_flow **flows_list)
>         mc_pool.flows_record.query[port_id][core_id] = cpu_time_used;
>  }
>
> -static struct rte_flow **
> -insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
> +static void
> +insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id, struct
> rte_flow **flows_list)
>  {
> -       struct rte_flow **flows_list;
> +       struct rte_flow *flow;
>         struct rte_flow_error error;
>         clock_t start_batch, end_batch;
>         double first_flow_latency;
> @@ -1485,8 +1536,7 @@ insert_flows(int port_id, uint8_t core_id, uint16_t
> dst_port_id)
>         global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
>         global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
>
> -       flows_list = rte_zmalloc("flows_list",
> -               (sizeof(struct rte_flow *) * (rules_count_per_core + 1)),
> 0);
> +       flows_list = malloc(sizeof(struct rte_flow *) *
> (rules_count_per_core + 1));
>         if (flows_list == NULL)
>                 rte_exit(EXIT_FAILURE, "No Memory available!\n");
>
> @@ -1524,6 +1574,11 @@ insert_flows(int port_id, uint8_t core_id, uint16_t
> dst_port_id)
>                         core_id, rx_queues_count,
>                         unique_data, max_priority, &error);
>
> +               if (!flow) {
> +                       print_flow_error(error);
> +                       rte_exit(EXIT_FAILURE, "Error in creating flow\n");
> +               }
> +
>                 if (!counter) {
>                         first_flow_latency = (double)
> (rte_get_timer_cycles() - start_batch);
>                         first_flow_latency /= rte_get_timer_hz();
> @@ -1537,11 +1592,6 @@ insert_flows(int port_id, uint8_t core_id, uint16_t
> dst_port_id)
>                 if (force_quit)
>                         counter = end_counter;
>
> -               if (!flow) {
> -                       print_flow_error(error);
> -                       rte_exit(EXIT_FAILURE, "Error in creating flow\n");
> -               }
> -
>                 flows_list[flow_index++] = flow;
>
>                 /*
> @@ -1575,7 +1625,203 @@ insert_flows(int port_id, uint8_t core_id,
> uint16_t dst_port_id)
>                 port_id, core_id, rules_count_per_core, cpu_time_used);
>
>         mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
> -       return flows_list;
> +}
> +
> +static uint32_t push_counter[RTE_MAX_LCORE];
> +
> +static inline int
> +push_pull_flows_async(int port_id, int queue_id, int core_id, uint32_t
> enqueued, bool empty,
> +                     bool check_op_status, struct rte_flow_error *error)
> +{
> +       static struct rte_flow_op_result
> results[RTE_MAX_LCORE][MAX_ASYNC_QUEUE_SIZE];
> +       uint32_t to_pull = (empty || async_push_batch > enqueued) ?
> enqueued : async_push_batch;
> +       uint32_t pulled_complete = 0;
> +       uint32_t retries = 0;
> +       int pulled, i;
> +       int ret = 0;
> +
> +       /* Push periodically to give HW work to do */
> +       ret = rte_flow_push(port_id, queue_id, error);
> +       if (ret)
> +               return ret;
> +       push_counter[core_id]++;
> +
> +       /* Check if queue is getting full, if so push and drain
> completions */
> +       if (!empty && push_counter[core_id] == 1)
> +               return 0;
> +
> +       while (to_pull > 0) {
> +               pulled = rte_flow_pull(port_id, queue_id,
> results[core_id], to_pull, error);
> +               if (pulled < 0) {
> +                       return -1;
> +               } else if (pulled == 0) {
> +                       if (++retries > MAX_PULL_RETRIES) {
> +                               rte_flow_error_set(error, ETIMEDOUT,
> +
> RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +                                                  "Timeout waiting for
> async completions");
> +                               return -1;
> +                       }
> +                       rte_pause();
> +                       continue;
> +               }
> +               retries = 0;
> +
> +               to_pull -= pulled;
> +               pulled_complete += pulled;
> +               if (!check_op_status)
> +                       continue;
> +
> +               for (i = 0; i < pulled; i++) {
> +                       if (results[core_id][i].status !=
> RTE_FLOW_OP_SUCCESS) {
> +                               rte_flow_error_set(error, EINVAL,
> RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                                                  NULL, "Some flow rule
> insertion failed");
> +                               return -1;
> +                       }
> +               }
> +       }
> +
> +       return pulled_complete;
> +}
> +
> +static void
> +insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id,
> struct rte_flow **flows_list)
> +{
> +       struct rte_flow *flow;
> +       struct rte_flow_error error;
> +       clock_t start_batch, end_batch;
> +       double first_flow_latency;
> +       double cpu_time_used;
> +       double insertion_rate;
> +       double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
> +       double delta;
> +       uint32_t flow_index;
> +       uint32_t counter, batch_counter, start_counter = 0, end_counter;
> +       int rules_batch_idx;
> +       int rules_count_per_core;
> +       uint32_t enqueued = 0;
> +       uint32_t queue_id = core_id;
> +       bool first_batch = true;
> +       int pulled;
> +
> +       rules_count_per_core = rules_count / mc_pool.cores_count;
> +
> +       if (async_push_batch > async_queue_size >> 1)
> +               async_push_batch = async_queue_size >> 1;
> +
> +       /* Set boundaries of rules for each core. */
> +       if (core_id)
> +               start_counter = core_id * rules_count_per_core;
> +       end_counter = (core_id + 1) * rules_count_per_core;
> +
> +       cpu_time_used = 0;
> +       flow_index = 0;
> +       push_counter[core_id] = 0;
> +
> +       if (flow_group > 0 && core_id == 0) {
> +               /*
> +                * Create global rule to jump into flow_group,
> +                * this way the app will avoid the default rules.
> +                *
> +                * This rule will be created only once.
> +                *
> +                * Global rule:
> +                * group 0 eth / end actions jump group <flow_group>
> +                */
> +
> +               uint64_t global_items[MAX_ITEMS_NUM] = {0};
> +               uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
> +               global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
> +               global_actions[0] =
> FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
> +               flow = generate_flow(port_id, 0, flow_attrs, global_items,
> global_actions,
> +                                    flow_group, 0, 0, 0, 0, dst_port_id,
> core_id, rx_queues_count,
> +                                    unique_data, max_priority, &error);
> +
> +               if (flow == NULL) {
> +                       print_flow_error(error);
> +                       rte_exit(EXIT_FAILURE, "Error in creating flow\n");
> +               }
> +               flows_list[flow_index++] = flow;
> +       }
> +
> +       start_batch = rte_get_timer_cycles();
> +       for (counter = start_counter; counter < end_counter;) {
> +               /* batch adding flow rules, this avoids unnecessary checks
> for push/pull */
> +               for (batch_counter = 0; batch_counter < async_push_batch
> && counter < end_counter;
> +                    batch_counter++, counter++) {
> +                       /* Create flow with postpone=true to batch
> operations */
> +                       flow = async_generate_flow(port_id, queue_id,
> counter, hairpin_queues_num,
> +                                                  encap_data, decap_data,
> dst_port_id, core_id,
> +                                                  rx_queues_count,
> unique_data, true, &error);
> +
> +                       if (!flow) {
> +                               print_flow_error(error);
> +                               rte_exit(EXIT_FAILURE, "Error in creating
> async flow\n");
> +                       }
> +
> +                       if (force_quit)
> +                               break;
> +
> +                       flows_list[flow_index++] = flow;
> +                       enqueued++;
> +
> +                       /*
> +                        * Save the insertion rate for rules batch.
> +                        * Check if the insertion reached the rules
> +                        * patch counter, then save the insertion rate
> +                        * for this batch.
> +                        */
> +                       if (!((counter + 1) % rules_batch)) {
> +                               end_batch = rte_get_timer_cycles();
> +                               delta = (double)(end_batch - start_batch);
> +                               rules_batch_idx = ((counter + 1) /
> rules_batch) - 1;
> +                               cpu_time_per_batch[rules_batch_idx] =
> delta / rte_get_timer_hz();
> +                               cpu_time_used +=
> cpu_time_per_batch[rules_batch_idx];
> +                               start_batch = rte_get_timer_cycles();
> +                       }
> +               }
> +
> +               if ((pulled = push_pull_flows_async(port_id, queue_id,
> core_id, enqueued, false,
> +                                                   true, &error)) < 0) {
> +                       print_flow_error(error);
> +                       rte_exit(EXIT_FAILURE, "Error push/pull async
> operations\n");
> +               }
> +
> +               enqueued -= pulled;
> +
> +               if (first_batch) {
> +                       first_flow_latency =
> (double)(rte_get_timer_cycles() - start_batch);
> +                       first_flow_latency /= rte_get_timer_hz();
> +                       /* In millisecond */
> +                       first_flow_latency *= 1000;
> +                       printf(":: First Flow Batch Latency (Async) ::
> Port %d :: First batch (%u) "
> +                              "installed in %f milliseconds\n",
> +                              port_id, async_push_batch,
> first_flow_latency);
> +                       first_batch = false;
> +               }
> +       }
> +
> +       if (push_pull_flows_async(port_id, queue_id, core_id, enqueued,
> true, true, &error) < 0) {
> +               print_flow_error(error);
> +               rte_exit(EXIT_FAILURE, "Error final push/pull async
> operations\n");
> +       }
> +
> +       /* Print insertion rates for all batches */
> +       if (dump_iterations)
> +               print_rules_batches(cpu_time_per_batch);
> +
> +       printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] -
> end @[%d]\n", port_id,
> +              core_id, start_counter, end_counter - 1);
> +
> +       /* Insertion rate for all rules in one core */
> +       if (cpu_time_used > 0) {
> +               insertion_rate = ((double)rules_count_per_core /
> cpu_time_used) / 1000;
> +               printf(":: Port %d :: Core %d :: Async rules insertion
> rate -> %f K Rule/Sec\n",
> +                      port_id, core_id, insertion_rate);
> +       }
> +       printf(":: Port %d :: Core %d :: The time for creating %d async
> rules is %f seconds\n",
> +              port_id, core_id, rules_count_per_core, cpu_time_used);
> +
> +       mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
>  }
>
>  static void
> @@ -1585,12 +1831,18 @@ flows_handler(uint8_t core_id)
>         uint16_t port_idx = 0;
>         uint16_t nr_ports;
>         int port_id;
> +       int rules_count_per_core;
>
>         nr_ports = rte_eth_dev_count_avail();
>
>         if (rules_batch > rules_count)
>                 rules_batch = rules_count;
>
> +       rules_count_per_core = rules_count / mc_pool.cores_count;
> +       flows_list = malloc(sizeof(struct rte_flow *) *
> (rules_count_per_core + 1));
> +       if (flows_list == NULL)
> +               rte_exit(EXIT_FAILURE, "No Memory available!\n");
> +
>         printf(":: Rules Count per port: %d\n\n", rules_count);
>
>         for (port_id = 0; port_id < nr_ports; port_id++) {
> @@ -1602,10 +1854,10 @@ flows_handler(uint8_t core_id)
>                 mc_pool.last_alloc[core_id] =
> (int64_t)dump_socket_mem(stdout);
>                 if (has_meter())
>                         meters_handler(port_id, core_id, METER_CREATE);
> -               flows_list = insert_flows(port_id, core_id,
> -                                               dst_ports[port_idx++]);
> -               if (flows_list == NULL)
> -                       rte_exit(EXIT_FAILURE, "Error: Insertion
> Failed!\n");
> +               if (async_mode)
> +                       insert_flows_async(port_id, core_id,
> dst_ports[port_idx++], flows_list);
> +               else
> +                       insert_flows(port_id, core_id,
> dst_ports[port_idx++], flows_list);
>                 mc_pool.current_alloc[core_id] =
> (int64_t)dump_socket_mem(stdout);
>
>                 if (query_flag)
> @@ -2212,6 +2464,16 @@ init_port(void)
>                         }
>                 }
>
> +               /* Configure async flow engine before device start */
> +               if (async_mode) {
> +                       ret = async_flow_init_port(port_id,
> mc_pool.cores_count, async_queue_size,
> +                                                  flow_items,
> flow_actions, flow_attrs, flow_group,
> +                                                  rules_count);
> +                       if (ret != 0)
> +                               rte_exit(EXIT_FAILURE, "Failed to init
> async flow on port %d\n",
> +                                        port_id);
> +               }
> +
>                 ret = rte_eth_dev_start(port_id);
>                 if (ret < 0)
>                         rte_exit(EXIT_FAILURE,
> @@ -2291,6 +2553,8 @@ main(int argc, char **argv)
>
>         RTE_ETH_FOREACH_DEV(port) {
>                 rte_flow_flush(port, &error);
> +               if (async_mode)
> +                       async_flow_cleanup_port(port);
>                 if (rte_eth_dev_stop(port) != 0)
>                         printf("Failed to stop device on port %u\n", port);
>                 rte_eth_dev_close(port);
> diff --git a/app/test-flow-perf/meson.build
> b/app/test-flow-perf/meson.build
> index e101449e32..2f820a7597 100644
> --- a/app/test-flow-perf/meson.build
> +++ b/app/test-flow-perf/meson.build
> @@ -3,6 +3,7 @@
>
>  sources = files(
>          'actions_gen.c',
> +        'async_flow.c',
>          'flow_gen.c',
>          'items_gen.c',
>          'main.c',
> --
> 2.43.0
>
>

[-- Attachment #2: Type: text/html, Size: 99590 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] test/flow: add support for async API
  2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
  2026-03-01 23:36     ` Maxime Peim
@ 2026-03-02  0:52     ` Stephen Hemminger
  1 sibling, 0 replies; 8+ messages in thread
From: Stephen Hemminger @ 2026-03-02  0:52 UTC (permalink / raw)
  To: Maxime Peim; +Cc: dev

On Mon,  2 Mar 2026 00:29:30 +0100
Maxime Peim <maxime.peim@gmail.com> wrote:

> +		switch (items[i].type) {
> +		case RTE_FLOW_ITEM_TYPE_IPV4:
> +			((struct rte_flow_item_ipv4 *)(uintptr_t)items[i].spec)->hdr.src_addr =
> +				RTE_BE32(counter);
Since spec is a void * you don't have to make it so opaque via uintptr_t.


> +			break;
> +		case RTE_FLOW_ITEM_TYPE_IPV6: {
> +			struct rte_flow_item_ipv6 *spec =
> +				(struct rte_flow_item_ipv6 *)(uintptr_t)items[i].spec;

ditto

> +			uint8_t j;
> +			for (j = 0; j < 4; j++)
> +				spec->hdr.src_addr.a[15 - j] = counter >> (j * 8);
> +			break;
> +		}
> +		default:
> +			break;
> +		}

Every time I see a cast my "spidey sense" tingles and look for code issues.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3] test/flow: add support for async API
  2026-02-25 22:23 ` Stephen Hemminger
  2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
@ 2026-03-02 10:57   ` Maxime Peim
  2026-03-02 14:35   ` [PATCH v4] " Maxime Peim
  2026-03-09 12:52   ` [PATCH v5] " Maxime Peim
  3 siblings, 0 replies; 8+ messages in thread
From: Maxime Peim @ 2026-03-02 10:57 UTC (permalink / raw)
  To: stephen; +Cc: dev

Add async flow API mode to test-flow-perf application for improved
flow rule insertion performance. The async API allows batching flow
rule creation operations and processing completions in bulk, reducing
per-rule overhead.

New command line options:
  --async: enable async flow API mode
  --async-queue-size=N: size of async queues (default: 1024)
  --async-push-batch=N: flows to batch before push (default: 256)

Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
---
v2:
  - Replace per-flow stack allocation with pre-allocated slot pool;
    flat buffers are initialized once at init time and the hot path
    only patches per-flow item/action values into a pre-set slot
  - Fix alloca misuse: use heap allocation for queue_attr_list, round
    queue_size to power of 2 for bitmask wrapping, add bounds checks
  - Fix race on file-scope flow variable, premature latency
    measurement, and integer division in rate calculation
  - Drop unrelated lgopts reformatting
  - Use malloc instead of rte_zmalloc for non-dataplane allocations
  - Various robustness and style fixes

v3:
  - Update meson.build to exclude Windows build for flow perf test
  - Fix checkstyle
  - Remove cast from void* to uintptr_t
  - Add name to mailmap and maintainers

 .mailmap                         |   1 +
 MAINTAINERS                      |   1 +
 app/test-flow-perf/actions_gen.c | 281 +++++++++++-
 app/test-flow-perf/actions_gen.h |  31 ++
 app/test-flow-perf/async_flow.c  | 758 +++++++++++++++++++++++++++++++
 app/test-flow-perf/async_flow.h  |  54 +++
 app/test-flow-perf/items_gen.c   |  58 +++
 app/test-flow-perf/items_gen.h   |   6 +
 app/test-flow-perf/main.c        | 303 +++++++++++-
 app/test-flow-perf/meson.build   |   7 +
 10 files changed, 1460 insertions(+), 40 deletions(-)
 create mode 100644 app/test-flow-perf/async_flow.c
 create mode 100644 app/test-flow-perf/async_flow.h

diff --git a/.mailmap b/.mailmap
index 6c4c977dde..a0141402c3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1044,6 +1044,7 @@ Mauro Annarumma <mauroannarumma@hotmail.it>
 Maxime Coquelin <maxime.coquelin@redhat.com>
 Maxime Gouin <maxime.gouin@6wind.com>
 Maxime Leroy <maxime@leroys.fr> <maxime.leroy@6wind.com>
+Maxime Peim <maxime.peim@gmail.com>
 Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>
 Megha Ajmera <megha.ajmera@intel.com>
 Meijuan Zhao <meijuanx.zhao@intel.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 1b2f1ed2ba..d4c01037c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1964,6 +1964,7 @@ F: doc/guides/tools/dmaperf.rst
 
 Flow performance tool
 M: Wisam Jaddo <wisamm@nvidia.com>
+M: Maxime Peim <maxime.peim@gmail.com>
 F: app/test-flow-perf/
 F: doc/guides/tools/flow-perf.rst
 
diff --git a/app/test-flow-perf/actions_gen.c b/app/test-flow-perf/actions_gen.c
index 9d102e3af4..4a5b6fb2ff 100644
--- a/app/test-flow-perf/actions_gen.c
+++ b/app/test-flow-perf/actions_gen.c
@@ -36,27 +36,7 @@ struct additional_para {
 	bool unique_data;
 };
 
-/* Storage for struct rte_flow_action_raw_encap including external data. */
-struct action_raw_encap_data {
-	struct rte_flow_action_raw_encap conf;
-	uint8_t data[128];
-	uint8_t preserve[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_raw_decap including external data. */
-struct action_raw_decap_data {
-	struct rte_flow_action_raw_decap conf;
-	uint8_t data[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_rss including external data. */
-struct action_rss_data {
-	struct rte_flow_action_rss conf;
-	uint8_t key[40];
-	uint16_t queue[128];
-};
+/* Compound action data structs defined in actions_gen.h */
 
 static void
 add_mark(struct rte_flow_action *actions,
@@ -1165,3 +1145,262 @@ fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	free(queues);
 	free(hairpin_queues);
 }
+
+static size_t
+action_conf_size(enum rte_flow_action_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ACTION_TYPE_MARK:
+		return sizeof(struct rte_flow_action_mark);
+	case RTE_FLOW_ACTION_TYPE_QUEUE:
+		return sizeof(struct rte_flow_action_queue);
+	case RTE_FLOW_ACTION_TYPE_JUMP:
+		return sizeof(struct rte_flow_action_jump);
+	case RTE_FLOW_ACTION_TYPE_RSS:
+		return sizeof(struct action_rss_data);
+	case RTE_FLOW_ACTION_TYPE_SET_META:
+		return sizeof(struct rte_flow_action_set_meta);
+	case RTE_FLOW_ACTION_TYPE_SET_TAG:
+		return sizeof(struct rte_flow_action_set_tag);
+	case RTE_FLOW_ACTION_TYPE_PORT_ID:
+		return sizeof(struct rte_flow_action_port_id);
+	case RTE_FLOW_ACTION_TYPE_COUNT:
+		return sizeof(struct rte_flow_action_count);
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+		return sizeof(struct rte_flow_action_set_mac);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+		return sizeof(struct rte_flow_action_set_ipv4);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+		return sizeof(struct rte_flow_action_set_ipv6);
+	case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+		return sizeof(struct rte_flow_action_set_tp);
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+		return sizeof(rte_be32_t);
+	case RTE_FLOW_ACTION_TYPE_SET_TTL:
+		return sizeof(struct rte_flow_action_set_ttl);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+		return sizeof(struct rte_flow_action_set_dscp);
+	case RTE_FLOW_ACTION_TYPE_METER:
+		return sizeof(struct rte_flow_action_meter);
+	case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+		return sizeof(struct action_raw_encap_data);
+	case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+		return sizeof(struct action_raw_decap_data);
+	case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		return sizeof(struct rte_flow_action_vxlan_encap) +
+		       5 * sizeof(struct rte_flow_item) + sizeof(struct rte_flow_item_eth) +
+		       sizeof(struct rte_flow_item_ipv4) + sizeof(struct rte_flow_item_udp) +
+		       sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD:
+		return sizeof(struct rte_flow_action_modify_field);
+	/* Zero-conf types */
+	case RTE_FLOW_ACTION_TYPE_DROP:
+	case RTE_FLOW_ACTION_TYPE_FLAG:
+	case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+	case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+void
+fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+		      uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+		      bool *need_wire_orig_table, size_t *conf_sizes, uint32_t *n_actions_out)
+{
+	uint8_t actions_counter = 0;
+	uint8_t i, j;
+
+	*need_wire_orig_table = false;
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	/* Static configurations for actions that need them in templates */
+	static struct rte_flow_action_mark mark_conf = {
+		.id = 1,
+	};
+	static struct rte_flow_action_queue queue_conf = {
+		.index = 0,
+	};
+	static struct rte_flow_action_port_id port_id_conf = {
+		.id = 0,
+	};
+	static struct rte_flow_action_jump jump_conf = {
+		.group = 1,
+	};
+	static struct rte_flow_action_modify_field set_meta_conf = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst = {.field = RTE_FLOW_FIELD_META},
+		.src = {
+			.field = RTE_FLOW_FIELD_VALUE,
+			.value = {0, 0, 0, META_DATA},
+		},
+		.width = 32,
+	};
+
+	/* Static mask configurations for each action type */
+	static struct rte_flow_action_mark mark_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_queue queue_mask = {
+		.index = UINT16_MAX,
+	};
+	static struct rte_flow_action_jump jump_mask = {
+		.group = UINT32_MAX,
+	};
+	static struct rte_flow_action_rss rss_mask = {
+		.level = UINT32_MAX,
+		.types = UINT64_MAX,
+	};
+	static struct rte_flow_action_set_meta set_meta_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_tag set_tag_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+		.index = UINT8_MAX,
+	};
+	static struct rte_flow_action_port_id port_id_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_count count_mask;
+	static struct rte_flow_action_set_mac set_mac_mask = {
+		.mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	};
+	static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
+		.ipv4_addr = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_ipv6 set_ipv6_mask = {
+		.ipv6_addr.a = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff}};
+	static struct rte_flow_action_set_tp set_tp_mask = {
+		.port = UINT16_MAX,
+	};
+	static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
+	static struct rte_flow_action_set_ttl set_ttl_mask = {
+		.ttl_value = UINT8_MAX,
+	};
+	static struct rte_flow_action_set_dscp set_dscp_mask = {
+		.dscp = UINT8_MAX,
+	};
+	static struct rte_flow_action_meter meter_mask = {
+		.mtr_id = UINT32_MAX,
+	};
+
+	static const struct {
+		uint64_t flow_mask;
+		enum rte_flow_action_type type;
+		const void *action_conf;
+		const void *action_mask;
+		const bool need_wire_orig_table;
+	} template_actions[] = {
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK), RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
+		 &mark_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT), RTE_FLOW_ACTION_TYPE_COUNT, NULL,
+		 &count_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
+		 RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf, &set_meta_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG), RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
+		 &set_tag_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG), RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
+		 false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC), RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST), RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL), RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
+		 &set_ttl_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL), RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
+		 NULL, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE), RTE_FLOW_ACTION_TYPE_QUEUE,
+		 &queue_conf, &queue_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS), RTE_FLOW_ACTION_TYPE_RSS, NULL,
+		 &rss_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP), RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
+		 &jump_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID), RTE_FLOW_ACTION_TYPE_PORT_ID,
+		 &port_id_conf, &port_id_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP), RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
+		 false},
+		{HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE, &queue_conf, &queue_mask, false},
+		{HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL, &rss_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER), RTE_FLOW_ACTION_TYPE_METER, NULL,
+		 &meter_mask, false},
+	};
+
+	for (j = 0; j < MAX_ACTIONS_NUM; j++) {
+		if (flow_actions[j] == 0)
+			break;
+		for (i = 0; i < RTE_DIM(template_actions); i++) {
+			if ((flow_actions[j] & template_actions[i].flow_mask) == 0)
+				continue;
+
+			switch (template_actions[i].type) {
+			case RTE_FLOW_ACTION_TYPE_COUNT:
+				port_attr->nb_counters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_AGE:
+				port_attr->nb_aging_objects++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_METER:
+				port_attr->nb_meters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_CONNTRACK:
+				port_attr->nb_conn_tracks++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_QUOTA:
+				port_attr->nb_quotas++;
+			default:
+				break;
+			}
+
+			actions[actions_counter].type = template_actions[i].type;
+			actions[actions_counter].conf = template_actions[i].action_conf;
+			masks[actions_counter].type = template_actions[i].type;
+			masks[actions_counter].conf = template_actions[i].action_mask;
+			conf_sizes[actions_counter] = action_conf_size(template_actions[i].type);
+			*need_wire_orig_table |= template_actions[i].need_wire_orig_table;
+			actions_counter++;
+			break;
+		}
+	}
+
+	actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+	masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+
+	/* take END into account */
+	*n_actions_out = actions_counter + 1;
+}
diff --git a/app/test-flow-perf/actions_gen.h b/app/test-flow-perf/actions_gen.h
index 9e13b164f9..3ac0ffed59 100644
--- a/app/test-flow-perf/actions_gen.h
+++ b/app/test-flow-perf/actions_gen.h
@@ -17,9 +17,40 @@
 #define RTE_VXLAN_GPE_UDP_PORT 250
 #define RTE_GENEVE_UDP_PORT 6081
 
+/* Compound action data structures (needed by async_flow.c for slot init) */
+
+/* Storage for struct rte_flow_action_raw_encap including external data. */
+struct action_raw_encap_data {
+	struct rte_flow_action_raw_encap conf;
+	uint8_t data[128];
+	uint8_t preserve[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_raw_decap including external data. */
+struct action_raw_decap_data {
+	struct rte_flow_action_raw_decap conf;
+	uint8_t data[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_rss including external data. */
+struct action_rss_data {
+	struct rte_flow_action_rss conf;
+	uint8_t key[40];
+	uint16_t queue[128];
+};
+
 void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	uint32_t counter, uint16_t next_table, uint16_t hairpinq,
 	uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
 	bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
 
+/* Fill actions template for async flow API (types only, no values).
+ * If conf_sizes is non-NULL, populates per-action conf sizes and n_actions_out.
+ */
+void fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+			   uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+			   bool *need_wire_orig_table, size_t *conf_sizes, uint32_t *n_actions_out);
+
 #endif /* FLOW_PERF_ACTION_GEN */
diff --git a/app/test-flow-perf/async_flow.c b/app/test-flow-perf/async_flow.c
new file mode 100644
index 0000000000..71728845cc
--- /dev/null
+++ b/app/test-flow-perf/async_flow.c
@@ -0,0 +1,758 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API implementation
+ * for the flow-perf application.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_bitops.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_vxlan.h>
+
+#include "actions_gen.h"
+#include "async_flow.h"
+#include "flow_gen.h"
+#include "items_gen.h"
+
+/* Max iterations when draining pending async completions during cleanup */
+#define DRAIN_MAX_ITERATIONS 100
+
+/* Per-port async flow resources */
+static struct async_flow_resources port_resources[MAX_PORTS];
+
+/*
+ * Initialize compound action types within a pre-allocated slot.
+ * Called once per slot during pool init to set up internal pointers
+ * for RSS, RAW_ENCAP, RAW_DECAP and VXLAN_ENCAP actions.
+ */
+static void
+init_slot_compound_actions(struct rte_flow_action *actions, uint32_t n_actions,
+			   const size_t *action_conf_sizes)
+{
+	uint32_t i;
+
+	for (i = 0; i < n_actions; i++) {
+		if (action_conf_sizes[i] == 0)
+			continue;
+
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *rss = actions[i].conf;
+			rss->conf.func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+			rss->conf.level = 0;
+			rss->conf.types = GET_RSS_HF();
+			rss->conf.key_len = sizeof(rss->key);
+			rss->conf.key = rss->key;
+			rss->conf.queue = rss->queue;
+			rss->key[0] = 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap = actions[i].conf;
+			encap->conf.data = encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap = actions[i].conf;
+			decap->conf.data = decap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			/*
+			 * Layout within the conf area:
+			 *   struct rte_flow_action_vxlan_encap
+			 *   struct rte_flow_item[5]
+			 *   struct rte_flow_item_eth
+			 *   struct rte_flow_item_ipv4
+			 *   struct rte_flow_item_udp
+			 *   struct rte_flow_item_vxlan
+			 */
+			uint8_t *base = actions[i].conf;
+			struct rte_flow_action_vxlan_encap *ve =
+				(struct rte_flow_action_vxlan_encap *)base;
+			struct rte_flow_item *items =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			uint8_t *data = (uint8_t *)(items + 5);
+
+			struct rte_flow_item_eth *item_eth = (struct rte_flow_item_eth *)data;
+			data += sizeof(struct rte_flow_item_eth);
+			struct rte_flow_item_ipv4 *item_ipv4 = (struct rte_flow_item_ipv4 *)data;
+			data += sizeof(struct rte_flow_item_ipv4);
+			struct rte_flow_item_udp *item_udp = (struct rte_flow_item_udp *)data;
+			data += sizeof(struct rte_flow_item_udp);
+			struct rte_flow_item_vxlan *item_vxlan = (struct rte_flow_item_vxlan *)data;
+
+			memset(item_eth, 0, sizeof(*item_eth));
+			memset(item_ipv4, 0, sizeof(*item_ipv4));
+			memset(item_udp, 0, sizeof(*item_udp));
+			memset(item_vxlan, 0, sizeof(*item_vxlan));
+
+			item_ipv4->hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+			item_ipv4->hdr.version_ihl = RTE_IPV4_VHL_DEF;
+			item_udp->hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+			item_vxlan->hdr.vni[2] = 1;
+
+			items[0].type = RTE_FLOW_ITEM_TYPE_ETH;
+			items[0].spec = item_eth;
+			items[0].mask = item_eth;
+			items[1].type = RTE_FLOW_ITEM_TYPE_IPV4;
+			items[1].spec = item_ipv4;
+			items[1].mask = item_ipv4;
+			items[2].type = RTE_FLOW_ITEM_TYPE_UDP;
+			items[2].spec = item_udp;
+			items[2].mask = item_udp;
+			items[3].type = RTE_FLOW_ITEM_TYPE_VXLAN;
+			items[3].spec = item_vxlan;
+			items[3].mask = item_vxlan;
+			items[4].type = RTE_FLOW_ITEM_TYPE_END;
+
+			ve->definition = items;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Allocate and pre-initialize all per-slot flat buffers.
+ * Returns 0 on success.
+ */
+static int
+init_slot_pool(struct async_flow_resources *res, uint32_t nb_queues, uint32_t queue_size,
+	       const struct rte_flow_item *pattern, uint32_t n_items, const size_t *item_spec_sizes,
+	       const struct rte_flow_action *template_actions, uint32_t n_actions,
+	       const size_t *action_conf_sizes)
+{
+	uint32_t items_array_bytes, actions_array_bytes;
+	uint32_t spec_data_bytes, conf_data_bytes, mask_data_bytes;
+	uint32_t slot_size, num_slots;
+	uint32_t s, i;
+	uint8_t *mptr;
+
+	/* Compute shared mask size */
+	mask_data_bytes = 0;
+	for (i = 0; i < n_items; i++)
+		mask_data_bytes += item_spec_sizes[i];
+
+	/* specs and masks have the same size */
+	spec_data_bytes = mask_data_bytes;
+
+	conf_data_bytes = 0;
+	for (i = 0; i < n_actions; i++)
+		conf_data_bytes += action_conf_sizes[i];
+
+	/* Compute per-slot layout sizes (+ 1 for END sentinel) */
+	items_array_bytes = n_items * sizeof(struct rte_flow_item);
+	actions_array_bytes = n_actions * sizeof(struct rte_flow_action);
+
+	slot_size = RTE_ALIGN_CEIL(items_array_bytes + actions_array_bytes + spec_data_bytes +
+					   conf_data_bytes,
+				   RTE_CACHE_LINE_SIZE);
+
+	num_slots = queue_size * nb_queues;
+
+	/* Store layout info */
+	res->slot_size = slot_size;
+	res->slots_per_queue = queue_size;
+	res->nb_queues = nb_queues;
+	res->n_items = n_items;
+	res->n_actions = n_actions;
+
+	/* Allocate shared masks */
+	if (mask_data_bytes > 0) {
+		res->shared_masks = aligned_alloc(
+			RTE_CACHE_LINE_SIZE, RTE_ALIGN_CEIL(mask_data_bytes, RTE_CACHE_LINE_SIZE));
+		if (res->shared_masks == NULL) {
+			fprintf(stderr, "Failed to allocate shared masks (%u bytes)\n",
+				mask_data_bytes);
+			return -ENOMEM;
+		}
+		memset(res->shared_masks, 0, mask_data_bytes);
+
+		/* Copy mask data from template pattern */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			if (item_spec_sizes[i] > 0 && pattern[i].mask != NULL)
+				memcpy(mptr, pattern[i].mask, item_spec_sizes[i]);
+			mptr += RTE_ALIGN_CEIL(item_spec_sizes[i], 8);
+		}
+	}
+
+	/* Allocate per-slot pool */
+	/* slot_size is already cache-line aligned, so total is a multiple */
+	res->slot_pool = aligned_alloc(RTE_CACHE_LINE_SIZE, (size_t)num_slots * slot_size);
+	if (res->slot_pool == NULL) {
+		fprintf(stderr, "Failed to allocate slot pool (%u slots * %u bytes)\n", num_slots,
+			slot_size);
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+		return -ENOMEM;
+	}
+	memset(res->slot_pool, 0, (size_t)num_slots * slot_size);
+
+	/* Pre-initialize every slot */
+	for (s = 0; s < num_slots; s++) {
+		uint8_t *slot = res->slot_pool + (size_t)s * slot_size;
+		struct rte_flow_item *items = (struct rte_flow_item *)slot;
+		struct rte_flow_action *actions =
+			(struct rte_flow_action *)(slot + items_array_bytes);
+		uint8_t *data = slot + items_array_bytes + actions_array_bytes;
+
+		/* Pre-set items: spec → per-slot data, mask → shared masks */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			items[i].type = pattern[i].type;
+			if (item_spec_sizes[i] > 0) {
+				items[i].spec = data;
+				items[i].mask = mptr;
+				data += item_spec_sizes[i];
+				mptr += item_spec_sizes[i];
+			}
+		}
+		items[n_items].type = RTE_FLOW_ITEM_TYPE_END;
+
+		/* Pre-set actions: conf → per-slot data */
+		for (i = 0; i < n_actions; i++) {
+			actions[i].type = template_actions[i].type;
+			if (action_conf_sizes[i] > 0) {
+				actions[i].conf = data;
+				data += action_conf_sizes[i];
+			}
+		}
+		actions[n_actions].type = RTE_FLOW_ACTION_TYPE_END;
+
+		/* Initialize compound action types (RSS, RAW_ENCAP, etc.) */
+		init_slot_compound_actions(actions, n_actions, action_conf_sizes);
+	}
+
+	/* Allocate and initialize per-queue slot tracking */
+	res->queues =
+		aligned_alloc(RTE_CACHE_LINE_SIZE, nb_queues * sizeof(struct async_flow_queue));
+	if (res->queues == NULL) {
+		fprintf(stderr, "Failed to allocate queue structs (%u queues)\n", nb_queues);
+		free(res->slot_pool);
+		res->slot_pool = NULL;
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+		return -ENOMEM;
+	}
+	memset(res->queues, 0, nb_queues * sizeof(struct async_flow_queue));
+	for (s = 0; s < nb_queues; s++) {
+		res->queues[s].slots = res->slot_pool + (size_t)s * queue_size * slot_size;
+		res->queues[s].head = 0;
+	}
+
+	printf(":: Slot pool: %u slots * %u bytes = %u KB (shared masks: %u bytes)\n", num_slots,
+	       slot_size, (num_slots * slot_size) / 1024, mask_data_bytes);
+
+	return 0;
+}
+
+/*
+ * Hot-path: update per-flow item values through pre-set pointers.
+ * Only IPv4/IPv6 src_addr varies per flow (based on counter).
+ */
+static void
+update_item_values(struct rte_flow_item *items, uint32_t counter)
+{
+	uint8_t i;
+
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
+		switch (items[i].type) {
+		case RTE_FLOW_ITEM_TYPE_IPV4: {
+			struct rte_flow_item_ipv4 *spec = items[i].spec;
+			spec->hdr.src_addr = RTE_BE32(counter);
+			break;
+		}
+		case RTE_FLOW_ITEM_TYPE_IPV6: {
+			struct rte_flow_item_ipv6 *spec = items[i].spec;
+			uint8_t j;
+			for (j = 0; j < 4; j++)
+				spec->hdr.src_addr.a[15 - j] = counter >> (j * 8);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Hot-path: update per-flow action values through pre-set pointers.
+ */
+static void
+update_action_values(struct rte_flow_action *actions, uint32_t counter, uint16_t hairpinq,
+		     uint64_t encap_data, uint64_t decap_data, __rte_unused uint8_t core_idx,
+		     bool unique_data, uint8_t rx_queues_count, uint16_t dst_port)
+{
+	uint8_t i;
+
+	for (i = 0; actions[i].type != RTE_FLOW_ACTION_TYPE_END; i++) {
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_MARK: {
+			struct rte_flow_action_mark *conf = actions[i].conf;
+			conf->id = (counter % 255) + 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_QUEUE: {
+			struct rte_flow_action_queue *conf = actions[i].conf;
+			conf->index = hairpinq ? (counter % hairpinq) + rx_queues_count :
+						 counter % rx_queues_count;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_METER: {
+			struct rte_flow_action_meter *conf = actions[i].conf;
+			conf->mtr_id = counter;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *conf = actions[i].conf;
+			uint16_t q;
+			if (hairpinq) {
+				conf->conf.queue_num = hairpinq;
+				for (q = 0; q < hairpinq; q++)
+					conf->queue[q] = q + rx_queues_count;
+			} else {
+				conf->conf.queue_num = rx_queues_count;
+				for (q = 0; q < rx_queues_count; q++)
+					conf->queue[q] = q;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: {
+			struct rte_flow_action_set_mac *conf = actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < RTE_ETHER_ADDR_LEN; j++) {
+				conf->mac_addr[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: {
+			struct rte_flow_action_set_ipv4 *conf = actions[i].conf;
+			uint32_t ip = unique_data ? counter : 1;
+			conf->ipv4_addr = RTE_BE32(ip + 1);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: {
+			struct rte_flow_action_set_ipv6 *conf = actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < 16; j++) {
+				conf->ipv6_addr.a[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: {
+			struct rte_flow_action_set_tp *conf = actions[i].conf;
+			uint32_t tp = unique_data ? counter : 100;
+			tp = tp % 0xffff;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST: {
+			struct rte_flow_action_set_tp *conf = actions[i].conf;
+			uint32_t tp = unique_data ? counter : 100;
+			if (tp > 0xffff)
+				tp >>= 16;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: {
+			rte_be32_t *conf = actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			*conf = RTE_BE32(val);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TTL: {
+			struct rte_flow_action_set_ttl *conf = actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			conf->ttl_value = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: {
+			struct rte_flow_action_set_dscp *conf = actions[i].conf;
+			uint32_t val = unique_data ? counter : 1;
+			conf->dscp = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_PORT_ID: {
+			struct rte_flow_action_port_id *conf = actions[i].conf;
+			conf->id = dst_port;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap = actions[i].conf;
+			uint8_t *header = encap->data;
+			struct rte_ether_hdr eth_hdr;
+			struct rte_ipv4_hdr ipv4_hdr;
+			struct rte_udp_hdr udp_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4)) {
+				uint32_t ip_dst = unique_data ? counter : 1;
+				memset(&ipv4_hdr, 0, sizeof(ipv4_hdr));
+				ipv4_hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+				ipv4_hdr.dst_addr = RTE_BE32(ip_dst);
+				ipv4_hdr.version_ihl = RTE_IPV4_VHL_DEF;
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP))
+					ipv4_hdr.next_proto_id = 17; /* UDP */
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE))
+					ipv4_hdr.next_proto_id = 47; /* GRE */
+				memcpy(header, &ipv4_hdr, sizeof(ipv4_hdr));
+				header += sizeof(ipv4_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP)) {
+				memset(&udp_hdr, 0, sizeof(udp_hdr));
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN))
+					udp_hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+				memcpy(header, &udp_hdr, sizeof(udp_hdr));
+				header += sizeof(udp_hdr);
+			}
+			encap->conf.size = header - encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap_d = actions[i].conf;
+			uint8_t *header = decap_d->data;
+			struct rte_ether_hdr eth_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			decap_d->conf.size = header - decap_d->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			uint8_t *base = actions[i].conf;
+			struct rte_flow_item *vitems =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			struct rte_flow_item_ipv4 *spec = vitems[1].spec;
+			uint32_t ip_dst = unique_data ? counter : 1;
+			/* vitems[1] is IPV4 */
+			spec->hdr.dst_addr = RTE_BE32(ip_dst);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int
+async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+		     uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+		     uint8_t flow_group, uint32_t rules_count)
+{
+	struct rte_flow_port_info port_info = {0};
+	struct rte_flow_queue_info queue_info = {0};
+	struct rte_flow_error error = {0};
+	struct rte_flow_port_attr port_attr = {0};
+	struct rte_flow_queue_attr queue_attr;
+	const struct rte_flow_queue_attr **queue_attr_list;
+	struct rte_flow_pattern_template_attr pt_attr = {0};
+	struct rte_flow_actions_template_attr at_attr = {0};
+	struct rte_flow_template_table_attr table_attr = {0};
+	struct rte_flow_item pattern[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
+	size_t item_spec_sizes[MAX_ITEMS_NUM];
+	size_t action_conf_sizes[MAX_ACTIONS_NUM];
+	uint32_t n_items, n_actions;
+	struct async_flow_resources *res;
+	bool need_wire_orig_table = false;
+	uint32_t i;
+	int ret;
+
+	if (port_id >= MAX_PORTS)
+		return -1;
+
+	res = &port_resources[port_id];
+	memset(res, 0, sizeof(*res));
+
+	/* Query port flow info */
+	ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	if (port_info.max_nb_queues == 0 || queue_info.max_size == 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get reports that no queues are supported\n",
+			port_id);
+		return -1;
+	}
+
+	/* Limit to device capabilities if reported */
+	if (port_info.max_nb_queues != 0 && port_info.max_nb_queues != UINT32_MAX &&
+	    nb_queues > port_info.max_nb_queues)
+		nb_queues = port_info.max_nb_queues;
+	if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX &&
+	    queue_size > queue_info.max_size)
+		queue_size = queue_info.max_size;
+
+	/* Slot ring uses bitmask wrapping, so queue_size must be power of 2 */
+	queue_size = rte_align32prevpow2(queue_size);
+	if (queue_size == 0) {
+		fprintf(stderr, "Port %u: queue_size is 0 after rounding\n", port_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < MAX_ATTRS_NUM; i++) {
+		if (flow_attrs[i] == 0)
+			break;
+		if (flow_attrs[i] & INGRESS)
+			pt_attr.ingress = 1;
+		else if (flow_attrs[i] & EGRESS)
+			pt_attr.egress = 1;
+		else if (flow_attrs[i] & TRANSFER)
+			pt_attr.transfer = 1;
+	}
+	/* Enable relaxed matching for better performance */
+	pt_attr.relaxed_matching = 1;
+
+	memset(pattern, 0, sizeof(pattern));
+	memset(actions, 0, sizeof(actions));
+	memset(action_masks, 0, sizeof(action_masks));
+
+	/* Fill templates and gather per-item/action sizes */
+	fill_items_template(pattern, flow_items, 0, 0, item_spec_sizes, &n_items);
+
+	at_attr.ingress = pt_attr.ingress;
+	at_attr.egress = pt_attr.egress;
+	at_attr.transfer = pt_attr.transfer;
+
+	fill_actions_template(actions, action_masks, flow_actions, &port_attr,
+			      &need_wire_orig_table, action_conf_sizes, &n_actions);
+
+	/*
+	 * fill_actions_template count the number of actions that require each kind of object,
+	 * so we multiply by the number of rules to have correct number
+	 */
+	port_attr.nb_counters *= rules_count;
+	port_attr.nb_aging_objects *= rules_count;
+	port_attr.nb_meters *= rules_count;
+	port_attr.nb_conn_tracks *= rules_count;
+	port_attr.nb_quotas *= rules_count;
+
+	table_attr.flow_attr.group = flow_group;
+	table_attr.flow_attr.priority = 0;
+	table_attr.flow_attr.ingress = pt_attr.ingress;
+	table_attr.flow_attr.egress = pt_attr.egress;
+	table_attr.flow_attr.transfer = pt_attr.transfer;
+	table_attr.nb_flows = rules_count;
+
+	if (pt_attr.transfer && need_wire_orig_table)
+		table_attr.specialize = RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
+
+	queue_attr_list = malloc(sizeof(*queue_attr_list) * nb_queues);
+	if (queue_attr_list == NULL) {
+		fprintf(stderr, "Port %u: failed to allocate queue_attr_list\n", port_id);
+		return -ENOMEM;
+	}
+
+	queue_attr.size = queue_size;
+	for (i = 0; i < nb_queues; i++)
+		queue_attr_list[i] = &queue_attr;
+
+	ret = rte_flow_configure(port_id, &port_attr, nb_queues, queue_attr_list, &error);
+
+	free(queue_attr_list);
+
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_configure failed (ret=%d, type=%d): %s\n",
+			port_id, ret, error.type, error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Create pattern template */
+	res->pattern_template =
+		rte_flow_pattern_template_create(port_id, &pt_attr, pattern, &error);
+	if (res->pattern_template == NULL) {
+		fprintf(stderr, "Port %u: pattern template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return -1;
+	}
+
+	/* Create actions template */
+	res->actions_template =
+		rte_flow_actions_template_create(port_id, &at_attr, actions, action_masks, &error);
+	if (res->actions_template == NULL) {
+		fprintf(stderr, "Port %u: actions template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		return -1;
+	}
+
+	/* Create template table */
+	res->table = rte_flow_template_table_create(port_id, &table_attr, &res->pattern_template, 1,
+						    &res->actions_template, 1, &error);
+	if (res->table == NULL) {
+		fprintf(stderr, "Port %u: template table create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+		res->actions_template = NULL;
+		return -1;
+	}
+
+	/* Allocate and pre-initialize per-slot flat buffers */
+	ret = init_slot_pool(res, nb_queues, queue_size, pattern, n_items, item_spec_sizes, actions,
+			     n_actions, action_conf_sizes);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: slot pool init failed\n", port_id);
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->table = NULL;
+		res->actions_template = NULL;
+		res->pattern_template = NULL;
+		return ret;
+	}
+
+	res->table_capacity = rules_count;
+	res->initialized = true;
+
+	printf(":: Port %u: Async flow engine initialized (queues=%u, queue_size=%u)\n", port_id,
+	       nb_queues, queue_size);
+
+	return 0;
+}
+
+struct rte_flow *
+async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter, uint16_t hairpinq,
+		    uint64_t encap_data, uint64_t decap_data, uint16_t dst_port, uint8_t core_idx,
+		    uint8_t rx_queues_count, bool unique_data, bool postpone,
+		    struct rte_flow_error *error)
+{
+	struct async_flow_resources *res;
+	struct async_flow_queue *q;
+	uint8_t *slot;
+	uint32_t idx, items_array_bytes;
+	struct rte_flow_item *items;
+	struct rte_flow_action *actions;
+	struct rte_flow_op_attr op_attr = {
+		.postpone = postpone,
+	};
+
+	if (port_id >= MAX_PORTS) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid port ID");
+		return NULL;
+	}
+
+	res = &port_resources[port_id];
+	if (!res->initialized) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Async flow resources not initialized");
+		return NULL;
+	}
+
+	if (queue_id >= res->nb_queues) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid queue ID");
+		return NULL;
+	}
+
+	/* Pick the next slot from this queue's ring */
+	q = &res->queues[queue_id];
+	idx = q->head;
+	q->head = (idx + 1) & (res->slots_per_queue - 1);
+	slot = q->slots + (size_t)idx * res->slot_size;
+	items_array_bytes = res->n_items * sizeof(struct rte_flow_item);
+	items = (struct rte_flow_item *)slot;
+	actions = (struct rte_flow_action *)(slot + items_array_bytes);
+
+	/* Update only per-flow varying values */
+	update_item_values(items, counter);
+	update_action_values(actions, counter, hairpinq, encap_data, decap_data, core_idx,
+			     unique_data, rx_queues_count, dst_port);
+
+	return rte_flow_async_create(port_id, queue_id, &op_attr, res->table, items, 0, actions, 0,
+				     NULL, error);
+}
+
+void
+async_flow_cleanup_port(uint16_t port_id)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_error error;
+	struct rte_flow_op_result results[64];
+	int ret, i;
+
+	if (port_id >= MAX_PORTS)
+		return;
+
+	res = &port_resources[port_id];
+	if (!res->initialized)
+		return;
+
+	/* Drain any pending async completions from flow flush */
+	for (i = 0; i < DRAIN_MAX_ITERATIONS; i++) {
+		rte_flow_push(port_id, 0, &error);
+		ret = rte_flow_pull(port_id, 0, results, 64, &error);
+		if (ret <= 0)
+			break;
+	}
+
+	if (res->table != NULL) {
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		res->table = NULL;
+	}
+
+	if (res->actions_template != NULL) {
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		res->actions_template = NULL;
+	}
+
+	if (res->pattern_template != NULL) {
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+	}
+
+	free(res->queues);
+	res->queues = NULL;
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+	free(res->shared_masks);
+	res->shared_masks = NULL;
+
+	res->initialized = false;
+}
diff --git a/app/test-flow-perf/async_flow.h b/app/test-flow-perf/async_flow.h
new file mode 100644
index 0000000000..8c12924bc6
--- /dev/null
+++ b/app/test-flow-perf/async_flow.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API related definitions
+ * and function declarations.
+ */
+
+#ifndef FLOW_PERF_ASYNC_FLOW
+#define FLOW_PERF_ASYNC_FLOW
+
+#include <rte_flow.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "config.h"
+
+/* Per-queue slot ring — tracks which slot to use next */
+struct async_flow_queue {
+	uint8_t *slots; /* pointer to this queue's region within slot_pool */
+	uint32_t head;	/* next slot index (wraps mod slots_per_queue) */
+};
+
+/* Per-port async flow resources */
+struct async_flow_resources {
+	struct rte_flow_pattern_template *pattern_template;
+	struct rte_flow_actions_template *actions_template;
+	struct rte_flow_template_table *table;
+	uint8_t *slot_pool;    /* flat buffer pool for all slots */
+	uint8_t *shared_masks; /* shared item mask data (one copy for all slots) */
+	struct async_flow_queue *queues;
+	uint32_t slot_size;	  /* bytes per slot (cache-line aligned) */
+	uint32_t slots_per_queue; /* = queue_size */
+	uint32_t nb_queues;
+	uint32_t n_items;   /* item count (excl. END) */
+	uint32_t n_actions; /* action count (excl. END) */
+	uint32_t table_capacity;
+	bool initialized;
+};
+
+/* Initialize async flow engine for a port */
+int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+			 uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+			 uint8_t flow_group, uint32_t rules_count);
+
+/* Create a flow rule asynchronously using pre-allocated slot */
+struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter,
+				     uint16_t hairpinq, uint64_t encap_data, uint64_t decap_data,
+				     uint16_t dst_port, uint8_t core_idx, uint8_t rx_queues_count,
+				     bool unique_data, bool postpone, struct rte_flow_error *error);
+
+/* Cleanup async flow resources for a port */
+void async_flow_cleanup_port(uint16_t port_id);
+
+#endif /* FLOW_PERF_ASYNC_FLOW */
diff --git a/app/test-flow-perf/items_gen.c b/app/test-flow-perf/items_gen.c
index c740e1838f..58f1c16cf8 100644
--- a/app/test-flow-perf/items_gen.c
+++ b/app/test-flow-perf/items_gen.c
@@ -389,3 +389,61 @@ fill_items(struct rte_flow_item *items,
 
 	items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
 }
+
+static size_t
+item_spec_size(enum rte_flow_item_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ITEM_TYPE_ETH:
+		return sizeof(struct rte_flow_item_eth);
+	case RTE_FLOW_ITEM_TYPE_VLAN:
+		return sizeof(struct rte_flow_item_vlan);
+	case RTE_FLOW_ITEM_TYPE_IPV4:
+		return sizeof(struct rte_flow_item_ipv4);
+	case RTE_FLOW_ITEM_TYPE_IPV6:
+		return sizeof(struct rte_flow_item_ipv6);
+	case RTE_FLOW_ITEM_TYPE_TCP:
+		return sizeof(struct rte_flow_item_tcp);
+	case RTE_FLOW_ITEM_TYPE_UDP:
+		return sizeof(struct rte_flow_item_udp);
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		return sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		return sizeof(struct rte_flow_item_vxlan_gpe);
+	case RTE_FLOW_ITEM_TYPE_GRE:
+		return sizeof(struct rte_flow_item_gre);
+	case RTE_FLOW_ITEM_TYPE_GENEVE:
+		return sizeof(struct rte_flow_item_geneve);
+	case RTE_FLOW_ITEM_TYPE_GTP:
+		return sizeof(struct rte_flow_item_gtp);
+	case RTE_FLOW_ITEM_TYPE_META:
+		return sizeof(struct rte_flow_item_meta);
+	case RTE_FLOW_ITEM_TYPE_TAG:
+		return sizeof(struct rte_flow_item_tag);
+	case RTE_FLOW_ITEM_TYPE_ICMP:
+		return sizeof(struct rte_flow_item_icmp);
+	case RTE_FLOW_ITEM_TYPE_ICMP6:
+		return sizeof(struct rte_flow_item_icmp6);
+	default:
+		return 0;
+	}
+}
+
+void
+fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+		    uint8_t core_idx, size_t *spec_sizes, uint32_t *n_items_out)
+{
+	uint32_t count;
+
+	fill_items(items, flow_items, outer_ip_src, core_idx);
+
+	/* Count items before END */
+	for (count = 0; items[count].type != RTE_FLOW_ITEM_TYPE_END; count++) {
+		spec_sizes[count] = item_spec_size(items[count].type);
+		/* For templates, set spec to NULL - only mask matters for template matching */
+		items[count].spec = NULL;
+	}
+
+	/* take END into account */
+	*n_items_out = count + 1;
+}
diff --git a/app/test-flow-perf/items_gen.h b/app/test-flow-perf/items_gen.h
index f4b0e9a981..0987f7be3c 100644
--- a/app/test-flow-perf/items_gen.h
+++ b/app/test-flow-perf/items_gen.h
@@ -15,4 +15,10 @@
 void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
 	uint32_t outer_ip_src, uint8_t core_idx);
 
+/* Fill items template for async flow API (masks only, no spec values).
+ * If spec_sizes is non-NULL, populates per-item spec sizes and n_items_out.
+ */
+void fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+			 uint8_t core_idx, size_t *spec_sizes, uint32_t *n_items_out);
+
 #endif /* FLOW_PERF_ITEMS_GEN */
diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 6636d1517f..bf420b397b 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -37,11 +37,15 @@
 #include <rte_mtr.h>
 #include <rte_os_shim.h>
 
-#include "config.h"
 #include "actions_gen.h"
+#include "async_flow.h"
+#include "config.h"
 #include "flow_gen.h"
+#include "rte_build_config.h"
 
 #define MAX_BATCHES_COUNT          100
+#define MAX_ASYNC_QUEUE_SIZE	     (1 << 14)
+#define MAX_PULL_RETRIES	     (1 << 20)
 #define DEFAULT_RULES_COUNT    4000000
 #define DEFAULT_RULES_BATCH     100000
 #define DEFAULT_GROUP                0
@@ -55,7 +59,6 @@
 #define HAIRPIN_TX_CONF_LOCKED_MEMORY (0x0100)
 #define HAIRPIN_TX_CONF_RTE_MEMORY    (0x0200)
 
-struct rte_flow *flow;
 static uint8_t flow_group;
 
 static uint64_t encap_data;
@@ -81,6 +84,9 @@ static bool enable_fwd;
 static bool unique_data;
 static bool policy_mtr;
 static bool packet_mode;
+static bool async_mode;
+static uint32_t async_queue_size = 1024;
+static uint32_t async_push_batch = 256;
 
 static uint8_t rx_queues_count;
 static uint8_t tx_queues_count;
@@ -598,6 +604,29 @@ usage(char *progname)
 		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
 		"With fixed values\n");
 	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
+
+	printf("\nAsync flow API options:\n");
+	printf("  --async: enable async flow API mode\n");
+	printf("  --async-queue-size=N: size of each async queue,"
+	       " default is 1024\n");
+	printf("  --async-push-batch=N: flows to batch before push,"
+	       " default is 256\n");
+}
+
+static inline uint32_t
+prev_power_of_two(uint32_t x)
+{
+	uint32_t saved = x;
+	x--;
+	x |= x >> 1;
+	x |= x >> 2;
+	x |= x >> 4;
+	x |= x >> 8;
+	x |= x >> 16;
+	x++;
+	if (x == saved)
+		return x;
+	return x >> 1;
 }
 
 static void
@@ -734,6 +763,9 @@ args_parse(int argc, char **argv)
 		{ "policy-mtr",                 1, 0, 0 },
 		{ "meter-profile",              1, 0, 0 },
 		{ "packet-mode",                0, 0, 0 },
+		{ "async",                      0, 0, 0 },
+		{ "async-queue-size",           1, 0, 0 },
+		{ "async-push-batch",           1, 0, 0 },
 		{ 0, 0, 0, 0 },
 	};
 
@@ -913,8 +945,7 @@ args_parse(int argc, char **argv)
 					rte_exit(EXIT_FAILURE, "Invalid hairpin config mask\n");
 				hairpin_conf_mask = hp_conf;
 			}
-			if (strcmp(lgopts[opt_idx].name,
-					"port-id") == 0) {
+			if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
 				uint16_t port_idx = 0;
 
 				token = strtok(optarg, ",");
@@ -981,6 +1012,26 @@ args_parse(int argc, char **argv)
 			}
 			if (strcmp(lgopts[opt_idx].name, "packet-mode") == 0)
 				packet_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async") == 0)
+				async_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async-queue-size") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE)
+					async_queue_size = MAX_ASYNC_QUEUE_SIZE;
+				else if (n > 0)
+					async_queue_size = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-queue-size should be > 0\n");
+			}
+			if (strcmp(lgopts[opt_idx].name, "async-push-batch") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE >> 1)
+					async_push_batch = MAX_ASYNC_QUEUE_SIZE >> 1;
+				else if (n > 0)
+					async_push_batch = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-push-batch should be > 0\n");
+			}
 			break;
 		default:
 			usage(argv[0]);
@@ -1457,10 +1508,10 @@ query_flows(int port_id, uint8_t core_id, struct rte_flow **flows_list)
 	mc_pool.flows_record.query[port_id][core_id] = cpu_time_used;
 }
 
-static struct rte_flow **
-insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
+static void
+insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
 {
-	struct rte_flow **flows_list;
+	struct rte_flow *flow;
 	struct rte_flow_error error;
 	clock_t start_batch, end_batch;
 	double first_flow_latency;
@@ -1485,8 +1536,7 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 	global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
 	global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
 
-	flows_list = rte_zmalloc("flows_list",
-		(sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
 	if (flows_list == NULL)
 		rte_exit(EXIT_FAILURE, "No Memory available!\n");
 
@@ -1524,6 +1574,11 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 			core_id, rx_queues_count,
 			unique_data, max_priority, &error);
 
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+
 		if (!counter) {
 			first_flow_latency = (double) (rte_get_timer_cycles() - start_batch);
 			first_flow_latency /= rte_get_timer_hz();
@@ -1537,11 +1592,6 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		if (force_quit)
 			counter = end_counter;
 
-		if (!flow) {
-			print_flow_error(error);
-			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
-		}
-
 		flows_list[flow_index++] = flow;
 
 		/*
@@ -1575,7 +1625,204 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		port_id, core_id, rules_count_per_core, cpu_time_used);
 
 	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
-	return flows_list;
+}
+
+static uint32_t push_counter[RTE_MAX_LCORE];
+
+static inline int
+push_pull_flows_async(int port_id, int queue_id, int core_id, uint32_t enqueued, bool empty,
+		      bool check_op_status, struct rte_flow_error *error)
+{
+	static struct rte_flow_op_result results[RTE_MAX_LCORE][MAX_ASYNC_QUEUE_SIZE];
+	uint32_t to_pull = (empty || async_push_batch > enqueued) ? enqueued : async_push_batch;
+	uint32_t pulled_complete = 0;
+	uint32_t retries = 0;
+	int pulled, i;
+	int ret = 0;
+
+	/* Push periodically to give HW work to do */
+	ret = rte_flow_push(port_id, queue_id, error);
+	if (ret)
+		return ret;
+	push_counter[core_id]++;
+
+	/* Check if queue is getting full, if so push and drain completions */
+	if (!empty && push_counter[core_id] == 1)
+		return 0;
+
+	while (to_pull > 0) {
+		pulled = rte_flow_pull(port_id, queue_id, results[core_id], to_pull, error);
+		if (pulled < 0) {
+			return -1;
+		} else if (pulled == 0) {
+			if (++retries > MAX_PULL_RETRIES) {
+				rte_flow_error_set(error, ETIMEDOUT,
+						   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+						   "Timeout waiting for async completions");
+				return -1;
+			}
+			rte_pause();
+			continue;
+		}
+		retries = 0;
+
+		to_pull -= pulled;
+		pulled_complete += pulled;
+		if (!check_op_status)
+			continue;
+
+		for (i = 0; i < pulled; i++) {
+			if (results[core_id][i].status != RTE_FLOW_OP_SUCCESS) {
+				rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+						   NULL, "Some flow rule insertion failed");
+				return -1;
+			}
+		}
+	}
+
+	return pulled_complete;
+}
+
+static void
+insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
+{
+	struct rte_flow *flow;
+	struct rte_flow_error error;
+	clock_t start_batch, end_batch;
+	double first_flow_latency;
+	double cpu_time_used;
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
+	double delta;
+	uint32_t flow_index;
+	uint32_t counter, batch_counter, start_counter = 0, end_counter;
+	int rules_batch_idx;
+	int rules_count_per_core;
+	uint32_t enqueued = 0;
+	uint32_t queue_id = core_id;
+	bool first_batch = true;
+	int pulled;
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	if (async_push_batch > async_queue_size >> 1)
+		async_push_batch = async_queue_size >> 1;
+
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	cpu_time_used = 0;
+	flow_index = 0;
+	push_counter[core_id] = 0;
+
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+
+		uint64_t global_items[MAX_ITEMS_NUM] = {0};
+		uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
+		global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+		global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
+		flow = generate_flow(port_id, 0, flow_attrs, global_items, global_actions,
+				     flow_group, 0, 0, 0, 0, dst_port_id, core_id, rx_queues_count,
+				     unique_data, max_priority, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_get_timer_cycles();
+	for (counter = start_counter; counter < end_counter;) {
+		/* batch adding flow rules, this avoids unnecessary checks for push/pull */
+		for (batch_counter = 0; batch_counter < async_push_batch && counter < end_counter;
+		     batch_counter++, counter++) {
+			/* Create flow with postpone=true to batch operations */
+			flow = async_generate_flow(port_id, queue_id, counter, hairpin_queues_num,
+						   encap_data, decap_data, dst_port_id, core_id,
+						   rx_queues_count, unique_data, true, &error);
+
+			if (!flow) {
+				print_flow_error(error);
+				rte_exit(EXIT_FAILURE, "Error in creating async flow\n");
+			}
+
+			if (force_quit)
+				break;
+
+			flows_list[flow_index++] = flow;
+			enqueued++;
+
+			/*
+			 * Save the insertion rate for rules batch.
+			 * Check if the insertion reached the rules
+			 * patch counter, then save the insertion rate
+			 * for this batch.
+			 */
+			if (!((counter + 1) % rules_batch)) {
+				end_batch = rte_get_timer_cycles();
+				delta = (double)(end_batch - start_batch);
+				rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+				cpu_time_per_batch[rules_batch_idx] = delta / rte_get_timer_hz();
+				cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+				start_batch = rte_get_timer_cycles();
+			}
+		}
+
+		pulled = push_pull_flows_async(port_id, queue_id, core_id, enqueued, false, true,
+					       &error);
+		if (pulled < 0) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error push/pull async operations\n");
+		}
+
+		enqueued -= pulled;
+
+		if (first_batch) {
+			first_flow_latency = (double)(rte_get_timer_cycles() - start_batch);
+			first_flow_latency /= rte_get_timer_hz();
+			/* In millisecond */
+			first_flow_latency *= 1000;
+			printf(":: First Flow Batch Latency (Async) :: Port %d :: First batch (%u) "
+			       "installed in %f milliseconds\n",
+			       port_id, async_push_batch, first_flow_latency);
+			first_batch = false;
+		}
+	}
+
+	if (push_pull_flows_async(port_id, queue_id, core_id, enqueued, true, true, &error) < 0) {
+		print_flow_error(error);
+		rte_exit(EXIT_FAILURE, "Error final push/pull async operations\n");
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] - end @[%d]\n", port_id,
+	       core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	if (cpu_time_used > 0) {
+		insertion_rate = ((double)rules_count_per_core / cpu_time_used) / 1000;
+		printf(":: Port %d :: Core %d :: Async rules insertion rate -> %f K Rule/Sec\n",
+		       port_id, core_id, insertion_rate);
+	}
+	printf(":: Port %d :: Core %d :: The time for creating %d async rules is %f seconds\n",
+	       port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
 }
 
 static void
@@ -1585,12 +1832,18 @@ flows_handler(uint8_t core_id)
 	uint16_t port_idx = 0;
 	uint16_t nr_ports;
 	int port_id;
+	int rules_count_per_core;
 
 	nr_ports = rte_eth_dev_count_avail();
 
 	if (rules_batch > rules_count)
 		rules_batch = rules_count;
 
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
+	if (flows_list == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+
 	printf(":: Rules Count per port: %d\n\n", rules_count);
 
 	for (port_id = 0; port_id < nr_ports; port_id++) {
@@ -1602,10 +1855,10 @@ flows_handler(uint8_t core_id)
 		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 		if (has_meter())
 			meters_handler(port_id, core_id, METER_CREATE);
-		flows_list = insert_flows(port_id, core_id,
-						dst_ports[port_idx++]);
-		if (flows_list == NULL)
-			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
+		if (async_mode)
+			insert_flows_async(port_id, core_id, dst_ports[port_idx++], flows_list);
+		else
+			insert_flows(port_id, core_id, dst_ports[port_idx++], flows_list);
 		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 
 		if (query_flag)
@@ -2212,6 +2465,16 @@ init_port(void)
 			}
 		}
 
+		/* Configure async flow engine before device start */
+		if (async_mode) {
+			ret = async_flow_init_port(port_id, mc_pool.cores_count, async_queue_size,
+						   flow_items, flow_actions, flow_attrs, flow_group,
+						   rules_count);
+			if (ret != 0)
+				rte_exit(EXIT_FAILURE, "Failed to init async flow on port %d\n",
+					 port_id);
+		}
+
 		ret = rte_eth_dev_start(port_id);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE,
@@ -2291,6 +2554,8 @@ main(int argc, char **argv)
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
+		if (async_mode)
+			async_flow_cleanup_port(port);
 		if (rte_eth_dev_stop(port) != 0)
 			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
diff --git a/app/test-flow-perf/meson.build b/app/test-flow-perf/meson.build
index e101449e32..70d8671a54 100644
--- a/app/test-flow-perf/meson.build
+++ b/app/test-flow-perf/meson.build
@@ -1,8 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2020 Mellanox Technologies, Ltd
 
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
 sources = files(
         'actions_gen.c',
+        'async_flow.c',
         'flow_gen.c',
         'items_gen.c',
         'main.c',
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v4] test/flow: add support for async API
  2026-02-25 22:23 ` Stephen Hemminger
  2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
  2026-03-02 10:57   ` [PATCH v3] " Maxime Peim
@ 2026-03-02 14:35   ` Maxime Peim
  2026-03-09 12:52   ` [PATCH v5] " Maxime Peim
  3 siblings, 0 replies; 8+ messages in thread
From: Maxime Peim @ 2026-03-02 14:35 UTC (permalink / raw)
  To: stephen; +Cc: dev

Add async flow API mode to test-flow-perf application for improved
flow rule insertion performance. The async API allows batching flow
rule creation operations and processing completions in bulk, reducing
per-rule overhead.

New command line options:
  --async: enable async flow API mode
  --async-queue-size=N: size of async queues (default: 1024)
  --async-push-batch=N: flows to batch before push (default: 256)

Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
---
v2:
  - Replace per-flow stack allocation with pre-allocated slot pool;
    flat buffers are initialized once at init time and the hot path
    only patches per-flow item/action values into a pre-set slot
  - Fix alloca misuse: use heap allocation for queue_attr_list, round
    queue_size to power of 2 for bitmask wrapping, add bounds checks
  - Fix race on file-scope flow variable, premature latency
    measurement, and integer division in rate calculation
  - Drop unrelated lgopts reformatting
  - Use malloc instead of rte_zmalloc for non-dataplane allocations
  - Various robustness and style fixes

v3:
  - Update meson.build to exclude Windows build for flow perf test
  - Fix checkstyle
  - Remove cast from void* to uintptr_t
  - Add name to mailmap and maintainers

v4:
  - Use RTE_CAST_PTR for void pointer casts (const-correctness)
  - Simplify init_slot_pool, fill_items_template, and
    fill_actions_template signatures; export item_spec_size and
    action_conf_size helpers
  - Use goto-based centralized error cleanup
  - Remove redundant queue_id variable, use core_id directly
  - Make push_counter a function-local static
  - Narrow n_items/n_actions from uint32_t to uint8_t
  - Consistent conf variable naming in action slot-fill handlers

 .mailmap                         |   1 +
 MAINTAINERS                      |   1 +
 app/test-flow-perf/actions_gen.c | 276 ++++++++++-
 app/test-flow-perf/actions_gen.h |  31 ++
 app/test-flow-perf/async_flow.c  | 783 +++++++++++++++++++++++++++++++
 app/test-flow-perf/async_flow.h  |  54 +++
 app/test-flow-perf/items_gen.c   |  56 +++
 app/test-flow-perf/items_gen.h   |   6 +
 app/test-flow-perf/main.c        | 283 ++++++++++-
 app/test-flow-perf/meson.build   |   7 +
 10 files changed, 1458 insertions(+), 40 deletions(-)
 create mode 100644 app/test-flow-perf/async_flow.c
 create mode 100644 app/test-flow-perf/async_flow.h

diff --git a/.mailmap b/.mailmap
index 6c4c977dde..a0141402c3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1044,6 +1044,7 @@ Mauro Annarumma <mauroannarumma@hotmail.it>
 Maxime Coquelin <maxime.coquelin@redhat.com>
 Maxime Gouin <maxime.gouin@6wind.com>
 Maxime Leroy <maxime@leroys.fr> <maxime.leroy@6wind.com>
+Maxime Peim <maxime.peim@gmail.com>
 Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>
 Megha Ajmera <megha.ajmera@intel.com>
 Meijuan Zhao <meijuanx.zhao@intel.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 1b2f1ed2ba..d4c01037c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1964,6 +1964,7 @@ F: doc/guides/tools/dmaperf.rst
 
 Flow performance tool
 M: Wisam Jaddo <wisamm@nvidia.com>
+M: Maxime Peim <maxime.peim@gmail.com>
 F: app/test-flow-perf/
 F: doc/guides/tools/flow-perf.rst
 
diff --git a/app/test-flow-perf/actions_gen.c b/app/test-flow-perf/actions_gen.c
index 9d102e3af4..624ecfdf43 100644
--- a/app/test-flow-perf/actions_gen.c
+++ b/app/test-flow-perf/actions_gen.c
@@ -36,27 +36,7 @@ struct additional_para {
 	bool unique_data;
 };
 
-/* Storage for struct rte_flow_action_raw_encap including external data. */
-struct action_raw_encap_data {
-	struct rte_flow_action_raw_encap conf;
-	uint8_t data[128];
-	uint8_t preserve[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_raw_decap including external data. */
-struct action_raw_decap_data {
-	struct rte_flow_action_raw_decap conf;
-	uint8_t data[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_rss including external data. */
-struct action_rss_data {
-	struct rte_flow_action_rss conf;
-	uint8_t key[40];
-	uint16_t queue[128];
-};
+/* Compound action data structs defined in actions_gen.h */
 
 static void
 add_mark(struct rte_flow_action *actions,
@@ -1165,3 +1145,257 @@ fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	free(queues);
 	free(hairpin_queues);
 }
+
+size_t
+action_conf_size(enum rte_flow_action_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ACTION_TYPE_MARK:
+		return sizeof(struct rte_flow_action_mark);
+	case RTE_FLOW_ACTION_TYPE_QUEUE:
+		return sizeof(struct rte_flow_action_queue);
+	case RTE_FLOW_ACTION_TYPE_JUMP:
+		return sizeof(struct rte_flow_action_jump);
+	case RTE_FLOW_ACTION_TYPE_RSS:
+		return sizeof(struct action_rss_data);
+	case RTE_FLOW_ACTION_TYPE_SET_META:
+		return sizeof(struct rte_flow_action_set_meta);
+	case RTE_FLOW_ACTION_TYPE_SET_TAG:
+		return sizeof(struct rte_flow_action_set_tag);
+	case RTE_FLOW_ACTION_TYPE_PORT_ID:
+		return sizeof(struct rte_flow_action_port_id);
+	case RTE_FLOW_ACTION_TYPE_COUNT:
+		return sizeof(struct rte_flow_action_count);
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+		return sizeof(struct rte_flow_action_set_mac);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+		return sizeof(struct rte_flow_action_set_ipv4);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+		return sizeof(struct rte_flow_action_set_ipv6);
+	case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+		return sizeof(struct rte_flow_action_set_tp);
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+		return sizeof(rte_be32_t);
+	case RTE_FLOW_ACTION_TYPE_SET_TTL:
+		return sizeof(struct rte_flow_action_set_ttl);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+		return sizeof(struct rte_flow_action_set_dscp);
+	case RTE_FLOW_ACTION_TYPE_METER:
+		return sizeof(struct rte_flow_action_meter);
+	case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+		return sizeof(struct action_raw_encap_data);
+	case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+		return sizeof(struct action_raw_decap_data);
+	case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		return sizeof(struct rte_flow_action_vxlan_encap) +
+		       5 * sizeof(struct rte_flow_item) + sizeof(struct rte_flow_item_eth) +
+		       sizeof(struct rte_flow_item_ipv4) + sizeof(struct rte_flow_item_udp) +
+		       sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD:
+		return sizeof(struct rte_flow_action_modify_field);
+	/* Zero-conf types */
+	case RTE_FLOW_ACTION_TYPE_DROP:
+	case RTE_FLOW_ACTION_TYPE_FLAG:
+	case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+	case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+void
+fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+		      uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+		      bool *need_wire_orig_table)
+{
+	uint8_t actions_counter = 0;
+	uint8_t i, j;
+
+	*need_wire_orig_table = false;
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	/* Static configurations for actions that need them in templates */
+	static struct rte_flow_action_mark mark_conf = {
+		.id = 1,
+	};
+	static struct rte_flow_action_queue queue_conf = {
+		.index = 0,
+	};
+	static struct rte_flow_action_port_id port_id_conf = {
+		.id = 0,
+	};
+	static struct rte_flow_action_jump jump_conf = {
+		.group = 1,
+	};
+	static struct rte_flow_action_modify_field set_meta_conf = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst = {.field = RTE_FLOW_FIELD_META},
+		.src = {
+			.field = RTE_FLOW_FIELD_VALUE,
+			.value = {0, 0, 0, META_DATA},
+		},
+		.width = 32,
+	};
+
+	/* Static mask configurations for each action type */
+	static struct rte_flow_action_mark mark_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_queue queue_mask = {
+		.index = UINT16_MAX,
+	};
+	static struct rte_flow_action_jump jump_mask = {
+		.group = UINT32_MAX,
+	};
+	static struct rte_flow_action_rss rss_mask = {
+		.level = UINT32_MAX,
+		.types = UINT64_MAX,
+	};
+	static struct rte_flow_action_set_meta set_meta_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_tag set_tag_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+		.index = UINT8_MAX,
+	};
+	static struct rte_flow_action_port_id port_id_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_count count_mask;
+	static struct rte_flow_action_set_mac set_mac_mask = {
+		.mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	};
+	static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
+		.ipv4_addr = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_ipv6 set_ipv6_mask = {
+		.ipv6_addr.a = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff}};
+	static struct rte_flow_action_set_tp set_tp_mask = {
+		.port = UINT16_MAX,
+	};
+	static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
+	static struct rte_flow_action_set_ttl set_ttl_mask = {
+		.ttl_value = UINT8_MAX,
+	};
+	static struct rte_flow_action_set_dscp set_dscp_mask = {
+		.dscp = UINT8_MAX,
+	};
+	static struct rte_flow_action_meter meter_mask = {
+		.mtr_id = UINT32_MAX,
+	};
+
+	static const struct {
+		uint64_t flow_mask;
+		enum rte_flow_action_type type;
+		const void *action_conf;
+		const void *action_mask;
+		const bool need_wire_orig_table;
+	} template_actions[] = {
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK), RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
+		 &mark_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT), RTE_FLOW_ACTION_TYPE_COUNT, NULL,
+		 &count_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
+		 RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf, &set_meta_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG), RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
+		 &set_tag_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG), RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
+		 false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC), RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST), RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL), RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
+		 &set_ttl_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL), RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
+		 NULL, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE), RTE_FLOW_ACTION_TYPE_QUEUE,
+		 &queue_conf, &queue_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS), RTE_FLOW_ACTION_TYPE_RSS, NULL,
+		 &rss_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP), RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
+		 &jump_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID), RTE_FLOW_ACTION_TYPE_PORT_ID,
+		 &port_id_conf, &port_id_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP), RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
+		 false},
+		{HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE, &queue_conf, &queue_mask, false},
+		{HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL, &rss_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER), RTE_FLOW_ACTION_TYPE_METER, NULL,
+		 &meter_mask, false},
+	};
+
+	for (j = 0; j < MAX_ACTIONS_NUM; j++) {
+		if (flow_actions[j] == 0)
+			break;
+		for (i = 0; i < RTE_DIM(template_actions); i++) {
+			if ((flow_actions[j] & template_actions[i].flow_mask) == 0)
+				continue;
+
+			switch (template_actions[i].type) {
+			case RTE_FLOW_ACTION_TYPE_COUNT:
+				port_attr->nb_counters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_AGE:
+				port_attr->nb_aging_objects++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_METER:
+				port_attr->nb_meters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_CONNTRACK:
+				port_attr->nb_conn_tracks++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_QUOTA:
+				port_attr->nb_quotas++;
+			default:
+				break;
+			}
+
+			actions[actions_counter].type = template_actions[i].type;
+			actions[actions_counter].conf = template_actions[i].action_conf;
+			masks[actions_counter].type = template_actions[i].type;
+			masks[actions_counter].conf = template_actions[i].action_mask;
+			*need_wire_orig_table |= template_actions[i].need_wire_orig_table;
+			break;
+		}
+	}
+
+	actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+	masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+}
diff --git a/app/test-flow-perf/actions_gen.h b/app/test-flow-perf/actions_gen.h
index 9e13b164f9..cd4d1753bc 100644
--- a/app/test-flow-perf/actions_gen.h
+++ b/app/test-flow-perf/actions_gen.h
@@ -17,9 +17,40 @@
 #define RTE_VXLAN_GPE_UDP_PORT 250
 #define RTE_GENEVE_UDP_PORT 6081
 
+/* Compound action data structures (needed by async_flow.c for slot init) */
+
+/* Storage for struct rte_flow_action_raw_encap including external data. */
+struct action_raw_encap_data {
+	struct rte_flow_action_raw_encap conf;
+	uint8_t data[128];
+	uint8_t preserve[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_raw_decap including external data. */
+struct action_raw_decap_data {
+	struct rte_flow_action_raw_decap conf;
+	uint8_t data[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_rss including external data. */
+struct action_rss_data {
+	struct rte_flow_action_rss conf;
+	uint8_t key[40];
+	uint16_t queue[128];
+};
+
 void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	uint32_t counter, uint16_t next_table, uint16_t hairpinq,
 	uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
 	bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
 
+/* Fill actions template for async flow API (types only, no values) */
+void fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+			   uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+			   bool *need_wire_orig_table);
+
+size_t action_conf_size(enum rte_flow_action_type type);
+
 #endif /* FLOW_PERF_ACTION_GEN */
diff --git a/app/test-flow-perf/async_flow.c b/app/test-flow-perf/async_flow.c
new file mode 100644
index 0000000000..3ead41393c
--- /dev/null
+++ b/app/test-flow-perf/async_flow.c
@@ -0,0 +1,783 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API implementation
+ * for the flow-perf application.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_bitops.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_vxlan.h>
+
+#include "actions_gen.h"
+#include "async_flow.h"
+#include "flow_gen.h"
+#include "items_gen.h"
+
+/* Max iterations when draining pending async completions during cleanup */
+#define DRAIN_MAX_ITERATIONS 100
+
+/* Per-port async flow resources */
+static struct async_flow_resources port_resources[MAX_PORTS];
+
+/*
+ * Initialize compound action types within a pre-allocated slot.
+ * Called once per slot during pool init to set up internal pointers
+ * for RSS, RAW_ENCAP, RAW_DECAP and VXLAN_ENCAP actions.
+ */
+static void
+init_slot_compound_actions(struct rte_flow_action *actions, uint32_t n_actions,
+			   const size_t *action_conf_sizes)
+{
+	uint32_t i;
+
+	for (i = 0; i < n_actions; i++) {
+		if (action_conf_sizes[i] == 0)
+			continue;
+
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *rss = RTE_CAST_PTR(typeof(rss), actions[i].conf);
+			rss->conf.func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+			rss->conf.level = 0;
+			rss->conf.types = GET_RSS_HF();
+			rss->conf.key_len = sizeof(rss->key);
+			rss->conf.key = rss->key;
+			rss->conf.queue = rss->queue;
+			rss->key[0] = 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap =
+				RTE_CAST_PTR(typeof(encap), actions[i].conf);
+			encap->conf.data = encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap =
+				RTE_CAST_PTR(typeof(decap), actions[i].conf);
+			decap->conf.data = decap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			/*
+			 * Layout within the conf area:
+			 *   struct rte_flow_action_vxlan_encap
+			 *   struct rte_flow_item[5]
+			 *   struct rte_flow_item_eth
+			 *   struct rte_flow_item_ipv4
+			 *   struct rte_flow_item_udp
+			 *   struct rte_flow_item_vxlan
+			 */
+			uint8_t *base = RTE_CAST_PTR(typeof(base), actions[i].conf);
+			struct rte_flow_action_vxlan_encap *ve =
+				(struct rte_flow_action_vxlan_encap *)base;
+			struct rte_flow_item *items =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			uint8_t *data = (uint8_t *)(items + 5);
+
+			struct rte_flow_item_eth *item_eth = (struct rte_flow_item_eth *)data;
+			data += sizeof(struct rte_flow_item_eth);
+			struct rte_flow_item_ipv4 *item_ipv4 = (struct rte_flow_item_ipv4 *)data;
+			data += sizeof(struct rte_flow_item_ipv4);
+			struct rte_flow_item_udp *item_udp = (struct rte_flow_item_udp *)data;
+			data += sizeof(struct rte_flow_item_udp);
+			struct rte_flow_item_vxlan *item_vxlan = (struct rte_flow_item_vxlan *)data;
+
+			memset(item_eth, 0, sizeof(*item_eth));
+			memset(item_ipv4, 0, sizeof(*item_ipv4));
+			memset(item_udp, 0, sizeof(*item_udp));
+			memset(item_vxlan, 0, sizeof(*item_vxlan));
+
+			item_ipv4->hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+			item_ipv4->hdr.version_ihl = RTE_IPV4_VHL_DEF;
+			item_udp->hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+			item_vxlan->hdr.vni[2] = 1;
+
+			items[0].type = RTE_FLOW_ITEM_TYPE_ETH;
+			items[0].spec = item_eth;
+			items[0].mask = item_eth;
+			items[1].type = RTE_FLOW_ITEM_TYPE_IPV4;
+			items[1].spec = item_ipv4;
+			items[1].mask = item_ipv4;
+			items[2].type = RTE_FLOW_ITEM_TYPE_UDP;
+			items[2].spec = item_udp;
+			items[2].mask = item_udp;
+			items[3].type = RTE_FLOW_ITEM_TYPE_VXLAN;
+			items[3].spec = item_vxlan;
+			items[3].mask = item_vxlan;
+			items[4].type = RTE_FLOW_ITEM_TYPE_END;
+
+			ve->definition = items;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Allocate and pre-initialize all per-slot flat buffers.
+ * Returns 0 on success.
+ */
+static int
+init_slot_pool(struct async_flow_resources *res, uint32_t nb_queues, uint32_t queue_size,
+	       const struct rte_flow_item *pattern, const struct rte_flow_action *template_actions)
+{
+	size_t item_spec_sizes[MAX_ITEMS_NUM], action_conf_sizes[MAX_ACTIONS_NUM];
+	uint32_t items_array_bytes, actions_array_bytes;
+	uint32_t spec_data_bytes, conf_data_bytes;
+	uint32_t slot_size, num_slots;
+	uint32_t s, i;
+	uint8_t n_items, n_actions;
+	uint8_t *mptr;
+	int ret = 0;
+
+	/* Compute shared mask size */
+	spec_data_bytes = 0;
+	for (n_items = 0; pattern[n_items].type != RTE_FLOW_ITEM_TYPE_END; n_items++) {
+		item_spec_sizes[n_items] = item_spec_size(pattern[n_items].type);
+		spec_data_bytes += item_spec_sizes[n_items];
+	}
+
+	/* END */
+	item_spec_sizes[n_items++] = 0;
+
+	conf_data_bytes = 0;
+	for (n_actions = 0; template_actions[n_actions].type != RTE_FLOW_ACTION_TYPE_END;
+	     n_actions++) {
+		action_conf_sizes[n_actions] = action_conf_size(template_actions[n_actions].type);
+		conf_data_bytes += action_conf_sizes[n_actions];
+	}
+
+	/* END */
+	action_conf_sizes[n_actions++] = 0;
+
+	/* Compute per-slot layout sizes */
+	items_array_bytes = n_items * sizeof(struct rte_flow_item);
+	actions_array_bytes = n_actions * sizeof(struct rte_flow_action);
+
+	slot_size = RTE_ALIGN_CEIL(items_array_bytes + actions_array_bytes + spec_data_bytes +
+					   conf_data_bytes,
+				   RTE_CACHE_LINE_SIZE);
+
+	num_slots = queue_size * nb_queues;
+
+	/* Store layout info */
+	res->slot_size = slot_size;
+	res->slots_per_queue = queue_size;
+	res->nb_queues = nb_queues;
+	res->n_items = n_items;
+	res->n_actions = n_actions;
+
+	/* Allocate shared masks */
+	if (spec_data_bytes > 0) {
+		res->shared_masks = aligned_alloc(RTE_CACHE_LINE_SIZE, spec_data_bytes);
+		if (res->shared_masks == NULL) {
+			fprintf(stderr, "Failed to allocate shared masks (%u bytes)\n",
+				spec_data_bytes);
+			return -ENOMEM;
+		}
+		memset(res->shared_masks, 0, spec_data_bytes);
+
+		/* Copy mask data from template pattern */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			if (item_spec_sizes[i] > 0 && pattern[i].mask != NULL)
+				memcpy(mptr, pattern[i].mask, item_spec_sizes[i]);
+			mptr += item_spec_sizes[i];
+		}
+	}
+
+	/* Allocate per-slot pool */
+	/* slot_size is already cache-line aligned, so total is a multiple */
+	res->slot_pool = aligned_alloc(RTE_CACHE_LINE_SIZE, (size_t)num_slots * slot_size);
+	if (res->slot_pool == NULL) {
+		fprintf(stderr, "Failed to allocate slot pool (%u slots * %u bytes)\n", num_slots,
+			slot_size);
+		ret = -ENOMEM;
+		goto free_shared_masks;
+	}
+	memset(res->slot_pool, 0, (size_t)num_slots * slot_size);
+
+	/* Pre-initialize every slot */
+	for (s = 0; s < num_slots; s++) {
+		uint8_t *slot = res->slot_pool + (size_t)s * slot_size;
+		struct rte_flow_item *items = (struct rte_flow_item *)slot;
+		struct rte_flow_action *actions =
+			(struct rte_flow_action *)(slot + items_array_bytes);
+		uint8_t *data = slot + items_array_bytes + actions_array_bytes;
+
+		/* Pre-set items: spec → per-slot data, mask → shared masks */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			items[i].type = pattern[i].type;
+			if (item_spec_sizes[i] > 0) {
+				items[i].spec = data;
+				items[i].mask = mptr;
+				data += item_spec_sizes[i];
+				mptr += item_spec_sizes[i];
+			}
+		}
+
+		/* Pre-set actions: conf → per-slot data */
+		for (i = 0; i < n_actions; i++) {
+			actions[i].type = template_actions[i].type;
+			if (action_conf_sizes[i] > 0) {
+				actions[i].conf = data;
+				data += action_conf_sizes[i];
+			}
+		}
+
+		/* Initialize compound action types (RSS, RAW_ENCAP, etc.) */
+		init_slot_compound_actions(actions, n_actions, action_conf_sizes);
+	}
+
+	/* Allocate and initialize per-queue slot tracking */
+	res->queues =
+		aligned_alloc(RTE_CACHE_LINE_SIZE, nb_queues * sizeof(struct async_flow_queue));
+	if (res->queues == NULL) {
+		fprintf(stderr, "Failed to allocate queue structs (%u queues)\n", nb_queues);
+		ret = -ENOMEM;
+		goto free_slot_pool;
+	}
+	memset(res->queues, 0, nb_queues * sizeof(struct async_flow_queue));
+	for (s = 0; s < nb_queues; s++) {
+		res->queues[s].slots = res->slot_pool + (size_t)s * queue_size * slot_size;
+		res->queues[s].head = 0;
+	}
+
+	printf(":: Slot pool: %u slots * %u bytes = %u KB (shared masks: %u bytes)\n", num_slots,
+	       slot_size, (num_slots * slot_size) / 1024, spec_data_bytes);
+
+	return 0;
+
+free_slot_pool:
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+free_shared_masks:
+	if (res->shared_masks) {
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+	}
+	return ret;
+}
+
+/*
+ * Hot-path: update per-flow item values through pre-set pointers.
+ * Only IPv4/IPv6 src_addr varies per flow (based on counter).
+ */
+static void
+update_item_values(struct rte_flow_item *items, uint32_t counter)
+{
+	uint8_t i;
+
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
+		switch (items[i].type) {
+		case RTE_FLOW_ITEM_TYPE_IPV4: {
+			struct rte_flow_item_ipv4 *spec = RTE_CAST_PTR(typeof(spec), items[i].spec);
+			spec->hdr.src_addr = RTE_BE32(counter);
+			break;
+		}
+		case RTE_FLOW_ITEM_TYPE_IPV6: {
+			struct rte_flow_item_ipv6 *spec = RTE_CAST_PTR(typeof(spec), items[i].spec);
+			uint8_t j;
+			for (j = 0; j < 4; j++)
+				spec->hdr.src_addr.a[15 - j] = counter >> (j * 8);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Hot-path: update per-flow action values through pre-set pointers.
+ */
+static void
+update_action_values(struct rte_flow_action *actions, uint32_t counter, uint16_t hairpinq,
+		     uint64_t encap_data, uint64_t decap_data, __rte_unused uint8_t core_idx,
+		     bool unique_data, uint8_t rx_queues_count, uint16_t dst_port)
+{
+	uint8_t i;
+
+	for (i = 0; actions[i].type != RTE_FLOW_ACTION_TYPE_END; i++) {
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_MARK: {
+			struct rte_flow_action_mark *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->id = (counter % 255) + 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_QUEUE: {
+			struct rte_flow_action_queue *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->index = hairpinq ? (counter % hairpinq) + rx_queues_count :
+						 counter % rx_queues_count;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_METER: {
+			struct rte_flow_action_meter *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->mtr_id = counter;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint16_t q;
+			if (hairpinq) {
+				conf->conf.queue_num = hairpinq;
+				for (q = 0; q < hairpinq; q++)
+					conf->queue[q] = q + rx_queues_count;
+			} else {
+				conf->conf.queue_num = rx_queues_count;
+				for (q = 0; q < rx_queues_count; q++)
+					conf->queue[q] = q;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: {
+			struct rte_flow_action_set_mac *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < RTE_ETHER_ADDR_LEN; j++) {
+				conf->mac_addr[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: {
+			struct rte_flow_action_set_ipv4 *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t ip = unique_data ? counter : 1;
+			conf->ipv4_addr = RTE_BE32(ip + 1);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: {
+			struct rte_flow_action_set_ipv6 *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < 16; j++) {
+				conf->ipv6_addr.a[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: {
+			struct rte_flow_action_set_tp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t tp = unique_data ? counter : 100;
+			tp = tp % 0xffff;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST: {
+			struct rte_flow_action_set_tp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t tp = unique_data ? counter : 100;
+			if (tp > 0xffff)
+				tp >>= 16;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: {
+			rte_be32_t *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			*conf = RTE_BE32(val);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TTL: {
+			struct rte_flow_action_set_ttl *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			conf->ttl_value = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: {
+			struct rte_flow_action_set_dscp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			conf->dscp = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_PORT_ID: {
+			struct rte_flow_action_port_id *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->id = dst_port;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint8_t *header = conf->data;
+			struct rte_ether_hdr eth_hdr;
+			struct rte_ipv4_hdr ipv4_hdr;
+			struct rte_udp_hdr udp_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4)) {
+				uint32_t ip_dst = unique_data ? counter : 1;
+				memset(&ipv4_hdr, 0, sizeof(ipv4_hdr));
+				ipv4_hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+				ipv4_hdr.dst_addr = RTE_BE32(ip_dst);
+				ipv4_hdr.version_ihl = RTE_IPV4_VHL_DEF;
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP))
+					ipv4_hdr.next_proto_id = 17; /* UDP */
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE))
+					ipv4_hdr.next_proto_id = 47; /* GRE */
+				memcpy(header, &ipv4_hdr, sizeof(ipv4_hdr));
+				header += sizeof(ipv4_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP)) {
+				memset(&udp_hdr, 0, sizeof(udp_hdr));
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN))
+					udp_hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+				memcpy(header, &udp_hdr, sizeof(udp_hdr));
+				header += sizeof(udp_hdr);
+			}
+			conf->conf.size = header - conf->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint8_t *header = conf->data;
+			struct rte_ether_hdr eth_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			conf->conf.size = header - conf->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			uint8_t *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			struct rte_flow_item *vitems =
+				(struct rte_flow_item
+					 *)(conf + sizeof(struct rte_flow_action_vxlan_encap));
+			struct rte_flow_item_ipv4 *spec =
+				RTE_CAST_PTR(typeof(spec), vitems[1].spec);
+			uint32_t ip_dst = unique_data ? counter : 1;
+			/* vitems[1] is IPV4 */
+			spec->hdr.dst_addr = RTE_BE32(ip_dst);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int
+async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+		     uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+		     uint8_t flow_group, uint32_t rules_count)
+{
+	struct rte_flow_port_info port_info = {0};
+	struct rte_flow_queue_info queue_info = {0};
+	struct rte_flow_error error = {0};
+	struct rte_flow_port_attr port_attr = {0};
+	struct rte_flow_queue_attr queue_attr;
+	const struct rte_flow_queue_attr **queue_attr_list;
+	struct rte_flow_pattern_template_attr pt_attr = {0};
+	struct rte_flow_actions_template_attr at_attr = {0};
+	struct rte_flow_template_table_attr table_attr = {0};
+	struct rte_flow_item pattern[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
+	struct async_flow_resources *res;
+	bool need_wire_orig_table = false;
+	uint32_t i;
+	int ret;
+
+	if (port_id >= MAX_PORTS)
+		return -1;
+
+	res = &port_resources[port_id];
+	memset(res, 0, sizeof(*res));
+
+	/* Query port flow info */
+	ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	if (port_info.max_nb_queues == 0 || queue_info.max_size == 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get reports that no queues are supported\n",
+			port_id);
+		return -1;
+	}
+
+	/* Limit to device capabilities if reported */
+	if (port_info.max_nb_queues != 0 && port_info.max_nb_queues != UINT32_MAX &&
+	    nb_queues > port_info.max_nb_queues)
+		nb_queues = port_info.max_nb_queues;
+	if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX &&
+	    queue_size > queue_info.max_size)
+		queue_size = queue_info.max_size;
+
+	/* Slot ring uses bitmask wrapping, so queue_size must be power of 2 */
+	queue_size = rte_align32prevpow2(queue_size);
+	if (queue_size == 0) {
+		fprintf(stderr, "Port %u: queue_size is 0 after rounding\n", port_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < MAX_ATTRS_NUM; i++) {
+		if (flow_attrs[i] == 0)
+			break;
+		if (flow_attrs[i] & INGRESS)
+			pt_attr.ingress = 1;
+		else if (flow_attrs[i] & EGRESS)
+			pt_attr.egress = 1;
+		else if (flow_attrs[i] & TRANSFER)
+			pt_attr.transfer = 1;
+	}
+	/* Enable relaxed matching for better performance */
+	pt_attr.relaxed_matching = 1;
+
+	memset(pattern, 0, sizeof(pattern));
+	memset(actions, 0, sizeof(actions));
+	memset(action_masks, 0, sizeof(action_masks));
+
+	/* Fill templates and gather per-item/action sizes */
+	fill_items_template(pattern, flow_items, 0, 0);
+
+	at_attr.ingress = pt_attr.ingress;
+	at_attr.egress = pt_attr.egress;
+	at_attr.transfer = pt_attr.transfer;
+
+	fill_actions_template(actions, action_masks, flow_actions, &port_attr,
+			      &need_wire_orig_table);
+
+	/*
+	 * fill_actions_template count the number of actions that require each kind of object,
+	 * so we multiply by the number of rules to have correct number
+	 */
+	port_attr.nb_counters *= rules_count;
+	port_attr.nb_aging_objects *= rules_count;
+	port_attr.nb_meters *= rules_count;
+	port_attr.nb_conn_tracks *= rules_count;
+	port_attr.nb_quotas *= rules_count;
+
+	table_attr.flow_attr.group = flow_group;
+	table_attr.flow_attr.priority = 0;
+	table_attr.flow_attr.ingress = pt_attr.ingress;
+	table_attr.flow_attr.egress = pt_attr.egress;
+	table_attr.flow_attr.transfer = pt_attr.transfer;
+	table_attr.nb_flows = rules_count;
+
+	if (pt_attr.transfer && need_wire_orig_table)
+		table_attr.specialize = RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
+
+	queue_attr_list = malloc(sizeof(*queue_attr_list) * nb_queues);
+	if (queue_attr_list == NULL) {
+		fprintf(stderr, "Port %u: failed to allocate queue_attr_list\n", port_id);
+		return -ENOMEM;
+	}
+
+	queue_attr.size = queue_size;
+	for (i = 0; i < nb_queues; i++)
+		queue_attr_list[i] = &queue_attr;
+
+	ret = rte_flow_configure(port_id, &port_attr, nb_queues, queue_attr_list, &error);
+
+	free(queue_attr_list);
+
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_configure failed (ret=%d, type=%d): %s\n",
+			port_id, ret, error.type, error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Create pattern template */
+	res->pattern_template =
+		rte_flow_pattern_template_create(port_id, &pt_attr, pattern, &error);
+	if (res->pattern_template == NULL) {
+		fprintf(stderr, "Port %u: pattern template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return -1;
+	}
+
+	/* Create actions template */
+	res->actions_template =
+		rte_flow_actions_template_create(port_id, &at_attr, actions, action_masks, &error);
+	if (res->actions_template == NULL) {
+		fprintf(stderr, "Port %u: actions template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		ret = -1;
+		goto free_pattern;
+	}
+
+	/* Create template table */
+	res->table = rte_flow_template_table_create(port_id, &table_attr, &res->pattern_template, 1,
+						    &res->actions_template, 1, &error);
+	if (res->table == NULL) {
+		fprintf(stderr, "Port %u: template table create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		ret = -1;
+		goto free_actions;
+	}
+
+	/* Allocate and pre-initialize per-slot flat buffers */
+	ret = init_slot_pool(res, nb_queues, queue_size, pattern, actions);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: slot pool init failed\n", port_id);
+		goto free_table;
+	}
+
+	res->table_capacity = rules_count;
+	res->initialized = true;
+
+	printf(":: Port %u: Async flow engine initialized (queues=%u, queue_size=%u)\n", port_id,
+	       nb_queues, queue_size);
+
+	return 0;
+
+free_table:
+	rte_flow_template_table_destroy(port_id, res->table, &error);
+	res->table = NULL;
+free_actions:
+	rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+	res->actions_template = NULL;
+free_pattern:
+	rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+	res->pattern_template = NULL;
+	return ret;
+}
+
+struct rte_flow *
+async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter, uint16_t hairpinq,
+		    uint64_t encap_data, uint64_t decap_data, uint16_t dst_port, uint8_t core_idx,
+		    uint8_t rx_queues_count, bool unique_data, bool postpone,
+		    struct rte_flow_error *error)
+{
+	struct async_flow_resources *res;
+	struct async_flow_queue *q;
+	uint8_t *slot;
+	uint32_t idx, items_array_bytes;
+	struct rte_flow_item *items;
+	struct rte_flow_action *actions;
+	struct rte_flow_op_attr op_attr = {
+		.postpone = postpone,
+	};
+
+	if (port_id >= MAX_PORTS) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid port ID");
+		return NULL;
+	}
+
+	res = &port_resources[port_id];
+	if (!res->initialized) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Async flow resources not initialized");
+		return NULL;
+	}
+
+	if (queue_id >= res->nb_queues) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid queue ID");
+		return NULL;
+	}
+
+	/* Pick the next slot from this queue's ring */
+	q = &res->queues[queue_id];
+	idx = q->head;
+	q->head = (idx + 1) & (res->slots_per_queue - 1);
+	slot = q->slots + (size_t)idx * res->slot_size;
+	items_array_bytes = res->n_items * sizeof(struct rte_flow_item);
+	items = (struct rte_flow_item *)slot;
+	actions = (struct rte_flow_action *)(slot + items_array_bytes);
+
+	/* Update only per-flow varying values */
+	update_item_values(items, counter);
+	update_action_values(actions, counter, hairpinq, encap_data, decap_data, core_idx,
+			     unique_data, rx_queues_count, dst_port);
+
+	return rte_flow_async_create(port_id, queue_id, &op_attr, res->table, items, 0, actions, 0,
+				     NULL, error);
+}
+
+void
+async_flow_cleanup_port(uint16_t port_id)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_error error;
+	struct rte_flow_op_result results[64];
+	int ret, i;
+
+	if (port_id >= MAX_PORTS)
+		return;
+
+	res = &port_resources[port_id];
+	if (!res->initialized)
+		return;
+
+	/* Drain any pending async completions from flow flush */
+	for (i = 0; i < DRAIN_MAX_ITERATIONS; i++) {
+		rte_flow_push(port_id, 0, &error);
+		ret = rte_flow_pull(port_id, 0, results, 64, &error);
+		if (ret <= 0)
+			break;
+	}
+
+	if (res->table != NULL) {
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		res->table = NULL;
+	}
+
+	if (res->actions_template != NULL) {
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		res->actions_template = NULL;
+	}
+
+	if (res->pattern_template != NULL) {
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+	}
+
+	free(res->queues);
+	res->queues = NULL;
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+	free(res->shared_masks);
+	res->shared_masks = NULL;
+
+	res->initialized = false;
+}
diff --git a/app/test-flow-perf/async_flow.h b/app/test-flow-perf/async_flow.h
new file mode 100644
index 0000000000..8ef8883107
--- /dev/null
+++ b/app/test-flow-perf/async_flow.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API related definitions
+ * and function declarations.
+ */
+
+#ifndef FLOW_PERF_ASYNC_FLOW
+#define FLOW_PERF_ASYNC_FLOW
+
+#include <rte_flow.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "config.h"
+
+/* Per-queue slot ring — tracks which slot to use next */
+struct async_flow_queue {
+	uint8_t *slots; /* pointer to this queue's region within slot_pool */
+	uint32_t head;	/* next slot index (wraps mod slots_per_queue) */
+};
+
+/* Per-port async flow resources */
+struct async_flow_resources {
+	struct rte_flow_pattern_template *pattern_template;
+	struct rte_flow_actions_template *actions_template;
+	struct rte_flow_template_table *table;
+	uint8_t *slot_pool;    /* flat buffer pool for all slots */
+	uint8_t *shared_masks; /* shared item mask data (one copy for all slots) */
+	struct async_flow_queue *queues;
+	uint32_t slot_size;	  /* bytes per slot (cache-line aligned) */
+	uint32_t slots_per_queue; /* = queue_size */
+	uint32_t nb_queues;
+	uint32_t table_capacity;
+	uint8_t n_items;   /* item count (excl. END) */
+	uint8_t n_actions; /* action count (excl. END) */
+	bool initialized;
+};
+
+/* Initialize async flow engine for a port */
+int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+			 uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+			 uint8_t flow_group, uint32_t rules_count);
+
+/* Create a flow rule asynchronously using pre-allocated slot */
+struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter,
+				     uint16_t hairpinq, uint64_t encap_data, uint64_t decap_data,
+				     uint16_t dst_port, uint8_t core_idx, uint8_t rx_queues_count,
+				     bool unique_data, bool postpone, struct rte_flow_error *error);
+
+/* Cleanup async flow resources for a port */
+void async_flow_cleanup_port(uint16_t port_id);
+
+#endif /* FLOW_PERF_ASYNC_FLOW */
diff --git a/app/test-flow-perf/items_gen.c b/app/test-flow-perf/items_gen.c
index c740e1838f..13af05cbb5 100644
--- a/app/test-flow-perf/items_gen.c
+++ b/app/test-flow-perf/items_gen.c
@@ -389,3 +389,59 @@ fill_items(struct rte_flow_item *items,
 
 	items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
 }
+
+size_t
+item_spec_size(enum rte_flow_item_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ITEM_TYPE_ETH:
+		return sizeof(struct rte_flow_item_eth);
+	case RTE_FLOW_ITEM_TYPE_VLAN:
+		return sizeof(struct rte_flow_item_vlan);
+	case RTE_FLOW_ITEM_TYPE_IPV4:
+		return sizeof(struct rte_flow_item_ipv4);
+	case RTE_FLOW_ITEM_TYPE_IPV6:
+		return sizeof(struct rte_flow_item_ipv6);
+	case RTE_FLOW_ITEM_TYPE_TCP:
+		return sizeof(struct rte_flow_item_tcp);
+	case RTE_FLOW_ITEM_TYPE_UDP:
+		return sizeof(struct rte_flow_item_udp);
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		return sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		return sizeof(struct rte_flow_item_vxlan_gpe);
+	case RTE_FLOW_ITEM_TYPE_GRE:
+		return sizeof(struct rte_flow_item_gre);
+	case RTE_FLOW_ITEM_TYPE_GENEVE:
+		return sizeof(struct rte_flow_item_geneve);
+	case RTE_FLOW_ITEM_TYPE_GTP:
+		return sizeof(struct rte_flow_item_gtp);
+	case RTE_FLOW_ITEM_TYPE_META:
+		return sizeof(struct rte_flow_item_meta);
+	case RTE_FLOW_ITEM_TYPE_TAG:
+		return sizeof(struct rte_flow_item_tag);
+	case RTE_FLOW_ITEM_TYPE_ICMP:
+		return sizeof(struct rte_flow_item_icmp);
+	case RTE_FLOW_ITEM_TYPE_ICMP6:
+		return sizeof(struct rte_flow_item_icmp6);
+	default:
+		return 0;
+	}
+}
+
+void
+fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+		    uint8_t core_idx)
+{
+	uint32_t i;
+
+	fill_items(items, flow_items, outer_ip_src, core_idx);
+
+	/* For templates, set spec to NULL - only mask matters for template matching */
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
+		items[i].spec = NULL;
+	}
+
+	/* END */
+	items[i].spec = NULL;
+}
diff --git a/app/test-flow-perf/items_gen.h b/app/test-flow-perf/items_gen.h
index f4b0e9a981..85d08ef35b 100644
--- a/app/test-flow-perf/items_gen.h
+++ b/app/test-flow-perf/items_gen.h
@@ -15,4 +15,10 @@
 void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
 	uint32_t outer_ip_src, uint8_t core_idx);
 
+/* Fill items template for async flow API (masks only, no spec values) */
+void fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+			 uint8_t core_idx);
+
+size_t item_spec_size(enum rte_flow_item_type type);
+
 #endif /* FLOW_PERF_ITEMS_GEN */
diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 6636d1517f..691020c719 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -37,11 +37,15 @@
 #include <rte_mtr.h>
 #include <rte_os_shim.h>
 
-#include "config.h"
 #include "actions_gen.h"
+#include "async_flow.h"
+#include "config.h"
 #include "flow_gen.h"
+#include "rte_build_config.h"
 
 #define MAX_BATCHES_COUNT          100
+#define MAX_ASYNC_QUEUE_SIZE	     (1 << 14)
+#define MAX_PULL_RETRIES	     (1 << 20)
 #define DEFAULT_RULES_COUNT    4000000
 #define DEFAULT_RULES_BATCH     100000
 #define DEFAULT_GROUP                0
@@ -55,7 +59,6 @@
 #define HAIRPIN_TX_CONF_LOCKED_MEMORY (0x0100)
 #define HAIRPIN_TX_CONF_RTE_MEMORY    (0x0200)
 
-struct rte_flow *flow;
 static uint8_t flow_group;
 
 static uint64_t encap_data;
@@ -81,6 +84,9 @@ static bool enable_fwd;
 static bool unique_data;
 static bool policy_mtr;
 static bool packet_mode;
+static bool async_mode;
+static uint32_t async_queue_size = 1024;
+static uint32_t async_push_batch = 256;
 
 static uint8_t rx_queues_count;
 static uint8_t tx_queues_count;
@@ -598,6 +604,13 @@ usage(char *progname)
 		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
 		"With fixed values\n");
 	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
+
+	printf("\nAsync flow API options:\n");
+	printf("  --async: enable async flow API mode\n");
+	printf("  --async-queue-size=N: size of each async queue,"
+	       " default is 1024\n");
+	printf("  --async-push-batch=N: flows to batch before push,"
+	       " default is 256\n");
 }
 
 static void
@@ -734,6 +747,9 @@ args_parse(int argc, char **argv)
 		{ "policy-mtr",                 1, 0, 0 },
 		{ "meter-profile",              1, 0, 0 },
 		{ "packet-mode",                0, 0, 0 },
+		{ "async",                      0, 0, 0 },
+		{ "async-queue-size",           1, 0, 0 },
+		{ "async-push-batch",           1, 0, 0 },
 		{ 0, 0, 0, 0 },
 	};
 
@@ -913,8 +929,7 @@ args_parse(int argc, char **argv)
 					rte_exit(EXIT_FAILURE, "Invalid hairpin config mask\n");
 				hairpin_conf_mask = hp_conf;
 			}
-			if (strcmp(lgopts[opt_idx].name,
-					"port-id") == 0) {
+			if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
 				uint16_t port_idx = 0;
 
 				token = strtok(optarg, ",");
@@ -981,6 +996,26 @@ args_parse(int argc, char **argv)
 			}
 			if (strcmp(lgopts[opt_idx].name, "packet-mode") == 0)
 				packet_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async") == 0)
+				async_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async-queue-size") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE)
+					async_queue_size = MAX_ASYNC_QUEUE_SIZE;
+				else if (n > 0)
+					async_queue_size = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-queue-size should be > 0\n");
+			}
+			if (strcmp(lgopts[opt_idx].name, "async-push-batch") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE >> 1)
+					async_push_batch = MAX_ASYNC_QUEUE_SIZE >> 1;
+				else if (n > 0)
+					async_push_batch = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-push-batch should be > 0\n");
+			}
 			break;
 		default:
 			usage(argv[0]);
@@ -1457,10 +1492,10 @@ query_flows(int port_id, uint8_t core_id, struct rte_flow **flows_list)
 	mc_pool.flows_record.query[port_id][core_id] = cpu_time_used;
 }
 
-static struct rte_flow **
-insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
+static void
+insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
 {
-	struct rte_flow **flows_list;
+	struct rte_flow *flow;
 	struct rte_flow_error error;
 	clock_t start_batch, end_batch;
 	double first_flow_latency;
@@ -1485,8 +1520,7 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 	global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
 	global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
 
-	flows_list = rte_zmalloc("flows_list",
-		(sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
 	if (flows_list == NULL)
 		rte_exit(EXIT_FAILURE, "No Memory available!\n");
 
@@ -1524,6 +1558,11 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 			core_id, rx_queues_count,
 			unique_data, max_priority, &error);
 
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+
 		if (!counter) {
 			first_flow_latency = (double) (rte_get_timer_cycles() - start_batch);
 			first_flow_latency /= rte_get_timer_hz();
@@ -1537,11 +1576,6 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		if (force_quit)
 			counter = end_counter;
 
-		if (!flow) {
-			print_flow_error(error);
-			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
-		}
-
 		flows_list[flow_index++] = flow;
 
 		/*
@@ -1575,7 +1609,200 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		port_id, core_id, rules_count_per_core, cpu_time_used);
 
 	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
-	return flows_list;
+}
+
+static inline int
+push_pull_flows_async(int port_id, int core_id, uint32_t enqueued, bool empty, bool check_op_status,
+		      struct rte_flow_error *error)
+{
+	static struct rte_flow_op_result results[RTE_MAX_LCORE][MAX_ASYNC_QUEUE_SIZE];
+	static uint32_t push_counter[RTE_MAX_LCORE] = {0};
+	uint32_t to_pull = (empty || async_push_batch > enqueued) ? enqueued : async_push_batch;
+	uint32_t pulled_complete = 0;
+	uint32_t retries = 0;
+	int pulled, i;
+	int ret = 0;
+
+	/* Push periodically to give HW work to do */
+	ret = rte_flow_push(port_id, core_id, error);
+	if (ret)
+		return ret;
+	push_counter[core_id]++;
+
+	/* Check if queue is getting full, if so push and drain completions */
+	if (!empty && push_counter[core_id] == 1)
+		return 0;
+
+	while (to_pull > 0) {
+		pulled = rte_flow_pull(port_id, core_id, results[core_id], to_pull, error);
+		if (pulled < 0) {
+			return -1;
+		} else if (pulled == 0) {
+			if (++retries > MAX_PULL_RETRIES) {
+				rte_flow_error_set(error, ETIMEDOUT,
+						   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+						   "Timeout waiting for async completions");
+				return -1;
+			}
+			rte_pause();
+			continue;
+		}
+		retries = 0;
+
+		to_pull -= pulled;
+		pulled_complete += pulled;
+		if (!check_op_status)
+			continue;
+
+		for (i = 0; i < pulled; i++) {
+			if (results[core_id][i].status != RTE_FLOW_OP_SUCCESS) {
+				rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+						   NULL, "Some flow rule insertion failed");
+				return -1;
+			}
+		}
+	}
+
+	return pulled_complete;
+}
+
+static void
+insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
+{
+	struct rte_flow *flow;
+	struct rte_flow_error error;
+	clock_t start_batch, end_batch;
+	double first_flow_latency;
+	double cpu_time_used;
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
+	double delta;
+	uint32_t flow_index;
+	uint32_t counter, batch_counter, start_counter = 0, end_counter;
+	int rules_batch_idx;
+	int rules_count_per_core;
+	uint32_t enqueued = 0;
+	bool first_batch = true;
+	int pulled;
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	if (async_push_batch > async_queue_size >> 1)
+		async_push_batch = async_queue_size >> 1;
+
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	cpu_time_used = 0;
+	flow_index = 0;
+
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+
+		uint64_t global_items[MAX_ITEMS_NUM] = {0};
+		uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
+		global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+		global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
+		flow = generate_flow(port_id, 0, flow_attrs, global_items, global_actions,
+				     flow_group, 0, 0, 0, 0, dst_port_id, core_id, rx_queues_count,
+				     unique_data, max_priority, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_get_timer_cycles();
+	for (counter = start_counter; counter < end_counter;) {
+		/* batch adding flow rules, this avoids unnecessary checks for push/pull */
+		for (batch_counter = 0; batch_counter < async_push_batch && counter < end_counter;
+		     batch_counter++, counter++) {
+			/* Create flow with postpone=true to batch operations */
+			flow = async_generate_flow(port_id, core_id, counter, hairpin_queues_num,
+						   encap_data, decap_data, dst_port_id, core_id,
+						   rx_queues_count, unique_data, true, &error);
+
+			if (!flow) {
+				print_flow_error(error);
+				rte_exit(EXIT_FAILURE, "Error in creating async flow\n");
+			}
+
+			if (force_quit)
+				break;
+
+			flows_list[flow_index++] = flow;
+			enqueued++;
+
+			/*
+			 * Save the insertion rate for rules batch.
+			 * Check if the insertion reached the rules
+			 * patch counter, then save the insertion rate
+			 * for this batch.
+			 */
+			if (!((counter + 1) % rules_batch)) {
+				end_batch = rte_get_timer_cycles();
+				delta = (double)(end_batch - start_batch);
+				rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+				cpu_time_per_batch[rules_batch_idx] = delta / rte_get_timer_hz();
+				cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+				start_batch = rte_get_timer_cycles();
+			}
+		}
+
+		pulled = push_pull_flows_async(port_id, core_id, enqueued, false, true, &error);
+		if (pulled < 0) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error push/pull async operations\n");
+		}
+
+		enqueued -= pulled;
+
+		if (first_batch) {
+			first_flow_latency = (double)(rte_get_timer_cycles() - start_batch);
+			first_flow_latency /= rte_get_timer_hz();
+			/* In millisecond */
+			first_flow_latency *= 1000;
+			printf(":: First Flow Batch Latency (Async) :: Port %d :: First batch (%u) "
+			       "installed in %f milliseconds\n",
+			       port_id, async_push_batch, first_flow_latency);
+			first_batch = false;
+		}
+	}
+
+	if (push_pull_flows_async(port_id, core_id, enqueued, true, true, &error) < 0) {
+		print_flow_error(error);
+		rte_exit(EXIT_FAILURE, "Error final push/pull async operations\n");
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] - end @[%d]\n", port_id,
+	       core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	if (cpu_time_used > 0) {
+		insertion_rate = ((double)rules_count_per_core / cpu_time_used) / 1000;
+		printf(":: Port %d :: Core %d :: Async rules insertion rate -> %f K Rule/Sec\n",
+		       port_id, core_id, insertion_rate);
+	}
+	printf(":: Port %d :: Core %d :: The time for creating %d async rules is %f seconds\n",
+	       port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
 }
 
 static void
@@ -1585,12 +1812,18 @@ flows_handler(uint8_t core_id)
 	uint16_t port_idx = 0;
 	uint16_t nr_ports;
 	int port_id;
+	int rules_count_per_core;
 
 	nr_ports = rte_eth_dev_count_avail();
 
 	if (rules_batch > rules_count)
 		rules_batch = rules_count;
 
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
+	if (flows_list == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+
 	printf(":: Rules Count per port: %d\n\n", rules_count);
 
 	for (port_id = 0; port_id < nr_ports; port_id++) {
@@ -1602,10 +1835,10 @@ flows_handler(uint8_t core_id)
 		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 		if (has_meter())
 			meters_handler(port_id, core_id, METER_CREATE);
-		flows_list = insert_flows(port_id, core_id,
-						dst_ports[port_idx++]);
-		if (flows_list == NULL)
-			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
+		if (async_mode)
+			insert_flows_async(port_id, core_id, dst_ports[port_idx++], flows_list);
+		else
+			insert_flows(port_id, core_id, dst_ports[port_idx++], flows_list);
 		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 
 		if (query_flag)
@@ -2212,6 +2445,16 @@ init_port(void)
 			}
 		}
 
+		/* Configure async flow engine before device start */
+		if (async_mode) {
+			ret = async_flow_init_port(port_id, mc_pool.cores_count, async_queue_size,
+						   flow_items, flow_actions, flow_attrs, flow_group,
+						   rules_count);
+			if (ret != 0)
+				rte_exit(EXIT_FAILURE, "Failed to init async flow on port %d\n",
+					 port_id);
+		}
+
 		ret = rte_eth_dev_start(port_id);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE,
@@ -2291,6 +2534,8 @@ main(int argc, char **argv)
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
+		if (async_mode)
+			async_flow_cleanup_port(port);
 		if (rte_eth_dev_stop(port) != 0)
 			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
diff --git a/app/test-flow-perf/meson.build b/app/test-flow-perf/meson.build
index e101449e32..70d8671a54 100644
--- a/app/test-flow-perf/meson.build
+++ b/app/test-flow-perf/meson.build
@@ -1,8 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2020 Mellanox Technologies, Ltd
 
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
 sources = files(
         'actions_gen.c',
+        'async_flow.c',
         'flow_gen.c',
         'items_gen.c',
         'main.c',
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH v5] test/flow: add support for async API
  2026-02-25 22:23 ` Stephen Hemminger
                     ` (2 preceding siblings ...)
  2026-03-02 14:35   ` [PATCH v4] " Maxime Peim
@ 2026-03-09 12:52   ` Maxime Peim
  3 siblings, 0 replies; 8+ messages in thread
From: Maxime Peim @ 2026-03-09 12:52 UTC (permalink / raw)
  To: stephen; +Cc: dev

Add async flow API mode to test-flow-perf application for improved
flow rule insertion performance. The async API allows batching flow
rule creation operations and processing completions in bulk, reducing
per-rule overhead.

New command line options:
  --async: enable async flow API mode
  --async-queue-size=N: size of async queues (default: 1024)
  --async-push-batch=N: flows to batch before push (default: 256)

Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
---
v2:
  - Replace per-flow stack allocation with pre-allocated slot pool;
    flat buffers are initialized once at init time and the hot path
    only patches per-flow item/action values into a pre-set slot
  - Fix alloca misuse: use heap allocation for queue_attr_list, round
    queue_size to power of 2 for bitmask wrapping, add bounds checks
  - Fix race on file-scope flow variable, premature latency
    measurement, and integer division in rate calculation
  - Drop unrelated lgopts reformatting
  - Use malloc instead of rte_zmalloc for non-dataplane allocations
  - Various robustness and style fixes

v3:
  - Update meson.build to exclude Windows build for flow perf test
  - Fix checkstyle
  - Remove cast from void* to uintptr_t
  - Add name to mailmap and maintainers

v4:
  - Use RTE_CAST_PTR for void pointer casts (const-correctness)
  - Simplify init_slot_pool, fill_items_template, and
    fill_actions_template signatures; export item_spec_size and
    action_conf_size helpers
  - Use goto-based centralized error cleanup
  - Remove redundant queue_id variable, use core_id directly
  - Make push_counter a function-local static
  - Narrow n_items/n_actions from uint32_t to uint8_t
  - Consistent conf variable naming in action slot-fill handlers

v5:
  - Fix style

 .mailmap                         |   1 +
 MAINTAINERS                      |   1 +
 app/test-flow-perf/actions_gen.c | 276 ++++++++++-
 app/test-flow-perf/actions_gen.h |  31 ++
 app/test-flow-perf/async_flow.c  | 783 +++++++++++++++++++++++++++++++
 app/test-flow-perf/async_flow.h  |  54 +++
 app/test-flow-perf/items_gen.c   |  55 +++
 app/test-flow-perf/items_gen.h   |   6 +
 app/test-flow-perf/main.c        | 283 ++++++++++-
 app/test-flow-perf/meson.build   |   7 +
 10 files changed, 1457 insertions(+), 40 deletions(-)
 create mode 100644 app/test-flow-perf/async_flow.c
 create mode 100644 app/test-flow-perf/async_flow.h

diff --git a/.mailmap b/.mailmap
index 6c4c977dde..a0141402c3 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1044,6 +1044,7 @@ Mauro Annarumma <mauroannarumma@hotmail.it>
 Maxime Coquelin <maxime.coquelin@redhat.com>
 Maxime Gouin <maxime.gouin@6wind.com>
 Maxime Leroy <maxime@leroys.fr> <maxime.leroy@6wind.com>
+Maxime Peim <maxime.peim@gmail.com>
 Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com>
 Megha Ajmera <megha.ajmera@intel.com>
 Meijuan Zhao <meijuanx.zhao@intel.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 1b2f1ed2ba..d4c01037c8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1964,6 +1964,7 @@ F: doc/guides/tools/dmaperf.rst
 
 Flow performance tool
 M: Wisam Jaddo <wisamm@nvidia.com>
+M: Maxime Peim <maxime.peim@gmail.com>
 F: app/test-flow-perf/
 F: doc/guides/tools/flow-perf.rst
 
diff --git a/app/test-flow-perf/actions_gen.c b/app/test-flow-perf/actions_gen.c
index 9d102e3af4..624ecfdf43 100644
--- a/app/test-flow-perf/actions_gen.c
+++ b/app/test-flow-perf/actions_gen.c
@@ -36,27 +36,7 @@ struct additional_para {
 	bool unique_data;
 };
 
-/* Storage for struct rte_flow_action_raw_encap including external data. */
-struct action_raw_encap_data {
-	struct rte_flow_action_raw_encap conf;
-	uint8_t data[128];
-	uint8_t preserve[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_raw_decap including external data. */
-struct action_raw_decap_data {
-	struct rte_flow_action_raw_decap conf;
-	uint8_t data[128];
-	uint16_t idx;
-};
-
-/* Storage for struct rte_flow_action_rss including external data. */
-struct action_rss_data {
-	struct rte_flow_action_rss conf;
-	uint8_t key[40];
-	uint16_t queue[128];
-};
+/* Compound action data structs defined in actions_gen.h */
 
 static void
 add_mark(struct rte_flow_action *actions,
@@ -1165,3 +1145,257 @@ fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	free(queues);
 	free(hairpin_queues);
 }
+
+size_t
+action_conf_size(enum rte_flow_action_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ACTION_TYPE_MARK:
+		return sizeof(struct rte_flow_action_mark);
+	case RTE_FLOW_ACTION_TYPE_QUEUE:
+		return sizeof(struct rte_flow_action_queue);
+	case RTE_FLOW_ACTION_TYPE_JUMP:
+		return sizeof(struct rte_flow_action_jump);
+	case RTE_FLOW_ACTION_TYPE_RSS:
+		return sizeof(struct action_rss_data);
+	case RTE_FLOW_ACTION_TYPE_SET_META:
+		return sizeof(struct rte_flow_action_set_meta);
+	case RTE_FLOW_ACTION_TYPE_SET_TAG:
+		return sizeof(struct rte_flow_action_set_tag);
+	case RTE_FLOW_ACTION_TYPE_PORT_ID:
+		return sizeof(struct rte_flow_action_port_id);
+	case RTE_FLOW_ACTION_TYPE_COUNT:
+		return sizeof(struct rte_flow_action_count);
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_MAC_DST:
+		return sizeof(struct rte_flow_action_set_mac);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST:
+		return sizeof(struct rte_flow_action_set_ipv4);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST:
+		return sizeof(struct rte_flow_action_set_ipv6);
+	case RTE_FLOW_ACTION_TYPE_SET_TP_SRC:
+	case RTE_FLOW_ACTION_TYPE_SET_TP_DST:
+		return sizeof(struct rte_flow_action_set_tp);
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+	case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+	case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ:
+		return sizeof(rte_be32_t);
+	case RTE_FLOW_ACTION_TYPE_SET_TTL:
+		return sizeof(struct rte_flow_action_set_ttl);
+	case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+	case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP:
+		return sizeof(struct rte_flow_action_set_dscp);
+	case RTE_FLOW_ACTION_TYPE_METER:
+		return sizeof(struct rte_flow_action_meter);
+	case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
+		return sizeof(struct action_raw_encap_data);
+	case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
+		return sizeof(struct action_raw_decap_data);
+	case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
+		return sizeof(struct rte_flow_action_vxlan_encap) +
+		       5 * sizeof(struct rte_flow_item) + sizeof(struct rte_flow_item_eth) +
+		       sizeof(struct rte_flow_item_ipv4) + sizeof(struct rte_flow_item_udp) +
+		       sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ACTION_TYPE_MODIFY_FIELD:
+		return sizeof(struct rte_flow_action_modify_field);
+	/* Zero-conf types */
+	case RTE_FLOW_ACTION_TYPE_DROP:
+	case RTE_FLOW_ACTION_TYPE_FLAG:
+	case RTE_FLOW_ACTION_TYPE_DEC_TTL:
+	case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+void
+fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+		      uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+		      bool *need_wire_orig_table)
+{
+	uint8_t actions_counter = 0;
+	uint8_t i, j;
+
+	*need_wire_orig_table = false;
+	memset(port_attr, 0, sizeof(*port_attr));
+
+	/* Static configurations for actions that need them in templates */
+	static struct rte_flow_action_mark mark_conf = {
+		.id = 1,
+	};
+	static struct rte_flow_action_queue queue_conf = {
+		.index = 0,
+	};
+	static struct rte_flow_action_port_id port_id_conf = {
+		.id = 0,
+	};
+	static struct rte_flow_action_jump jump_conf = {
+		.group = 1,
+	};
+	static struct rte_flow_action_modify_field set_meta_conf = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst = {.field = RTE_FLOW_FIELD_META},
+		.src = {
+			.field = RTE_FLOW_FIELD_VALUE,
+			.value = {0, 0, 0, META_DATA},
+		},
+		.width = 32,
+	};
+
+	/* Static mask configurations for each action type */
+	static struct rte_flow_action_mark mark_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_queue queue_mask = {
+		.index = UINT16_MAX,
+	};
+	static struct rte_flow_action_jump jump_mask = {
+		.group = UINT32_MAX,
+	};
+	static struct rte_flow_action_rss rss_mask = {
+		.level = UINT32_MAX,
+		.types = UINT64_MAX,
+	};
+	static struct rte_flow_action_set_meta set_meta_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_tag set_tag_mask = {
+		.data = UINT32_MAX,
+		.mask = UINT32_MAX,
+		.index = UINT8_MAX,
+	};
+	static struct rte_flow_action_port_id port_id_mask = {
+		.id = UINT32_MAX,
+	};
+	static struct rte_flow_action_count count_mask;
+	static struct rte_flow_action_set_mac set_mac_mask = {
+		.mac_addr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	};
+	static struct rte_flow_action_set_ipv4 set_ipv4_mask = {
+		.ipv4_addr = UINT32_MAX,
+	};
+	static struct rte_flow_action_set_ipv6 set_ipv6_mask = {
+		.ipv6_addr.a = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+				0xff, 0xff, 0xff, 0xff, 0xff}};
+	static struct rte_flow_action_set_tp set_tp_mask = {
+		.port = UINT16_MAX,
+	};
+	static rte_be32_t tcp_seq_ack_mask = UINT32_MAX;
+	static struct rte_flow_action_set_ttl set_ttl_mask = {
+		.ttl_value = UINT8_MAX,
+	};
+	static struct rte_flow_action_set_dscp set_dscp_mask = {
+		.dscp = UINT8_MAX,
+	};
+	static struct rte_flow_action_meter meter_mask = {
+		.mtr_id = UINT32_MAX,
+	};
+
+	static const struct {
+		uint64_t flow_mask;
+		enum rte_flow_action_type type;
+		const void *action_conf;
+		const void *action_mask;
+		const bool need_wire_orig_table;
+	} template_actions[] = {
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MARK), RTE_FLOW_ACTION_TYPE_MARK, &mark_conf,
+		 &mark_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_COUNT), RTE_FLOW_ACTION_TYPE_COUNT, NULL,
+		 &count_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_MODIFY_FIELD),
+		 RTE_FLOW_ACTION_TYPE_MODIFY_FIELD, &set_meta_conf, &set_meta_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TAG), RTE_FLOW_ACTION_TYPE_SET_TAG, NULL,
+		 &set_tag_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_FLAG), RTE_FLOW_ACTION_TYPE_FLAG, NULL, NULL,
+		 false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_SRC, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_MAC_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_MAC_DST, NULL, &set_mac_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DST, NULL, &set_ipv4_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DST),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DST, NULL, &set_ipv6_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_SRC), RTE_FLOW_ACTION_TYPE_SET_TP_SRC,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TP_DST), RTE_FLOW_ACTION_TYPE_SET_TP_DST,
+		 NULL, &set_tp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ),
+		 RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ, NULL, &tcp_seq_ack_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_TTL), RTE_FLOW_ACTION_TYPE_SET_TTL, NULL,
+		 &set_ttl_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DEC_TTL), RTE_FLOW_ACTION_TYPE_DEC_TTL, NULL,
+		 NULL, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP),
+		 RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP, NULL, &set_dscp_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_QUEUE), RTE_FLOW_ACTION_TYPE_QUEUE,
+		 &queue_conf, &queue_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_RSS), RTE_FLOW_ACTION_TYPE_RSS, NULL,
+		 &rss_mask, true},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_JUMP), RTE_FLOW_ACTION_TYPE_JUMP, &jump_conf,
+		 &jump_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_PORT_ID), RTE_FLOW_ACTION_TYPE_PORT_ID,
+		 &port_id_conf, &port_id_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_DROP), RTE_FLOW_ACTION_TYPE_DROP, NULL, NULL,
+		 false},
+		{HAIRPIN_QUEUE_ACTION, RTE_FLOW_ACTION_TYPE_QUEUE, &queue_conf, &queue_mask, false},
+		{HAIRPIN_RSS_ACTION, RTE_FLOW_ACTION_TYPE_RSS, NULL, &rss_mask, false},
+		{FLOW_ACTION_MASK(RTE_FLOW_ACTION_TYPE_METER), RTE_FLOW_ACTION_TYPE_METER, NULL,
+		 &meter_mask, false},
+	};
+
+	for (j = 0; j < MAX_ACTIONS_NUM; j++) {
+		if (flow_actions[j] == 0)
+			break;
+		for (i = 0; i < RTE_DIM(template_actions); i++) {
+			if ((flow_actions[j] & template_actions[i].flow_mask) == 0)
+				continue;
+
+			switch (template_actions[i].type) {
+			case RTE_FLOW_ACTION_TYPE_COUNT:
+				port_attr->nb_counters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_AGE:
+				port_attr->nb_aging_objects++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_METER:
+				port_attr->nb_meters++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_CONNTRACK:
+				port_attr->nb_conn_tracks++;
+				break;
+			case RTE_FLOW_ACTION_TYPE_QUOTA:
+				port_attr->nb_quotas++;
+			default:
+				break;
+			}
+
+			actions[actions_counter].type = template_actions[i].type;
+			actions[actions_counter].conf = template_actions[i].action_conf;
+			masks[actions_counter].type = template_actions[i].type;
+			masks[actions_counter].conf = template_actions[i].action_mask;
+			*need_wire_orig_table |= template_actions[i].need_wire_orig_table;
+			break;
+		}
+	}
+
+	actions[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+	masks[actions_counter].type = RTE_FLOW_ACTION_TYPE_END;
+}
diff --git a/app/test-flow-perf/actions_gen.h b/app/test-flow-perf/actions_gen.h
index 9e13b164f9..cd4d1753bc 100644
--- a/app/test-flow-perf/actions_gen.h
+++ b/app/test-flow-perf/actions_gen.h
@@ -17,9 +17,40 @@
 #define RTE_VXLAN_GPE_UDP_PORT 250
 #define RTE_GENEVE_UDP_PORT 6081
 
+/* Compound action data structures (needed by async_flow.c for slot init) */
+
+/* Storage for struct rte_flow_action_raw_encap including external data. */
+struct action_raw_encap_data {
+	struct rte_flow_action_raw_encap conf;
+	uint8_t data[128];
+	uint8_t preserve[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_raw_decap including external data. */
+struct action_raw_decap_data {
+	struct rte_flow_action_raw_decap conf;
+	uint8_t data[128];
+	uint16_t idx;
+};
+
+/* Storage for struct rte_flow_action_rss including external data. */
+struct action_rss_data {
+	struct rte_flow_action_rss conf;
+	uint8_t key[40];
+	uint16_t queue[128];
+};
+
 void fill_actions(struct rte_flow_action *actions, uint64_t *flow_actions,
 	uint32_t counter, uint16_t next_table, uint16_t hairpinq,
 	uint64_t encap_data, uint64_t decap_data, uint8_t core_idx,
 	bool unique_data, uint8_t rx_queues_count, uint16_t dst_port);
 
+/* Fill actions template for async flow API (types only, no values) */
+void fill_actions_template(struct rte_flow_action *actions, struct rte_flow_action *masks,
+			   uint64_t *flow_actions, struct rte_flow_port_attr *port_attr,
+			   bool *need_wire_orig_table);
+
+size_t action_conf_size(enum rte_flow_action_type type);
+
 #endif /* FLOW_PERF_ACTION_GEN */
diff --git a/app/test-flow-perf/async_flow.c b/app/test-flow-perf/async_flow.c
new file mode 100644
index 0000000000..3ead41393c
--- /dev/null
+++ b/app/test-flow-perf/async_flow.c
@@ -0,0 +1,783 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API implementation
+ * for the flow-perf application.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <rte_bitops.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_flow.h>
+#include <rte_vxlan.h>
+
+#include "actions_gen.h"
+#include "async_flow.h"
+#include "flow_gen.h"
+#include "items_gen.h"
+
+/* Max iterations when draining pending async completions during cleanup */
+#define DRAIN_MAX_ITERATIONS 100
+
+/* Per-port async flow resources */
+static struct async_flow_resources port_resources[MAX_PORTS];
+
+/*
+ * Initialize compound action types within a pre-allocated slot.
+ * Called once per slot during pool init to set up internal pointers
+ * for RSS, RAW_ENCAP, RAW_DECAP and VXLAN_ENCAP actions.
+ */
+static void
+init_slot_compound_actions(struct rte_flow_action *actions, uint32_t n_actions,
+			   const size_t *action_conf_sizes)
+{
+	uint32_t i;
+
+	for (i = 0; i < n_actions; i++) {
+		if (action_conf_sizes[i] == 0)
+			continue;
+
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *rss = RTE_CAST_PTR(typeof(rss), actions[i].conf);
+			rss->conf.func = RTE_ETH_HASH_FUNCTION_DEFAULT;
+			rss->conf.level = 0;
+			rss->conf.types = GET_RSS_HF();
+			rss->conf.key_len = sizeof(rss->key);
+			rss->conf.key = rss->key;
+			rss->conf.queue = rss->queue;
+			rss->key[0] = 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *encap =
+				RTE_CAST_PTR(typeof(encap), actions[i].conf);
+			encap->conf.data = encap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *decap =
+				RTE_CAST_PTR(typeof(decap), actions[i].conf);
+			decap->conf.data = decap->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			/*
+			 * Layout within the conf area:
+			 *   struct rte_flow_action_vxlan_encap
+			 *   struct rte_flow_item[5]
+			 *   struct rte_flow_item_eth
+			 *   struct rte_flow_item_ipv4
+			 *   struct rte_flow_item_udp
+			 *   struct rte_flow_item_vxlan
+			 */
+			uint8_t *base = RTE_CAST_PTR(typeof(base), actions[i].conf);
+			struct rte_flow_action_vxlan_encap *ve =
+				(struct rte_flow_action_vxlan_encap *)base;
+			struct rte_flow_item *items =
+				(struct rte_flow_item
+					 *)(base + sizeof(struct rte_flow_action_vxlan_encap));
+			uint8_t *data = (uint8_t *)(items + 5);
+
+			struct rte_flow_item_eth *item_eth = (struct rte_flow_item_eth *)data;
+			data += sizeof(struct rte_flow_item_eth);
+			struct rte_flow_item_ipv4 *item_ipv4 = (struct rte_flow_item_ipv4 *)data;
+			data += sizeof(struct rte_flow_item_ipv4);
+			struct rte_flow_item_udp *item_udp = (struct rte_flow_item_udp *)data;
+			data += sizeof(struct rte_flow_item_udp);
+			struct rte_flow_item_vxlan *item_vxlan = (struct rte_flow_item_vxlan *)data;
+
+			memset(item_eth, 0, sizeof(*item_eth));
+			memset(item_ipv4, 0, sizeof(*item_ipv4));
+			memset(item_udp, 0, sizeof(*item_udp));
+			memset(item_vxlan, 0, sizeof(*item_vxlan));
+
+			item_ipv4->hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+			item_ipv4->hdr.version_ihl = RTE_IPV4_VHL_DEF;
+			item_udp->hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+			item_vxlan->hdr.vni[2] = 1;
+
+			items[0].type = RTE_FLOW_ITEM_TYPE_ETH;
+			items[0].spec = item_eth;
+			items[0].mask = item_eth;
+			items[1].type = RTE_FLOW_ITEM_TYPE_IPV4;
+			items[1].spec = item_ipv4;
+			items[1].mask = item_ipv4;
+			items[2].type = RTE_FLOW_ITEM_TYPE_UDP;
+			items[2].spec = item_udp;
+			items[2].mask = item_udp;
+			items[3].type = RTE_FLOW_ITEM_TYPE_VXLAN;
+			items[3].spec = item_vxlan;
+			items[3].mask = item_vxlan;
+			items[4].type = RTE_FLOW_ITEM_TYPE_END;
+
+			ve->definition = items;
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Allocate and pre-initialize all per-slot flat buffers.
+ * Returns 0 on success.
+ */
+static int
+init_slot_pool(struct async_flow_resources *res, uint32_t nb_queues, uint32_t queue_size,
+	       const struct rte_flow_item *pattern, const struct rte_flow_action *template_actions)
+{
+	size_t item_spec_sizes[MAX_ITEMS_NUM], action_conf_sizes[MAX_ACTIONS_NUM];
+	uint32_t items_array_bytes, actions_array_bytes;
+	uint32_t spec_data_bytes, conf_data_bytes;
+	uint32_t slot_size, num_slots;
+	uint32_t s, i;
+	uint8_t n_items, n_actions;
+	uint8_t *mptr;
+	int ret = 0;
+
+	/* Compute shared mask size */
+	spec_data_bytes = 0;
+	for (n_items = 0; pattern[n_items].type != RTE_FLOW_ITEM_TYPE_END; n_items++) {
+		item_spec_sizes[n_items] = item_spec_size(pattern[n_items].type);
+		spec_data_bytes += item_spec_sizes[n_items];
+	}
+
+	/* END */
+	item_spec_sizes[n_items++] = 0;
+
+	conf_data_bytes = 0;
+	for (n_actions = 0; template_actions[n_actions].type != RTE_FLOW_ACTION_TYPE_END;
+	     n_actions++) {
+		action_conf_sizes[n_actions] = action_conf_size(template_actions[n_actions].type);
+		conf_data_bytes += action_conf_sizes[n_actions];
+	}
+
+	/* END */
+	action_conf_sizes[n_actions++] = 0;
+
+	/* Compute per-slot layout sizes */
+	items_array_bytes = n_items * sizeof(struct rte_flow_item);
+	actions_array_bytes = n_actions * sizeof(struct rte_flow_action);
+
+	slot_size = RTE_ALIGN_CEIL(items_array_bytes + actions_array_bytes + spec_data_bytes +
+					   conf_data_bytes,
+				   RTE_CACHE_LINE_SIZE);
+
+	num_slots = queue_size * nb_queues;
+
+	/* Store layout info */
+	res->slot_size = slot_size;
+	res->slots_per_queue = queue_size;
+	res->nb_queues = nb_queues;
+	res->n_items = n_items;
+	res->n_actions = n_actions;
+
+	/* Allocate shared masks */
+	if (spec_data_bytes > 0) {
+		res->shared_masks = aligned_alloc(RTE_CACHE_LINE_SIZE, spec_data_bytes);
+		if (res->shared_masks == NULL) {
+			fprintf(stderr, "Failed to allocate shared masks (%u bytes)\n",
+				spec_data_bytes);
+			return -ENOMEM;
+		}
+		memset(res->shared_masks, 0, spec_data_bytes);
+
+		/* Copy mask data from template pattern */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			if (item_spec_sizes[i] > 0 && pattern[i].mask != NULL)
+				memcpy(mptr, pattern[i].mask, item_spec_sizes[i]);
+			mptr += item_spec_sizes[i];
+		}
+	}
+
+	/* Allocate per-slot pool */
+	/* slot_size is already cache-line aligned, so total is a multiple */
+	res->slot_pool = aligned_alloc(RTE_CACHE_LINE_SIZE, (size_t)num_slots * slot_size);
+	if (res->slot_pool == NULL) {
+		fprintf(stderr, "Failed to allocate slot pool (%u slots * %u bytes)\n", num_slots,
+			slot_size);
+		ret = -ENOMEM;
+		goto free_shared_masks;
+	}
+	memset(res->slot_pool, 0, (size_t)num_slots * slot_size);
+
+	/* Pre-initialize every slot */
+	for (s = 0; s < num_slots; s++) {
+		uint8_t *slot = res->slot_pool + (size_t)s * slot_size;
+		struct rte_flow_item *items = (struct rte_flow_item *)slot;
+		struct rte_flow_action *actions =
+			(struct rte_flow_action *)(slot + items_array_bytes);
+		uint8_t *data = slot + items_array_bytes + actions_array_bytes;
+
+		/* Pre-set items: spec → per-slot data, mask → shared masks */
+		mptr = res->shared_masks;
+		for (i = 0; i < n_items; i++) {
+			items[i].type = pattern[i].type;
+			if (item_spec_sizes[i] > 0) {
+				items[i].spec = data;
+				items[i].mask = mptr;
+				data += item_spec_sizes[i];
+				mptr += item_spec_sizes[i];
+			}
+		}
+
+		/* Pre-set actions: conf → per-slot data */
+		for (i = 0; i < n_actions; i++) {
+			actions[i].type = template_actions[i].type;
+			if (action_conf_sizes[i] > 0) {
+				actions[i].conf = data;
+				data += action_conf_sizes[i];
+			}
+		}
+
+		/* Initialize compound action types (RSS, RAW_ENCAP, etc.) */
+		init_slot_compound_actions(actions, n_actions, action_conf_sizes);
+	}
+
+	/* Allocate and initialize per-queue slot tracking */
+	res->queues =
+		aligned_alloc(RTE_CACHE_LINE_SIZE, nb_queues * sizeof(struct async_flow_queue));
+	if (res->queues == NULL) {
+		fprintf(stderr, "Failed to allocate queue structs (%u queues)\n", nb_queues);
+		ret = -ENOMEM;
+		goto free_slot_pool;
+	}
+	memset(res->queues, 0, nb_queues * sizeof(struct async_flow_queue));
+	for (s = 0; s < nb_queues; s++) {
+		res->queues[s].slots = res->slot_pool + (size_t)s * queue_size * slot_size;
+		res->queues[s].head = 0;
+	}
+
+	printf(":: Slot pool: %u slots * %u bytes = %u KB (shared masks: %u bytes)\n", num_slots,
+	       slot_size, (num_slots * slot_size) / 1024, spec_data_bytes);
+
+	return 0;
+
+free_slot_pool:
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+free_shared_masks:
+	if (res->shared_masks) {
+		free(res->shared_masks);
+		res->shared_masks = NULL;
+	}
+	return ret;
+}
+
+/*
+ * Hot-path: update per-flow item values through pre-set pointers.
+ * Only IPv4/IPv6 src_addr varies per flow (based on counter).
+ */
+static void
+update_item_values(struct rte_flow_item *items, uint32_t counter)
+{
+	uint8_t i;
+
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++) {
+		switch (items[i].type) {
+		case RTE_FLOW_ITEM_TYPE_IPV4: {
+			struct rte_flow_item_ipv4 *spec = RTE_CAST_PTR(typeof(spec), items[i].spec);
+			spec->hdr.src_addr = RTE_BE32(counter);
+			break;
+		}
+		case RTE_FLOW_ITEM_TYPE_IPV6: {
+			struct rte_flow_item_ipv6 *spec = RTE_CAST_PTR(typeof(spec), items[i].spec);
+			uint8_t j;
+			for (j = 0; j < 4; j++)
+				spec->hdr.src_addr.a[15 - j] = counter >> (j * 8);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Hot-path: update per-flow action values through pre-set pointers.
+ */
+static void
+update_action_values(struct rte_flow_action *actions, uint32_t counter, uint16_t hairpinq,
+		     uint64_t encap_data, uint64_t decap_data, __rte_unused uint8_t core_idx,
+		     bool unique_data, uint8_t rx_queues_count, uint16_t dst_port)
+{
+	uint8_t i;
+
+	for (i = 0; actions[i].type != RTE_FLOW_ACTION_TYPE_END; i++) {
+		switch (actions[i].type) {
+		case RTE_FLOW_ACTION_TYPE_MARK: {
+			struct rte_flow_action_mark *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->id = (counter % 255) + 1;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_QUEUE: {
+			struct rte_flow_action_queue *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->index = hairpinq ? (counter % hairpinq) + rx_queues_count :
+						 counter % rx_queues_count;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_METER: {
+			struct rte_flow_action_meter *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->mtr_id = counter;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RSS: {
+			struct action_rss_data *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint16_t q;
+			if (hairpinq) {
+				conf->conf.queue_num = hairpinq;
+				for (q = 0; q < hairpinq; q++)
+					conf->queue[q] = q + rx_queues_count;
+			} else {
+				conf->conf.queue_num = rx_queues_count;
+				for (q = 0; q < rx_queues_count; q++)
+					conf->queue[q] = q;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: {
+			struct rte_flow_action_set_mac *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < RTE_ETHER_ADDR_LEN; j++) {
+				conf->mac_addr[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: {
+			struct rte_flow_action_set_ipv4 *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t ip = unique_data ? counter : 1;
+			conf->ipv4_addr = RTE_BE32(ip + 1);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: {
+			struct rte_flow_action_set_ipv6 *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			uint8_t j;
+			for (j = 0; j < 16; j++) {
+				conf->ipv6_addr.a[j] = val & 0xff;
+				val >>= 8;
+			}
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: {
+			struct rte_flow_action_set_tp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t tp = unique_data ? counter : 100;
+			tp = tp % 0xffff;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TP_DST: {
+			struct rte_flow_action_set_tp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t tp = unique_data ? counter : 100;
+			if (tp > 0xffff)
+				tp >>= 16;
+			conf->port = RTE_BE16(tp & 0xffff);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK:
+		case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ:
+		case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: {
+			rte_be32_t *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			*conf = RTE_BE32(val);
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_TTL: {
+			struct rte_flow_action_set_ttl *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			conf->ttl_value = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_SET_IPV4_DSCP:
+		case RTE_FLOW_ACTION_TYPE_SET_IPV6_DSCP: {
+			struct rte_flow_action_set_dscp *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint32_t val = unique_data ? counter : 1;
+			conf->dscp = val % 0xff;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_PORT_ID: {
+			struct rte_flow_action_port_id *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			conf->id = dst_port;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: {
+			struct action_raw_encap_data *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint8_t *header = conf->data;
+			struct rte_ether_hdr eth_hdr;
+			struct rte_ipv4_hdr ipv4_hdr;
+			struct rte_udp_hdr udp_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VLAN))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_VLAN);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4)) {
+				uint32_t ip_dst = unique_data ? counter : 1;
+				memset(&ipv4_hdr, 0, sizeof(ipv4_hdr));
+				ipv4_hdr.src_addr = RTE_IPV4(127, 0, 0, 1);
+				ipv4_hdr.dst_addr = RTE_BE32(ip_dst);
+				ipv4_hdr.version_ihl = RTE_IPV4_VHL_DEF;
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP))
+					ipv4_hdr.next_proto_id = 17; /* UDP */
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_GRE))
+					ipv4_hdr.next_proto_id = 47; /* GRE */
+				memcpy(header, &ipv4_hdr, sizeof(ipv4_hdr));
+				header += sizeof(ipv4_hdr);
+			}
+			if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_UDP)) {
+				memset(&udp_hdr, 0, sizeof(udp_hdr));
+				if (encap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_VXLAN))
+					udp_hdr.dst_port = RTE_BE16(RTE_VXLAN_DEFAULT_PORT);
+				memcpy(header, &udp_hdr, sizeof(udp_hdr));
+				header += sizeof(udp_hdr);
+			}
+			conf->conf.size = header - conf->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_RAW_DECAP: {
+			struct action_raw_decap_data *conf =
+				RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			uint8_t *header = conf->data;
+			struct rte_ether_hdr eth_hdr;
+
+			memset(&eth_hdr, 0, sizeof(eth_hdr));
+			if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH)) {
+				if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV4))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV4);
+				else if (decap_data & FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_IPV6))
+					eth_hdr.ether_type = RTE_BE16(RTE_ETHER_TYPE_IPV6);
+				memcpy(header, &eth_hdr, sizeof(eth_hdr));
+				header += sizeof(eth_hdr);
+			}
+			conf->conf.size = header - conf->data;
+			break;
+		}
+		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: {
+			uint8_t *conf = RTE_CAST_PTR(typeof(conf), actions[i].conf);
+			struct rte_flow_item *vitems =
+				(struct rte_flow_item
+					 *)(conf + sizeof(struct rte_flow_action_vxlan_encap));
+			struct rte_flow_item_ipv4 *spec =
+				RTE_CAST_PTR(typeof(spec), vitems[1].spec);
+			uint32_t ip_dst = unique_data ? counter : 1;
+			/* vitems[1] is IPV4 */
+			spec->hdr.dst_addr = RTE_BE32(ip_dst);
+			break;
+		}
+		default:
+			break;
+		}
+	}
+}
+
+int
+async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+		     uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+		     uint8_t flow_group, uint32_t rules_count)
+{
+	struct rte_flow_port_info port_info = {0};
+	struct rte_flow_queue_info queue_info = {0};
+	struct rte_flow_error error = {0};
+	struct rte_flow_port_attr port_attr = {0};
+	struct rte_flow_queue_attr queue_attr;
+	const struct rte_flow_queue_attr **queue_attr_list;
+	struct rte_flow_pattern_template_attr pt_attr = {0};
+	struct rte_flow_actions_template_attr at_attr = {0};
+	struct rte_flow_template_table_attr table_attr = {0};
+	struct rte_flow_item pattern[MAX_ITEMS_NUM];
+	struct rte_flow_action actions[MAX_ACTIONS_NUM];
+	struct rte_flow_action action_masks[MAX_ACTIONS_NUM];
+	struct async_flow_resources *res;
+	bool need_wire_orig_table = false;
+	uint32_t i;
+	int ret;
+
+	if (port_id >= MAX_PORTS)
+		return -1;
+
+	res = &port_resources[port_id];
+	memset(res, 0, sizeof(*res));
+
+	/* Query port flow info */
+	ret = rte_flow_info_get(port_id, &port_info, &queue_info, &error);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	if (port_info.max_nb_queues == 0 || queue_info.max_size == 0) {
+		fprintf(stderr, "Port %u: rte_flow_info_get reports that no queues are supported\n",
+			port_id);
+		return -1;
+	}
+
+	/* Limit to device capabilities if reported */
+	if (port_info.max_nb_queues != 0 && port_info.max_nb_queues != UINT32_MAX &&
+	    nb_queues > port_info.max_nb_queues)
+		nb_queues = port_info.max_nb_queues;
+	if (queue_info.max_size != 0 && queue_info.max_size != UINT32_MAX &&
+	    queue_size > queue_info.max_size)
+		queue_size = queue_info.max_size;
+
+	/* Slot ring uses bitmask wrapping, so queue_size must be power of 2 */
+	queue_size = rte_align32prevpow2(queue_size);
+	if (queue_size == 0) {
+		fprintf(stderr, "Port %u: queue_size is 0 after rounding\n", port_id);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < MAX_ATTRS_NUM; i++) {
+		if (flow_attrs[i] == 0)
+			break;
+		if (flow_attrs[i] & INGRESS)
+			pt_attr.ingress = 1;
+		else if (flow_attrs[i] & EGRESS)
+			pt_attr.egress = 1;
+		else if (flow_attrs[i] & TRANSFER)
+			pt_attr.transfer = 1;
+	}
+	/* Enable relaxed matching for better performance */
+	pt_attr.relaxed_matching = 1;
+
+	memset(pattern, 0, sizeof(pattern));
+	memset(actions, 0, sizeof(actions));
+	memset(action_masks, 0, sizeof(action_masks));
+
+	/* Fill templates and gather per-item/action sizes */
+	fill_items_template(pattern, flow_items, 0, 0);
+
+	at_attr.ingress = pt_attr.ingress;
+	at_attr.egress = pt_attr.egress;
+	at_attr.transfer = pt_attr.transfer;
+
+	fill_actions_template(actions, action_masks, flow_actions, &port_attr,
+			      &need_wire_orig_table);
+
+	/*
+	 * fill_actions_template count the number of actions that require each kind of object,
+	 * so we multiply by the number of rules to have correct number
+	 */
+	port_attr.nb_counters *= rules_count;
+	port_attr.nb_aging_objects *= rules_count;
+	port_attr.nb_meters *= rules_count;
+	port_attr.nb_conn_tracks *= rules_count;
+	port_attr.nb_quotas *= rules_count;
+
+	table_attr.flow_attr.group = flow_group;
+	table_attr.flow_attr.priority = 0;
+	table_attr.flow_attr.ingress = pt_attr.ingress;
+	table_attr.flow_attr.egress = pt_attr.egress;
+	table_attr.flow_attr.transfer = pt_attr.transfer;
+	table_attr.nb_flows = rules_count;
+
+	if (pt_attr.transfer && need_wire_orig_table)
+		table_attr.specialize = RTE_FLOW_TABLE_SPECIALIZE_TRANSFER_WIRE_ORIG;
+
+	queue_attr_list = malloc(sizeof(*queue_attr_list) * nb_queues);
+	if (queue_attr_list == NULL) {
+		fprintf(stderr, "Port %u: failed to allocate queue_attr_list\n", port_id);
+		return -ENOMEM;
+	}
+
+	queue_attr.size = queue_size;
+	for (i = 0; i < nb_queues; i++)
+		queue_attr_list[i] = &queue_attr;
+
+	ret = rte_flow_configure(port_id, &port_attr, nb_queues, queue_attr_list, &error);
+
+	free(queue_attr_list);
+
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: rte_flow_configure failed (ret=%d, type=%d): %s\n",
+			port_id, ret, error.type, error.message ? error.message : "(no message)");
+		return ret;
+	}
+
+	/* Create pattern template */
+	res->pattern_template =
+		rte_flow_pattern_template_create(port_id, &pt_attr, pattern, &error);
+	if (res->pattern_template == NULL) {
+		fprintf(stderr, "Port %u: pattern template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		return -1;
+	}
+
+	/* Create actions template */
+	res->actions_template =
+		rte_flow_actions_template_create(port_id, &at_attr, actions, action_masks, &error);
+	if (res->actions_template == NULL) {
+		fprintf(stderr, "Port %u: actions template create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		ret = -1;
+		goto free_pattern;
+	}
+
+	/* Create template table */
+	res->table = rte_flow_template_table_create(port_id, &table_attr, &res->pattern_template, 1,
+						    &res->actions_template, 1, &error);
+	if (res->table == NULL) {
+		fprintf(stderr, "Port %u: template table create failed: %s\n", port_id,
+			error.message ? error.message : "(no message)");
+		ret = -1;
+		goto free_actions;
+	}
+
+	/* Allocate and pre-initialize per-slot flat buffers */
+	ret = init_slot_pool(res, nb_queues, queue_size, pattern, actions);
+	if (ret != 0) {
+		fprintf(stderr, "Port %u: slot pool init failed\n", port_id);
+		goto free_table;
+	}
+
+	res->table_capacity = rules_count;
+	res->initialized = true;
+
+	printf(":: Port %u: Async flow engine initialized (queues=%u, queue_size=%u)\n", port_id,
+	       nb_queues, queue_size);
+
+	return 0;
+
+free_table:
+	rte_flow_template_table_destroy(port_id, res->table, &error);
+	res->table = NULL;
+free_actions:
+	rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+	res->actions_template = NULL;
+free_pattern:
+	rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+	res->pattern_template = NULL;
+	return ret;
+}
+
+struct rte_flow *
+async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter, uint16_t hairpinq,
+		    uint64_t encap_data, uint64_t decap_data, uint16_t dst_port, uint8_t core_idx,
+		    uint8_t rx_queues_count, bool unique_data, bool postpone,
+		    struct rte_flow_error *error)
+{
+	struct async_flow_resources *res;
+	struct async_flow_queue *q;
+	uint8_t *slot;
+	uint32_t idx, items_array_bytes;
+	struct rte_flow_item *items;
+	struct rte_flow_action *actions;
+	struct rte_flow_op_attr op_attr = {
+		.postpone = postpone,
+	};
+
+	if (port_id >= MAX_PORTS) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid port ID");
+		return NULL;
+	}
+
+	res = &port_resources[port_id];
+	if (!res->initialized) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Async flow resources not initialized");
+		return NULL;
+	}
+
+	if (queue_id >= res->nb_queues) {
+		rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				   "Invalid queue ID");
+		return NULL;
+	}
+
+	/* Pick the next slot from this queue's ring */
+	q = &res->queues[queue_id];
+	idx = q->head;
+	q->head = (idx + 1) & (res->slots_per_queue - 1);
+	slot = q->slots + (size_t)idx * res->slot_size;
+	items_array_bytes = res->n_items * sizeof(struct rte_flow_item);
+	items = (struct rte_flow_item *)slot;
+	actions = (struct rte_flow_action *)(slot + items_array_bytes);
+
+	/* Update only per-flow varying values */
+	update_item_values(items, counter);
+	update_action_values(actions, counter, hairpinq, encap_data, decap_data, core_idx,
+			     unique_data, rx_queues_count, dst_port);
+
+	return rte_flow_async_create(port_id, queue_id, &op_attr, res->table, items, 0, actions, 0,
+				     NULL, error);
+}
+
+void
+async_flow_cleanup_port(uint16_t port_id)
+{
+	struct async_flow_resources *res;
+	struct rte_flow_error error;
+	struct rte_flow_op_result results[64];
+	int ret, i;
+
+	if (port_id >= MAX_PORTS)
+		return;
+
+	res = &port_resources[port_id];
+	if (!res->initialized)
+		return;
+
+	/* Drain any pending async completions from flow flush */
+	for (i = 0; i < DRAIN_MAX_ITERATIONS; i++) {
+		rte_flow_push(port_id, 0, &error);
+		ret = rte_flow_pull(port_id, 0, results, 64, &error);
+		if (ret <= 0)
+			break;
+	}
+
+	if (res->table != NULL) {
+		rte_flow_template_table_destroy(port_id, res->table, &error);
+		res->table = NULL;
+	}
+
+	if (res->actions_template != NULL) {
+		rte_flow_actions_template_destroy(port_id, res->actions_template, &error);
+		res->actions_template = NULL;
+	}
+
+	if (res->pattern_template != NULL) {
+		rte_flow_pattern_template_destroy(port_id, res->pattern_template, &error);
+		res->pattern_template = NULL;
+	}
+
+	free(res->queues);
+	res->queues = NULL;
+	free(res->slot_pool);
+	res->slot_pool = NULL;
+	free(res->shared_masks);
+	res->shared_masks = NULL;
+
+	res->initialized = false;
+}
diff --git a/app/test-flow-perf/async_flow.h b/app/test-flow-perf/async_flow.h
new file mode 100644
index 0000000000..8ef8883107
--- /dev/null
+++ b/app/test-flow-perf/async_flow.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2026 Maxime Peim <maxime.peim@gmail.com>
+ *
+ * This file contains the async flow API related definitions
+ * and function declarations.
+ */
+
+#ifndef FLOW_PERF_ASYNC_FLOW
+#define FLOW_PERF_ASYNC_FLOW
+
+#include <rte_flow.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "config.h"
+
+/* Per-queue slot ring — tracks which slot to use next */
+struct async_flow_queue {
+	uint8_t *slots; /* pointer to this queue's region within slot_pool */
+	uint32_t head;	/* next slot index (wraps mod slots_per_queue) */
+};
+
+/* Per-port async flow resources */
+struct async_flow_resources {
+	struct rte_flow_pattern_template *pattern_template;
+	struct rte_flow_actions_template *actions_template;
+	struct rte_flow_template_table *table;
+	uint8_t *slot_pool;    /* flat buffer pool for all slots */
+	uint8_t *shared_masks; /* shared item mask data (one copy for all slots) */
+	struct async_flow_queue *queues;
+	uint32_t slot_size;	  /* bytes per slot (cache-line aligned) */
+	uint32_t slots_per_queue; /* = queue_size */
+	uint32_t nb_queues;
+	uint32_t table_capacity;
+	uint8_t n_items;   /* item count (excl. END) */
+	uint8_t n_actions; /* action count (excl. END) */
+	bool initialized;
+};
+
+/* Initialize async flow engine for a port */
+int async_flow_init_port(uint16_t port_id, uint32_t nb_queues, uint32_t queue_size,
+			 uint64_t *flow_items, uint64_t *flow_actions, uint64_t *flow_attrs,
+			 uint8_t flow_group, uint32_t rules_count);
+
+/* Create a flow rule asynchronously using pre-allocated slot */
+struct rte_flow *async_generate_flow(uint16_t port_id, uint32_t queue_id, uint32_t counter,
+				     uint16_t hairpinq, uint64_t encap_data, uint64_t decap_data,
+				     uint16_t dst_port, uint8_t core_idx, uint8_t rx_queues_count,
+				     bool unique_data, bool postpone, struct rte_flow_error *error);
+
+/* Cleanup async flow resources for a port */
+void async_flow_cleanup_port(uint16_t port_id);
+
+#endif /* FLOW_PERF_ASYNC_FLOW */
diff --git a/app/test-flow-perf/items_gen.c b/app/test-flow-perf/items_gen.c
index c740e1838f..bd80a2864b 100644
--- a/app/test-flow-perf/items_gen.c
+++ b/app/test-flow-perf/items_gen.c
@@ -389,3 +389,58 @@ fill_items(struct rte_flow_item *items,
 
 	items[items_counter].type = RTE_FLOW_ITEM_TYPE_END;
 }
+
+size_t
+item_spec_size(enum rte_flow_item_type type)
+{
+	switch (type) {
+	case RTE_FLOW_ITEM_TYPE_ETH:
+		return sizeof(struct rte_flow_item_eth);
+	case RTE_FLOW_ITEM_TYPE_VLAN:
+		return sizeof(struct rte_flow_item_vlan);
+	case RTE_FLOW_ITEM_TYPE_IPV4:
+		return sizeof(struct rte_flow_item_ipv4);
+	case RTE_FLOW_ITEM_TYPE_IPV6:
+		return sizeof(struct rte_flow_item_ipv6);
+	case RTE_FLOW_ITEM_TYPE_TCP:
+		return sizeof(struct rte_flow_item_tcp);
+	case RTE_FLOW_ITEM_TYPE_UDP:
+		return sizeof(struct rte_flow_item_udp);
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		return sizeof(struct rte_flow_item_vxlan);
+	case RTE_FLOW_ITEM_TYPE_VXLAN_GPE:
+		return sizeof(struct rte_flow_item_vxlan_gpe);
+	case RTE_FLOW_ITEM_TYPE_GRE:
+		return sizeof(struct rte_flow_item_gre);
+	case RTE_FLOW_ITEM_TYPE_GENEVE:
+		return sizeof(struct rte_flow_item_geneve);
+	case RTE_FLOW_ITEM_TYPE_GTP:
+		return sizeof(struct rte_flow_item_gtp);
+	case RTE_FLOW_ITEM_TYPE_META:
+		return sizeof(struct rte_flow_item_meta);
+	case RTE_FLOW_ITEM_TYPE_TAG:
+		return sizeof(struct rte_flow_item_tag);
+	case RTE_FLOW_ITEM_TYPE_ICMP:
+		return sizeof(struct rte_flow_item_icmp);
+	case RTE_FLOW_ITEM_TYPE_ICMP6:
+		return sizeof(struct rte_flow_item_icmp6);
+	default:
+		return 0;
+	}
+}
+
+void
+fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+		    uint8_t core_idx)
+{
+	uint32_t i;
+
+	fill_items(items, flow_items, outer_ip_src, core_idx);
+
+	/* For templates, set spec to NULL - only mask matters for template matching */
+	for (i = 0; items[i].type != RTE_FLOW_ITEM_TYPE_END; i++)
+		items[i].spec = NULL;
+
+	/* END */
+	items[i].spec = NULL;
+}
diff --git a/app/test-flow-perf/items_gen.h b/app/test-flow-perf/items_gen.h
index f4b0e9a981..85d08ef35b 100644
--- a/app/test-flow-perf/items_gen.h
+++ b/app/test-flow-perf/items_gen.h
@@ -15,4 +15,10 @@
 void fill_items(struct rte_flow_item *items, uint64_t *flow_items,
 	uint32_t outer_ip_src, uint8_t core_idx);
 
+/* Fill items template for async flow API (masks only, no spec values) */
+void fill_items_template(struct rte_flow_item *items, uint64_t *flow_items, uint32_t outer_ip_src,
+			 uint8_t core_idx);
+
+size_t item_spec_size(enum rte_flow_item_type type);
+
 #endif /* FLOW_PERF_ITEMS_GEN */
diff --git a/app/test-flow-perf/main.c b/app/test-flow-perf/main.c
index 6636d1517f..691020c719 100644
--- a/app/test-flow-perf/main.c
+++ b/app/test-flow-perf/main.c
@@ -37,11 +37,15 @@
 #include <rte_mtr.h>
 #include <rte_os_shim.h>
 
-#include "config.h"
 #include "actions_gen.h"
+#include "async_flow.h"
+#include "config.h"
 #include "flow_gen.h"
+#include "rte_build_config.h"
 
 #define MAX_BATCHES_COUNT          100
+#define MAX_ASYNC_QUEUE_SIZE	     (1 << 14)
+#define MAX_PULL_RETRIES	     (1 << 20)
 #define DEFAULT_RULES_COUNT    4000000
 #define DEFAULT_RULES_BATCH     100000
 #define DEFAULT_GROUP                0
@@ -55,7 +59,6 @@
 #define HAIRPIN_TX_CONF_LOCKED_MEMORY (0x0100)
 #define HAIRPIN_TX_CONF_RTE_MEMORY    (0x0200)
 
-struct rte_flow *flow;
 static uint8_t flow_group;
 
 static uint64_t encap_data;
@@ -81,6 +84,9 @@ static bool enable_fwd;
 static bool unique_data;
 static bool policy_mtr;
 static bool packet_mode;
+static bool async_mode;
+static uint32_t async_queue_size = 1024;
+static uint32_t async_push_batch = 256;
 
 static uint8_t rx_queues_count;
 static uint8_t tx_queues_count;
@@ -598,6 +604,13 @@ usage(char *progname)
 		"Encapped data is fixed with pattern: ether,ipv4,udp,vxlan\n"
 		"With fixed values\n");
 	printf("  --vxlan-decap: add vxlan_decap action to flow actions\n");
+
+	printf("\nAsync flow API options:\n");
+	printf("  --async: enable async flow API mode\n");
+	printf("  --async-queue-size=N: size of each async queue,"
+	       " default is 1024\n");
+	printf("  --async-push-batch=N: flows to batch before push,"
+	       " default is 256\n");
 }
 
 static void
@@ -734,6 +747,9 @@ args_parse(int argc, char **argv)
 		{ "policy-mtr",                 1, 0, 0 },
 		{ "meter-profile",              1, 0, 0 },
 		{ "packet-mode",                0, 0, 0 },
+		{ "async",                      0, 0, 0 },
+		{ "async-queue-size",           1, 0, 0 },
+		{ "async-push-batch",           1, 0, 0 },
 		{ 0, 0, 0, 0 },
 	};
 
@@ -913,8 +929,7 @@ args_parse(int argc, char **argv)
 					rte_exit(EXIT_FAILURE, "Invalid hairpin config mask\n");
 				hairpin_conf_mask = hp_conf;
 			}
-			if (strcmp(lgopts[opt_idx].name,
-					"port-id") == 0) {
+			if (strcmp(lgopts[opt_idx].name, "port-id") == 0) {
 				uint16_t port_idx = 0;
 
 				token = strtok(optarg, ",");
@@ -981,6 +996,26 @@ args_parse(int argc, char **argv)
 			}
 			if (strcmp(lgopts[opt_idx].name, "packet-mode") == 0)
 				packet_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async") == 0)
+				async_mode = true;
+			if (strcmp(lgopts[opt_idx].name, "async-queue-size") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE)
+					async_queue_size = MAX_ASYNC_QUEUE_SIZE;
+				else if (n > 0)
+					async_queue_size = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-queue-size should be > 0\n");
+			}
+			if (strcmp(lgopts[opt_idx].name, "async-push-batch") == 0) {
+				n = atoi(optarg);
+				if (n >= MAX_ASYNC_QUEUE_SIZE >> 1)
+					async_push_batch = MAX_ASYNC_QUEUE_SIZE >> 1;
+				else if (n > 0)
+					async_push_batch = rte_align32prevpow2(n);
+				else
+					rte_exit(EXIT_FAILURE, "async-push-batch should be > 0\n");
+			}
 			break;
 		default:
 			usage(argv[0]);
@@ -1457,10 +1492,10 @@ query_flows(int port_id, uint8_t core_id, struct rte_flow **flows_list)
 	mc_pool.flows_record.query[port_id][core_id] = cpu_time_used;
 }
 
-static struct rte_flow **
-insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
+static void
+insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
 {
-	struct rte_flow **flows_list;
+	struct rte_flow *flow;
 	struct rte_flow_error error;
 	clock_t start_batch, end_batch;
 	double first_flow_latency;
@@ -1485,8 +1520,7 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 	global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
 	global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
 
-	flows_list = rte_zmalloc("flows_list",
-		(sizeof(struct rte_flow *) * (rules_count_per_core + 1)), 0);
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
 	if (flows_list == NULL)
 		rte_exit(EXIT_FAILURE, "No Memory available!\n");
 
@@ -1524,6 +1558,11 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 			core_id, rx_queues_count,
 			unique_data, max_priority, &error);
 
+		if (!flow) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+
 		if (!counter) {
 			first_flow_latency = (double) (rte_get_timer_cycles() - start_batch);
 			first_flow_latency /= rte_get_timer_hz();
@@ -1537,11 +1576,6 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		if (force_quit)
 			counter = end_counter;
 
-		if (!flow) {
-			print_flow_error(error);
-			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
-		}
-
 		flows_list[flow_index++] = flow;
 
 		/*
@@ -1575,7 +1609,200 @@ insert_flows(int port_id, uint8_t core_id, uint16_t dst_port_id)
 		port_id, core_id, rules_count_per_core, cpu_time_used);
 
 	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
-	return flows_list;
+}
+
+static inline int
+push_pull_flows_async(int port_id, int core_id, uint32_t enqueued, bool empty, bool check_op_status,
+		      struct rte_flow_error *error)
+{
+	static struct rte_flow_op_result results[RTE_MAX_LCORE][MAX_ASYNC_QUEUE_SIZE];
+	static uint32_t push_counter[RTE_MAX_LCORE] = {0};
+	uint32_t to_pull = (empty || async_push_batch > enqueued) ? enqueued : async_push_batch;
+	uint32_t pulled_complete = 0;
+	uint32_t retries = 0;
+	int pulled, i;
+	int ret = 0;
+
+	/* Push periodically to give HW work to do */
+	ret = rte_flow_push(port_id, core_id, error);
+	if (ret)
+		return ret;
+	push_counter[core_id]++;
+
+	/* Check if queue is getting full, if so push and drain completions */
+	if (!empty && push_counter[core_id] == 1)
+		return 0;
+
+	while (to_pull > 0) {
+		pulled = rte_flow_pull(port_id, core_id, results[core_id], to_pull, error);
+		if (pulled < 0) {
+			return -1;
+		} else if (pulled == 0) {
+			if (++retries > MAX_PULL_RETRIES) {
+				rte_flow_error_set(error, ETIMEDOUT,
+						   RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+						   "Timeout waiting for async completions");
+				return -1;
+			}
+			rte_pause();
+			continue;
+		}
+		retries = 0;
+
+		to_pull -= pulled;
+		pulled_complete += pulled;
+		if (!check_op_status)
+			continue;
+
+		for (i = 0; i < pulled; i++) {
+			if (results[core_id][i].status != RTE_FLOW_OP_SUCCESS) {
+				rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+						   NULL, "Some flow rule insertion failed");
+				return -1;
+			}
+		}
+	}
+
+	return pulled_complete;
+}
+
+static void
+insert_flows_async(int port_id, uint8_t core_id, uint16_t dst_port_id, struct rte_flow **flows_list)
+{
+	struct rte_flow *flow;
+	struct rte_flow_error error;
+	clock_t start_batch, end_batch;
+	double first_flow_latency;
+	double cpu_time_used;
+	double insertion_rate;
+	double cpu_time_per_batch[MAX_BATCHES_COUNT] = {0};
+	double delta;
+	uint32_t flow_index;
+	uint32_t counter, batch_counter, start_counter = 0, end_counter;
+	int rules_batch_idx;
+	int rules_count_per_core;
+	uint32_t enqueued = 0;
+	bool first_batch = true;
+	int pulled;
+
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+
+	if (async_push_batch > async_queue_size >> 1)
+		async_push_batch = async_queue_size >> 1;
+
+	/* Set boundaries of rules for each core. */
+	if (core_id)
+		start_counter = core_id * rules_count_per_core;
+	end_counter = (core_id + 1) * rules_count_per_core;
+
+	cpu_time_used = 0;
+	flow_index = 0;
+
+	if (flow_group > 0 && core_id == 0) {
+		/*
+		 * Create global rule to jump into flow_group,
+		 * this way the app will avoid the default rules.
+		 *
+		 * This rule will be created only once.
+		 *
+		 * Global rule:
+		 * group 0 eth / end actions jump group <flow_group>
+		 */
+
+		uint64_t global_items[MAX_ITEMS_NUM] = {0};
+		uint64_t global_actions[MAX_ACTIONS_NUM] = {0};
+		global_items[0] = FLOW_ITEM_MASK(RTE_FLOW_ITEM_TYPE_ETH);
+		global_actions[0] = FLOW_ITEM_MASK(RTE_FLOW_ACTION_TYPE_JUMP);
+		flow = generate_flow(port_id, 0, flow_attrs, global_items, global_actions,
+				     flow_group, 0, 0, 0, 0, dst_port_id, core_id, rx_queues_count,
+				     unique_data, max_priority, &error);
+
+		if (flow == NULL) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error in creating flow\n");
+		}
+		flows_list[flow_index++] = flow;
+	}
+
+	start_batch = rte_get_timer_cycles();
+	for (counter = start_counter; counter < end_counter;) {
+		/* batch adding flow rules, this avoids unnecessary checks for push/pull */
+		for (batch_counter = 0; batch_counter < async_push_batch && counter < end_counter;
+		     batch_counter++, counter++) {
+			/* Create flow with postpone=true to batch operations */
+			flow = async_generate_flow(port_id, core_id, counter, hairpin_queues_num,
+						   encap_data, decap_data, dst_port_id, core_id,
+						   rx_queues_count, unique_data, true, &error);
+
+			if (!flow) {
+				print_flow_error(error);
+				rte_exit(EXIT_FAILURE, "Error in creating async flow\n");
+			}
+
+			if (force_quit)
+				break;
+
+			flows_list[flow_index++] = flow;
+			enqueued++;
+
+			/*
+			 * Save the insertion rate for rules batch.
+			 * Check if the insertion reached the rules
+			 * patch counter, then save the insertion rate
+			 * for this batch.
+			 */
+			if (!((counter + 1) % rules_batch)) {
+				end_batch = rte_get_timer_cycles();
+				delta = (double)(end_batch - start_batch);
+				rules_batch_idx = ((counter + 1) / rules_batch) - 1;
+				cpu_time_per_batch[rules_batch_idx] = delta / rte_get_timer_hz();
+				cpu_time_used += cpu_time_per_batch[rules_batch_idx];
+				start_batch = rte_get_timer_cycles();
+			}
+		}
+
+		pulled = push_pull_flows_async(port_id, core_id, enqueued, false, true, &error);
+		if (pulled < 0) {
+			print_flow_error(error);
+			rte_exit(EXIT_FAILURE, "Error push/pull async operations\n");
+		}
+
+		enqueued -= pulled;
+
+		if (first_batch) {
+			first_flow_latency = (double)(rte_get_timer_cycles() - start_batch);
+			first_flow_latency /= rte_get_timer_hz();
+			/* In millisecond */
+			first_flow_latency *= 1000;
+			printf(":: First Flow Batch Latency (Async) :: Port %d :: First batch (%u) "
+			       "installed in %f milliseconds\n",
+			       port_id, async_push_batch, first_flow_latency);
+			first_batch = false;
+		}
+	}
+
+	if (push_pull_flows_async(port_id, core_id, enqueued, true, true, &error) < 0) {
+		print_flow_error(error);
+		rte_exit(EXIT_FAILURE, "Error final push/pull async operations\n");
+	}
+
+	/* Print insertion rates for all batches */
+	if (dump_iterations)
+		print_rules_batches(cpu_time_per_batch);
+
+	printf(":: Port %d :: Core %d boundaries (Async) :: start @[%d] - end @[%d]\n", port_id,
+	       core_id, start_counter, end_counter - 1);
+
+	/* Insertion rate for all rules in one core */
+	if (cpu_time_used > 0) {
+		insertion_rate = ((double)rules_count_per_core / cpu_time_used) / 1000;
+		printf(":: Port %d :: Core %d :: Async rules insertion rate -> %f K Rule/Sec\n",
+		       port_id, core_id, insertion_rate);
+	}
+	printf(":: Port %d :: Core %d :: The time for creating %d async rules is %f seconds\n",
+	       port_id, core_id, rules_count_per_core, cpu_time_used);
+
+	mc_pool.flows_record.insertion[port_id][core_id] = cpu_time_used;
 }
 
 static void
@@ -1585,12 +1812,18 @@ flows_handler(uint8_t core_id)
 	uint16_t port_idx = 0;
 	uint16_t nr_ports;
 	int port_id;
+	int rules_count_per_core;
 
 	nr_ports = rte_eth_dev_count_avail();
 
 	if (rules_batch > rules_count)
 		rules_batch = rules_count;
 
+	rules_count_per_core = rules_count / mc_pool.cores_count;
+	flows_list = malloc(sizeof(struct rte_flow *) * (rules_count_per_core + 1));
+	if (flows_list == NULL)
+		rte_exit(EXIT_FAILURE, "No Memory available!\n");
+
 	printf(":: Rules Count per port: %d\n\n", rules_count);
 
 	for (port_id = 0; port_id < nr_ports; port_id++) {
@@ -1602,10 +1835,10 @@ flows_handler(uint8_t core_id)
 		mc_pool.last_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 		if (has_meter())
 			meters_handler(port_id, core_id, METER_CREATE);
-		flows_list = insert_flows(port_id, core_id,
-						dst_ports[port_idx++]);
-		if (flows_list == NULL)
-			rte_exit(EXIT_FAILURE, "Error: Insertion Failed!\n");
+		if (async_mode)
+			insert_flows_async(port_id, core_id, dst_ports[port_idx++], flows_list);
+		else
+			insert_flows(port_id, core_id, dst_ports[port_idx++], flows_list);
 		mc_pool.current_alloc[core_id] = (int64_t)dump_socket_mem(stdout);
 
 		if (query_flag)
@@ -2212,6 +2445,16 @@ init_port(void)
 			}
 		}
 
+		/* Configure async flow engine before device start */
+		if (async_mode) {
+			ret = async_flow_init_port(port_id, mc_pool.cores_count, async_queue_size,
+						   flow_items, flow_actions, flow_attrs, flow_group,
+						   rules_count);
+			if (ret != 0)
+				rte_exit(EXIT_FAILURE, "Failed to init async flow on port %d\n",
+					 port_id);
+		}
+
 		ret = rte_eth_dev_start(port_id);
 		if (ret < 0)
 			rte_exit(EXIT_FAILURE,
@@ -2291,6 +2534,8 @@ main(int argc, char **argv)
 
 	RTE_ETH_FOREACH_DEV(port) {
 		rte_flow_flush(port, &error);
+		if (async_mode)
+			async_flow_cleanup_port(port);
 		if (rte_eth_dev_stop(port) != 0)
 			printf("Failed to stop device on port %u\n", port);
 		rte_eth_dev_close(port);
diff --git a/app/test-flow-perf/meson.build b/app/test-flow-perf/meson.build
index e101449e32..70d8671a54 100644
--- a/app/test-flow-perf/meson.build
+++ b/app/test-flow-perf/meson.build
@@ -1,8 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright(c) 2020 Mellanox Technologies, Ltd
 
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
 sources = files(
         'actions_gen.c',
+        'async_flow.c',
         'flow_gen.c',
         'items_gen.c',
         'main.c',
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-03-10  7:42 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-02-24 10:56 [PATCH] test/flow: add support for async API Maxime Peim
2026-02-25 22:23 ` Stephen Hemminger
2026-03-01 23:29   ` [PATCH v2] " Maxime Peim
2026-03-01 23:36     ` Maxime Peim
2026-03-02  0:52     ` Stephen Hemminger
2026-03-02 10:57   ` [PATCH v3] " Maxime Peim
2026-03-02 14:35   ` [PATCH v4] " Maxime Peim
2026-03-09 12:52   ` [PATCH v5] " Maxime Peim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox