DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v8 00/23] net/sxe2: added Linkdata sxe2 ethernet driver
From: Stephen Hemminger @ 2026-06-25 22:50 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260625131936.205040-1-liujie5@linkdatatechnology.com>

On Thu, 25 Jun 2026 21:19:36 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> This patch set implements core functionality for the SXE2 PMD,
> including basic driver framework, data path setup, and advanced
> offload features (VLAN, RSS,TM, PTP etc.).
> 
> v8:
> 	Restore the flow-duplicate-pattern parameter
> 
> Jie Liu (23):
>   net/sxe2: remove software statistics devargs
>   net/sxe2: add Rx framework and packet types callback
>   net/sxe2: support AVX512 vectorized path for Rx and Tx
>   net/sxe2: add AVX2 vector data path for Rx and Tx
>   net/sxe2: add link update callback
>   net/sxe2: support L2 filtering and MAC config
>   drivers: support RSS feature
>   net/sxe2: support TM hierarchy and shaping
>   net/sxe2: support IPsec inline protocol offload
>   net/sxe2: support statistics and multi-process
>   drivers: interrupt handling
>   net/sxe2: add NEON vec Rx/Tx burst functions
>   drivers: add support for VF representors
>   net/sxe2: add support for custom UDP tunnel ports
>   net/sxe2: support firmware version reading
>   net/sxe2: implement get monitor address
>   common/sxe2: add shared SFP module definitions
>   net/sxe2: support SFP module info and EEPROM access
>   net/sxe2: add mbuf validation in Tx debug mode
>   common/sxe2: add callback for memory event handling
>   net/sxe2: add private devargs parsing
>   net/sxe2: implement private dump info
>   net/sxe2: update sxe2 feature matrix docs
> 
>  doc/guides/nics/features/sxe2.ini          |   56 +
>  doc/guides/nics/sxe2.rst                   |  168 ++
>  drivers/common/sxe2/sxe2_common.c          |  156 ++
>  drivers/common/sxe2/sxe2_common.h          |    4 +
>  drivers/common/sxe2/sxe2_flow_public.h     |  633 +++++++
>  drivers/common/sxe2/sxe2_ioctl_chnl.c      |  178 +-
>  drivers/common/sxe2/sxe2_ioctl_chnl_func.h |   18 +
>  drivers/common/sxe2/sxe2_msg.h             |  118 ++
>  drivers/net/sxe2/meson.build               |   52 +
>  drivers/net/sxe2/sxe2_cmd_chnl.c           | 1587 +++++++++++++++-
>  drivers/net/sxe2/sxe2_cmd_chnl.h           |  139 ++
>  drivers/net/sxe2/sxe2_drv_cmd.h            |  523 +++++-
>  drivers/net/sxe2/sxe2_dump.c               |  287 +++
>  drivers/net/sxe2/sxe2_dump.h               |   12 +
>  drivers/net/sxe2/sxe2_ethdev.c             | 1486 ++++++++++++++-
>  drivers/net/sxe2/sxe2_ethdev.h             |  111 +-
>  drivers/net/sxe2/sxe2_ethdev_repr.c        |  609 ++++++
>  drivers/net/sxe2/sxe2_ethdev_repr.h        |   32 +
>  drivers/net/sxe2/sxe2_filter.c             |  895 +++++++++
>  drivers/net/sxe2/sxe2_filter.h             |  100 +
>  drivers/net/sxe2/sxe2_flow.c               | 1394 ++++++++++++++
>  drivers/net/sxe2/sxe2_flow.h               |   30 +
>  drivers/net/sxe2/sxe2_flow_define.h        |  144 ++
>  drivers/net/sxe2/sxe2_flow_parse_action.c  | 1182 ++++++++++++
>  drivers/net/sxe2/sxe2_flow_parse_action.h  |   23 +
>  drivers/net/sxe2/sxe2_flow_parse_engine.c  |  106 ++
>  drivers/net/sxe2/sxe2_flow_parse_engine.h  |   13 +
>  drivers/net/sxe2/sxe2_flow_parse_pattern.c | 1935 +++++++++++++++++++
>  drivers/net/sxe2/sxe2_flow_parse_pattern.h |   46 +
>  drivers/net/sxe2/sxe2_ipsec.c              | 1565 ++++++++++++++++
>  drivers/net/sxe2/sxe2_ipsec.h              |  254 +++
>  drivers/net/sxe2/sxe2_irq.c                | 1026 ++++++++++
>  drivers/net/sxe2/sxe2_irq.h                |   25 +
>  drivers/net/sxe2/sxe2_mac.c                |  530 ++++++
>  drivers/net/sxe2/sxe2_mac.h                |   84 +
>  drivers/net/sxe2/sxe2_mp.c                 |  414 +++++
>  drivers/net/sxe2/sxe2_mp.h                 |   67 +
>  drivers/net/sxe2/sxe2_queue.c              |   17 +-
>  drivers/net/sxe2/sxe2_queue.h              |   15 +-
>  drivers/net/sxe2/sxe2_rss.c                |  584 ++++++
>  drivers/net/sxe2/sxe2_rss.h                |   81 +
>  drivers/net/sxe2/sxe2_rx.c                 |   93 +-
>  drivers/net/sxe2/sxe2_rx.h                 |    2 +
>  drivers/net/sxe2/sxe2_security.c           |  335 ++++
>  drivers/net/sxe2/sxe2_security.h           |   77 +
>  drivers/net/sxe2/sxe2_stats.c              |  586 ++++++
>  drivers/net/sxe2/sxe2_stats.h              |   39 +
>  drivers/net/sxe2/sxe2_switchdev.c          |  332 ++++
>  drivers/net/sxe2/sxe2_switchdev.h          |   33 +
>  drivers/net/sxe2/sxe2_tm.c                 | 1151 ++++++++++++
>  drivers/net/sxe2/sxe2_tm.h                 |   76 +
>  drivers/net/sxe2/sxe2_tx.c                 |    7 +
>  drivers/net/sxe2/sxe2_txrx.c               | 1958 +++++++++++++++++++-
>  drivers/net/sxe2/sxe2_txrx.h               |    8 +
>  drivers/net/sxe2/sxe2_txrx_check_mbuf.c    |  595 ++++++
>  drivers/net/sxe2/sxe2_txrx_check_mbuf.h    |   38 +
>  drivers/net/sxe2/sxe2_txrx_poll.c          |  284 ++-
>  drivers/net/sxe2/sxe2_txrx_vec.c           |   46 +-
>  drivers/net/sxe2/sxe2_txrx_vec.h           |   38 +-
>  drivers/net/sxe2/sxe2_txrx_vec_avx2.c      |  747 ++++++++
>  drivers/net/sxe2/sxe2_txrx_vec_avx512.c    |  867 +++++++++
>  drivers/net/sxe2/sxe2_txrx_vec_common.h    |   54 +-
>  drivers/net/sxe2/sxe2_txrx_vec_neon.c      |  689 +++++++
>  drivers/net/sxe2/sxe2_txrx_vec_sse.c       |   38 +-
>  drivers/net/sxe2/sxe2_vsi.c                |  146 ++
>  drivers/net/sxe2/sxe2_vsi.h                |   12 +-
>  drivers/net/sxe2/sxe2vf_regs.h             |   85 +
>  67 files changed, 24762 insertions(+), 273 deletions(-)
>  create mode 100644 drivers/common/sxe2/sxe2_flow_public.h
>  create mode 100644 drivers/common/sxe2/sxe2_msg.h
>  create mode 100644 drivers/net/sxe2/sxe2_dump.c
>  create mode 100644 drivers/net/sxe2/sxe2_dump.h
>  create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.c
>  create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.h
>  create mode 100644 drivers/net/sxe2/sxe2_filter.c
>  create mode 100644 drivers/net/sxe2/sxe2_filter.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_define.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.h
>  create mode 100644 drivers/net/sxe2/sxe2_ipsec.c
>  create mode 100644 drivers/net/sxe2/sxe2_ipsec.h
>  create mode 100644 drivers/net/sxe2/sxe2_irq.c
>  create mode 100644 drivers/net/sxe2/sxe2_mac.c
>  create mode 100644 drivers/net/sxe2/sxe2_mac.h
>  create mode 100644 drivers/net/sxe2/sxe2_mp.c
>  create mode 100644 drivers/net/sxe2/sxe2_mp.h
>  create mode 100644 drivers/net/sxe2/sxe2_rss.c
>  create mode 100644 drivers/net/sxe2/sxe2_rss.h
>  create mode 100644 drivers/net/sxe2/sxe2_security.c
>  create mode 100644 drivers/net/sxe2/sxe2_security.h
>  create mode 100644 drivers/net/sxe2/sxe2_stats.c
>  create mode 100644 drivers/net/sxe2/sxe2_stats.h
>  create mode 100644 drivers/net/sxe2/sxe2_switchdev.c
>  create mode 100644 drivers/net/sxe2/sxe2_switchdev.h
>  create mode 100644 drivers/net/sxe2/sxe2_tm.c
>  create mode 100644 drivers/net/sxe2/sxe2_tm.h
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.h
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx2.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx512.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_neon.c
>  create mode 100644 drivers/net/sxe2/sxe2vf_regs.h

Overall this is close, I don't think the flow duplicate flag you
added matches mlx5 as close as you think. Adding a new variant makes
sense, but try to keep original values matching other driver.
My intent here is to have drivers diverge as little as possible;
it makes it simpler for real world applications.

The AI chat review showed:

[PATCH v8 00/23] sxe2 driver feature additions

The v6 issues are resolved cleanly, except the flow-duplicate-pattern
alignment didn't go in the direction your "align with mlx5" guidance
intended.

Resolved from v6:

- The mbox is now in correct numeric order (06 before 07). git am
  bundle.mbox applies cleanly. The v6 bundle-order break is gone.

- All 23 commits build end-to-end at every commit. The dump-info patch
  was moved from position 19 to 22 - it now runs after devargs parsing,
  which makes its references to devargs fields consistent. sxe2_dump_fc_state()
  also correctly moved out of the devargs patch into the dump-info patch
  where it belongs.

- The v6 flow_dup_pattern_mode orphan situation is fixed. The field is
  now actually written by the new sxe2_parse_flow_dup_pattern_mode()
  parser, properly defaulted to SXE2_FLOW_SW_PATTERN_LAST in init, and
  read at sxe2_flow.c:310 to populate switch_pattern_dup_allow in flow
  metadata. The struct field, its writer, its reader, and its hardware-
  programming path are all consistent now.

- sxe2_parse_no_sched_mode was simplified to call the existing
  sxe2_parse_bool helper instead of duplicating strtoul boilerplate. Nice
  cleanup.

The flow-duplicate-pattern semantics:

This is the v8 redesign in response to "align to mlx5". What it does:

  doc/guides/nics/sxe2.rst, the option:
    0 = reject duplicates with EEXIST
    1 = allow; last-added rule active (LIFO)        <-- default
    2 = allow; first-added rule active (FIFO)

mlx5's equivalent allow_duplicate_pattern is a {0,1} boolean. Its
value=1 means "all rules are inserted but only the first rule takes
effect, the next rule takes effect only if the previous rules are
deleted." That is FIFO. So:

  mlx5 allow_duplicate_pattern=0  ==  sxe2 flow-duplicate-pattern=0
  mlx5 allow_duplicate_pattern=1  ==  sxe2 flow-duplicate-pattern=2 (FIFO)
  (no mlx5 equivalent)            ==  sxe2 flow-duplicate-pattern=1 (LIFO)

This is the worst possible alignment: sxe2 picked the same value range
{0,1} mlx5 has, picked the same name root, picked similar documentation
wording, and made the same value 1 mean the opposite hardware behaviour.
A user who runs the same `-a 0000:xx:xx.x,flow-duplicate-pattern=1` on
both PMDs gets FIFO on mlx5 and LIFO on sxe2 - inverted semantics under
identical names.

If the hardware genuinely supports both LIFO and FIFO and exposing both
is valuable, then naming and defaults should not visually parallel mlx5.
Suggested resolutions, in order of preference:

  (a) Drop LIFO. Document and implement only the two modes mlx5 has:
      0 = reject, 1 = FIFO (mlx5's value=1 semantic). Default 1. The
      name allow_duplicate_pattern (mlx5's spelling) becomes valid -
      same parameter, same values, same defaults across PMDs.

  (b) Keep both modes, but rename to make the meaning explicit and
      non-overlapping with mlx5:
        flow-duplicate-policy=reject|fifo|lifo  (string-valued)
      with reject as default. mlx5 keeps its short boolean; sxe2 has
      its own tri-state with non-aliasing values. This is clear but
      slightly more typing for users.

  (c) Keep current name and values but make value=1 mean FIFO (matching
      mlx5) and value=2 mean LIFO. The name's spelling still differs
      from mlx5's, but at least the integer-to-semantic mapping matches.

Personal preference is (a). The LIFO mode is unusual enough that I would
want to see a concrete use case before accepting it into a public
devarg. If the hardware queues rules in an order applications can
observe, that ordering is an rte_flow priority concern; the devarg layer
shouldn't be encoding it.

A separate small issue in the error-message ternary - sxe2_flow.c:809
prints "Duplicate flow pattern." when mode is non-zero and "Duplicate
flow pattern is not allowed." when mode is zero. But the EEXIST error
path should only fire when the policy is to reject, which is exactly
mode 0. In modes 1 and 2, the hardware accepts the duplicate as a
shadow/active rule; this error path should be unreachable. Either the
ternary's first branch is dead code (in which case remove it and the
ternary; print only the rejection message), or the rejection can
happen in modes 1/2 too (in which case the message must explain why -
table full, action conflict, etc - and the current wording is wrong
in both branches).

Otherwise this revision is in good shape. Once flow-duplicate-pattern
alignment lands one way or another, ready to merge.

^ permalink raw reply

* Re: [PATCH v8 0/4] net/zxdh: optimize Rx/Tx path performance
From: Stephen Hemminger @ 2026-06-25 22:42 UTC (permalink / raw)
  To: Junlong Wang; +Cc: dev
In-Reply-To: <20260625120317.211780-1-wang.junlong1@zte.com.cn>

On Thu, 25 Jun 2026 20:03:12 +0800
Junlong Wang <wang.junlong1@zte.com.cn> wrote:

> v8:
>   - Add checked the size of ZXDH_DL_NET_HDR_SIZE and RTE_PKTMBUF_HEADROOM in
>     zxdh_xmit_pkts_simple() before submitting. Add static_assert to reject builds with insufficient
>     default headroom at compile time.
> 
> v7:
>   - Add a new xmit prepare func for xmit_pkts_simple, which will checked the size of
>     ZXDH_DL_NET_HDR_SIZE and RTE_PKTMBUF_HEADROOM.
> 
> v6:
>   - Remove unnecessary error checking code in submit_to_backend_simple() and
>     pkt_padding(). Since as the max dl_net_hdr_len is always less than
>     RTE_PKTMBUF_HEADROOM, rte_pktmbuf_prepend() cannot fail in the
>     simple path (single-segment mbufs).
> v5:
>   - Reorganize patch series, placing interrupt fix as the first patch
>     and fix condition check to properly enable interrupts.
>   - Fix zxdh_recv_single_pkts() not compacting rcv_pkts[] on failure,
>     which could cause use-after-free and mbuf leak.
>   - Fix tx_bunch() and tx1() missing store barrier before setting AVAIL flag,
>     preventing data race on weakly-ordered architectures.
>   - Fix submit_to_backend_simple() writing descriptors for packets that
>     failed pkt_padding(), causing mbuf leak.
> v4:
>   - fix some AI review issues.
>   - fix queue enable intr bug.
> v3:
>   - remove unnecessary NULL check in zxdh_init_queue.
>   - Split Ring: Bit[31] is unused and reserved, zxdh_queue_notify(): removing the
>     zxdh_pci_with_feature(hw, ZXDH_F_RING_PACKED) check;
>   - remove unnecessary double-free in in zxdh_recv_single_pkts();
>   - used rte_pktmbuf_mtod();
>   - remove rxq_get_vq(q) macro, use q->vq and apply it consistently;
>   - Refactoring scatter and mtu check logic in zxdh_dev_mtu_set();
>   - set txdp->id = avail_idx + i in tx_bunch/tx1.
>   - add comment documenting zxdh_xmit_enqueue_append() now sets dxp->cookie = NULL for
>     the head slot and stores cookies per descriptor via dep[idx].cookie.
>   - add one-line comment noting tx_bunch() is the simple path handles single-segment.
>   - remove unnecessary Extra initialization and the uint32_t cast.
> v2:
>   - zxdh_rxtx.c, pkt_padding(): modifyed the return value of pkt_padding();
>   - zxdh_rxtx.c, zxdh_recv_single_pkts(): modifyed When zxdh_init_mbuf() fails
>     the loop does "continue" and free mbufs;
>   - zxdh_rxtx.c, refill_desc_unwrap(): Add rte_io_wmb() before writing flags
>     in the refill_que_descs();
>   - zxdh_queue.h, zxdh_queue_enable_intr(): Remove unnecessary function of zxdh_queue_enable_intr;
>   - zxdh_ethdev.c, zxdh_init_queue(): changed the hdr_mz NULL check logic;
>   - zxdh_rxtx.c, zxdh_xmit_pkts_simple()、zxdh_recv_single_pkts(): add stats.bytes count;
>   - zxdh_rxtx.c, zxdh_init_mbuf():remove  rte_pktmbuf_dump(stdout, rxm, 40);
>   - zxdh_ethdev.c, zxdh_dev_free_mbufs(): using rte_pktmbuf_free() to free mbufs;
>   - Splitting into separate patches, structure reorganization and sw_ring removal、
>     RX recv optimize、Tx xmit optimize、Tx;
> v1:
>   This patch optimizes the ZXDH PMD's receive and transmit path for better
>   performance through several improvements:
> - Add simple TX/RX burst functions (zxdh_xmit_pkts_simple and
>   zxdh_recv_single_pkts) for single-segment packet scenarios.
> - Remove RX software ring (sw_ring) to reduce memory allocation and
>   copy.
> - Optimize descriptor management with prefetching and simplified
>   cleanup.
> - Reorganize structure fields for better cache locality.
> 
>   These changes reduce CPU cycles and memory bandwidth consumption,
>   resulting in improved packet processing throughput.
> 
> Junlong Wang (4):
>   net/zxdh: fix queue enable intr issues
>   net/zxdh: optimize queue structure to improve performance
>   net/zxdh: optimize Rx recv pkts performance
>   net/zxdh: optimize Tx xmit pkts performance
> 
>  drivers/net/zxdh/zxdh_ethdev.c     |  83 ++--
>  drivers/net/zxdh/zxdh_ethdev_ops.c |  23 +-
>  drivers/net/zxdh/zxdh_ethdev_ops.h |   4 +
>  drivers/net/zxdh/zxdh_pci.c        |   2 +-
>  drivers/net/zxdh/zxdh_queue.c      |  11 +-
>  drivers/net/zxdh/zxdh_queue.h      | 122 +++---
>  drivers/net/zxdh/zxdh_rxtx.c       | 591 +++++++++++++++++++++++------
>  drivers/net/zxdh/zxdh_rxtx.h       |  29 +-
>  8 files changed, 604 insertions(+), 261 deletions(-)
> 

AI spotted one bug but it under valued how serious the issue is.
AI also gave bad advice about this earlier. Here is my analysis
followed by buggy AI review, then what it said after I pointed out
its mistake. Sorry, I need to not trust AI; its good until it isn't

The semantics of how DPDK drivers are supposed to handle bad packets
in transmit is not well documented. The correct way to handle a packet
that can not be sent is for the driver to consume (free in tx_burst)
and increment an error counter, and keep going. Suppose application
gave a single buggy mbuf (index 3) in a burst of 16 packets (assuming
there was space in the transmit ring).
The driver should return with 16 (all packets consumed) but the
statistics should reflect what happened: 15 packets transmitted, 1 tx error.

The problem with should return is that the application can not tell
the difference between backpressure (transmit ring full) and a bad packet.
An application may think it see back pressure and retransmit the bad packet.

--- AI review (buggy) ---

Series review: net/zxdh Rx/Tx optimization (v8)

The v7 issue is resolved. The simple Tx burst no longer depends on the
optional tx_prepare for memory safety: zxdh_xmit_pkts_simple() now
filters mbufs with data_off < hw->dl_net_hdr_len before any packet
reaches pkt_padding(), stopping at the first short-headroom packet and
returning the count enqueued so far, so the caller keeps ownership of
the rest. The threshold matches exactly what pkt_padding() prepends,
and submit_to_backend_simple()/pkt_padding() are only reachable through
that filter, so the open-coded prepend can no longer underflow data_off
or write before buf_addr. The build-time backstop

	static_assert(RTE_PKTMBUF_HEADROOM >= ZXDH_DL_NET_HDR_SIZE, ...)

is also in place (static_assert resolves via rte_common.h).

Patches 1-3 are unchanged from v7 and remain correct: the enable_intr
fix is first in the series against the original field name so it
backports cleanly; the Tx AVAIL flag is published through
zxdh_queue_store_flags_packed() (rte_io_wmb); multi-segment Tx frees
each segment once via free_seg with the head descriptor's NULL cookie
holding the hardware header; and zxdh_recv_single_pkts() compacts
surviving mbufs.

Info (non-blocking, no need to respin):

In the new headroom filter, the whole returned tail is counted as
errors:

	if (unlikely(tx_pkts[i]->data_off < hdr_len)) {
		txvq->stats.errors += nb_pkts - i;
		nb_pkts = i;
		break;
	}

Only tx_pkts[i] is known to be short; the packets after it are not
examined, just deferred, and the application will resubmit them on the
next burst. Counting the entire tail as errors inflates the error stat
(and a packet that is eventually sent fine can be counted as an error
on an earlier burst where it sat behind a short-headroom packet).
Counting only the rejected packet, or tracking the deferred tail
separately from errors, would be more accurate. Not worth a respin on
its own.

-- AI review (corrected) ---

Series review: net/zxdh Rx/Tx optimization (v8)

The v6/v7 out-of-bounds write is fixed: zxdh_xmit_pkts_simple() now
checks data_off < hw->dl_net_hdr_len before any packet reaches the
open-coded prepend in pkt_padding(), and the static_assert backstop is
in place. Memory safety is no longer the issue. How the unsendable
packet is handled is.

Patches 1-3 are unchanged from v7 and remain correct.

[PATCH v8 4/4] net/zxdh: optimize Tx xmit pkts performance

Error: the simple Tx burst signals a bad packet with a short return,
which the application cannot distinguish from backpressure.

	for (i = 0; i < nb_pkts; i++) {
		rte_prefetch0(tx_pkts[i]);
		if (unlikely(tx_pkts[i]->data_off < hdr_len)) {
			txvq->stats.errors += nb_pkts - i;
			nb_pkts = i;
			break;
		}
	}

A short return from tx_burst is the backpressure signal (transmit ring
full, retry later). Here it is also used to mean "packet i is bad",
and the bad mbuf is left owned by the caller. The application has no
way to tell the two apart: the usual

	for (sent = 0; sent < n; )
		sent += rte_eth_tx_burst(port, q, &pkts[sent], n - sent);

loop treats the short return as backpressure and resubmits pkts[i],
which fails again every time -- head-of-line blocking, and the good
packets after i (which had ring space) never go out.

A packet that cannot be sent must be consumed by the driver, not
handed back. Free it in tx_burst, increment the tx error counter, and
continue with the rest of the burst. For a burst of 16 where only
index 3 is bad and the ring has room, tx_burst should return 16, with
stats showing 15 transmitted and 1 tx error. A short return is then
reserved for the one case the application is entitled to retry: ring
full.

Concretely for the simple path: walk the burst, and for a packet with
data_off < hdr_len free it and bump the error counter rather than
breaking; compact the survivors (as zxdh_recv_single_pkts() already
does on Rx) and submit the compacted run. Reserve the short return for
the vq_free_cnt limit.

Better still, avoid dropping the packet at all: route a short-headroom
mbuf through the append path the packed burst already uses
(zxdh_xmit_enqueue_append copies the header into the reserved txr
region) instead of prepending in place. Then nothing is lost and the
simple path matches the packed path's handling.

^ permalink raw reply

* [PATCH v3 3/3] dma/ae4dma: add data path operations
From: Raghavendra Ningoji @ 2026-06-25 18:47 UTC (permalink / raw)
  To: dev
  Cc: david.marchand, bruce.richardson, fengchengwen, Selwin.Sebastian,
	bhagyada.modali, rjarry, thomas, Raghavendra Ningoji
In-Reply-To: <20260625184728.1678328-1-raghavendra.ningoji@amd.com>

Implement the dmadev fast path for the AMD AE4DMA PMD.

This commit adds:
 - copy enqueue (rte_dma_copy): write an AE4DMA descriptor for a
   memory-to-memory transfer; on RTE_DMA_OP_FLAG_SUBMIT the doorbell
   is rung immediately.
 - submit (rte_dma_submit): advance the per-queue write_idx
   register to expose pending descriptors to the hardware.
 - completion (rte_dma_completed / rte_dma_completed_status):
   completion is detected via the hardware's per-queue read_idx
   register, which the engine advances as it processes descriptors.
   The descriptor status / err_code bytes are read only to classify
   each drained slot as success or failure, and HW error codes are
   translated to the dmadev RTE_DMA_STATUS_* enumeration.
 - burst capacity (rte_dma_burst_capacity): report the number of
   free descriptor slots, taking into account the one slot reserved
   to distinguish full from empty on the power-of-two ring.

The fast path entry points are wired through fp_obj in
ae4dma_dmadev_create(). The fill capability is not advertised;
fp_obj->fill is left zero-initialised.

Signed-off-by: Raghavendra Ningoji <raghavendra.ningoji@amd.com>
---
 doc/guides/dmadevs/ae4dma.rst      |  22 +++
 drivers/dma/ae4dma/ae4dma_dmadev.c | 287 +++++++++++++++++++++++++++++
 2 files changed, 309 insertions(+)

diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst
index a85c1d92ca..37a2096ccf 100644
--- a/doc/guides/dmadevs/ae4dma.rst
+++ b/doc/guides/dmadevs/ae4dma.rst
@@ -51,3 +51,25 @@ On probe the PMD performs the following steps for each PCI function:
   IOVA-contiguous memory, programs the queue base address and ring
   depth into the per-queue registers, and enables the queue.
 * Interrupts are masked; completion is polled by the application.
+
+Usage
+-----
+
+Once a dmadev has been started, copies are submitted with
+``rte_dma_copy()`` and completions are reaped with ``rte_dma_completed()``
+or ``rte_dma_completed_status()``. See the
+:ref:`Enqueue / Dequeue API <dmadev_enqueue_dequeue>` section of the
+dmadev library documentation for details.
+
+Limitations
+-----------
+
+* Only memory-to-memory copies are supported. Fill, scatter-gather and
+  any other operation types are not advertised in
+  ``rte_dma_info::dev_capa``.
+* The maximum number of descriptors per virtual channel is fixed by
+  hardware at 32. The PMD rounds the requested ring size up to a
+  power of two and clamps it to 32.
+* Only a single virtual channel per dmadev is supported; use the 16
+  per-PCI-function dmadevs to obtain channel-level parallelism.
+* Interrupt-driven completion is not supported.
diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c b/drivers/dma/ae4dma/ae4dma_dmadev.c
index 607f288623..da3ec42233 100644
--- a/drivers/dma/ae4dma/ae4dma_dmadev.c
+++ b/drivers/dma/ae4dma/ae4dma_dmadev.c
@@ -158,6 +158,72 @@ ae4dma_dev_close(struct rte_dma_dev *dev)
 	return 0;
 }
 
+/* trigger h/w to process enqued desc:doorbell - by next_write */
+static inline void
+__submit(struct ae4dma_dmadev *ae4dma)
+{
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t write_idx = cmd_q->next_write;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->write_idx, write_idx);
+	if (nb != 0)
+		cmd_q->stats.submitted += (uint16_t)((cmd_q->next_write - cmd_q->last_write +
+				nb) % nb);
+	cmd_q->last_write = cmd_q->next_write;
+}
+
+static int
+ae4dma_submit(void *dev_private, uint16_t vchan __rte_unused)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+
+	__submit(ae4dma);
+	return 0;
+}
+
+/* Write descriptor for enqueue (copy only). */
+static inline int
+__write_desc_copy(void *dev_private, rte_iova_t src, rte_iova_t dst,
+		uint32_t len, uint64_t flags)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	struct ae4dma_desc *dma_desc;
+	uint16_t ret;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t write = cmd_q->next_write;
+
+	if (nb == 0)
+		return -EINVAL;
+
+	/* Reserve one slot to distinguish full from empty (power-of-two ring). */
+	if ((uint32_t)cmd_q->ring_buff_count >= (uint32_t)(nb - 1))
+		return -ENOSPC;
+
+	dma_desc = &cmd_q->qbase_desc[write];
+	memset(dma_desc, 0, sizeof(*dma_desc));
+	dma_desc->length = len;
+	dma_desc->src_hi = upper_32_bits(src);
+	dma_desc->src_lo = lower_32_bits(src);
+	dma_desc->dst_hi = upper_32_bits(dst);
+	dma_desc->dst_lo = lower_32_bits(dst);
+	cmd_q->ring_buff_count++;
+	cmd_q->next_write = (uint16_t)((write + 1) % nb);
+	ret = write;
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+		__submit(ae4dma);
+	return ret;
+}
+
+/* Enqueue a copy operation onto the ae4dma device. */
+static int
+ae4dma_enqueue_copy(void *dev_private, uint16_t vchan __rte_unused,
+		rte_iova_t src, rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+	return __write_desc_copy(dev_private, src, dst, length, flags);
+}
+
 static int
 ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
 {
@@ -187,6 +253,220 @@ ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
 		cmd_q->stats.errors);
 	return 0;
 }
+
+/* Translates AE4DMA ChanERRs to DMA error codes. */
+static inline enum rte_dma_status_code
+__translate_status_ae4dma_to_dma(enum ae4dma_dma_err status)
+{
+	AE4DMA_PMD_DEBUG("ae4dma desc status = %d", status);
+
+	switch (status) {
+	case AE4DMA_DMA_ERR_NO_ERR:
+		return RTE_DMA_STATUS_SUCCESSFUL;
+	case AE4DMA_DMA_ERR_INV_LEN:
+		return RTE_DMA_STATUS_INVALID_LENGTH;
+	case AE4DMA_DMA_ERR_INV_SRC:
+		return RTE_DMA_STATUS_INVALID_SRC_ADDR;
+	case AE4DMA_DMA_ERR_INV_DST:
+		return RTE_DMA_STATUS_INVALID_DST_ADDR;
+	case AE4DMA_DMA_ERR_INV_ALIGN:
+		/* Name matches DPDK public enum spelling. */
+		return RTE_DMA_STATUS_DATA_POISION;
+	case AE4DMA_DMA_ERR_INV_HEADER:
+	case AE4DMA_DMA_ERR_INV_STATUS:
+		return RTE_DMA_STATUS_ERROR_UNKNOWN;
+	default:
+		return RTE_DMA_STATUS_ERROR_UNKNOWN;
+	}
+}
+
+/*
+ * Scan HW queue for completed descriptors (non-blocking).
+ *
+ * The AE4DMA engine signals completion by advancing the per-queue
+ * `read_idx` register; it does not (reliably) write a status value
+ * back into the descriptor. We therefore use the HW `read_idx`
+ * register as the source of truth and only inspect the descriptor's
+ * `dw1.err_code` byte to classify each completion as success or
+ * failure.
+ *
+ * @param cmd_q
+ *   The AE4DMA command queue.
+ * @param max_ops
+ *   Maximum descriptors to process this call.
+ * @param[out] failed_count
+ *   Number of completed descriptors that did not report success.
+ * @return
+ *   Number of descriptors completed (success + failure), <= max_ops.
+ */
+static inline uint16_t
+ae4dma_scan_hwq(struct ae4dma_cmd_queue *cmd_q, uint16_t max_ops,
+		uint16_t *failed_count)
+{
+	volatile struct ae4dma_desc *hw_desc;
+	uint16_t events_count = 0, fails = 0;
+	uint16_t tail;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t mask;
+	uint16_t hw_read_idx;
+	uint16_t in_flight;
+	uint16_t scan_cap;
+
+	if (nb == 0 || cmd_q->ring_buff_count == 0) {
+		*failed_count = 0;
+		return 0;
+	}
+	mask = nb - 1;
+
+	hw_read_idx = (uint16_t)(AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx) & mask);
+	tail = cmd_q->next_read;
+
+	/*
+	 * Descriptors completed since our last visit live in the
+	 * half-open ring range [tail, hw_read_idx). If HW hasn't
+	 * moved we have nothing to do.
+	 */
+	in_flight = (uint16_t)((hw_read_idx - tail) & mask);
+	if (in_flight == 0) {
+		*failed_count = 0;
+		return 0;
+	}
+
+	scan_cap = max_ops;
+	if (scan_cap > AE4DMA_DESCRIPTORS_PER_CMDQ)
+		scan_cap = AE4DMA_DESCRIPTORS_PER_CMDQ;
+	if (scan_cap > in_flight)
+		scan_cap = in_flight;
+	if (scan_cap > cmd_q->ring_buff_count)
+		scan_cap = (uint16_t)cmd_q->ring_buff_count;
+
+	while (events_count < scan_cap) {
+		uint8_t hw_status;
+		uint8_t hw_err;
+
+		hw_desc = &cmd_q->qbase_desc[tail];
+		hw_status = hw_desc->dw1.status;
+		hw_err = hw_desc->dw1.err_code;
+
+		/*
+		 * read_idx advancing is the definitive completion
+		 * signal. The per-descriptor status byte is informational
+		 * and may not yet be written when we observe it:
+		 *
+		 *   AE4DMA_DMA_DESC_ERROR (4)
+		 *     Hard failure - err_code names the precise cause.
+		 *   AE4DMA_DMA_DESC_COMPLETED (3) or 0
+		 *     Success.
+		 *   AE4DMA_DMA_DESC_VALIDATED (1) / _PROCESSED (2)
+		 *     Benign race: HW had not finished updating the
+		 *     status byte at the instant we read it. Since
+		 *     read_idx has moved past this slot, treat it as
+		 *     success unless err_code says otherwise.
+		 *
+		 * A non-zero err_code is treated as a failure regardless
+		 * of the observed status value.
+		 */
+		if (hw_status == AE4DMA_DMA_DESC_ERROR ||
+				hw_err != AE4DMA_DMA_ERR_NO_ERR) {
+			fails++;
+			AE4DMA_PMD_WARN("Desc failed: status=%u err=%u",
+					hw_status, hw_err);
+		}
+		cmd_q->status[events_count] = (enum ae4dma_dma_err)hw_err;
+		cmd_q->ring_buff_count--;
+		events_count++;
+		tail = (tail + 1) & mask;
+	}
+
+	cmd_q->stats.completed += events_count;
+	cmd_q->stats.errors += fails;
+	cmd_q->next_read = tail;
+	*failed_count = fails;
+	return events_count;
+}
+
+/* Returns successful operations count and sets error flag if any errors. */
+static uint16_t
+ae4dma_completed(void *dev_private, uint16_t vchan __rte_unused,
+		const uint16_t max_ops, uint16_t *last_idx, bool *has_error)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t cpl_count, sl_count;
+	uint16_t err_count = 0;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	*has_error = false;
+
+	cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+	if (cpl_count > max_ops)
+		cpl_count = max_ops;
+
+	if (cpl_count > 0 && last_idx != NULL)
+		*last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+	sl_count = cpl_count - err_count;
+	if (err_count)
+		*has_error = true;
+
+	return sl_count;
+}
+
+static uint16_t
+ae4dma_completed_status(void *dev_private, uint16_t vchan __rte_unused,
+		uint16_t max_ops, uint16_t *last_idx,
+		enum rte_dma_status_code *status)
+{
+	struct ae4dma_dmadev *ae4dma = dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t cpl_count;
+	uint16_t i;
+	uint16_t err_count = 0;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	cpl_count = ae4dma_scan_hwq(cmd_q, max_ops, &err_count);
+
+	if (cpl_count > max_ops)
+		cpl_count = max_ops;
+
+	if (cpl_count > 0 && last_idx != NULL)
+		*last_idx = (uint16_t)((cmd_q->next_read - 1 + nb) % nb);
+
+	if (likely(err_count == 0)) {
+		for (i = 0; i < cpl_count; i++)
+			status[i] = RTE_DMA_STATUS_SUCCESSFUL;
+	} else {
+		for (i = 0; i < cpl_count; i++)
+			status[i] = __translate_status_ae4dma_to_dma(cmd_q->status[i]);
+	}
+
+	return cpl_count;
+}
+
+/* Get the remaining capacity of the ring. */
+static uint16_t
+ae4dma_burst_capacity(const void *dev_private, uint16_t vchan __rte_unused)
+{
+	const struct ae4dma_dmadev *ae4dma = dev_private;
+	const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+	uint16_t mask;
+	uint16_t read_idx = cmd_q->next_read;
+	uint16_t write_idx = cmd_q->next_write;
+	uint16_t used;
+
+	if (nb < 2 || !rte_is_power_of_2(nb))
+		return 0;
+
+	mask = nb - 1;
+	used = (uint16_t)((write_idx - read_idx) & mask);
+	/* One slot reserved (same rule as enqueue). */
+	if (used >= nb - 1)
+		return 0;
+	return (uint16_t)(nb - 1 - used);
+}
+
 static int
 ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
 		struct rte_dma_stats *rte_stats, uint32_t size)
@@ -342,6 +622,13 @@ ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn)
 	dmadev->fp_obj->dev_private = dmadev->data->dev_private;
 	dmadev->dev_ops = &ae4dma_dmadev_ops;
 
+	dmadev->fp_obj->burst_capacity = ae4dma_burst_capacity;
+	dmadev->fp_obj->completed = ae4dma_completed;
+	dmadev->fp_obj->completed_status = ae4dma_completed_status;
+	dmadev->fp_obj->copy = ae4dma_enqueue_copy;
+	dmadev->fp_obj->submit = ae4dma_submit;
+	/* fill capability not advertised: leave fp_obj->fill as zero-initialised. */
+
 	ae4dma = dmadev->data->dev_private;
 
 	if (ae4dma_add_queue(ae4dma, dev, qn, name) != 0)
-- 
2.34.1


^ permalink raw reply related

* [PATCH v3 2/3] dma/ae4dma: add control path operations
From: Raghavendra Ningoji @ 2026-06-25 18:47 UTC (permalink / raw)
  To: dev
  Cc: david.marchand, bruce.richardson, fengchengwen, Selwin.Sebastian,
	bhagyada.modali, rjarry, thomas, Raghavendra Ningoji
In-Reply-To: <20260625184728.1678328-1-raghavendra.ningoji@amd.com>

Implement the dmadev control path for the AMD AE4DMA PMD.

This commit adds:
 - dev_configure / vchan_setup: accept a single virtual channel per
   dmadev and clamp the requested ring size to the hardware maximum
   of 32 descriptors (rounded up to a power of two).
 - dev_start / dev_stop / dev_close: program the per-queue control
   register to enable/disable the hardware queue and release the
   descriptor ring memzone on close.
 - dev_info_get: advertise RTE_DMA_CAPA_MEM_TO_MEM and the fixed
   ring depth.
 - dev_dump: print the queue identifiers, ring layout and software
   completion counters.
 - stats_get / stats_reset: expose submitted / completed / errors
   counters maintained by the driver.
 - vchan_status: report IDLE / ACTIVE based on hardware read_idx vs
   write_idx, and HALTED_ERROR when the queue is not enabled.

The dmadev framework is wired through dev_ops in ae4dma_dmadev_create().

Signed-off-by: Raghavendra Ningoji <raghavendra.ningoji@amd.com>
---
 drivers/dma/ae4dma/ae4dma_dmadev.c | 211 +++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)

diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c b/drivers/dma/ae4dma/ae4dma_dmadev.c
index 3d82f86906..607f288623 100644
--- a/drivers/dma/ae4dma/ae4dma_dmadev.c
+++ b/drivers/dma/ae4dma/ae4dma_dmadev.c
@@ -53,6 +53,203 @@ ae4dma_queue_dma_zone_reserve(const char *queue_name,
 			socket_id, RTE_MEMZONE_IOVA_CONTIG, queue_size);
 }
 
+static int
+ae4dma_dev_configure(struct rte_dma_dev *dev __rte_unused,
+		const struct rte_dma_conf *dev_conf,
+		uint32_t conf_sz)
+{
+	if (sizeof(struct rte_dma_conf) != conf_sz)
+		return -EINVAL;
+
+	if (dev_conf->nb_vchans != 1)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Setup a virtual channel for AE4DMA, only 1 vchan is supported per dmadev. */
+static int
+ae4dma_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
+		const struct rte_dma_vchan_conf *qconf, uint32_t qconf_sz)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t max_desc = qconf->nb_desc;
+
+	if (sizeof(struct rte_dma_vchan_conf) != qconf_sz)
+		return -EINVAL;
+
+	if (max_desc < 2)
+		return -EINVAL;
+
+	if (!rte_is_power_of_2(max_desc))
+		max_desc = rte_align32pow2(max_desc);
+
+	if (max_desc > AE4DMA_DESCRIPTORS_PER_CMDQ) {
+		AE4DMA_PMD_DEBUG("DMA dev %u nb_desc clamped to %u",
+				dev->data->dev_id, AE4DMA_DESCRIPTORS_PER_CMDQ);
+		max_desc = AE4DMA_DESCRIPTORS_PER_CMDQ;
+	}
+
+	cmd_q->qcfg = *qconf;
+	cmd_q->qcfg.nb_desc = max_desc;
+
+	/* Ensure all counters are reset, if reconfiguring/restarting device. */
+	memset(&cmd_q->stats, 0, sizeof(cmd_q->stats));
+	return 0;
+}
+
+static int
+ae4dma_dev_start(struct rte_dma_dev *dev)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint16_t nb = cmd_q->qcfg.nb_desc;
+
+	if (nb == 0)
+		return -EBUSY;
+
+	/* Program ring depth expected by hardware. */
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->max_idx, nb);
+	return 0;
+}
+
+static int
+ae4dma_dev_stop(struct rte_dma_dev *dev)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+
+	if (cmd_q->hwq_regs != NULL)
+		AE4DMA_WRITE_REG(&cmd_q->hwq_regs->control_reg.control_raw,
+				AE4DMA_CMD_QUEUE_DISABLE);
+	return 0;
+}
+
+static int
+ae4dma_dev_info_get(const struct rte_dma_dev *dev __rte_unused,
+		struct rte_dma_info *info, uint32_t size)
+{
+	if (size < sizeof(*info))
+		return -EINVAL;
+	info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM;
+	info->max_vchans = 1;
+	info->min_desc = 2;
+	info->max_desc = AE4DMA_DESCRIPTORS_PER_CMDQ;
+	info->nb_vchans = 1;
+	return 0;
+}
+
+static int
+ae4dma_dev_close(struct rte_dma_dev *dev)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+
+	if (cmd_q->hwq_regs != NULL)
+		AE4DMA_WRITE_REG(&cmd_q->hwq_regs->control_reg.control_raw,
+				AE4DMA_CMD_QUEUE_DISABLE);
+
+	rte_memzone_free(cmd_q->mz);
+	cmd_q->mz = NULL;
+	cmd_q->qbase_desc = NULL;
+	cmd_q->qbase_addr = NULL;
+	cmd_q->qbase_phys_addr = 0;
+	return 0;
+}
+
+static int
+ae4dma_dev_dump(const struct rte_dma_dev *dev, FILE *f)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q;
+	void *ae4dma_mmio_base_addr = (uint8_t *)ae4dma->io_regs;
+
+	cmd_q = &ae4dma->cmd_q;
+	fprintf(f, "cmd_q->id              = %" PRIx64 "\n", cmd_q->id);
+	fprintf(f, "cmd_q->qidx            = %" PRIx64 "\n", cmd_q->qidx);
+	fprintf(f, "cmd_q->qsize           = %" PRIx64 "\n", cmd_q->qsize);
+	fprintf(f, "mmio_base_addr	= %p\n", ae4dma_mmio_base_addr);
+	fprintf(f, "queues per ae4dma engine     = %d\n", AE4DMA_READ_REG_OFFSET(
+				ae4dma_mmio_base_addr, AE4DMA_COMMON_CONFIG_OFFSET));
+	fprintf(f, "== Private Data ==\n");
+	fprintf(f, "  Config: { ring_size: %u }\n", cmd_q->qcfg.nb_desc);
+	fprintf(f, "  Ring virt: %p\tphys: %#" PRIx64 "\n",
+			(void *)cmd_q->qbase_desc,
+			(uint64_t)cmd_q->qbase_phys_addr);
+	fprintf(f, "  Next write: %u\n", cmd_q->next_write);
+	fprintf(f, "  Next read: %u\n", cmd_q->next_read);
+	fprintf(f, "  current queue depth: %u\n", cmd_q->ring_buff_count);
+	fprintf(f, "  }\n");
+	fprintf(f, "  Key Stats { submitted: %" PRIu64 ", comp: %" PRIu64 ", failed: %" PRIu64 " }\n",
+		cmd_q->stats.submitted,
+		cmd_q->stats.completed,
+		cmd_q->stats.errors);
+	return 0;
+}
+static int
+ae4dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
+		struct rte_dma_stats *rte_stats, uint32_t size)
+{
+	const struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	const struct rte_dma_stats *stats = &cmd_q->stats;
+
+	if (size < sizeof(*rte_stats))
+		return -EINVAL;
+	if (rte_stats == NULL)
+		return -EINVAL;
+
+	*rte_stats = *stats;
+	return 0;
+}
+
+static int
+ae4dma_stats_reset(struct rte_dma_dev *dev, uint16_t vchan __rte_unused)
+{
+	struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+
+	memset(&cmd_q->stats, 0, sizeof(cmd_q->stats));
+	return 0;
+}
+
+/*
+ * Report channel state to the dmadev framework.
+ *
+ *   RTE_DMA_VCHAN_HALTED_ERROR - HW queue is disabled (never started, or
+ *                                stopped via dev_stop()).
+ *   RTE_DMA_VCHAN_IDLE         - HW has caught up: read_idx == write_idx,
+ *                                no descriptors in flight.
+ *   RTE_DMA_VCHAN_ACTIVE       - HW still has descriptors to process.
+ */
+static int
+ae4dma_vchan_status(const struct rte_dma_dev *dev, uint16_t vchan __rte_unused,
+		enum rte_dma_vchan_status *status)
+{
+	const struct ae4dma_dmadev *ae4dma = dev->fp_obj->dev_private;
+	const struct ae4dma_cmd_queue *cmd_q = &ae4dma->cmd_q;
+	uint32_t ctrl, hw_read, hw_write;
+
+	if (cmd_q->hwq_regs == NULL) {
+		*status = RTE_DMA_VCHAN_HALTED_ERROR;
+		return 0;
+	}
+
+	ctrl = AE4DMA_READ_REG(&cmd_q->hwq_regs->control_reg.control_raw);
+	if ((ctrl & AE4DMA_CMD_QUEUE_ENABLE) == 0) {
+		*status = RTE_DMA_VCHAN_HALTED_ERROR;
+		return 0;
+	}
+
+	hw_read  = AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx);
+	hw_write = AE4DMA_READ_REG(&cmd_q->hwq_regs->write_idx);
+
+	*status = (hw_read == hw_write) ? RTE_DMA_VCHAN_IDLE
+					: RTE_DMA_VCHAN_ACTIVE;
+	return 0;
+}
+
 static int
 ae4dma_add_queue(struct ae4dma_dmadev *dev, struct rte_pci_device *pci,
 		uint8_t qn, const char *pci_name)
@@ -115,6 +312,19 @@ ae4dma_channel_dev_name(char *out, size_t outlen, const char *pci_name,
 static int
 ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn)
 {
+	static const struct rte_dma_dev_ops ae4dma_dmadev_ops = {
+		.dev_close = ae4dma_dev_close,
+		.dev_configure = ae4dma_dev_configure,
+		.dev_dump = ae4dma_dev_dump,
+		.dev_info_get = ae4dma_dev_info_get,
+		.dev_start = ae4dma_dev_start,
+		.dev_stop = ae4dma_dev_stop,
+		.stats_get = ae4dma_stats_get,
+		.stats_reset = ae4dma_stats_reset,
+		.vchan_status = ae4dma_vchan_status,
+		.vchan_setup = ae4dma_vchan_setup,
+	};
+
 	struct rte_dma_dev *dmadev;
 	struct ae4dma_dmadev *ae4dma;
 	char hwq_dev_name[RTE_DEV_NAME_MAX_LEN];
@@ -130,6 +340,7 @@ ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn)
 	}
 	dmadev->device = &dev->device;
 	dmadev->fp_obj->dev_private = dmadev->data->dev_private;
+	dmadev->dev_ops = &ae4dma_dmadev_ops;
 
 	ae4dma = dmadev->data->dev_private;
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v3 1/3] dma/ae4dma: introduce AMD AE4DMA DMA PMD
From: Raghavendra Ningoji @ 2026-06-25 18:47 UTC (permalink / raw)
  To: dev
  Cc: david.marchand, bruce.richardson, fengchengwen, Selwin.Sebastian,
	bhagyada.modali, rjarry, thomas, Raghavendra Ningoji
In-Reply-To: <20260625184728.1678328-1-raghavendra.ningoji@amd.com>

Add the skeleton of a new dmadev poll-mode driver for the AMD AE4DMA
hardware DMA engine, providing only PCI probe/remove and per-queue
hardware initialisation. An AE4DMA engine exposes 16 hardware command
queues, each with a 32-entry descriptor ring; the PMD maps each
hardware channel to its own dmadev with a single virtual channel,
so a PCI function appears as 16 dmadevs named "<pci-bdf>-ch0" ..
"<pci-bdf>-ch15".

This patch only registers the PCI driver, allocates the dmadev
objects, reserves the per-queue descriptor rings and programs the
hardware queue base addresses. Control and data path operations are
added in subsequent patches.

Signed-off-by: Raghavendra Ningoji <raghavendra.ningoji@amd.com>
---
 .mailmap                               |   1 +
 MAINTAINERS                            |   5 +
 doc/guides/dmadevs/ae4dma.rst          |  53 ++++++
 doc/guides/dmadevs/index.rst           |   1 +
 doc/guides/rel_notes/release_26_07.rst |   7 +
 drivers/dma/ae4dma/ae4dma_dmadev.c     | 220 +++++++++++++++++++++++++
 drivers/dma/ae4dma/ae4dma_hw_defs.h    | 154 +++++++++++++++++
 drivers/dma/ae4dma/ae4dma_internal.h   |  97 +++++++++++
 drivers/dma/ae4dma/meson.build         |   7 +
 drivers/dma/meson.build                |   1 +
 usertools/dpdk-devbind.py              |   5 +-
 11 files changed, 550 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/dmadevs/ae4dma.rst
 create mode 100644 drivers/dma/ae4dma/ae4dma_dmadev.c
 create mode 100644 drivers/dma/ae4dma/ae4dma_hw_defs.h
 create mode 100644 drivers/dma/ae4dma/ae4dma_internal.h
 create mode 100644 drivers/dma/ae4dma/meson.build

diff --git a/.mailmap b/.mailmap
index 89ba6ffccc..71a62564fa 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1329,6 +1329,7 @@ Radu Bulie <radu-andrei.bulie@nxp.com>
 Radu Nicolau <radu.nicolau@intel.com>
 Rafael Ávila de Espíndola <espindola@scylladb.com>
 Rafal Kozik <rk@semihalf.com>
+Raghavendra Ningoji <raghavendra.ningoji@amd.com>
 Ragothaman Jayaraman <rjayaraman@caviumnetworks.com>
 Rahul Bhansali <rbhansali@marvell.com>
 Rahul Gupta <rahul.gupta@broadcom.com>
diff --git a/MAINTAINERS b/MAINTAINERS
index 9143d028bc..2e27af49f4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1361,6 +1361,11 @@ F: doc/guides/compressdevs/features/zsda.ini
 DMAdev Drivers
 --------------
 
+AMD AE4DMA
+M: Bhagyada Modali <bhagyada.modali@amd.com>
+F: drivers/dma/ae4dma/
+F: doc/guides/dmadevs/ae4dma.rst
+
 Intel IDXD - EXPERIMENTAL
 M: Bruce Richardson <bruce.richardson@intel.com>
 M: Kevin Laatz <kevin.laatz@intel.com>
diff --git a/doc/guides/dmadevs/ae4dma.rst b/doc/guides/dmadevs/ae4dma.rst
new file mode 100644
index 0000000000..a85c1d92ca
--- /dev/null
+++ b/doc/guides/dmadevs/ae4dma.rst
@@ -0,0 +1,53 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+    Copyright(c) 2025 Advanced Micro Devices, Inc.
+
+.. include:: <isonum.txt>
+
+AMD AE4DMA DMA Device Driver
+============================
+
+The ``ae4dma`` dmadev driver is a poll-mode driver (PMD) for the
+AMD AE4DMA hardware DMA engine. The engine exposes 16 independent
+hardware command queues, each with a ring of 32 descriptors. The PMD
+maps each hardware command queue to a separate DPDK dmadev with a
+single virtual channel, so a single PCI function appears as 16 dmadevs
+named ``<pci-bdf>-ch0`` through ``<pci-bdf>-ch15``.
+
+The driver supports memory-to-memory copy operations only.
+
+Hardware Requirements
+---------------------
+
+The ``dpdk-devbind.py`` script can be used to list AE4DMA devices on
+the system::
+
+   dpdk-devbind.py --status-dev dma
+
+AE4DMA devices appear with vendor ID ``0x1022`` and device ID
+``0x149b``.
+
+Compilation
+-----------
+
+The driver is built as part of the standard DPDK build on x86 platforms
+using ``meson`` and ``ninja``; no extra configuration is required.
+
+Device Setup
+------------
+
+The AE4DMA device must be bound to a DPDK-compatible kernel module such
+as ``vfio-pci`` before it can be used::
+
+   dpdk-devbind.py -b vfio-pci <pci-bdf>
+
+Initialization
+~~~~~~~~~~~~~~
+
+On probe the PMD performs the following steps for each PCI function:
+
+* Reads BAR0 and programs the common configuration register with the
+  number of hardware queues to enable (16).
+* For each hardware queue it allocates a 32-entry descriptor ring in
+  IOVA-contiguous memory, programs the queue base address and ring
+  depth into the per-queue registers, and enables the queue.
+* Interrupts are masked; completion is polled by the application.
diff --git a/doc/guides/dmadevs/index.rst b/doc/guides/dmadevs/index.rst
index 56beb1733f..97399590f6 100644
--- a/doc/guides/dmadevs/index.rst
+++ b/doc/guides/dmadevs/index.rst
@@ -11,6 +11,7 @@ an application through DMA API.
    :maxdepth: 1
    :numbered:
 
+   ae4dma
    cnxk
    dpaa
    dpaa2
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index f012d47a4b..9a78a7ef62 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -63,6 +63,13 @@ New Features
     ``rte_eal_init`` and the application is responsible for probing each device,
   * ``--auto-probing`` enables the initial bus probing, which is the current default behavior.
 
+* **Added AMD AE4DMA DMA PMD.**
+
+  Added a new ``dma/ae4dma`` driver for the AMD AE4DMA hardware DMA engine.
+  Each PCI function exposes 16 hardware command queues; the PMD registers one
+  dmadev per channel with a single virtual channel and supports
+  memory-to-memory copy operations.
+
 
 Removed Items
 -------------
diff --git a/drivers/dma/ae4dma/ae4dma_dmadev.c b/drivers/dma/ae4dma/ae4dma_dmadev.c
new file mode 100644
index 0000000000..3d82f86906
--- /dev/null
+++ b/drivers/dma/ae4dma/ae4dma_dmadev.c
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_bus_pci.h>
+#include <bus_pci_driver.h>
+#include <rte_dmadev_pmd.h>
+#include <rte_malloc.h>
+
+#include "ae4dma_internal.h"
+
+/*
+ * One dmadev per AE4DMA hardware channel; each dmadev has exactly one
+ * virtual channel. The HW's per-queue register block must be densely
+ * packed right after the engine-common config register at BAR0+0; the
+ * build-time check below catches an accidental layout change.
+ */
+static_assert(sizeof(struct ae4dma_hwq_regs) == 32,
+		"ae4dma_hwq_regs stride changed; per-queue offset math will break");
+
+RTE_LOG_REGISTER_DEFAULT(ae4dma_pmd_logtype, INFO);
+
+#define AE4DMA_PMD_NAME dmadev_ae4dma
+
+static const struct rte_memzone *
+ae4dma_queue_dma_zone_reserve(const char *queue_name,
+		uint32_t queue_size, int socket_id)
+{
+	const struct rte_memzone *mz;
+
+	mz = rte_memzone_lookup(queue_name);
+	if (mz != NULL) {
+		if (((size_t)queue_size <= mz->len) &&
+				((socket_id == SOCKET_ID_ANY) ||
+				 (socket_id == mz->socket_id))) {
+			AE4DMA_PMD_INFO("reuse memzone already "
+					"allocated for %s", queue_name);
+			return mz;
+		}
+		AE4DMA_PMD_ERR("Incompatible memzone already "
+				"allocated %s, size %u, socket %d. "
+				"Requested size %u, socket %u",
+				queue_name, (uint32_t)mz->len,
+				mz->socket_id, queue_size, socket_id);
+		return NULL;
+	}
+	return rte_memzone_reserve_aligned(queue_name, queue_size,
+			socket_id, RTE_MEMZONE_IOVA_CONTIG, queue_size);
+}
+
+static int
+ae4dma_add_queue(struct ae4dma_dmadev *dev, struct rte_pci_device *pci,
+		uint8_t qn, const char *pci_name)
+{
+	uint32_t dma_addr_lo, dma_addr_hi;
+	struct ae4dma_cmd_queue *cmd_q;
+	const struct rte_memzone *q_mz;
+
+	dev->io_regs = pci->mem_resource[AE4DMA_PCIE_BAR].addr;
+
+	cmd_q = &dev->cmd_q;
+	cmd_q->id = qn;
+	cmd_q->qidx = 0;
+	cmd_q->qsize = AE4DMA_QUEUE_SIZE(AE4DMA_QUEUE_DESC_SIZE);
+	cmd_q->hwq_regs = (volatile struct ae4dma_hwq_regs *)dev->io_regs + (qn + 1);
+
+	/*
+	 * Memzone name must be globally unique. Embed PCI BDF so multiple
+	 * PCI functions probed concurrently don't collide.
+	 */
+	snprintf(cmd_q->memz_name, sizeof(cmd_q->memz_name),
+			"ae4dma_%s_q%u", pci_name, (unsigned int)qn);
+
+	q_mz = ae4dma_queue_dma_zone_reserve(cmd_q->memz_name,
+			cmd_q->qsize, rte_socket_id());
+	if (q_mz == NULL) {
+		AE4DMA_PMD_ERR("memzone reserve failed for %s", cmd_q->memz_name);
+		return -ENOMEM;
+	}
+
+	cmd_q->mz = q_mz;
+	cmd_q->qbase_addr = q_mz->addr;
+	cmd_q->qbase_desc = q_mz->addr;
+	cmd_q->qbase_phys_addr = q_mz->iova;
+
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->max_idx, AE4DMA_DESCRIPTORS_PER_CMDQ);
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->control_reg.control_raw,
+			AE4DMA_CMD_QUEUE_ENABLE);
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->intr_status_reg.intr_status_raw,
+			AE4DMA_DISABLE_INTR);
+	cmd_q->next_write = AE4DMA_READ_REG(&cmd_q->hwq_regs->write_idx);
+	cmd_q->next_read = AE4DMA_READ_REG(&cmd_q->hwq_regs->read_idx);
+	cmd_q->ring_buff_count = 0;
+
+	dma_addr_lo = lower_32_bits(cmd_q->qbase_phys_addr);
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->qbase_lo, dma_addr_lo);
+	dma_addr_hi = upper_32_bits(cmd_q->qbase_phys_addr);
+	AE4DMA_WRITE_REG(&cmd_q->hwq_regs->qbase_hi, dma_addr_hi);
+
+	return 0;
+}
+
+static void
+ae4dma_channel_dev_name(char *out, size_t outlen, const char *pci_name,
+		unsigned int ch)
+{
+	snprintf(out, outlen, "%s-ch%u", pci_name, ch);
+}
+
+static int
+ae4dma_dmadev_create(const char *name, struct rte_pci_device *dev, uint8_t qn)
+{
+	struct rte_dma_dev *dmadev;
+	struct ae4dma_dmadev *ae4dma;
+	char hwq_dev_name[RTE_DEV_NAME_MAX_LEN];
+
+	memset(hwq_dev_name, 0, sizeof(hwq_dev_name));
+	ae4dma_channel_dev_name(hwq_dev_name, sizeof(hwq_dev_name), name, qn);
+
+	dmadev = rte_dma_pmd_allocate(hwq_dev_name, dev->device.numa_node,
+			sizeof(struct ae4dma_dmadev));
+	if (dmadev == NULL) {
+		AE4DMA_PMD_ERR("Unable to allocate dma device");
+		return -ENOMEM;
+	}
+	dmadev->device = &dev->device;
+	dmadev->fp_obj->dev_private = dmadev->data->dev_private;
+
+	ae4dma = dmadev->data->dev_private;
+
+	if (ae4dma_add_queue(ae4dma, dev, qn, name) != 0)
+		goto init_error;
+	return 0;
+
+init_error:
+	AE4DMA_PMD_ERR("failed");
+	rte_dma_pmd_release(hwq_dev_name);
+	return -ENOMEM;
+}
+
+static int
+ae4dma_dmadev_probe(struct rte_pci_driver *drv __rte_unused,
+		struct rte_pci_device *dev)
+{
+	char name[32];
+	char chname[RTE_DEV_NAME_MAX_LEN];
+	void *mmio_base;
+	uint32_t q_per_eng;
+	int ret = 0;
+	uint8_t i;
+
+	rte_pci_device_name(&dev->addr, name, sizeof(name));
+	AE4DMA_PMD_INFO("Init %s on NUMA node %d", name, dev->device.numa_node);
+
+	mmio_base = dev->mem_resource[AE4DMA_PCIE_BAR].addr;
+	if (mmio_base == NULL) {
+		AE4DMA_PMD_ERR("%s: BAR%d not mapped", name, AE4DMA_PCIE_BAR);
+		return -ENODEV;
+	}
+
+	/* Program the per-engine HW queue count once. */
+	AE4DMA_WRITE_REG_OFFSET(mmio_base, AE4DMA_COMMON_CONFIG_OFFSET,
+			AE4DMA_MAX_HW_QUEUES);
+	q_per_eng = AE4DMA_READ_REG_OFFSET(mmio_base, AE4DMA_COMMON_CONFIG_OFFSET);
+	AE4DMA_PMD_INFO("%s: AE4DMA queues per engine = %u", name, q_per_eng);
+
+	for (i = 0; i < AE4DMA_MAX_HW_QUEUES; i++) {
+		ret = ae4dma_dmadev_create(name, dev, i);
+		if (ret != 0) {
+			AE4DMA_PMD_ERR("%s create dmadev %u failed!", name, i);
+			while (i > 0) {
+				i--;
+				ae4dma_channel_dev_name(chname, sizeof(chname), name, i);
+				rte_dma_pmd_release(chname);
+			}
+			break;
+		}
+	}
+	return ret;
+}
+
+static int
+ae4dma_dmadev_remove(struct rte_pci_device *dev)
+{
+	char name[32];
+	char chname[RTE_DEV_NAME_MAX_LEN];
+	unsigned int i;
+
+	rte_pci_device_name(&dev->addr, name, sizeof(name));
+
+	AE4DMA_PMD_INFO("Closing %s on NUMA node %d",
+			name, dev->device.numa_node);
+
+	for (i = 0; i < AE4DMA_MAX_HW_QUEUES; i++) {
+		ae4dma_channel_dev_name(chname, sizeof(chname), name, i);
+		rte_dma_pmd_release(chname);
+	}
+	return 0;
+}
+
+static const struct rte_pci_id pci_id_ae4dma_map[] = {
+	{ RTE_PCI_DEVICE(AMD_VENDOR_ID, AE4DMA_DEVICE_ID) },
+	{ .vendor_id = 0, /* sentinel */ },
+};
+
+static struct rte_pci_driver ae4dma_pmd_drv = {
+	.id_table = pci_id_ae4dma_map,
+	.drv_flags = RTE_PCI_DRV_NEED_MAPPING,
+	.probe = ae4dma_dmadev_probe,
+	.remove = ae4dma_dmadev_remove,
+};
+
+RTE_PMD_REGISTER_PCI(AE4DMA_PMD_NAME, ae4dma_pmd_drv);
+RTE_PMD_REGISTER_PCI_TABLE(AE4DMA_PMD_NAME, pci_id_ae4dma_map);
+RTE_PMD_REGISTER_KMOD_DEP(AE4DMA_PMD_NAME, "* igb_uio | uio_pci_generic | vfio-pci");
diff --git a/drivers/dma/ae4dma/ae4dma_hw_defs.h b/drivers/dma/ae4dma/ae4dma_hw_defs.h
new file mode 100644
index 0000000000..e7798be09b
--- /dev/null
+++ b/drivers/dma/ae4dma/ae4dma_hw_defs.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#ifndef __AE4DMA_HW_DEFS_H__
+#define __AE4DMA_HW_DEFS_H__
+
+#include <stdint.h>
+
+#include <rte_bus_pci.h>
+#include <rte_byteorder.h>
+#include <rte_io.h>
+#include <rte_pci.h>
+#include <rte_memzone.h>
+
+#define AE4DMA_BIT(nr)			(1UL << (nr))
+
+/* ae4dma device details */
+#define AMD_VENDOR_ID	0x1022
+#define AE4DMA_DEVICE_ID	0x149b
+#define AE4DMA_PCIE_BAR 0
+
+/*
+ * An AE4DMA engine has 16 DMA queues. Each queue supports 32 descriptors.
+ */
+#define AE4DMA_MAX_HW_QUEUES        16
+#define AE4DMA_QUEUE_START_INDEX    0
+#define AE4DMA_CMD_QUEUE_ENABLE		0x1
+#define AE4DMA_CMD_QUEUE_DISABLE	0x0
+
+/* Common to all queues */
+#define AE4DMA_COMMON_CONFIG_OFFSET 0x00
+
+#define AE4DMA_DISABLE_INTR 0x01
+
+/* Descriptor status */
+enum ae4dma_dma_status {
+	AE4DMA_DMA_DESC_SUBMITTED = 0,
+	AE4DMA_DMA_DESC_VALIDATED = 1,
+	AE4DMA_DMA_DESC_PROCESSED = 2,
+	AE4DMA_DMA_DESC_COMPLETED = 3,
+	AE4DMA_DMA_DESC_ERROR = 4,
+};
+
+/* Descriptor error-code */
+enum ae4dma_dma_err {
+	AE4DMA_DMA_ERR_NO_ERR = 0,
+	AE4DMA_DMA_ERR_INV_HEADER = 1,
+	AE4DMA_DMA_ERR_INV_STATUS = 2,
+	AE4DMA_DMA_ERR_INV_LEN = 3,
+	AE4DMA_DMA_ERR_INV_SRC = 4,
+	AE4DMA_DMA_ERR_INV_DST = 5,
+	AE4DMA_DMA_ERR_INV_ALIGN = 6,
+	AE4DMA_DMA_ERR_UNKNOWN = 7,
+};
+
+/* HW Queue status */
+enum ae4dma_hwqueue_status {
+	AE4DMA_HWQUEUE_EMPTY = 0,
+	AE4DMA_HWQUEUE_FULL = 1,
+	AE4DMA_HWQUEUE_NOT_EMPTY = 4,
+};
+/*
+ * descriptor for AE4DMA commands
+ * 8 32-bit words:
+ * word 0: source memory type; destination memory type ; control bits
+ * word 1: desc_id; error code; status
+ * word 2: length
+ * word 3: reserved
+ * word 4: upper 32 bits of source pointer
+ * word 5: low 32 bits of source pointer
+ * word 6: upper 32 bits of destination pointer
+ * word 7: low 32 bits of destination pointer
+ */
+
+/* AE4DMA Descriptor - DWORD0 - Controls bits: Reserved for future use */
+#define AE4DMA_DWORD0_STOP_ON_COMPLETION	AE4DMA_BIT(0)
+#define AE4DMA_DWORD0_INTERRUPT_ON_COMPLETION	AE4DMA_BIT(1)
+#define AE4DMA_DWORD0_START_OF_MESSAGE		AE4DMA_BIT(3)
+#define AE4DMA_DWORD0_END_OF_MESSAGE		AE4DMA_BIT(4)
+#define AE4DMA_DWORD0_DESTINATION_MEMORY_TYPE	RTE_GENMASK64(5, 4)
+#define AE4DMA_DWORD0_SOURCE_MEMEORY_TYPE	RTE_GENMASK64(7, 6)
+
+#define AE4DMA_DWORD0_DESTINATION_MEMORY_TYPE_MEMORY    (0x0)
+#define AE4DMA_DWORD0_DESTINATION_MEMORY_TYPE_IOMEMORY  (1<<4)
+#define AE4DMA_DWORD0_SOURCE_MEMEORY_TYPE_MEMORY    (0x0)
+#define AE4DMA_DWORD0_SOURCE_MEMEORY_TYPE_IOMEMORY  (1<<6)
+
+struct ae4dma_desc_dword0 {
+	uint8_t byte0;
+	uint8_t byte1;
+	uint16_t timestamp;
+};
+
+struct ae4dma_desc_dword1 {
+	uint8_t status;
+	uint8_t err_code;
+	uint16_t desc_id;
+};
+
+struct ae4dma_desc {
+	struct ae4dma_desc_dword0 dw0;
+	struct ae4dma_desc_dword1 dw1;
+	uint32_t length;
+	uint32_t reserved;
+	uint32_t src_lo;
+	uint32_t src_hi;
+	uint32_t dst_lo;
+	uint32_t dst_hi;
+};
+
+/*
+ * Registers for each queue :4 bytes length
+ * Effective address : offset + reg
+ */
+struct ae4dma_hwq_regs {
+	union {
+		uint32_t control_raw;
+		struct {
+			uint32_t queue_enable: 1;
+			uint32_t reserved_internal: 31;
+		} control;
+	} control_reg;
+
+	union {
+		uint32_t status_raw;
+		struct {
+			uint32_t reserved0: 1;
+			/* 0–empty, 1–full, 2–stopped, 3–error , 4–Not Empty */
+			uint32_t queue_status: 2;
+			uint32_t reserved1: 21;
+			uint32_t interrupt_type: 4;
+			uint32_t reserved2: 4;
+		} status;
+	} status_reg;
+
+	uint32_t max_idx;
+	uint32_t read_idx;
+	uint32_t write_idx;
+
+	union {
+		uint32_t intr_status_raw;
+		struct {
+			uint32_t intr_status: 1;
+			uint32_t reserved: 31;
+		} intr_status;
+	} intr_status_reg;
+
+	uint32_t qbase_lo;
+	uint32_t qbase_hi;
+
+};
+
+#endif /* AE4DMA_HW_DEFS_H */
diff --git a/drivers/dma/ae4dma/ae4dma_internal.h b/drivers/dma/ae4dma/ae4dma_internal.h
new file mode 100644
index 0000000000..7f149c97b5
--- /dev/null
+++ b/drivers/dma/ae4dma/ae4dma_internal.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#ifndef _AE4DMA_INTERNAL_H_
+#define _AE4DMA_INTERNAL_H_
+
+#include <stdint.h>
+
+#include "ae4dma_hw_defs.h"
+
+/* Return bits 32-63 of a 64-bit number. */
+#define upper_32_bits(n) ((uint32_t)(((n) >> 16) >> 16))
+
+/* Return bits 0-31 of a 64-bit number. */
+#define lower_32_bits(n) ((uint32_t)((n) & 0xffffffff))
+
+/* Hardware ring depth (slots per queue); must be power of two. */
+#define AE4DMA_DESCRIPTORS_PER_CMDQ	32
+#define AE4DMA_QUEUE_DESC_SIZE		sizeof(struct ae4dma_desc)
+#define AE4DMA_QUEUE_SIZE(n)		(AE4DMA_DESCRIPTORS_PER_CMDQ * (n))
+
+
+/* AE4DMA registers Write/Read */
+static inline void ae4dma_pci_reg_write(void *base, int offset,
+		uint32_t value)
+{
+	volatile void *reg_addr = ((uint8_t *)base + offset);
+
+	rte_write32((rte_cpu_to_le_32(value)), reg_addr);
+}
+
+static inline uint32_t ae4dma_pci_reg_read(void *base, int offset)
+{
+	volatile void *reg_addr = ((uint8_t *)base + offset);
+
+	return rte_le_to_cpu_32(rte_read32(reg_addr));
+}
+
+#define AE4DMA_READ_REG_OFFSET(hw_addr, reg_offset) \
+	ae4dma_pci_reg_read(hw_addr, reg_offset)
+
+#define AE4DMA_WRITE_REG_OFFSET(hw_addr, reg_offset, value) \
+	ae4dma_pci_reg_write(hw_addr, reg_offset, value)
+
+
+#define AE4DMA_READ_REG(hw_addr) \
+	ae4dma_pci_reg_read((void *)(uintptr_t)(hw_addr), 0)
+
+#define AE4DMA_WRITE_REG(hw_addr, value) \
+	ae4dma_pci_reg_write((void *)(uintptr_t)(hw_addr), 0, value)
+
+/* A structure describing an AE4DMA command queue. */
+struct __rte_cache_aligned ae4dma_cmd_queue {
+	char memz_name[RTE_MEMZONE_NAMESIZE];
+	const struct rte_memzone *mz;
+	volatile struct ae4dma_hwq_regs *hwq_regs;
+
+	struct rte_dma_vchan_conf qcfg;
+	struct rte_dma_stats stats;
+	/* Queue address */
+	struct ae4dma_desc *qbase_desc;
+	void *qbase_addr;
+	rte_iova_t qbase_phys_addr;
+	enum ae4dma_dma_err status[AE4DMA_DESCRIPTORS_PER_CMDQ];
+	/* Queue identifier */
+	uint64_t id;    /* queue id */
+	uint64_t qidx;  /* queue index */
+	uint64_t qsize; /* queue size */
+	uint32_t ring_buff_count;
+	uint16_t next_read;
+	uint16_t next_write;
+	uint16_t last_write; /* Used to compute submitted count. */
+};
+
+/*
+ * One dmadev per AE4DMA hardware channel: probe creates AE4DMA_MAX_HW_QUEUES
+ * dmadevs per PCI function, each owning a single HW command queue.
+ */
+struct ae4dma_dmadev {
+	void *io_regs;
+	struct ae4dma_cmd_queue cmd_q; /* single HW queue owned by this dmadev */
+};
+
+
+extern int ae4dma_pmd_logtype;
+#define RTE_LOGTYPE_AE4DMA_PMD ae4dma_pmd_logtype
+
+#define AE4DMA_PMD_LOG(level, ...) \
+	RTE_LOG_LINE_PREFIX(level, AE4DMA_PMD, "%s(): ", __func__, __VA_ARGS__)
+
+#define AE4DMA_PMD_DEBUG(...)  AE4DMA_PMD_LOG(DEBUG, __VA_ARGS__)
+#define AE4DMA_PMD_INFO(...)   AE4DMA_PMD_LOG(INFO, __VA_ARGS__)
+#define AE4DMA_PMD_ERR(...)    AE4DMA_PMD_LOG(ERR, __VA_ARGS__)
+#define AE4DMA_PMD_WARN(...)   AE4DMA_PMD_LOG(WARNING, __VA_ARGS__)
+
+#endif /* _AE4DMA_INTERNAL_H_ */
diff --git a/drivers/dma/ae4dma/meson.build b/drivers/dma/ae4dma/meson.build
new file mode 100644
index 0000000000..e48ab0d561
--- /dev/null
+++ b/drivers/dma/ae4dma/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+build = dpdk_conf.has('RTE_ARCH_X86')
+reason = 'only supported on x86'
+sources = files('ae4dma_dmadev.c')
+deps += ['bus_pci', 'dmadev']
diff --git a/drivers/dma/meson.build b/drivers/dma/meson.build
index e0d94db967..c230ac5a06 100644
--- a/drivers/dma/meson.build
+++ b/drivers/dma/meson.build
@@ -2,6 +2,7 @@
 # Copyright 2021 HiSilicon Limited
 
 drivers = [
+        'ae4dma',
         'cnxk',
         'dpaa',
         'dpaa2',
diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index 93f2383dff..7d09f155de 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -86,6 +86,9 @@
 cn9k_ree = {'Class': '08', 'Vendor': '177d', 'Device': 'a0f4',
             'SVendor': None, 'SDevice': None}
 
+amd_ae4dma = {'Class': '08', 'Vendor': '1022', 'Device': '149b',
+              'SVendor': None, 'SDevice': None}
+
 virtio_blk = {'Class': '01', 'Vendor': "1af4", 'Device': '1001,1042',
               'SVendor': None, 'SDevice': None}
 
@@ -95,7 +98,7 @@
 network_devices = [network_class, cavium_pkx, avp_vnic, ifpga_class]
 baseband_devices = [acceleration_class]
 crypto_devices = [encryption_class, intel_processor_class]
-dma_devices = [cnxk_dma, hisilicon_dma,
+dma_devices = [amd_ae4dma, cnxk_dma, hisilicon_dma,
                intel_idxd_gnrd, intel_idxd_dmr, intel_idxd_spr,
                intel_ioat_bdw, intel_ioat_icx, intel_ioat_skx,
                odm_dma]
-- 
2.34.1


^ permalink raw reply related

* [PATCH v3 0/3] dma/ae4dma: add AMD AE4DMA DMA PMD
From: Raghavendra Ningoji @ 2026-06-25 18:47 UTC (permalink / raw)
  To: dev
  Cc: david.marchand, bruce.richardson, fengchengwen, Selwin.Sebastian,
	bhagyada.modali, rjarry, thomas, Raghavendra Ningoji
In-Reply-To: <20260525184244.1758825-1-raghavendra.ningoji@amd.com>

This series adds a new dmadev poll-mode driver for the AMD AE4DMA
hardware DMA engine. An AE4DMA engine exposes 16 hardware command
queues, each with a 32-entry descriptor ring; the PMD maps each
hardware channel to its own dmadev with a single virtual channel,
so a PCI function appears as 16 dmadevs named "<pci-bdf>-ch0" ..
"<pci-bdf>-ch15".

Driver characteristics:

 - Memory-to-memory copy operations only (RTE_DMA_CAPA_MEM_TO_MEM).
 - Completion is detected via the hardware's per-queue read_idx
   register, which the engine advances as it processes descriptors.
   The descriptor status / err_code bytes are read only to classify
   each drained slot as success or failure.
 - vchan_status reports IDLE/ACTIVE based on HW read_idx vs write_idx
   and HALTED_ERROR when the queue is not enabled.
 - depends on bus_pci and dmadev.

The driver is introduced in three logical patches, following the
pattern of the recent hisi_acc dmadev driver:

  1/3 - introduce driver (probe, remove, per-queue HW init)
  2/3 - add control path operations (dev_ops)
  3/3 - add data path operations (copy, submit, completion)

Changes in v3:
 - Address review comments from David Marchand on patch 1/3 and 2/3:
     * Track the descriptor-ring memzone in the queue structure and
       free it directly in dev_close() instead of re-resolving it by
       name (also fixes the potential leak noted on probe/unplug).
     * Drop the unused back-references (ae4dma->dmadev, ae4dma->pci);
       pass the rte_pci_device to ae4dma_add_queue() instead.
     * Stop setting dev->device.driver in probe(); EAL sets it on
       probe() success since commit f282771a04ef.
     * Remove the redundant NULL name check in the single-caller
       helper, the needless NULL initialisers and the __func__ in the
       error log (already added by the log macro), and the NULL test
       before rte_memzone_free().
     * Remove the info->dev_name assignment (set by rte_dma_info_get()).
     * Replace the unsigned-long low32_value()/high32_value() helpers
       with lower_32_bits()/upper_32_bits() and drop the redundant
       index casts.
     * ae4dma_hw_defs.h: include <stdint.h>, drop the C++ guards and
       add the missing trailing enum comma; ae4dma_internal.h: convert
       doxygen comments to plain comments.
     * Remove trivial "what" comments throughout.
 - Reorder the .mailmap entry into alphabetical position.
 - Naming/architecture (16 dmadevs per PCI function, "<bdf>-chX")
   acknowledged as acceptable by Chengwen Feng and Bruce Richardson;
   kept unchanged.

Changes in v2:
 - Split the monolithic v1 patch into three logical patches
   (introduce / control path / data path), mirroring the
   structure used by drivers/dma/hisi_acc.
 - Fix checkpatches.sh warnings in ae4dma_internal.h (RTE_LOG_LINE_PREFIX,
   C99 __VA_ARGS__, __rte_cache_aligned placement).

v1: https://patches.dpdk.org/project/dpdk/patch/20260518181856.1228373-1-raghavendra.ningoji@amd.com/
v2: https://patches.dpdk.org/project/dpdk/patch/20260525184244.1758825-1-raghavendra.ningoji@amd.com/

Raghavendra Ningoji (3):
  dma/ae4dma: introduce AMD AE4DMA DMA PMD
  dma/ae4dma: add control path operations
  dma/ae4dma: add data path operations

 .mailmap                               |   1 +
 MAINTAINERS                            |   5 +
 doc/guides/dmadevs/ae4dma.rst          |  75 +++
 doc/guides/dmadevs/index.rst           |   1 +
 doc/guides/rel_notes/release_26_07.rst |   7 +
 drivers/dma/ae4dma/ae4dma_dmadev.c     | 718 +++++++++++++++++++++++++
 drivers/dma/ae4dma/ae4dma_hw_defs.h    | 154 ++++++
 drivers/dma/ae4dma/ae4dma_internal.h   |  97 ++++
 drivers/dma/ae4dma/meson.build         |   7 +
 drivers/dma/meson.build                |   1 +
 usertools/dpdk-devbind.py              |   5 +-
 11 files changed, 1070 insertions(+), 1 deletion(-)
 create mode 100644 doc/guides/dmadevs/ae4dma.rst
 create mode 100644 drivers/dma/ae4dma/ae4dma_dmadev.c
 create mode 100644 drivers/dma/ae4dma/ae4dma_hw_defs.h
 create mode 100644 drivers/dma/ae4dma/ae4dma_internal.h
 create mode 100644 drivers/dma/ae4dma/meson.build


base-commit: f724d1c0d1c1636b9c171c34db3f17c3defaa2f3
-- 
2.34.1


^ permalink raw reply

* Re: [PATCH v2 1/3] dma/ae4dma: introduce AMD AE4DMA DMA PMD
From: Raghavendra Ningoji @ 2026-06-25 18:43 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Thomas Monjalon, Bhagyada Modali, Robin Jarry,
	Selwin.Sebastian, Bruce Richardson
In-Reply-To: <CAJFAV8xPs4KiHJ5koucQyfEUk0S77zGQ1jM3LxtQvT2qxyX=nw@mail.gmail.com>

On Mon, 22 Jun 2026 at 14:26, David Marchand <david.marchand@redhat.com> wrote:
>
> > +Raghavendra Ningoji <raghavendra.ningoji@amd.com>
>
> Almost missed this.
> Alphabetical order please.

Fixed in v3 using devtools/mailmap-ctl.py (thanks Bruce for the
pointer); the entry now sits between "Rafal Kozik" and "Ragothaman
Jayaraman".

Thanks,
Raghavendra

^ permalink raw reply

* Re: [PATCH v2 2/3] dma/ae4dma: add control path operations
From: Raghavendra Ningoji @ 2026-06-25 18:42 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Thomas Monjalon, Bhagyada Modali, Robin Jarry,
	Selwin.Sebastian
In-Reply-To: <CAJFAV8w6YVzh6QEThWAq50SccApMtAzC=okPV7Hph5EvD7cYgA@mail.gmail.com>

On Mon, 22 Jun 2026 at 14:15, David Marchand <david.marchand@redhat.com> wrote:
>
> > +       info->dev_name = dev->device->name;
>
> The dmadev library sets this field in rte_dma_info_get().
> Please remove.

Removed.

> > +               const struct rte_memzone *mz = rte_memzone_lookup(cmd_q->memz_name);
>
> Rather than resolve again, can't you store the reference to the
> memzone in the priv pointer at probe time?

Done. The memzone reference is stored in cmd_q->mz at probe time (in
patch 1/3) and dev_close() now frees cmd_q->mz directly without a
lookup.

> > +               if (mz != NULL)
> > +                       rte_memzone_free(mz);
>
> No need to test for NULL.

Removed; rte_memzone_free(cmd_q->mz) is called unconditionally.

Sent as v3.

Thanks,
Raghavendra

^ permalink raw reply

* Re: [PATCH v2 1/3] dma/ae4dma: introduce AMD AE4DMA DMA PMD
From: Raghavendra Ningoji @ 2026-06-25 18:41 UTC (permalink / raw)
  To: David Marchand
  Cc: dev, Thomas Monjalon, Bhagyada Modali, Robin Jarry,
	Selwin.Sebastian, Chengwen Feng, Bruce Richardson
In-Reply-To: <CAJFAV8w_67sp9iGW9+Gpwxx0ZkDYc4Zc2JKDtsPFFccU0UHePg@mail.gmail.com>

On Mon, 22 Jun 2026 at 14:06, David Marchand <david.marchand@redhat.com> wrote:
>
> Here is a superficial review.
>
> Many places are fishy when it comes to integer/pointer casts: I only
> raised a few comments on this topic.

Thanks for the review. I went through the cast usage as well; the
low32_value()/high32_value() helpers (which took an unsigned long and
were therefore broken on LLP64) are gone in v3, replaced by
lower_32_bits()/upper_32_bits() on the rte_iova_t value, and the
redundant index casts are removed. Replies inline.

> > +       q_mz = ae4dma_queue_dma_zone_reserve(cmd_q->memz_name,
> > +                       cmd_q->qsize, rte_socket_id());
>
> I see no tracking of q_mz, so I suspect this memzone is leaked on
> device probing failure, and/or unplugging.

The memzone is now stored in cmd_q->mz at probe time and freed directly
in dev_close(). dev_close() is reached on the unplug path too
(remove() -> rte_dma_pmd_release() -> rte_dma_close()), so the ring is
no longer leaked.

> > +       cmd_q->next_write = (uint16_t)AE4DMA_READ_REG(...);
>
> Strange that you need to cast.

Removed; next_read/next_write/last_write are uint16_t and the registers
are read into them without an explicit cast in v3.

> > +/* Create a dmadev(dpdk DMA device) */
>
> This is a general comment for the patch: let's avoid Lapalissade /
> trivial comments that adds nothing.

Removed the trivial "what" comments across the series.

> > +       struct rte_dma_dev *dmadev = NULL;
> > +       struct ae4dma_dmadev *ae4dma = NULL;
>
> Those variables do not need any explicit setting to NULL [...]

Done.

> > +       if (!name) {
>
> [...] This is a static helper called internally from a single
> location, remove the check.

Removed.

> > +       ae4dma->dmadev = dmadev;
>
> [...] this field is never used in the series. Please remove.

Removed the field and the assignment.

> > +       ae4dma->pci = dev;
>
> [...] no user of this field in the series, please remove.

Removed. ae4dma_add_queue() now takes the rte_pci_device pointer as an
argument instead.

> > +init_error:
> > +       AE4DMA_PMD_ERR("driver %s(): failed", __func__);
>
> __func__ is already part of AE4DMA_PMD_LOG.

Dropped __func__ from the message.

> > +       dev->device.driver = &drv->driver;
>
> Setting the driver pointer in the device object is not the driver
> responsibility anymore [...]. EAL will set this field on probe()
> success.

Removed; the drv argument is now __rte_unused.

> > +#ifndef __AE4DMA_HW_DEFS_H__
>
> Is this header autosufficient ? I see references to uint32_t below,
> so this header probably depends on stdint.h.

Added #include <stdint.h>.

> > +#ifdef __cplusplus
> > +extern "C" {
>
> Do we really need C++ guards?

Removed (internal header).

> > +       AE4DMA_HWQUEUE_NOT_EMPTY = 4
>
> For consistency with other enums, add a comma.

Done.

> > +/**
>
> This is an internal header, we don't need doxygen style comments,
> simple comments are enough.

Converted the doxygen comments to plain comments.

Sent as v3.

Thanks,
Raghavendra

^ permalink raw reply

* [PATCH v4 2/2] dts: add build arguments to test run configuration
From: Koushik Bhargav Nimoji @ 2026-06-25 18:15 UTC (permalink / raw)
  To: luca.vizzarro, patrickrobb1997
  Cc: dev, abailey, ahassick, lylavoie, Koushik Bhargav Nimoji
In-Reply-To: <20260625181557.2331771-1-knimoji@iol.unh.edu>

This patch adds the ability to specify build arguments when building DPDK
through DTS. Doing so allows users to build DPDK with the desired build
arguments, which allows for a more configurable DTS run.

Signed-off-by: Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
---
 dts/configurations/test_run.example.yaml | 13 +++++++++++++
 dts/framework/config/test_run.py         |  2 ++
 dts/framework/remote_session/dpdk.py     | 12 ++++++++----
 dts/framework/utils.py                   | 21 ++++++++++++++++++++-
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/dts/configurations/test_run.example.yaml b/dts/configurations/test_run.example.yaml
index ee641f5dce..0bd5151801 100644
--- a/dts/configurations/test_run.example.yaml
+++ b/dts/configurations/test_run.example.yaml
@@ -16,6 +16,8 @@
 #       `precompiled_build_dir` or `build_options` can be defined, but not both.
 #   `compiler_wrapper`:
 #       Optional, adds a compiler wrapper if present.
+#   `build_args`:
+#       The additional build arguments to be used when building DPDK.
 #   `func_traffic_generator` & `perf_traffic_generator`:
 #       Define `func_traffic_generator` when `func` set to true.
 #       Define `perf_traffic_generator` when `perf` set to true.
@@ -40,6 +42,17 @@ dpdk:
       # the combination of the following two makes CC="ccache gcc"
       compiler: gcc
       compiler_wrapper: ccache # see `Optional Fields`
+      # arguments to be used when building DPDK
+      # build_args:
+      #   c_args:
+      #     - O3
+      #     - g
+      #   b_coverage:
+      #     - "true"
+      #   buildtype:
+      #     - release
+      #   flags:
+      #     - strip
 func_traffic_generator:
   type: SCAPY
 # perf_traffic_generator:
diff --git a/dts/framework/config/test_run.py b/dts/framework/config/test_run.py
index 76e24d1785..eab12041fc 100644
--- a/dts/framework/config/test_run.py
+++ b/dts/framework/config/test_run.py
@@ -191,6 +191,8 @@ class DPDKBuildOptionsConfiguration(FrozenModel):
     #: This string will be put in front of the compiler when executing the build. Useful for adding
     #: wrapper commands, such as ``ccache``.
     compiler_wrapper: str = ""
+    #: The build arguments to build dpdk with
+    build_args: dict[str, list[str]] = {}
 
 
 class DPDKUncompiledBuildConfiguration(BaseDPDKBuildConfiguration):
diff --git a/dts/framework/remote_session/dpdk.py b/dts/framework/remote_session/dpdk.py
index 1e6d8a01b5..b8485b9c3c 100644
--- a/dts/framework/remote_session/dpdk.py
+++ b/dts/framework/remote_session/dpdk.py
@@ -108,8 +108,8 @@ def setup(self) -> None:
                         "Cannot create code coverage report using a precompiled build directory."
                     )
                 self._set_remote_dpdk_build_dir(build_dir)
-            case DPDKUncompiledBuildConfiguration(build_options=build_options):
-                self._configure_dpdk_build(build_options)
+            case DPDKUncompiledBuildConfiguration():
+                self._configure_dpdk_build(self.config.build_options)
                 self._build_dpdk()
 
     def teardown(self) -> None:
@@ -285,16 +285,20 @@ def _build_dpdk(self) -> None:
         `remote_dpdk_tree_path` has already been set on the SUT node.
         """
         ctx = get_ctx()
+        build_options = getattr(self.config, "build_options")
         # If the SUT is an ice driver device, make sure to build with 16B descriptors.
         if (
             ctx.topology.sut_port_ingress
             and ctx.topology.sut_port_ingress.config.os_driver == "ice"
         ):
             meson_args = MesonArgs(
-                default_library="static", libdir="lib", c_args="-DRTE_NET_INTEL_USE_16BYTE_DESC"
+                build_options.build_args,
+                default_library="static",
+                libdir="lib",
+                c_args="-DRTE_NET_INTEL_USE_16BYTE_DESC",
             )
         else:
-            meson_args = MesonArgs(default_library="static", libdir="lib")
+            meson_args = MesonArgs(build_options.build_args, default_library="static", libdir="lib")
 
         if SETTINGS.code_coverage:
             meson_args._add_arg("-Db_coverage=true")
diff --git a/dts/framework/utils.py b/dts/framework/utils.py
index fb6b95271e..be528bb4da 100644
--- a/dts/framework/utils.py
+++ b/dts/framework/utils.py
@@ -99,10 +99,16 @@ class MesonArgs:
 
     _default_library: str
 
-    def __init__(self, default_library: str | None = None, **dpdk_args: str | bool):
+    def __init__(
+        self,
+        dpdk_build_args: dict[str, list[str]],
+        default_library: str | None = None,
+        **dpdk_args: str | bool,
+    ):
         """Initialize the meson arguments.
 
         Args:
+            dpdk_build_args: The DPDK build arguments specified in the test run configuration file.
             default_library: The default library type, Meson supports ``shared``, ``static`` and
                 ``both``. Defaults to :data:`None`, in which case the argument won't be used.
             dpdk_args: The arguments found in ``meson_options.txt`` in root DPDK directory.
@@ -121,6 +127,19 @@ def __init__(self, default_library: str | None = None, **dpdk_args: str | bool):
             )
         )
 
+        arguments = []
+        for option, value in dpdk_build_args.items():
+            if option == "c_args":
+                values = " ".join(f"-{val}" for val in value)
+                arguments.append(f'-D{option}="{values}"')
+            elif option == "flags":
+                values = " ".join(f"--{val}" for val in value)
+                arguments.append(values)
+            else:
+                arguments.append(f" -D{option}={value[0]}")
+
+        self._dpdk_args = " ".join(arguments)
+
     def __str__(self) -> str:
         """The actual args."""
         return " ".join(f"{self._default_library} {self._dpdk_args}".split())
-- 
2.54.0


^ permalink raw reply related

* [PATCH v4 1/2] dts: add code coverage reporting to DTS
From: Koushik Bhargav Nimoji @ 2026-06-25 18:15 UTC (permalink / raw)
  To: luca.vizzarro, patrickrobb1997
  Cc: dev, abailey, ahassick, lylavoie, Koushik Bhargav Nimoji
In-Reply-To: <20260522154637.952588-1-knimoji@iol.unh.edu>

Previously, DTS had no code coverage. This patch adds a command line
argument in order to build DPDK with code coverage enabled. This allows
users to create and view code coverage reports of what code and functions
were called during a DTS run.

Signed-off-by: Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
---
v2:
    *Fixed error in lcov/gcov tool detection
v3:
    *Fixed type hints and error message typos
v4:
    *Fixed documentation and docstring comments
    *Added a check to make sure code coverage is
     not enabled on a DTS run with a precompiled
     build directory    
---
 .mailmap                                      |  1 +
 doc/guides/tools/dts.rst                      | 18 +++++++++++++
 dts/README.md                                 |  5 ++++
 dts/framework/remote_session/dpdk.py          | 27 +++++++++++++++++++
 .../remote_session/remote_session.py          |  5 +++-
 dts/framework/settings.py                     | 10 +++++++
 dts/framework/testbed_model/os_session.py     | 10 +++++++
 dts/framework/testbed_model/posix_session.py  | 22 +++++++++++++++
 dts/framework/utils.py                        |  8 ++++++
 9 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/.mailmap b/.mailmap
index e052b85213..a1209150ad 100644
--- a/.mailmap
+++ b/.mailmap
@@ -877,6 +877,7 @@ Klaus Degner <kd@allegro-packets.com>
 Kommula Shiva Shankar <kshankar@marvell.com>
 Konstantin Ananyev <konstantin.ananyev@huawei.com> <konstantin.v.ananyev@yandex.ru>
 Konstantin Ananyev <konstantin.ananyev@huawei.com> <konstantin.ananyev@intel.com>
+Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
 Krishna Murthy <krishna.j.murthy@intel.com>
 Krzysztof Galazka <krzysztof.galazka@intel.com>
 Krzysztof Kanas <kkanas@marvell.com> <krzysztof.kanas@caviumnetworks.com>
diff --git a/doc/guides/tools/dts.rst b/doc/guides/tools/dts.rst
index 5b9a348016..0ffebdc713 100644
--- a/doc/guides/tools/dts.rst
+++ b/doc/guides/tools/dts.rst
@@ -352,6 +352,10 @@ DTS is run with ``main.py`` located in the ``dts`` directory using the ``poetry
      --precompiled-build-dir DIR_NAME
                            [DTS_PRECOMPILED_BUILD_DIR] Define the subdirectory under the DPDK tree root directory or tarball where the pre-
                            compiled binaries are located. (default: None)
+     --code-coverage       Builds DPDK on the SUT node with code coverage enabled. Generates a code coverage report which can be found on
+                           the DTS execution hosts local filesystem at dts/output/coverage_reports/meson-logs/coveragereport/index.html,
+                           or the specified output directory. To use code coverage, please ensure lcov v1.15 and gcov v8.0 or higher
+                           (included in gcc package) are installed on the SUT node.
 
 
 The brackets contain the names of environment variables that set the same thing.
@@ -367,6 +371,20 @@ Results are stored in the output dir by default
 which be changed with the ``--output-dir`` command line argument.
 The results contain basic statistics of passed/failed test cases and DPDK version.
 
+Code Coverage
+~~~~~~~~~~~~~
+
+DTS has the ablilty to track code usage during test runs, and generate an HTML
+coverage report which shows the coverage percentage for the various DPDK
+libraries and drivers utilized during execution. The DPDK build directory must
+be compiled on the SUT node, as a pre-built build directory may not be properly
+configured for code coverage. Code coverage can be enabled by using the
+"--code-coverage" CLI parameter when running DTS.
+
+To use code coverage, please make sure the following dependencies are available
+on the SUT node:
+- lcov v1.15 or greater
+- gcov v8.0 or greater (included in gcc package)
 
 Contributing to DTS
 -------------------
diff --git a/dts/README.md b/dts/README.md
index d257b7a167..51f824e077 100644
--- a/dts/README.md
+++ b/dts/README.md
@@ -64,6 +64,11 @@ $ poetry run ./main.py
 These commands will give you a bash shell inside a docker container
 with all DTS Python dependencies installed.
 
+# Code Coverage
+
+To generate code coverage reports, ensure the SUT has lcov v1.15 and gcov v8.0 or greater
+installed, and that DTS is run using the '--code-coverage' argument.
+
 ## Visual Studio Code
 
 Usage of VScode devcontainers is NOT required for developing on DTS and running DTS,
diff --git a/dts/framework/remote_session/dpdk.py b/dts/framework/remote_session/dpdk.py
index c3575cfcaf..1e6d8a01b5 100644
--- a/dts/framework/remote_session/dpdk.py
+++ b/dts/framework/remote_session/dpdk.py
@@ -29,6 +29,7 @@
 from framework.logger import DTSLogger, get_dts_logger
 from framework.params.eal import EalParams
 from framework.remote_session.remote_session import CommandResult
+from framework.settings import SETTINGS
 from framework.testbed_model.cpu import LogicalCore, LogicalCoreCount, LogicalCoreList, lcore_filter
 from framework.testbed_model.node import Node
 from framework.testbed_model.os_session import OSSession
@@ -80,6 +81,10 @@ def setup(self) -> None:
         DPDK setup includes setting all internals needed for the build, the copying of DPDK
         sources and then building DPDK or using the exist ones from the `dpdk_location`. The drivers
         are bound to those that DPDK needs.
+
+        Raises:
+            ConfigurationError: When DTS is run with code coverage enabled, but is also provided
+            a precompiled build directory.
         """
         if not isinstance(self.config.dpdk_location, RemoteDPDKTreeLocation):
             self._node.main_session.create_directory(self.remote_dpdk_tree_path)
@@ -98,6 +103,10 @@ def setup(self) -> None:
 
         match self.config:
             case DPDKPrecompiledBuildConfiguration(precompiled_build_dir=build_dir):
+                if SETTINGS.code_coverage:
+                    raise ConfigurationError(
+                        "Cannot create code coverage report using a precompiled build directory."
+                    )
                 self._set_remote_dpdk_build_dir(build_dir)
             case DPDKUncompiledBuildConfiguration(build_options=build_options):
                 self._configure_dpdk_build(build_options)
@@ -107,7 +116,22 @@ def teardown(self) -> None:
         """Teardown the DPDK build on the target node.
 
         Removes the DPDK tree and/or build directory/tarball depending on the configuration.
+        If code coverage is enabled, the coverage report and .info file are generated and
+        copied onto the local filesystem before teardown.
         """
+        if SETTINGS.code_coverage:
+            report_folder = PurePath(self.remote_dpdk_build_dir / "meson-logs")
+            output_dir = SETTINGS.output_dir
+            Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+            coverage_status = self._session.generate_coverage_report(self.remote_dpdk_build_dir)
+            if coverage_status:
+                self._session.copy_dir_from(report_folder, output_dir)
+                self._logger.info(
+                    "Coverage HTML report generated, "
+                    f"available at {output_dir}/meson-logs/coveragereports/index.html"
+                )
+
         match self.config.dpdk_location:
             case LocalDPDKTreeLocation():
                 self._node.main_session.remove_remote_dir(self.remote_dpdk_tree_path)
@@ -272,6 +296,9 @@ def _build_dpdk(self) -> None:
         else:
             meson_args = MesonArgs(default_library="static", libdir="lib")
 
+        if SETTINGS.code_coverage:
+            meson_args._add_arg("-Db_coverage=true")
+
         self._session.build_dpdk(
             self._env_vars,
             meson_args,
diff --git a/dts/framework/remote_session/remote_session.py b/dts/framework/remote_session/remote_session.py
index 158325bb7f..d2440dc2d8 100644
--- a/dts/framework/remote_session/remote_session.py
+++ b/dts/framework/remote_session/remote_session.py
@@ -252,7 +252,10 @@ def copy_from(self, source_file: str | PurePath, destination_dir: str | Path) ->
             destination_dir: The directory path on the local filesystem where the `source_file`
                 will be saved.
         """
-        self.session.get(str(source_file), str(destination_dir))
+        source_file = PurePath(source_file)
+        destination_dir = Path(destination_dir)
+        local_path = destination_dir / source_file.name
+        self.session.get(str(source_file), str(local_path))
 
     def copy_to(self, source_file: str | Path, destination_dir: str | PurePath) -> None:
         """Copy a file from local filesystem to the remote Node.
diff --git a/dts/framework/settings.py b/dts/framework/settings.py
index b08373b7ea..7df535bd84 100644
--- a/dts/framework/settings.py
+++ b/dts/framework/settings.py
@@ -159,6 +159,8 @@ class Settings:
     re_run: int = 0
     #:
     random_seed: int | None = None
+    #:
+    code_coverage: bool = False
 
 
 SETTINGS: Settings = Settings()
@@ -489,6 +491,14 @@ def _get_parser() -> _DTSArgumentParser:
     )
     _add_env_var_to_action(action)
 
+    action = parser.add_argument(
+        "--code-coverage",
+        action="store_true",
+        default=False,
+        help="Used to build DPDK with code coverage enabled.",
+    )
+    _add_env_var_to_action(action)
+
     return parser
 
 
diff --git a/dts/framework/testbed_model/os_session.py b/dts/framework/testbed_model/os_session.py
index f2dc9b20a9..c2874051a7 100644
--- a/dts/framework/testbed_model/os_session.py
+++ b/dts/framework/testbed_model/os_session.py
@@ -480,6 +480,16 @@ def build_dpdk(
             timeout: Wait at most this long in seconds for the build execution to complete.
         """
 
+    @abstractmethod
+    def generate_coverage_report(self, remote_build_dir: PurePath | None) -> bool:
+        """Generates a code coverage report for a DTS run.
+
+        Args:
+            remote_build_dir: The remote DPDK build directory
+        Returns:
+            Whether the coverage report was able to be created or not.
+        """
+
     @abstractmethod
     def get_dpdk_version(self, version_path: str | PurePath) -> str:
         """Inspect the DPDK version on the remote node.
diff --git a/dts/framework/testbed_model/posix_session.py b/dts/framework/testbed_model/posix_session.py
index dec952685a..d18ce27de2 100644
--- a/dts/framework/testbed_model/posix_session.py
+++ b/dts/framework/testbed_model/posix_session.py
@@ -295,6 +295,28 @@ def build_dpdk(
         except RemoteCommandExecutionError as e:
             raise DPDKBuildError(f"DPDK build failed when doing '{e.command}'.")
 
+    def generate_coverage_report(self, remote_build_dir: PurePath | None) -> bool:
+        """Overrides :meth:`~.os_session.OSSession.generate_coverage_report`."""
+        command_result = self.send_command(r"lcov --version | grep -oP '\d+\.\d+'")
+        lcov_version = float(
+            command_result.stdout if command_result.return_code == 0 and command_result else -1
+        )
+        command_result = self.send_command(
+            r"gcov --version | head -n 1 | grep -oP '\d+\.\d+' | tail -n 1"
+        )
+        gcov_version = float(
+            command_result.stdout if command_result.return_code == 0 and command_result else -1
+        )
+
+        if lcov_version >= 1.15 and gcov_version >= 8.0:
+            self.send_command(f"ninja -C {remote_build_dir} coverage-html", timeout=600)
+            return True
+        else:
+            self._logger.info(
+                "Unable to generate code coverage report, ensure lcov v1.15 and at least gcov v8.0"
+            )
+            return False
+
     def get_dpdk_version(self, build_dir: str | PurePath) -> str:
         """Overrides :meth:`~.os_session.OSSession.get_dpdk_version`."""
         out = self.send_command(f"cat {self.join_remote_path(build_dir, 'VERSION')}", verify=True)
diff --git a/dts/framework/utils.py b/dts/framework/utils.py
index 9917ffbfaa..fb6b95271e 100644
--- a/dts/framework/utils.py
+++ b/dts/framework/utils.py
@@ -125,6 +125,14 @@ def __str__(self) -> str:
         """The actual args."""
         return " ".join(f"{self._default_library} {self._dpdk_args}".split())
 
+    def _add_arg(self, arg: str):
+        """Adds an argument to the meson setup command.
+
+        Args:
+            arg: The meson build argument to be added.
+        """
+        self._dpdk_args = self._dpdk_args + " " + arg
+
 
 class TarCompressionFormat(StrEnum):
     """Compression formats that tar can use.
-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 6/6] examples: remove unused define
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Akhil Goyal, Fan Zhang, Sunil Kumar Kori,
	Pavan Nikhilesh, Bruce Richardson, Anatoly Burakov,
	Sivaprasad Tummala
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

The #define MAX_TX_QUEUE_PER_PORT was copy/pasted across
all the examples but never used.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 examples/l2fwd-crypto/main.c          | 1 -
 examples/l2fwd-event/l2fwd_common.h   | 1 -
 examples/l2fwd-jobstats/main.c        | 1 -
 examples/l2fwd-keepalive/main.c       | 1 -
 examples/l2fwd-macsec/main.c          | 1 -
 examples/l2fwd/main.c                 | 2 +-
 examples/l3fwd-power/main.c           | 1 -
 examples/link_status_interrupt/main.c | 1 -
 8 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/examples/l2fwd-crypto/main.c b/examples/l2fwd-crypto/main.c
index ff189b5fab..aab356aa6a 100644
--- a/examples/l2fwd-crypto/main.c
+++ b/examples/l2fwd-crypto/main.c
@@ -100,7 +100,6 @@ struct op_buffer {
 };
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 
 enum l2fwd_crypto_xform_chain {
 	L2FWD_CRYPTO_CIPHER_HASH,
diff --git a/examples/l2fwd-event/l2fwd_common.h b/examples/l2fwd-event/l2fwd_common.h
index f4f1c45cd1..53774e0fe6 100644
--- a/examples/l2fwd-event/l2fwd_common.h
+++ b/examples/l2fwd-event/l2fwd_common.h
@@ -43,7 +43,6 @@
 
 #define MAX_PKT_BURST 32
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 
 #define RX_DESC_DEFAULT 1024
 #define TX_DESC_DEFAULT 1024
diff --git a/examples/l2fwd-jobstats/main.c b/examples/l2fwd-jobstats/main.c
index f2c284cebb..856914e9f2 100644
--- a/examples/l2fwd-jobstats/main.c
+++ b/examples/l2fwd-jobstats/main.c
@@ -64,7 +64,6 @@ static uint32_t l2fwd_dst_ports[RTE_MAX_ETHPORTS];
 static unsigned int l2fwd_rx_queue_per_lcore = 1;
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 /* List of queues to be polled for given lcore. 8< */
 struct __rte_cache_aligned lcore_queue_conf {
 	unsigned n_rx_port;
diff --git a/examples/l2fwd-keepalive/main.c b/examples/l2fwd-keepalive/main.c
index 12ca60c3e4..fde997f778 100644
--- a/examples/l2fwd-keepalive/main.c
+++ b/examples/l2fwd-keepalive/main.c
@@ -65,7 +65,6 @@ static uint32_t l2fwd_dst_ports[RTE_MAX_ETHPORTS];
 static unsigned int l2fwd_rx_queue_per_lcore = 1;
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 struct __rte_cache_aligned lcore_queue_conf {
 	unsigned n_rx_port;
 	unsigned rx_port_list[MAX_RX_QUEUE_PER_LCORE];
diff --git a/examples/l2fwd-macsec/main.c b/examples/l2fwd-macsec/main.c
index 98763440bc..e40e5b6284 100644
--- a/examples/l2fwd-macsec/main.c
+++ b/examples/l2fwd-macsec/main.c
@@ -83,7 +83,6 @@ static uint16_t nb_port_pair_params;
 static unsigned int l2fwd_rx_queue_per_lcore = 1;
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 /* List of queues to be polled for a given lcore. 8< */
 struct __rte_cache_aligned lcore_queue_conf {
 	unsigned int n_rx_port;
diff --git a/examples/l2fwd/main.c b/examples/l2fwd/main.c
index 59ea3172ae..1c4a89ae90 100644
--- a/examples/l2fwd/main.c
+++ b/examples/l2fwd/main.c
@@ -80,7 +80,7 @@ static uint16_t nb_port_pair_params;
 static unsigned int l2fwd_rx_queue_per_lcore = 1;
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
+
 /* List of queues to be polled for a given lcore. 8< */
 struct __rte_cache_aligned lcore_queue_conf {
 	unsigned n_rx_port;
diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index ff0e61e639..0915ed5658 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -221,7 +221,6 @@ struct __rte_cache_aligned lcore_rx_queue {
 };
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
 #define MAX_RX_QUEUE_INTERRUPT_PER_PORT 16
diff --git a/examples/link_status_interrupt/main.c b/examples/link_status_interrupt/main.c
index aa33e71d7a..52bfb3c35e 100644
--- a/examples/link_status_interrupt/main.c
+++ b/examples/link_status_interrupt/main.c
@@ -63,7 +63,6 @@ static unsigned lsi_dst_ports[RTE_MAX_ETHPORTS] = {0};
 #define MAX_PKT_BURST 32
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 /* List of queues must be polled for a give lcore. 8< */
 struct __rte_cache_aligned lcore_queue_conf {
 	unsigned n_rx_port;
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 5/6] examples/ipv4_multicast: remove redundant Tx queue limit
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev; +Cc: Sivaprasad Tummala, stable
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

From: Sivaprasad Tummala <sivaprasad.tummala@amd.com>

In `ipv4_multicast` application, Tx queues are configured per lcore
to enable a lockless design and achieve optimal performance.

The `MAX_TX_QUEUE_PER_PORT` macro, defined as `RTE_MAX_ETHPORTS`,
introduced an artificial constraint on the number of Tx queues
and limited core-scaling performance.

This patch removes the unused `MAX_TX_QUEUE_PER_PORT` macro and
redundant Tx queue check, allowing Tx queues to scale directly
with the no. of lcores.

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Sivaprasad Tummala <sivaprasad.tummala@amd.com>
---
 examples/ipv4_multicast/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/ipv4_multicast/main.c b/examples/ipv4_multicast/main.c
index bd4c3f335b..1cb621cb8b 100644
--- a/examples/ipv4_multicast/main.c
+++ b/examples/ipv4_multicast/main.c
@@ -96,7 +96,6 @@ struct mbuf_table {
 };
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 struct __rte_cache_aligned lcore_queue_conf {
 	uint64_t tx_tsc;
 	uint16_t n_rx_queue;
@@ -735,8 +734,6 @@ main(int argc, char **argv)
 		fflush(stdout);
 
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
 
 		ret = rte_eth_dev_configure(portid, 1, (uint16_t)n_tx_queue,
 					    &local_port_conf);
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 4/6] examples/ip_reassembly: remove redundant Tx queue limit
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev; +Cc: Sivaprasad Tummala, stable, Konstantin Ananyev
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

From: Sivaprasad Tummala <sivaprasad.tummala@amd.com>

In `ip_reassembly` application, Tx queues are configured per lcore
to enable a lockless design and achieve optimal performance.

The `MAX_TX_QUEUE_PER_PORT` macro, defined as `RTE_MAX_ETHPORTS`,
introduced an artificial constraint on the number of Tx queues
and limited core-scaling performance.

This patch removes the unused `MAX_TX_QUEUE_PER_PORT` macro and
redundant Tx queue check, allowing Tx queues to scale directly
with the no. of lcores.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org

Signed-off-by: Sivaprasad Tummala <sivaprasad.tummala@amd.com>
---
 examples/ip_reassembly/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/ip_reassembly/main.c b/examples/ip_reassembly/main.c
index 520fbea1c2..0814d47a66 100644
--- a/examples/ip_reassembly/main.c
+++ b/examples/ip_reassembly/main.c
@@ -144,7 +144,6 @@ struct tx_lcore_stat {
 };
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 #define MAX_RX_QUEUE_PER_PORT 128
 
 struct __rte_cache_aligned lcore_queue_conf {
@@ -1097,8 +1096,6 @@ main(int argc, char **argv)
 		fflush(stdout);
 
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
 		if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
 			local_port_conf.txmode.offloads |=
 				RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 3/6] examples/ip_fragmentation: remove redundant Tx queue limit
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev; +Cc: Sivaprasad Tummala, stable, Konstantin Ananyev
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

From: Sivaprasad Tummala <sivaprasad.tummala@amd.com>

In `ip_fragmentation` application, Tx queues are configured per lcore
to enable a lockless design and achieve optimal performance.

The `MAX_TX_QUEUE_PER_PORT` macro, defined as `RTE_MAX_ETHPORTS`,
introduced an artificial constraint on the number of Tx queues
and limited core-scaling performance.

This patch removes the unused `MAX_TX_QUEUE_PER_PORT` macro and
redundant Tx queue check, allowing Tx queues to scale directly
with the no. of lcores.

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Sivaprasad Tummala <sivaprasad.tummala@amd.com>
---
 examples/ip_fragmentation/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/ip_fragmentation/main.c b/examples/ip_fragmentation/main.c
index 2180682373..132550e497 100644
--- a/examples/ip_fragmentation/main.c
+++ b/examples/ip_fragmentation/main.c
@@ -133,7 +133,6 @@ struct rx_queue {
 };
 
 #define MAX_RX_QUEUE_PER_LCORE 16
-#define MAX_TX_QUEUE_PER_PORT 16
 struct __rte_cache_aligned lcore_queue_conf {
 	uint16_t n_rx_queue;
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
@@ -947,8 +946,6 @@ main(int argc, char **argv)
 		fflush(stdout);
 
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
 		ret = rte_eth_dev_configure(portid, 1, (uint16_t)n_tx_queue,
 					    &local_port_conf);
 		if (ret < 0) {
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 2/6] examples/l3fwd: remove redundant Tx queue limit
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev; +Cc: Sivaprasad Tummala
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

From: Sivaprasad Tummala <sivaprasad.tummala@amd.com>

In `l3fwd` application, Tx queues are configured per lcore
to enable a lockless design and achieve optimal performance.

The `MAX_TX_QUEUE_PER_PORT` macro, defined as `RTE_MAX_LCORE`
which is dead code.

Signed-off-by: Sivaprasad Tummala <sivaprasad.tummala@amd.com>
---
 examples/l3fwd/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index df035b508c..6866811526 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -48,7 +48,6 @@
 #include "l3fwd_event.h"
 #include "l3fwd_route.h"
 
-#define MAX_TX_QUEUE_PER_PORT RTE_MAX_LCORE
 #define MAX_RX_QUEUE_PER_PORT 128
 
 #define MAX_LCORE_PARAMS 1024
@@ -1373,8 +1372,6 @@ l3fwd_poll_resource_setup(void)
 
 		nb_rx_queue = get_port_n_rx_queues(portid);
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 			nb_rx_queue, (unsigned)n_tx_queue );
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 1/6] examples/l3fwd-graph: remove redundant Tx queue limit
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev
  Cc: Sivaprasad Tummala, ndabilpuram, stable, Jerin Jacob,
	Kiran Kumar K, Zhirun Yan
In-Reply-To: <20260625174908.278408-1-stephen@networkplumber.org>

From: Sivaprasad Tummala <sivaprasad.tummala@amd.com>

In `l3fwd-graph` application, Tx queues are configured per lcore
to enable a lockless design and achieve optimal performance.

The `MAX_TX_QUEUE_PER_PORT` macro, defined as `RTE_MAX_ETHPORTS`,
introduced an artificial constraint on the number of Tx queues
and limited core-scaling performance.

This patch removes the unused `MAX_TX_QUEUE_PER_PORT` macro and
redundant Tx queue check, allowing Tx queues to scale directly
with the no. of lcores.

Fixes: 08bd1a174461 ("examples/l3fwd-graph: add graph-based l3fwd skeleton")
Cc: ndabilpuram@marvell.com
Cc: stable@dpdk.org

Signed-off-by: Sivaprasad Tummala <sivaprasad.tummala@amd.com>
---
 examples/l3fwd-graph/main.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/examples/l3fwd-graph/main.c b/examples/l3fwd-graph/main.c
index 01c65b0abd..5f89286dce 100644
--- a/examples/l3fwd-graph/main.c
+++ b/examples/l3fwd-graph/main.c
@@ -49,7 +49,6 @@
 #define RX_DESC_DEFAULT 1024
 #define TX_DESC_DEFAULT 1024
 
-#define MAX_TX_QUEUE_PER_PORT RTE_MAX_ETHPORTS
 #define MAX_RX_QUEUE_PER_PORT 128
 
 #define MAX_RX_QUEUE_PER_LCORE 16
@@ -1076,8 +1075,6 @@ main(int argc, char **argv)
 
 		nb_rx_queue = get_port_n_rx_queues(portid);
 		n_tx_queue = nb_lcores;
-		if (n_tx_queue > MAX_TX_QUEUE_PER_PORT)
-			n_tx_queue = MAX_TX_QUEUE_PER_PORT;
 		printf("Creating queues: nb_rxq=%d nb_txq=%u... ",
 		       nb_rx_queue, n_tx_queue);
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 0/6] examples: remove MAX_TX_QUEUE_PER_PORT
From: Stephen Hemminger @ 2026-06-25 17:46 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
In-Reply-To: <20250901154400.2333310-1-sivaprasad.tummala@amd.com>

This is an old set of patches that removes the use of MAX_TX_QUEUE_PER_PORT
in all the examples.

Sivaprasad Tummala (5):
  examples/l3fwd-graph: remove redundant Tx queue limit
  examples/l3fwd: remove redundant Tx queue limit
  examples/ip_fragmentation: remove redundant Tx queue limit
  examples/ip_reassembly: remove redundant Tx queue limit
  examples/ipv4_multicast: remove redundant Tx queue limit

Stephen Hemminger (1):
  examples: remove unused define

 examples/ip_fragmentation/main.c      | 3 ---
 examples/ip_reassembly/main.c         | 3 ---
 examples/ipv4_multicast/main.c        | 3 ---
 examples/l2fwd-crypto/main.c          | 1 -
 examples/l2fwd-event/l2fwd_common.h   | 1 -
 examples/l2fwd-jobstats/main.c        | 1 -
 examples/l2fwd-keepalive/main.c       | 1 -
 examples/l2fwd-macsec/main.c          | 1 -
 examples/l2fwd/main.c                 | 2 +-
 examples/l3fwd-graph/main.c           | 3 ---
 examples/l3fwd-power/main.c           | 1 -
 examples/l3fwd/main.c                 | 3 ---
 examples/link_status_interrupt/main.c | 1 -
 13 files changed, 1 insertion(+), 23 deletions(-)

-- 
2.53.0


^ permalink raw reply

* Re: [PATCH v3 1/2] dts: add code coverage reporting to DTS
From: Koushik Bhargav Nimoji @ 2026-06-25 17:41 UTC (permalink / raw)
  To: Patrick Robb; +Cc: luca.vizzarro, dev, abailey, ahassick, lylavoie
In-Reply-To: <CAK6DuxvoMGyBmGSu-6WYnC2bQqV=SeqoOME=m=Xs-5UgRv+=hg@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 14221 bytes --]

On Wed, Jun 24, 2026 at 10:34 PM Patrick Robb <patrickrobb1997@gmail.com>
wrote:

>
>
> On Mon, Jun 22, 2026 at 1:02 PM Koushik Bhargav Nimoji <
> knimoji@iol.unh.edu> wrote:
>
>> Previously, DTS had no code coverage. This patch adds a command line
>> argument in order to build DPDK with code coverage enabled. This allows
>> users to create and view code coverage reports of what code and functions
>> were called during a DTS run.
>>
>> Signed-off-by: Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
>> ---
>> v2:
>>     *Fixed error in lcov/gcov tool detection
>> v3:
>>     *Fixed type hints and error message typos
>> ---
>>  .mailmap                                      |  1 +
>>  doc/guides/tools/dts.rst                      | 15 +++++++++++++
>>  dts/README.md                                 |  5 +++++
>>  dts/framework/remote_session/dpdk.py          | 19 ++++++++++++++++
>>  .../remote_session/remote_session.py          |  5 ++++-
>>  dts/framework/settings.py                     | 10 +++++++++
>>  dts/framework/testbed_model/os_session.py     | 10 +++++++++
>>  dts/framework/testbed_model/posix_session.py  | 22 +++++++++++++++++++
>>  dts/framework/utils.py                        |  8 +++++++
>>  9 files changed, 94 insertions(+), 1 deletion(-)
>>
>> diff --git a/.mailmap b/.mailmap
>> index e052b85213..a1209150ad 100644
>> --- a/.mailmap
>> +++ b/.mailmap
>> @@ -877,6 +877,7 @@ Klaus Degner <kd@allegro-packets.com>
>>  Kommula Shiva Shankar <kshankar@marvell.com>
>>  Konstantin Ananyev <konstantin.ananyev@huawei.com> <
>> konstantin.v.ananyev@yandex.ru>
>>  Konstantin Ananyev <konstantin.ananyev@huawei.com> <
>> konstantin.ananyev@intel.com>
>> +Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
>>  Krishna Murthy <krishna.j.murthy@intel.com>
>>  Krzysztof Galazka <krzysztof.galazka@intel.com>
>>  Krzysztof Kanas <kkanas@marvell.com> <krzysztof.kanas@caviumnetworks.com
>> >
>> diff --git a/doc/guides/tools/dts.rst b/doc/guides/tools/dts.rst
>> index 5b9a348016..a838a317ee 100644
>> --- a/doc/guides/tools/dts.rst
>> +++ b/doc/guides/tools/dts.rst
>> @@ -352,6 +352,10 @@ DTS is run with ``main.py`` located in the ``dts``
>> directory using the ``poetry
>>       --precompiled-build-dir DIR_NAME
>>                             [DTS_PRECOMPILED_BUILD_DIR] Define the
>> subdirectory under the DPDK tree root directory or tarball where the pre-
>>                             compiled binaries are located. (default: None)
>> +     --code-coverage       Builds DPDK on the SUT node with code
>> coverage enabled. Generates a code coverage report which can be found on
>> +                           the local filesystem at
>> dts/output/coverage_reports/meson-logs/coveragereport/index.html, or the
>> specified output
>>
>
> at the DTS execution host's local filesystem
>
> I realize you are presumably concating what gcov/lcov gives you but can
> the dts/output/coverage_reports/meson-logs/coveragereport/index.html,
> path be shortened? Seems like 2-3 of those middle dir names can be dropped
> hah. Not a big deal if left as is for any reason.
>

I looked into shortening the path initially, but based on the structure of
the report and its components I believe it would be most simple to keep it
as is.


>
>> +                           directory. To use code coverage, please
>> ensure lcov v1.15 and gcov v8.0 or higher (included in gcc package) are
>> +                           installed on the SUT node.
>>
>>
>>  The brackets contain the names of environment variables that set the
>> same thing.
>> @@ -367,6 +371,17 @@ Results are stored in the output dir by default
>>  which be changed with the ``--output-dir`` command line argument.
>>  The results contain basic statistics of passed/failed test cases and
>> DPDK version.
>>
>> +Code Coverage
>> +~~~~~~~~~~~~~
>> +
>> +DTS has the ablilty to track code usage during test runs, and generate
>> an HTML
>>
>
> I'm sure it's obvious to most readers what coverage we are talking about
> here, but why not just explicitly say DTS can generate coverage reports
> which show the code coverage % for DPDK libraries and drivers touched
> during the testsuite(s) execution? It never hurts to be extra clear. :)
>
> VERY IMPORTANT: You need to explain what the code coverage behavior is. Is
> it a code coverage report per testrun? or per testsuite?
>
> If it is per testrun, what happens if we use a prebuilt DPDK dir? Then do
> coverage stats bleed over between runs because the build dir is preserved?
> (happy to talk about this tomorrow if I'm not phrasing it clearly).
>
> +coverage report with that data. This can be done by using the
>> "--code-coverage"
>> +CLI parameter when running DTS.
>> +
>> +To use code coverage, please make sure the following dependencies are
>> available
>> +on the SUT node:
>> +- lcov v1.15
>>
>
> code says 1.15 or greater
>
>
>> +- gcov v8.0 or greater (included in gcc package)
>>
>>  Contributing to DTS
>>  -------------------
>> diff --git a/dts/README.md b/dts/README.md
>> index d257b7a167..51f824e077 100644
>> --- a/dts/README.md
>> +++ b/dts/README.md
>> @@ -64,6 +64,11 @@ $ poetry run ./main.py
>>  These commands will give you a bash shell inside a docker container
>>  with all DTS Python dependencies installed.
>>
>> +# Code Coverage
>> +
>> +To generate code coverage reports, ensure the SUT has lcov v1.15 and
>> gcov v8.0 or greater
>> +installed, and that DTS is run using the '--code-coverage' argument.
>>
>
> Not that I'm opposed, but it is interesting to me that we are exposing
> this toggle as a flag but not as a test_run.yaml option. I was about to
> suggest adding a test_run.yaml boolean field for it but... maybe we need to
> relax on the amount of fields we put in there. It might be better for some
> of the more "infrequently used" options to be flag only, for
> readability reasons. Happy to defer to your judgement here.
>

I agree, as code coverage is more so an "add-on" to a DTS run. The
components of the test_run.yaml are more so required for the test run, so
it would be better to keep it as a flag.


> +
>>  ## Visual Studio Code
>>
>>  Usage of VScode devcontainers is NOT required for developing on DTS and
>> running DTS,
>> diff --git a/dts/framework/remote_session/dpdk.py
>> b/dts/framework/remote_session/dpdk.py
>> index c3575cfcaf..865f97f6ca 100644
>> --- a/dts/framework/remote_session/dpdk.py
>> +++ b/dts/framework/remote_session/dpdk.py
>> @@ -29,6 +29,7 @@
>>  from framework.logger import DTSLogger, get_dts_logger
>>  from framework.params.eal import EalParams
>>  from framework.remote_session.remote_session import CommandResult
>> +from framework.settings import SETTINGS
>>  from framework.testbed_model.cpu import LogicalCore, LogicalCoreCount,
>> LogicalCoreList, lcore_filter
>>  from framework.testbed_model.node import Node
>>  from framework.testbed_model.os_session import OSSession
>> @@ -107,7 +108,22 @@ def teardown(self) -> None:
>>          """Teardown the DPDK build on the target node.
>>
>>          Removes the DPDK tree and/or build directory/tarball depending
>> on the configuration.
>> +        If code coverage is enabled, the coverage report and .info file
>> are generated and
>> +        copied onto the local filesystem before teardown.
>>          """
>> +        if SETTINGS.code_coverage:
>> +            report_folder = PurePath(self.remote_dpdk_build_dir /
>> "meson-logs")
>> +            output_dir = SETTINGS.output_dir
>> +            Path(output_dir).mkdir(parents=True, exist_ok=True)
>> +
>> +            coverage_status =
>> self._session.generate_coverage_report(self.remote_dpdk_build_dir)
>> +            if coverage_status:
>> +                self._session.copy_dir_from(report_folder, output_dir)
>> +                self._logger.info(
>> +                    "Coverage HTML report generated, "
>> +                    f"available at
>> {output_dir}/meson-logs/coveragereports/index.html"
>> +                )
>> +
>>          match self.config.dpdk_location:
>>              case LocalDPDKTreeLocation():
>>
>>  self._node.main_session.remove_remote_dir(self.remote_dpdk_tree_path)
>> @@ -272,6 +288,9 @@ def _build_dpdk(self) -> None:
>>          else:
>>              meson_args = MesonArgs(default_library="static",
>> libdir="lib")
>>
>> +        if SETTINGS.code_coverage:
>> +            meson_args._add_arg("-Db_coverage=true")
>> +
>>          self._session.build_dpdk(
>>              self._env_vars,
>>              meson_args,
>> diff --git a/dts/framework/remote_session/remote_session.py
>> b/dts/framework/remote_session/remote_session.py
>> index 158325bb7f..d2440dc2d8 100644
>> --- a/dts/framework/remote_session/remote_session.py
>> +++ b/dts/framework/remote_session/remote_session.py
>> @@ -252,7 +252,10 @@ def copy_from(self, source_file: str | PurePath,
>> destination_dir: str | Path) ->
>>              destination_dir: The directory path on the local filesystem
>> where the `source_file`
>>                  will be saved.
>>          """
>> -        self.session.get(str(source_file), str(destination_dir))
>> +        source_file = PurePath(source_file)
>> +        destination_dir = Path(destination_dir)
>> +        local_path = destination_dir / source_file.name
>> +        self.session.get(str(source_file), str(local_path))
>>
>>      def copy_to(self, source_file: str | Path, destination_dir: str |
>> PurePath) -> None:
>>          """Copy a file from local filesystem to the remote Node.
>> diff --git a/dts/framework/settings.py b/dts/framework/settings.py
>> index b08373b7ea..7df535bd84 100644
>> --- a/dts/framework/settings.py
>> +++ b/dts/framework/settings.py
>> @@ -159,6 +159,8 @@ class Settings:
>>      re_run: int = 0
>>      #:
>>      random_seed: int | None = None
>> +    #:
>> +    code_coverage: bool = False
>>
>>
>>  SETTINGS: Settings = Settings()
>> @@ -489,6 +491,14 @@ def _get_parser() -> _DTSArgumentParser:
>>      )
>>      _add_env_var_to_action(action)
>>
>> +    action = parser.add_argument(
>> +        "--code-coverage",
>> +        action="store_true",
>> +        default=False,
>> +        help="Used to build DPDK with code coverage enabled.",
>> +    )
>> +    _add_env_var_to_action(action)
>> +
>>      return parser
>>
>>
>> diff --git a/dts/framework/testbed_model/os_session.py
>> b/dts/framework/testbed_model/os_session.py
>> index 2c267afed1..742b074948 100644
>> --- a/dts/framework/testbed_model/os_session.py
>> +++ b/dts/framework/testbed_model/os_session.py
>> @@ -480,6 +480,16 @@ def build_dpdk(
>>              timeout: Wait at most this long in seconds for the build
>> execution to complete.
>>          """
>>
>> +    @abstractmethod
>> +    def generate_coverage_report(self, remote_build_dir: PurePath |
>> None) -> bool:
>> +        """Generates a code coverage report for a DTS run.
>> +
>> +        Args:
>> +            remote_build_dir: The remote DPDK build directory
>> +        Returns:
>> +            Whether the coverage report was able to be created or not.
>> +        """
>> +
>>      @abstractmethod
>>      def get_dpdk_version(self, version_path: str | PurePath) -> str:
>>          """Inspect the DPDK version on the remote node.
>> diff --git a/dts/framework/testbed_model/posix_session.py
>> b/dts/framework/testbed_model/posix_session.py
>> index dec952685a..d18ce27de2 100644
>> --- a/dts/framework/testbed_model/posix_session.py
>> +++ b/dts/framework/testbed_model/posix_session.py
>> @@ -295,6 +295,28 @@ def build_dpdk(
>>          except RemoteCommandExecutionError as e:
>>              raise DPDKBuildError(f"DPDK build failed when doing
>> '{e.command}'.")
>>
>> +    def generate_coverage_report(self, remote_build_dir: PurePath |
>> None) -> bool:
>> +        """Overrides
>> :meth:`~.os_session.OSSession.generate_coverage_report`."""
>> +        command_result = self.send_command(r"lcov --version | grep -oP
>> '\d+\.\d+'")
>> +        lcov_version = float(
>> +            command_result.stdout if command_result.return_code == 0 and
>> command_result else -1
>> +        )
>> +        command_result = self.send_command(
>> +            r"gcov --version | head -n 1 | grep -oP '\d+\.\d+' | tail -n
>> 1"
>> +        )
>> +        gcov_version = float(
>> +            command_result.stdout if command_result.return_code == 0 and
>> command_result else -1
>> +        )
>> +
>> +        if lcov_version >= 1.15 and gcov_version >= 8.0:
>> +            self.send_command(f"ninja -C {remote_build_dir}
>> coverage-html", timeout=600)
>> +            return True
>> +        else:
>> +            self._logger.info(
>> +                "Unable to generate code coverage report, ensure lcov
>> v1.15 and at least gcov v8.0"
>> +            )
>> +            return False
>> +
>>      def get_dpdk_version(self, build_dir: str | PurePath) -> str:
>>          """Overrides :meth:`~.os_session.OSSession.get_dpdk_version`."""
>>          out = self.send_command(f"cat {self.join_remote_path(build_dir,
>> 'VERSION')}", verify=True)
>> diff --git a/dts/framework/utils.py b/dts/framework/utils.py
>> index 9917ffbfaa..38da88cd9c 100644
>> --- a/dts/framework/utils.py
>> +++ b/dts/framework/utils.py
>> @@ -125,6 +125,14 @@ def __str__(self) -> str:
>>          """The actual args."""
>>          return " ".join(f"{self._default_library}
>> {self._dpdk_args}".split())
>>
>> +    def _add_arg(self, arg: str):
>> +        """Used to add a meson build argument to the DPDK build.
>>
>
> Nit but rephrase to "Adds an argument to the Meson setup command"
>
>
>> +
>> +        Args:
>> +            arg: The meson build argument to be added.
>> +        """
>> +        self._dpdk_args = self._dpdk_args + " " + arg
>> +
>>
>>  class TarCompressionFormat(StrEnum):
>>      """Compression formats that tar can use.
>> --
>> 2.54.0
>>
>>
> Reviewed-by: Patrick Robb <patrickrobb1997@gmail.com>
>

[-- Attachment #2: Type: text/html, Size: 18808 bytes --]

^ permalink raw reply

* [PATCH v6 9/9] test/bpf: check that bpf_convert can be JIT'd
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Marat Khalili, Konstantin Ananyev
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

Run each converted filter through both the interpreter and the JIT and
check they agree, catching JIT miscompiles.

test_bpf_filter and test_bpf_match did nearly the same thing: compile,
load and run a filter against the dummy packet. Combine them into
test_bpf_match, which now builds the packet itself and returns whether
the filter matched. Callers run it for both load methods.

The dummy packet is a UDP packet to a fixed destination MAC, source
and destination ports, so the filter results are deterministic. None
of the sample filters should match it, so assert that; a convert or
JIT bug that flips a result is then caught. The destination MAC and
source port are chosen so the negative ethernet and port filters do
not match, and "port not 53 and not arp" is dropped as it matches
any non-ARP packet that lacks port 53.

Reduce log output to make it easier to match which expression might be
causing issues.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Marat Khalili <marat.khalili@huawei.com>
---
 app/test/test_bpf.c | 171 ++++++++++++++++++++++++++------------------
 1 file changed, 100 insertions(+), 71 deletions(-)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 16a1004e51..f56f6c7d7e 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -32,6 +32,7 @@ test_bpf(void)
 #include <rte_bpf.h>
 #include <rte_ether.h>
 #include <rte_ip.h>
+#include <rte_udp.h>
 
 
 /* Tests of most simple BPF programs (no instructions, one instruction etc.) */
@@ -4756,11 +4757,13 @@ load_cbpf_program_convert(struct bpf_program *cbpf_program, const char *str)
 		return NULL;
 	}
 
+#ifdef DEBUG
 	printf("bpf convert(\"%s\") produced:\n", str);
 	rte_bpf_dump(stdout, prm->ins, prm->nb_ins);
 
 	printf("%s \"%s\"\n", __func__, str);
 	test_bpf_dump(cbpf_program, prm);
+#endif
 
 	bpf = rte_bpf_load(prm);
 	rte_free(prm);
@@ -4785,18 +4788,65 @@ load_cbpf_program_direct(struct bpf_program *cbpf_program, const char *str __rte
 	});
 }
 
+static const load_cbpf_program_t cbpf_program_loaders[] = {
+	load_cbpf_program_convert,
+	load_cbpf_program_direct,
+};
+
+/* Setup Ethernet/IP/UDP headers in a dummy packet buffer for filter tests */
+static void
+dummy_ip_prep(void *data, uint16_t plen)
+{
+	struct {
+		struct rte_ether_hdr eth_hdr;
+		struct rte_ipv4_hdr ip_hdr;
+		struct rte_udp_hdr udp_hdr;
+	} *hdr = data;
+
+	hdr->eth_hdr = (struct rte_ether_hdr) {
+		.dst_addr.addr_bytes = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x0e },
+		.ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4),
+	};
+	hdr->ip_hdr = (struct rte_ipv4_hdr) {
+		.version_ihl = RTE_IPV4_VHL_DEF,
+		.total_length = rte_cpu_to_be_16(plen),
+		.time_to_live = IPDEFTTL,
+		.next_proto_id = IPPROTO_UDP,
+		.src_addr = rte_cpu_to_be_32(RTE_IPV4_LOOPBACK),
+		.dst_addr = rte_cpu_to_be_32(RTE_IPV4_BROADCAST),
+	};
+	hdr->udp_hdr = (struct rte_udp_hdr) {
+		.src_port = rte_cpu_to_be_16(49152),	/* fixed, avoids filter ports */
+		.dst_port = rte_cpu_to_be_16(9),	/* discard port */
+		.dgram_len = rte_cpu_to_be_16(plen - sizeof(struct rte_ipv4_hdr)),
+		.dgram_cksum = 0,
+	};
+}
+
+/*
+ * Compile a pcap filter, load it with the given loader, then run it against
+ * a standard dummy packet with both the interpreter and (when available) the
+ * JIT, checking the two agree.
+ *
+ * Returns 1 if the filter matched, 0 if it did not, and -1 on any error
+ * (compile, load, or interpreter/JIT mismatch).
+ */
 static int
-test_bpf_match(pcap_t *pcap, const char *str, struct rte_mbuf *mb,
+test_bpf_match(pcap_t *pcap, const char *str,
 	load_cbpf_program_t load_cbpf_program)
 {
+	uint8_t tbuf[RTE_MBUF_DEFAULT_BUF_SIZE];
+	const uint32_t plen = 100;
 	struct bpf_program fcode;
-	struct rte_bpf *bpf;
+	struct rte_mbuf mb = { 0 };
+	struct rte_bpf *bpf = NULL;
 	int ret = -1;
 	uint64_t rc;
 
+	printf("%s '%s'\n", __func__, str);
 	if (pcap_compile(pcap, &fcode, str, 1, PCAP_NETMASK_UNKNOWN)) {
 		printf("%s@%d: pcap_compile(\"%s\") failed: %s;\n",
-		       __func__, __LINE__,  str, pcap_geterr(pcap));
+		       __func__, __LINE__, str, pcap_geterr(pcap));
 		return -1;
 	}
 
@@ -4804,15 +4854,41 @@ test_bpf_match(pcap_t *pcap, const char *str, struct rte_mbuf *mb,
 	if (bpf == NULL) {
 		printf("%s@%d: failed to load cbpf program for \"%s\", error=%d(%s);\n",
 			__func__, __LINE__, str, rte_errno, strerror(rte_errno));
+		test_bpf_dump(&fcode, NULL);
 		goto error;
 	}
 
-	rc = rte_bpf_exec(bpf, mb);
-	/* The return code from bpf capture filter is non-zero if matched */
-	ret = (rc == 0);
+	dummy_mbuf_prep(&mb, tbuf, sizeof(tbuf), plen);
+	dummy_ip_prep(rte_pktmbuf_mtod(&mb, void *), plen);
+
+	rc = rte_bpf_exec(bpf, &mb);
+
+	/* Verify the JIT, when available, produces the same result. */
+	{
+		struct rte_bpf_jit jit;
+
+		rte_bpf_get_jit(bpf, &jit);
+		if (jit.func != NULL) {
+			fflush(stdout);
+			if (jit.func(&mb) != rc) {
+				printf("%s@%d: JIT return code does not match\n",
+				       __func__, __LINE__);
+				goto error;
+			}
+		}
+#ifdef RTE_BPF_JIT_SUPPORTED
+		else {
+			printf("%s@%d: no JIT code generated\n",
+			       __func__, __LINE__);
+			goto error;
+		}
+#endif
+	}
+
+	/* The return code from a bpf capture filter is non-zero if matched. */
+	ret = (rc != 0);
 error:
-	if (bpf)
-		rte_bpf_destroy(bpf);
+	rte_bpf_destroy(bpf);
 	pcap_freecode(&fcode);
 	return ret;
 }
@@ -4821,44 +4897,13 @@ test_bpf_match(pcap_t *pcap, const char *str, struct rte_mbuf *mb,
 static int
 test_bpf_filter_sanity(pcap_t *pcap)
 {
-	static const load_cbpf_program_t cbpf_program_loaders[] = {
-		load_cbpf_program_convert,
-		load_cbpf_program_direct,
-	};
-
-	const uint32_t plen = 100;
-	struct rte_mbuf mb, *m;
-	uint8_t tbuf[RTE_MBUF_DEFAULT_BUF_SIZE];
-	struct {
-		struct rte_ether_hdr eth_hdr;
-		struct rte_ipv4_hdr ip_hdr;
-	} *hdr;
-
-	memset(&mb, 0, sizeof(mb));
-	dummy_mbuf_prep(&mb, tbuf, sizeof(tbuf), plen);
-	m = &mb;
-
-	hdr = rte_pktmbuf_mtod(m, typeof(hdr));
-	hdr->eth_hdr = (struct rte_ether_hdr) {
-		.dst_addr.addr_bytes = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
-		.ether_type = rte_cpu_to_be_16(RTE_ETHER_TYPE_IPV4),
-	};
-	hdr->ip_hdr = (struct rte_ipv4_hdr) {
-		.version_ihl = RTE_IPV4_VHL_DEF,
-		.total_length = rte_cpu_to_be_16(plen),
-		.time_to_live = IPDEFTTL,
-		.next_proto_id = IPPROTO_RAW,
-		.src_addr = rte_cpu_to_be_32(RTE_IPV4_LOOPBACK),
-		.dst_addr = rte_cpu_to_be_32(RTE_IPV4_BROADCAST),
-	};
-
-	for (int li = 0; li != RTE_DIM(cbpf_program_loaders); ++li) {
-		if (test_bpf_match(pcap, "ip", m, cbpf_program_loaders[li]) != 0) {
+	for (unsigned int li = 0; li != RTE_DIM(cbpf_program_loaders); ++li) {
+		if (test_bpf_match(pcap, "ip", cbpf_program_loaders[li]) != 1) {
 			printf("%s@%d: filter \"ip\" doesn't match test data\n",
 			       __func__, __LINE__);
 			return -1;
 		}
-		if (test_bpf_match(pcap, "not ip", m, cbpf_program_loaders[li]) == 0) {
+		if (test_bpf_match(pcap, "not ip", cbpf_program_loaders[li]) != 0) {
 			printf("%s@%d: filter \"not ip\" does match test data\n",
 			       __func__, __LINE__);
 			return -1;
@@ -4882,7 +4927,6 @@ static const char * const sample_filters[] = {
 	"port 53",
 	"host 192.0.2.1 and not (port 80 or port 25)",
 	"host 2001:4b98:db0::8 and not port 80 and not port 25",
-	"port not 53 and not arp",
 	"(tcp[0:2] > 1500 and tcp[0:2] < 1550) or (tcp[2:2] > 1500 and tcp[2:2] < 1550)",
 	"ether proto 0x888e",
 	"ether[0] & 1 = 0 and ip[16] >= 224",
@@ -4909,35 +4953,10 @@ static const char * const sample_filters[] = {
 	"or host 192.0.2.1 or host 192.0.2.100 or host 192.0.2.200"),
 };
 
-static int
-test_bpf_filter(pcap_t *pcap, const char *s, load_cbpf_program_t load_cbpf_program)
-{
-	struct bpf_program fcode;
-	struct rte_bpf *bpf;
-
-	if (pcap_compile(pcap, &fcode, s, 1, PCAP_NETMASK_UNKNOWN)) {
-		printf("%s@%d: pcap_compile(\"%s\") failed: %s;\n",
-		       __func__, __LINE__, s, pcap_geterr(pcap));
-		return -1;
-	}
-
-	bpf = load_cbpf_program(&fcode, s);
-	if (bpf == NULL) {
-		printf("%s@%d: failed to load cbpf program for \"%s\", error=%d(%s);\n",
-			__func__, __LINE__, s, rte_errno, strerror(rte_errno));
-		test_bpf_dump(&fcode, NULL);
-	}
-
-	rte_bpf_destroy(bpf);
-
-	pcap_freecode(&fcode);
-	return (bpf == NULL) ? -1 : 0;
-}
-
 static int
 test_bpf_convert(void)
 {
-	unsigned int i;
+	unsigned int i, li;
 	pcap_t *pcap;
 	int rc;
 
@@ -4949,8 +4968,18 @@ test_bpf_convert(void)
 
 	rc = test_bpf_filter_sanity(pcap);
 	for (i = 0; i < RTE_DIM(sample_filters); i++) {
-		rc |= test_bpf_filter(pcap, sample_filters[i], load_cbpf_program_convert);
-		rc |= test_bpf_filter(pcap, sample_filters[i], load_cbpf_program_direct);
+		for (li = 0; li < RTE_DIM(cbpf_program_loaders); li++) {
+			int m = test_bpf_match(pcap, sample_filters[i],
+					       cbpf_program_loaders[li]);
+
+			/* None of the sample filters match the dummy packet. */
+			if (m != 0) {
+				if (m > 0)
+					printf("%s@%d: filter \"%s\" unexpectedly matched\n",
+					       __func__, __LINE__, sample_filters[i]);
+				rc = -1;
+			}
+		}
 	}
 
 	pcap_close(pcap);
-- 
2.53.0


^ permalink raw reply related

* [PATCH v6 8/9] test/bpf: check that JIT was generated
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Marat Khalili, Konstantin Ananyev
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

Avoid silently ignoring JIT failures. The test cases should
all succeed JIT compilation; if not it is a bug in the JIT
implementation and should be reported.

Introduce a configuration setting RTE_BPF_JIT_SUPPORTED
which is cleaner than using an ARCH specific #ifdef.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Marat Khalili <marat.khalili@huawei.com>
---
 app/test/test_bpf.c | 8 ++++++++
 lib/bpf/meson.build | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 0e5894a532..16a1004e51 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -3649,6 +3649,14 @@ run_test(const struct bpf_test *tst)
 				rv, strerror(rv));
 		}
 	}
+#ifdef RTE_BPF_JIT_SUPPORTED
+	else {
+		/* a JIT backend exists for this arch, so it must compile */
+		printf("%s@%d: %s: no JIT code generated;\n",
+			__func__, __LINE__, tst->name);
+		ret = -1;
+	}
+#endif
 
 	rte_bpf_destroy(bpf);
 	return ret;
diff --git a/lib/bpf/meson.build b/lib/bpf/meson.build
index 7e8a300e3f..04ede96689 100644
--- a/lib/bpf/meson.build
+++ b/lib/bpf/meson.build
@@ -27,8 +27,10 @@ sources = files(
 )
 
 if arch_subdir == 'x86' and dpdk_conf.get('RTE_ARCH_64')
+    dpdk_conf.set('RTE_BPF_JIT_SUPPORTED', 1)
     sources += files('bpf_jit_x86.c')
 elif dpdk_conf.has('RTE_ARCH_ARM64')
+    dpdk_conf.set('RTE_BPF_JIT_SUPPORTED', 1)
     sources += files('bpf_jit_arm64.c')
 endif
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH v6 7/9] bpf/arm64: add BPF_ABS/BPF_IND packet load support
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Wathsala Vithanage, Konstantin Ananyev,
	Marat Khalili
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

The arm64 JIT rejected BPF_LD | BPF_ABS and BPF_LD | BPF_IND with
"invalid opcode", so cBPF programs converted by rte_bpf_convert() could
not be JITed. Add these opcodes, mirroring the x86 JIT: a fast path for
data held in the first mbuf segment, and a __rte_pktmbuf_read() slow
path for everything else.

The forward branches over the call cannot use fixed distances:
emit_call() materializes the helper address with a variable number of
mov/movk instructions, so the block sizes are not known up front. Size
the three blocks (fast path, slow path, common tail) in a dry run, then
emit for real with the branches resolved from the measured offsets.

The effective offset is validated before use: src is a runtime value for
BPF_IND, so a negative offset is routed to the slow path rather than
read from the first segment, and the offset is bounded to UINT32_MAX
before __rte_pktmbuf_read(), whose off argument is uint32_t.

Programs using these opcodes use the call register layout, since the
slow path makes a function call.

For example, BPF_LD | BPF_IND | BPF_W (4-byte indirect load, mbuf in
R6/x19, effective offset kept in x9) emits:

	mov	x9, #imm		// off  = imm
	add	x9, x9, src		// off += src		(BPF_IND)
	cmp	x9, xzr			// reject negative
	b.mi	slow			//   effective offset
	mov	x10, #data_len_ofs
	ldrh	w10, [x19, x10]		// mbuf->data_len
	sub	x10, x10, x9		// data_len - off
	mov	x11, #sz
	cmp	x10, x11
	b.lt	slow			// not in first segment
	mov	x10, #data_off_ofs
	ldrh	w10, [x19, x10]		// mbuf->data_off
	mov	x7, #buf_addr_ofs
	ldr	x7, [x19, x7]		// mbuf->buf_addr
	add	x7, x7, x10
	add	x7, x7, x9		// ptr = buf_addr + data_off + off
	b	load
slow:
	mov	x10, #UINT32_MAX
	cmp	x9, x10
	b.ls	1f			// off fits uint32_t ...
	mov	x7, #0			//   else return 0
	b	epilogue
1:	mov	x1, x9			// __rte_pktmbuf_read(mbuf, off, sz, buf)
	mov	x0, x19
	mov	w2, #sz
	sub	x3, x25, #stack_ofs
	mov	x9, #<helper lo>
	movk	x9, #<helper hi>
	blr	x9
	mov	x7, x0			// ptr = return value
	cbnz	x7, load		// non-NULL -> common tail
	mov	x7, #0			//   else return 0
	b	epilogue
load:
	ldr	w7, [x7, xzr]		// *(uint32_t *)ptr	(size varies)
	rev32	x7, x7			// ntoh	(size varies; omitted for BPF_B)

For BPF_ABS the "add x9, x9, src" is omitted; the final load/byte-swap
vary with the access size.

Bugzilla ID: 1427

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/bpf/bpf_jit_arm64.c | 169 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 51906c7f0d..6d531dc83d 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -1133,6 +1133,155 @@ emit_branch(struct a64_jit_ctx *ctx, uint8_t op, uint32_t i, int16_t off)
 	emit_b_cond(ctx, ebpf_to_a64_cond(op), jump_offset_get(ctx, i, off));
 }
 
+/* LD_ABS/LD_IND code block offsets (in arm64 instructions) */
+enum {
+	LDMB_FAST_OFS, /* fast path */
+	LDMB_SLOW_OFS, /* slow path */
+	LDMB_FIN_OFS,  /* common tail */
+	LDMB_OFS_NUM
+};
+
+/*
+ * Helper for emit_ld_mbuf(): fast path.
+ * Compute the packet offset; if it lies inside the first segment leave the
+ * data pointer in R0, otherwise branch to the slow path.
+ */
+static void
+emit_ldmb_fast_path(struct a64_jit_ctx *ctx, uint8_t src, uint8_t mode,
+		    uint32_t sz, int32_t imm, const uint32_t ofs[LDMB_OFS_NUM])
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+	uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+	uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+	uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
+	uint8_t tmp3 = ebpf_to_a64_reg(ctx, TMP_REG_3);
+
+	/* off = imm (+ src for BPF_IND) */
+	emit_mov_imm(ctx, 1, tmp1, imm);
+	if (mode == BPF_IND)
+		emit_add(ctx, 1, tmp1, src);
+
+	/*
+	 * A negative effective offset (src can be < 0 for BPF_IND) would pass
+	 * the signed check below and read before the segment, so route it to
+	 * the slow path, which rejects it via the uint32_t bound on off.
+	 */
+	emit_cmp(ctx, 1, tmp1, A64_ZR);
+	emit_b_cond(ctx, A64_MI, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
+
+	/* if ((int64_t)(mbuf->data_len - off) < sz) goto slow_path */
+	emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_len));
+	emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+	emit_sub(ctx, 1, tmp2, tmp1);
+	emit_mov_imm(ctx, 1, tmp3, sz);
+	emit_cmp(ctx, 1, tmp2, tmp3);
+	emit_b_cond(ctx, A64_LT, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
+
+	/* R0 = mbuf->buf_addr + mbuf->data_off + off */
+	emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_off));
+	emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+	emit_mov_imm(ctx, 1, r0, offsetof(struct rte_mbuf, buf_addr));
+	emit_ldr(ctx, EBPF_DW, r0, r6, r0);
+	emit_add(ctx, 1, r0, tmp2);
+	emit_add(ctx, 1, r0, tmp1);
+
+	emit_b(ctx, (int32_t)(ofs[LDMB_FIN_OFS] - ctx->idx));
+}
+
+/*
+ * Helper for emit_ld_mbuf(): slow path.
+ * R0 = __rte_pktmbuf_read(mbuf, off, sz, buf); return 0 if NULL.
+ * The scratch buffer is the space reserved by __rte_bpf_validate() at the
+ * bottom of the eBPF stack frame, i.e. (frame_pointer - stack_ofs).
+ */
+static void
+emit_ldmb_slow_path(struct a64_jit_ctx *ctx, uint32_t sz, uint32_t stack_ofs)
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+	uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+	uint8_t fp = ebpf_to_a64_reg(ctx, EBPF_FP);
+	uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+	uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
+
+	/*
+	 * __rte_pktmbuf_read() takes a uint32_t off, so a 64-bit off that does
+	 * not fit would be silently truncated.  Return 0 if it is out of range;
+	 * this also catches the negative off routed here by the fast path.
+	 */
+	emit_mov_imm(ctx, 1, tmp2, UINT32_MAX);
+	emit_cmp(ctx, 1, tmp1, tmp2);
+	emit_b_cond(ctx, A64_LS, 3);		/* off <= UINT32_MAX: do the call */
+	emit_mov_imm(ctx, 1, r0, 0);
+	emit_b(ctx, (ctx->program_start + ctx->program_sz) - ctx->idx);
+
+	/* arguments of __rte_pktmbuf_read(mbuf, off, len, buf) */
+	emit_mov_64(ctx, A64_R(1), tmp1);		/* off (held in tmp1) */
+	emit_mov_64(ctx, A64_R(0), r6);			/* mbuf */
+	emit_mov_imm(ctx, 0, A64_R(2), sz);		/* len */
+	emit_sub_imm_64(ctx, A64_R(3), fp, stack_ofs);	/* buf */
+
+	emit_call(ctx, tmp1, (void *)(uintptr_t)__rte_pktmbuf_read);
+	emit_return_zero_if_src_zero(ctx, 1, r0);
+}
+
+/*
+ * Helper for emit_ld_mbuf(): common tail.
+ * Load the value pointed to by R0 and convert from network byte order.
+ */
+static void
+emit_ldmb_fin(struct a64_jit_ctx *ctx, uint8_t opsz, uint32_t sz)
+{
+	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+
+	emit_ldr(ctx, opsz, r0, r0, A64_ZR);
+	if (opsz != BPF_B)
+		emit_be(ctx, r0, sz * 8);
+}
+
+/*
+ * Emit code for BPF_LD | BPF_ABS and BPF_LD | BPF_IND packet loads:
+ *
+ *	off = imm (+ src for BPF_IND)
+ *	if (off >= 0 && mbuf->data_len - off >= sz)	    -- fast path
+ *		ptr = mbuf->buf_addr + mbuf->data_off + off;
+ *	else						    -- slow path
+ *		if ((uint64_t)off > UINT32_MAX)
+ *			return 0;
+ *		ptr = __rte_pktmbuf_read(mbuf, off, sz, buf);
+ *		if (ptr == NULL)
+ *			return 0;
+ *	R0 = ntoh(*(size *)ptr);			    -- common tail
+ *
+ * The three blocks are sized in a dry run so the forward branches can be
+ * resolved, then emitted for real (arm64 instructions are fixed width, so
+ * the dry run reproduces the real instruction count exactly).
+ */
+static void
+emit_ld_mbuf(struct a64_jit_ctx *ctx, uint8_t op, uint8_t src, int32_t imm,
+	     uint32_t stack_ofs)
+{
+	uint8_t mode = BPF_MODE(op);
+	uint8_t opsz = BPF_SIZE(op);
+	uint32_t sz = bpf_size(opsz);
+	uint32_t ofs[LDMB_OFS_NUM];
+
+	/* seed offsets so the dry-run branches stay in range */
+	ofs[LDMB_FAST_OFS] = ofs[LDMB_SLOW_OFS] = ofs[LDMB_FIN_OFS] = ctx->idx;
+
+	/* dry run to record block offsets */
+	emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+	ofs[LDMB_SLOW_OFS] = ctx->idx;
+	emit_ldmb_slow_path(ctx, sz, stack_ofs);
+	ofs[LDMB_FIN_OFS] = ctx->idx;
+	emit_ldmb_fin(ctx, opsz, sz);
+
+	/* rewind and emit for real with resolved offsets */
+	ctx->idx = ofs[LDMB_FAST_OFS];
+	emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+	emit_ldmb_slow_path(ctx, sz, stack_ofs);
+	emit_ldmb_fin(ctx, opsz, sz);
+}
+
 static void
 check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 {
@@ -1145,8 +1294,17 @@ check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 		op = ins->code;
 
 		switch (op) {
-		/* Call imm */
+		/*
+		 * BPF_ABS/BPF_IND can fall through to __rte_pktmbuf_read(),
+		 * so they need the call-clobbered register layout as well.
+		 */
 		case (BPF_JMP | EBPF_CALL):
+		case (BPF_LD | BPF_ABS | BPF_B):
+		case (BPF_LD | BPF_ABS | BPF_H):
+		case (BPF_LD | BPF_ABS | BPF_W):
+		case (BPF_LD | BPF_IND | BPF_B):
+		case (BPF_LD | BPF_IND | BPF_H):
+		case (BPF_LD | BPF_IND | BPF_W):
 			ctx->foundcall = 1;
 			return;
 		}
@@ -1348,6 +1506,15 @@ emit(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
 			emit_mov_imm(ctx, 1, dst, u64);
 			i++;
 			break;
+		/* R0 = ntoh(*(size *)(mbuf data + (src) + imm)) */
+		case (BPF_LD | BPF_ABS | BPF_B):
+		case (BPF_LD | BPF_ABS | BPF_H):
+		case (BPF_LD | BPF_ABS | BPF_W):
+		case (BPF_LD | BPF_IND | BPF_B):
+		case (BPF_LD | BPF_IND | BPF_H):
+		case (BPF_LD | BPF_IND | BPF_W):
+			emit_ld_mbuf(ctx, op, src, imm, bpf->stack_sz);
+			break;
 		/* *(size *)(dst + off) = src */
 		case (BPF_STX | BPF_MEM | BPF_B):
 		case (BPF_STX | BPF_MEM | BPF_H):
-- 
2.53.0


^ permalink raw reply related

* [PATCH v6 6/9] bpf/arm64: fix offset type to allow a negative jump
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev
  Cc: Christophe Fontaine, stable, Stephen Hemminger, Marat Khalili,
	Wathsala Vithanage, Konstantin Ananyev, Jerin Jacob
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

From: Christophe Fontaine <cfontain@redhat.com>

The DPDK BPF JIT standalone test test_ld_mbuf1 fails on arm64.
It does:
	r6 = r1                    // mbuf
	r0 = *(u8 *)pkt[0]         // BPF_ABS
	if ((r0 & 0xf0) == 0x40)
		goto parse
	r0 = 0
	exit                       // epilogue E0
parse:
	r0 = *(u8 *)pkt[r0 + 3]    // BPF_IND
	...
	exit

emit_return_zero_if_src_zero() returns 0 by branching to a function
epilogue. The target may be a previous epilogue so branch
might be backwards; therefore the offset needs to be negative.

The offset was stored in a uint16_t, so a negative value wrapped to a
large positive number; emit_b() then branched past the end of the
program and faulted at run time.

Fixes: 111e2a747a4f ("bpf/arm: add basic arithmetic operations")
Cc: stable@dpdk.org

Signed-off-by: Christophe Fontaine <cfontain@redhat.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Marat Khalili <marat.khalili@huawei.com>
---
 lib/bpf/bpf_jit_arm64.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 7582370062..51906c7f0d 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -965,10 +965,12 @@ static void
 emit_return_zero_if_src_zero(struct a64_jit_ctx *ctx, bool is64, uint8_t src)
 {
 	uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
-	uint16_t jump_to_epilogue;
+	int32_t jump_to_epilogue;
 
 	emit_cbnz(ctx, is64, src, 3);
 	emit_mov_imm(ctx, is64, r0, 0);
+
+	/* maybe backwards branch to earlier epilogue */
 	jump_to_epilogue = (ctx->program_start + ctx->program_sz) - ctx->idx;
 	emit_b(ctx, jump_to_epilogue);
 }
-- 
2.53.0


^ permalink raw reply related

* [PATCH v6 5/9] test/bpf: add test for large shift
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Marat Khalili, Konstantin Ananyev
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

There were multiple bugs with immediate values in shift instructions.
The code was not masking as required by RFC.

Add new tests that cover these instructions.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Marat Khalili <marat.khalili@huawei.com>
---
 app/test/test_bpf.c | 59 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c
index 232e9e2a98..0e5894a532 100644
--- a/app/test/test_bpf.c
+++ b/app/test/test_bpf.c
@@ -2005,6 +2005,51 @@ test_div1_check(uint64_t rc, const void *arg)
 	return cmp_res(__func__, 0, rc, dve.out, dvt->out, sizeof(dve.out));
 }
 
+/*
+ * Shift counts are masked to the operand width (RFC 9669: 0x3f for 64-bit,
+ * 0x1f for 32-bit). Counts >= 128 also exercise the x86 imm_size() path that
+ * used to desync the stream, and the arm64 UBFM/SBFM immediate encoding.
+ */
+static const struct ebpf_insn test_shift_big_imm_prog[] = {
+	{
+		.code = (EBPF_ALU64 | EBPF_MOV | BPF_K),
+		.dst_reg = EBPF_REG_0,
+		.imm = 1
+	},
+	{
+		.code = (EBPF_ALU64 | BPF_LSH | BPF_K),
+		.dst_reg = EBPF_REG_0,
+		.imm = 191
+	},
+	{
+		.code = (EBPF_ALU64 | EBPF_ARSH | BPF_K),
+		.dst_reg = EBPF_REG_0,
+		.imm = 200
+	},
+	{
+		.code = (EBPF_ALU64 | BPF_RSH | BPF_K),
+		.dst_reg = EBPF_REG_0,
+		.imm = 130
+	},
+	{
+		.code = (BPF_JMP | EBPF_EXIT)
+	},
+};
+
+static void
+test_shift_big_imm_prepare(void *arg)
+{
+	memset(arg, 0, sizeof(struct dummy_offset));
+}
+
+static int
+test_shift_big_imm_check(uint64_t rc, const void *arg)
+{
+	uint64_t expect = 0x3FE0000000000000ULL;
+
+	return cmp_res(__func__, expect, rc, arg, arg, 0);
+}
+
 /* call test-cases */
 static const struct ebpf_insn test_call1_prog[] = {
 
@@ -3409,6 +3454,20 @@ static const struct bpf_test tests[] = {
 		.prepare = test_mul1_prepare,
 		.check_result = test_div1_check,
 	},
+	{
+		.name = "test_shift_big_imm",
+		.arg_sz = sizeof(struct dummy_offset),
+		.prm = {
+			.ins = test_shift_big_imm_prog,
+			.nb_ins = RTE_DIM(test_shift_big_imm_prog),
+			.prog_arg = {
+				.type = RTE_BPF_ARG_PTR,
+				.size = sizeof(struct dummy_offset),
+			},
+		},
+		.prepare = test_shift_big_imm_prepare,
+		.check_result = test_shift_big_imm_check,
+	},
 	{
 		.name = "test_call1",
 		.arg_sz = sizeof(struct dummy_offset),
-- 
2.53.0


^ permalink raw reply related

* [PATCH v6 4/9] bpf/arm64: mask shift count per RFC 9669
From: Stephen Hemminger @ 2026-06-25 17:30 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, stable, Marat Khalili, Wathsala Vithanage,
	Konstantin Ananyev, Jerin Jacob
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>

The ARM JIT was not masking the shift count as required by RFC 9669
(0x3f for 64-bit, 0x1f for 32-bit), so large immediate shift counts
overflowed the UBFM/SBFM encoding and failed the JIT. Mask the
immediate in emit_lsl/emit_lsr/emit_asr.

Fixes: 9f4469d9e83a ("bpf/arm: add logical operations")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Marat Khalili <marat.khalili@huawei.com>
---
 lib/bpf/bpf_jit_arm64.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index ba7ae4d680..7582370062 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -545,12 +545,14 @@ emit_bitfield(struct a64_jit_ctx *ctx, bool is64, uint8_t rd, uint8_t rn,
 	emit_insn(ctx, insn, check_reg(rd) || check_reg(rn) ||
 		  check_immr_imms(is64, immr, imms));
 }
+
 static void
 emit_lsl(struct a64_jit_ctx *ctx, bool is64, uint8_t rd, uint8_t imm)
 {
 	const unsigned int width = is64 ? 64 : 32;
 	uint8_t imms, immr;
 
+	imm &= width - 1;
 	immr = (width - imm) & (width - 1);
 	imms = width - 1 - imm;
 
@@ -560,13 +562,19 @@ emit_lsl(struct a64_jit_ctx *ctx, bool is64, uint8_t rd, uint8_t imm)
 static void
 emit_lsr(struct a64_jit_ctx *ctx, bool is64, uint8_t rd, uint8_t imm)
 {
-	emit_bitfield(ctx, is64, rd, rd, imm, is64 ? 63 : 31, A64_UBFM);
+	const unsigned int width = is64 ? 64 : 32;
+
+	imm &= width - 1;
+	emit_bitfield(ctx, is64, rd, rd, imm, width - 1, A64_UBFM);
 }
 
 static void
 emit_asr(struct a64_jit_ctx *ctx, bool is64, uint8_t rd, uint8_t imm)
 {
-	emit_bitfield(ctx, is64, rd, rd, imm, is64 ? 63 : 31, A64_SBFM);
+	const unsigned int width = is64 ? 64 : 32;
+
+	imm &= width - 1;
+	emit_bitfield(ctx, is64, rd, rd, imm, width - 1, A64_SBFM);
 }
 
 #define A64_AND 0
-- 
2.53.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox