From: Stephen Hemminger <stephen@networkplumber.org>
To: dev@dpdk.org
Cc: Stephen Hemminger <stephen@networkplumber.org>
Subject: [PATCH v5 15/19] net/tap: fix Rx descriptor vs scatter segment confusion
Date: Sun, 22 Feb 2026 09:30:50 -0800 [thread overview]
Message-ID: <20260222173225.522754-16-stephen@networkplumber.org> (raw)
In-Reply-To: <20260222173225.522754-1-stephen@networkplumber.org>
The TAP PMD was reusing the nb_rx_desc queue setup parameter to control
the maximum number of scatter segments per received packet. Since the
driver reads one packet at a time from the kernel fd via readv(), it has
no descriptor ring and nb_rx_desc as a queue depth has no meaning here.
This meant applications could inadvertently affect scatter receive
behavior by changing the queue size parameter.
Compute the required number of scatter segments from the MTU and the
mempool buffer size instead, capping at TAP_MAX_RX_SEGS (128). The
nb_rx_desc parameter is now ignored.
While here, convert the iovecs pointer-to-VLA in struct rx_queue to a
flexible array member. This eliminates the separate iovec allocation,
removes the on-stack VLA from queue setup, reduces pointer indirection
on the receive path, and simplifies all cleanup paths to a single free.
Replace the runtime sysconf(_SC_IOV_MAX) check with a compile-time
static_assert against IOV_MAX from limits.h.
This is a bug fix, but too big a change to backport to earlier
releases.
Fixes: 0781f5762cfe ("net/tap: support segmented mbufs")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
drivers/net/tap/rte_eth_tap.c | 111 ++++++++++++++++++++--------------
drivers/net/tap/rte_eth_tap.h | 4 +-
2 files changed, 66 insertions(+), 49 deletions(-)
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 75fa517ae2..f2cf7da736 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -35,6 +35,7 @@
#include <linux/if_tun.h>
#include <linux/sched.h>
#include <fcntl.h>
+#include <limits.h>
#include <tap_rss.h>
#include <rte_eth_tap.h>
@@ -69,7 +70,17 @@
static_assert(RTE_PMD_TAP_MAX_QUEUES <= RTE_MP_MAX_FD_NUM, "TAP max queues exceeds MP fd limit");
-#define TAP_IOV_DEFAULT_MAX 1024
+/*
+ * Upper bound on the number of scatter segments per received packet.
+ * Actual value is computed at queue setup from MTU and mbuf data size.
+ * One extra iovec slot is reserved for the tun_pi header, so the total
+ * iovec count passed to readv() is max_rx_segs + 1, which must not
+ * exceed IOV_MAX.
+ */
+#define TAP_MAX_RX_SEGS 128
+
+static_assert(TAP_MAX_RX_SEGS + 1 <= IOV_MAX,
+ "TAP_MAX_RX_SEGS + 1 (for tun_pi) must not exceed IOV_MAX");
#define TAP_RX_OFFLOAD (RTE_ETH_RX_OFFLOAD_SCATTER | \
RTE_ETH_RX_OFFLOAD_IPV4_CKSUM | \
@@ -444,9 +455,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
int len;
len = readv(process_private->fds[rxq->queue_id],
- *rxq->iovecs,
- 1 + (rxq->rxmode->offloads & RTE_ETH_RX_OFFLOAD_SCATTER ?
- rxq->nb_rx_desc : 1));
+ rxq->iovecs, 1 + rxq->max_rx_segs);
if (len < (int)sizeof(struct tun_pi))
break;
@@ -483,9 +492,9 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
new_tail->next = seg->next;
/* iovecs[0] is reserved for packet info (pi) */
- (*rxq->iovecs)[mbuf->nb_segs].iov_len =
+ rxq->iovecs[mbuf->nb_segs].iov_len =
buf->buf_len - data_off;
- (*rxq->iovecs)[mbuf->nb_segs].iov_base =
+ rxq->iovecs[mbuf->nb_segs].iov_base =
(char *)buf->buf_addr + data_off;
seg->data_len = RTE_MIN(seg->buf_len - data_off, len);
@@ -1049,7 +1058,6 @@ tap_dev_close(struct rte_eth_dev *dev)
if (rxq != NULL) {
tap_rxq_pool_free(rxq->pool);
- rte_free(rxq->iovecs);
rte_free(rxq);
dev->data->rx_queues[i] = NULL;
}
@@ -1115,7 +1123,6 @@ tap_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
process_private = rte_eth_devices[rxq->in_port].process_private;
tap_rxq_pool_free(rxq->pool);
- rte_free(rxq->iovecs);
if (dev->data->tx_queues[qid] == NULL)
tap_queue_close(process_private, qid);
@@ -1495,7 +1502,7 @@ tap_setup_queue(struct rte_eth_dev *dev,
static int
tap_rx_queue_setup(struct rte_eth_dev *dev,
uint16_t rx_queue_id,
- uint16_t nb_rx_desc,
+ uint16_t nb_rx_desc __rte_unused,
unsigned int socket_id,
const struct rte_eth_rxconf *rx_conf __rte_unused,
struct rte_mempool *mp)
@@ -1504,30 +1511,51 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
struct pmd_process_private *process_private = dev->process_private;
struct rx_queue *rxq;
struct rte_mbuf **tmp;
- long iov_max = sysconf(_SC_IOV_MAX);
+ uint16_t max_rx_segs;
+ const uint32_t max_rx_pktlen = dev->data->mtu + RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN;
+ const uint16_t seg_size = rte_pktmbuf_data_room_size(mp);
+ uint16_t data_off = RTE_PKTMBUF_HEADROOM;
+ int ret = 0;
- if (iov_max <= 0) {
- TAP_LOG(WARNING,
- "_SC_IOV_MAX is not defined. Using %d as default",
- TAP_IOV_DEFAULT_MAX);
- iov_max = TAP_IOV_DEFAULT_MAX;
+ /*
+ * The nb_rx_desc parameter is ignored: the TAP PMD reads one packet
+ * at a time from the kernel fd, so there is no descriptor ring.
+ *
+ * Compute the number of scatter segments needed to receive the
+ * largest possible packet (MTU + L2 overhead). The first segment
+ * has headroom reserved, subsequent segments use the full data area.
+ */
+ if (seg_size <= RTE_PKTMBUF_HEADROOM) {
+ TAP_LOG(ERR, "%s: mbuf pool has no usable data room", dev->device->name);
+ return -EINVAL;
}
- uint16_t nb_desc = RTE_MIN(nb_rx_desc, iov_max - 1);
- struct iovec (*iovecs)[nb_desc + 1];
- int data_off = RTE_PKTMBUF_HEADROOM;
- int ret = 0;
- int fd;
- int i;
- if (rx_queue_id >= dev->data->nb_rx_queues || !mp) {
+ const uint16_t first_seg_size = seg_size - RTE_PKTMBUF_HEADROOM;
+ if (max_rx_pktlen > first_seg_size)
+ max_rx_segs = 1 + (max_rx_pktlen - first_seg_size + seg_size - 1) /
+ seg_size;
+ else
+ max_rx_segs = 1;
+
+ if (max_rx_segs > TAP_MAX_RX_SEGS) {
+ TAP_LOG(ERR,
+ "%s: MTU %u requires %u scatter segments, max is %u",
+ dev->device->name, dev->data->mtu,
+ max_rx_segs, TAP_MAX_RX_SEGS);
+ return -EINVAL;
+ }
+
+ if (max_rx_segs > 1 &&
+ !(dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_SCATTER)) {
+ /* non-fatal since applications might be doing it wrong now */
TAP_LOG(WARNING,
- "nb_rx_queues %d too small or mempool NULL",
- dev->data->nb_rx_queues);
- return -1;
+ "%s: MTU %u requires %u scatter segments, but offload not set",
+ dev->device->name, dev->data->mtu, max_rx_segs);
}
- rxq = rte_zmalloc_socket(dev->device->name, sizeof(*rxq), 0,
- socket_id);
+ rxq = rte_zmalloc_socket(dev->device->name,
+ sizeof(*rxq) + (max_rx_segs + 1) * sizeof(struct iovec),
+ RTE_CACHE_LINE_SIZE, socket_id);
if (!rxq) {
TAP_LOG(ERR,
"%s: Couldn't allocate rx queue structure",
@@ -1540,30 +1568,20 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
rxq->trigger_seen = 1; /* force initial burst */
rxq->in_port = dev->data->port_id;
rxq->queue_id = rx_queue_id;
- rxq->nb_rx_desc = nb_desc;
+ rxq->max_rx_segs = max_rx_segs;
rxq->rxmode = &dev->data->dev_conf.rxmode;
- iovecs = rte_zmalloc_socket(dev->device->name, sizeof(*iovecs), 0,
- socket_id);
- if (!iovecs) {
- TAP_LOG(WARNING,
- "%s: Couldn't allocate %d RX descriptors",
- dev->device->name, nb_desc);
- rte_free(rxq);
- return -ENOMEM;
- }
- rxq->iovecs = iovecs;
dev->data->rx_queues[rx_queue_id] = rxq;
- fd = tap_setup_queue(dev, rx_queue_id, 1);
+ int fd = tap_setup_queue(dev, rx_queue_id, 1);
if (fd == -1) {
ret = fd;
goto error;
}
- (*rxq->iovecs)[0].iov_len = sizeof(struct tun_pi);
- (*rxq->iovecs)[0].iov_base = &rxq->pi;
+ rxq->iovecs[0].iov_len = sizeof(struct tun_pi);
+ rxq->iovecs[0].iov_base = &rxq->pi;
- for (i = 1; i <= nb_desc; i++) {
+ for (uint16_t i = 1; i <= max_rx_segs; i++) {
*tmp = rte_pktmbuf_alloc(rxq->mp);
if (!*tmp) {
TAP_LOG(WARNING,
@@ -1572,8 +1590,8 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
ret = -ENOMEM;
goto error;
}
- (*rxq->iovecs)[i].iov_len = (*tmp)->buf_len - data_off;
- (*rxq->iovecs)[i].iov_base =
+ rxq->iovecs[i].iov_len = (*tmp)->buf_len - data_off;
+ rxq->iovecs[i].iov_base =
(char *)(*tmp)->buf_addr + data_off;
data_off = 0;
tmp = &(*tmp)->next;
@@ -1592,7 +1610,6 @@ tap_rx_queue_setup(struct rte_eth_dev *dev,
error:
tap_rxq_pool_free(rxq->pool);
- rte_free(rxq->iovecs);
rte_free(rxq);
dev->data->rx_queues[rx_queue_id] = NULL;
return ret;
@@ -1614,8 +1631,8 @@ tap_tx_queue_setup(struct rte_eth_dev *dev,
if (tx_queue_id >= dev->data->nb_tx_queues)
return -1;
- txq = rte_zmalloc_socket(dev->device->name, sizeof(*txq), 0,
- socket_id);
+ txq = rte_zmalloc_socket(dev->device->name, sizeof(*txq),
+ RTE_CACHE_LINE_SIZE, socket_id);
if (!txq) {
TAP_LOG(ERR,
"%s: Couldn't allocate tx queue structure",
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 365d5a5fe1..78182b1185 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -49,11 +49,11 @@ struct rx_queue {
uint16_t in_port; /* Port ID */
uint16_t queue_id; /* queue ID*/
struct pkt_stats stats; /* Stats for this RX queue */
- uint16_t nb_rx_desc; /* max number of mbufs available */
+ uint16_t max_rx_segs; /* max scatter segments per packet */
struct rte_eth_rxmode *rxmode; /* RX features */
struct rte_mbuf *pool; /* mbufs pool for this queue */
- struct iovec (*iovecs)[]; /* descriptors for this queue */
struct tun_pi pi; /* packet info for iovecs */
+ struct iovec iovecs[]; /* iov[0] = pi, iov[1..max_rx_segs] = data */
};
struct tx_queue {
--
2.51.0
next prev parent reply other threads:[~2026-02-22 17:34 UTC|newest]
Thread overview: 82+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-15 19:52 [PATCH 00/10] net/tap: tests, cleanups, and error path fixes Stephen Hemminger
2026-02-15 19:52 ` [PATCH 01/10] test: add unit tests for TAP PMD Stephen Hemminger
2026-02-16 0:46 ` Stephen Hemminger
2026-02-15 19:52 ` [PATCH 02/10] net/tap: replace runtime speed capability with constant Stephen Hemminger
2026-02-15 19:52 ` [PATCH 03/10] net/tap: clarify TUN/TAP flag assignment Stephen Hemminger
2026-02-15 21:45 ` Morten Brørup
2026-02-16 0:47 ` Stephen Hemminger
2026-02-16 5:56 ` Morten Brørup
2026-02-15 19:52 ` [PATCH 04/10] net/tap: extend fixed MAC range to 16 bits Stephen Hemminger
2026-02-15 19:52 ` [PATCH 05/10] net/tap: skip checksum on truncated L4 headers Stephen Hemminger
2026-02-15 19:52 ` [PATCH 06/10] net/tap: fix resource leaks in tap create error path Stephen Hemminger
2026-02-15 19:52 ` [PATCH 07/10] net/tap: fix resource leaks in secondary process probe Stephen Hemminger
2026-02-15 19:52 ` [PATCH 08/10] net/tap: free IPC reply buffer on queue count mismatch Stephen Hemminger
2026-02-15 19:52 ` [PATCH 09/10] net/tap: fix use-after-free on remote flow creation failure Stephen Hemminger
2026-02-15 19:52 ` [PATCH 10/10] net/tap: free remote flow when implicit rule already exists Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 00/11] net/tap: test, cleanups and error path fixes Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 01/11] test: add unit tests for TAP PMD Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 02/11] net/tap: replace runtime speed capability with constant Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 03/11] net/tap: clarify TUN/TAP flag assignment Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 04/11] net/tap: extend fixed MAC range to 16 bits Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 05/11] net/tap: skip checksum on truncated L4 headers Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 06/11] net/tap: fix resource leaks in tap create error path Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 07/11] net/tap: fix resource leaks in secondary process probe Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 08/11] net/tap: free IPC reply buffer on queue count mismatch Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 09/11] net/tap: fix use-after-free on remote flow creation failure Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 10/11] net/tap: free remote flow when implicit rule already exists Stephen Hemminger
2026-02-16 23:02 ` [PATCH v2 11/11] net/tap: track device by ifindex instead of name Stephen Hemminger
2026-02-17 1:28 ` Stephen Hemminger
2026-02-17 15:04 ` [RFT 0/4] net/mlx5: fix several correctness bugs Stephen Hemminger
2026-02-17 15:04 ` [RFT 1/4] net/mlx5: fix NULL dereference in Tx queue start Stephen Hemminger
2026-02-26 9:55 ` Dariusz Sosnowski
2026-02-17 15:05 ` [RFT 2/4] net/mlx5: fix counter leak in hairpin queue setup Stephen Hemminger
2026-02-26 9:57 ` Dariusz Sosnowski
2026-02-17 15:05 ` [RFT 3/4] net/mlx5: fix use-after-free in ASO management init Stephen Hemminger
2026-02-26 9:57 ` Dariusz Sosnowski
2026-02-17 15:05 ` [RFT 4/4] net/mlx5: fix counter truncation in queue counter read Stephen Hemminger
2026-02-26 9:58 ` Dariusz Sosnowski
2026-03-01 10:33 ` [RFT 0/4] net/mlx5: fix several correctness bugs Raslan Darawsheh
2026-02-20 5:02 ` [PATCH v3 00/10] net/tap: bug fixes and add test Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 01/10] test: add unit tests for TAP PMD Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 02/10] net/tap: replace runtime speed capability with constant Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 03/10] net/tap: clarify TUN/TAP flag assignment Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 04/10] net/tap: extend fixed MAC range to 16 bits Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 05/10] net/tap: skip checksum on truncated L4 headers Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 06/10] net/tap: fix resource leaks in tap create error path Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 07/10] net/tap: fix resource leaks in secondary process probe Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 08/10] net/tap: free IPC reply buffer on queue count mismatch Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 09/10] net/tap: fix use-after-free on remote flow creation failure Stephen Hemminger
2026-02-20 5:02 ` [PATCH v3 10/10] net/tap: free remote flow when implicit rule already exists Stephen Hemminger
2026-02-20 17:02 ` [PATCH 00/10] net/tap: cleanups and bug fixes Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 01/10] net/tap: replace runtime speed capability with constant Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 02/10] net/tap: clarify TUN/TAP flag assignment Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 03/10] net/tap: extend fixed MAC range to 16 bits Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 04/10] net/tap: skip checksum on truncated L4 headers Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 05/10] net/tap: fix resource leaks in tap create error path Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 06/10] net/tap: fix resource leaks in secondary process probe Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 07/10] net/tap: free IPC reply buffer on queue count mismatch Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 08/10] net/tap: fix use-after-free on remote flow creation failure Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 09/10] net/tap: free remote flow when implicit rule already exists Stephen Hemminger
2026-02-20 17:02 ` [PATCH v4 10/10] test: add unit tests for TAP PMD Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 00/19] net/tap: cleanups, bug fixes, and VLA removal Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 01/19] net/tap: fix handling of queue stats Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 02/19] doc: update tap features Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 03/19] net/tap: use correct length for interface names Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 04/19] net/tap: replace runtime speed capability with constant Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 05/19] net/tap: clarify TUN/TAP flag assignment Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 06/19] net/tap: extend fixed MAC range to 16 bits Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 07/19] net/tap: skip checksum on truncated L4 headers Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 08/19] net/tap: fix resource leaks in tap create error path Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 09/19] net/tap: fix resource leaks in secondary process probe Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 10/19] net/tap: free IPC reply buffer on queue count mismatch Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 11/19] net/tap: fix use-after-free on remote flow creation failure Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 12/19] net/tap: free remote flow when implicit rule already exists Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 13/19] net/tap: dynamically allocate queue structures Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 14/19] net/tap: remove VLA in flow item validation Stephen Hemminger
2026-02-22 17:30 ` Stephen Hemminger [this message]
2026-02-22 17:30 ` [PATCH v5 16/19] net/tap: replace use of VLA in transmit burst Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 17/19] net/tap: consolidate queue statistics Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 18/19] net/tap: enable VLA warnings Stephen Hemminger
2026-02-22 17:30 ` [PATCH v5 19/19] test: add unit tests for TAP PMD Stephen Hemminger
2026-03-16 8:03 ` David Marchand
2026-02-25 23:18 ` [PATCH v5 00/19] net/tap: cleanups, bug fixes, and VLA removal Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260222173225.522754-16-stephen@networkplumber.org \
--to=stephen@networkplumber.org \
--cc=dev@dpdk.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox