* [PATCH v4] r8169: migrate Rx path to page_pool, prepare for XDP
@ 2026-07-04 8:06 Atharva Potdar
0 siblings, 0 replies; only message in thread
From: Atharva Potdar @ 2026-07-04 8:06 UTC (permalink / raw)
To: Heiner Kallweit, nic_swsd, Andrew Lunn, David S . Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni
Cc: Francois Romieu, netdev, atharvapotdar07
Migrate the Rx path to the page_pool API. On MACs newer than
RTL_GIGA_MAC_VER_06, this replaces the alloc_pages() +
skb_copy_to_linear_data() model with napi_build_skb(), giving
zero-copy Rx delivery and laying the groundwork for XDP support.
These MACs are initialized with XDP_PACKET_HEADROOM reserved at the
start of each buffer. This space is needed for skb_push() during
bridged or routed workloads; without it, such workloads fall back to
pskb_expand_head() reallocations on every packet.
MACs up to and including RTL_GIGA_MAC_VER_06 (the original PCI
RTL8169/RTL8110 generation) are kept on the existing copy-based path,
with headroom set to 0. This is a conservative choice rather than a
response to a known, version-specific erratum: this driver's oldest
supported silicon predates the DMA-mapped shared-page model that
zero-copy Rx relies on, and there is comparatively little test
coverage for it.
The pool is locked to order-2 (SZ_16K) allocations on all MACs,
matching the existing R8169_RX_BUF_SIZE, so this migration introduces
no change in per-descriptor allocation size or jumbo-frame behavior
relative to the current tree.
The Rx consumption loop (cur_rx) is decoupled from the buffer refill
loop (dirty_rx). A page is only replaced once napi_build_skb() has
taken ownership of it; under allocation failure, or on the copy path
where the page is reused as-is, the descriptor is left in place for
the NIC to reuse on the next pass. This avoids Tx watchdog timeouts
under memory pressure that the previous unconditional-realloc model
was prone to.
DMA mapping and CPU/device cache synchronization are handled by
page_pool via PP_FLAG_DMA_MAP and PP_FLAG_DMA_SYNC_DEV. The explicit
dma_sync_single_for_device() calls on the copy and failed-build_skb
paths are still required: those pages are returned to the ring
directly rather than through page_pool_put_page(), so the pool's
automatic put-time sync is never invoked for them.
Signed-off-by: Atharva Potdar <atharvapotdar07@gmail.com>
---
Testing:
- Rx path: Verified napi_build_skb() utilization via eBPF. 0 fallback allocations observed at 1Gbps line rate.
- XDP headroom: Confirmed 256B reservation via bpf_xdp_adjust_head(-14). No pskb_expand_head() calls triggered during routing.
- Error path: Injected order-2 allocation failures in the refill loop. Dropped packets as expected without triggering Tx watchdog timeouts.
- Tx ring: Toggled TSO/GSO/SG via ethtool under bidirectional TCP load. No ring stalls or watchdog timeouts.
- skmem: Ran 9k jumbo frame TCP streams. nstat showed no TcpExtTCPRcvQDrop or OfoPrune regressions.
v4:
- Bifurcated headroom by MAC version (0 for legacy, 256 for modern).
Older MACs (<= RTL_GIGA_MAC_VER_06) are kept on the copy path as a
conservative choice given limited test coverage on that hardware,
not in response to a known version-specific erratum.
- Locked pool to order-2 allocations to prevent memory fragmentation
and Tx watchdog panics caused by order-3 churn.
- Fixed memory leak in error path by retaining pages for hardware reuse.
- Fixed prefetch offset to align dynamically with payload.
v3:
- Added XDP_PACKET_HEADROOM to prevent pskb_expand_head() routing overhead.
- Decoupled Rx consumption (cur_rx) from buffer refill (dirty_rx).
- Registered xdp_rxq_info mem model at pool creation time.
v2:
- Reverted buffer size to SZ_16K to prevent MTU regression.
- Replaced skb_add_rx_frag() with napi_build_skb().
v1:
- Initial page_pool migration.
drivers/net/ethernet/realtek/r8169_main.c | 141 ++++++++++++++++++----
1 file changed, 115 insertions(+), 26 deletions(-)
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index ec4fc21fa..5b13880c7 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -9,6 +9,7 @@
* See MAINTAINERS file for support contact information.
*/
+#include <linux/if_link.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/netdevice.h>
@@ -31,7 +32,9 @@
#include <linux/unaligned.h>
#include <net/ip6_checksum.h>
#include <net/netdev_queues.h>
+#include <net/page_pool/helpers.h>
#include <net/phy/realtek_phy.h>
+#include <net/xdp.h>
#include "r8169.h"
#include "r8169_firmware.h"
@@ -729,6 +732,9 @@ enum rtl_dash_type {
};
struct rtl8169_private {
+ struct page_pool *rx_pool;
+ struct xdp_rxq_info xdp_rxq;
+ u32 rx_headroom;
void __iomem *mmio_addr; /* memory map physical address */
struct pci_dev *pci_dev;
struct net_device *dev;
@@ -739,6 +745,7 @@ struct rtl8169_private {
u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */
u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */
u32 dirty_tx;
+ u32 dirty_rx;
struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */
struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */
dma_addr_t TxPhyAddr;
@@ -2622,6 +2629,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
static void rtl8169_init_ring_indexes(struct rtl8169_private *tp)
{
tp->dirty_tx = tp->cur_tx = tp->cur_rx = 0;
+ tp->dirty_rx = 0;
}
static void rtl_jumbo_config(struct rtl8169_private *tp)
@@ -4161,21 +4169,14 @@ static void rtl8169_mark_to_asic(struct RxDesc *desc)
static struct page *rtl8169_alloc_rx_data(struct rtl8169_private *tp,
struct RxDesc *desc)
{
- struct device *d = tp_to_dev(tp);
- int node = dev_to_node(d);
dma_addr_t mapping;
struct page *data;
- data = alloc_pages_node(node, GFP_KERNEL, get_order(R8169_RX_BUF_SIZE));
+ data = page_pool_dev_alloc_pages(tp->rx_pool);
if (!data)
return NULL;
- mapping = dma_map_page(d, data, 0, R8169_RX_BUF_SIZE, DMA_FROM_DEVICE);
- if (unlikely(dma_mapping_error(d, mapping))) {
- netdev_err(tp->dev, "Failed to map RX DMA!\n");
- __free_pages(data, get_order(R8169_RX_BUF_SIZE));
- return NULL;
- }
+ mapping = page_pool_get_dma_addr(data) + tp->rx_headroom;
desc->addr = cpu_to_le64(mapping);
rtl8169_mark_to_asic(desc);
@@ -4188,14 +4189,18 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
int i;
for (i = 0; i < NUM_RX_DESC && tp->Rx_databuff[i]; i++) {
- dma_unmap_page(tp_to_dev(tp),
- le64_to_cpu(tp->RxDescArray[i].addr),
- R8169_RX_BUF_SIZE, DMA_FROM_DEVICE);
- __free_pages(tp->Rx_databuff[i], get_order(R8169_RX_BUF_SIZE));
+ page_pool_put_full_page(tp->rx_pool, tp->Rx_databuff[i], false);
tp->Rx_databuff[i] = NULL;
tp->RxDescArray[i].addr = 0;
tp->RxDescArray[i].opts1 = 0;
}
+
+ if (tp->rx_pool) {
+ if (xdp_rxq_info_is_reg(&tp->xdp_rxq))
+ xdp_rxq_info_unreg(&tp->xdp_rxq);
+ page_pool_destroy(tp->rx_pool);
+ tp->rx_pool = NULL;
+ }
}
static int rtl8169_rx_fill(struct rtl8169_private *tp)
@@ -4221,12 +4226,52 @@ static int rtl8169_rx_fill(struct rtl8169_private *tp)
static int rtl8169_init_ring(struct rtl8169_private *tp)
{
+ struct page_pool_params params = {0};
+ int err;
+
rtl8169_init_ring_indexes(tp);
+ if (tp->mac_version <= RTL_GIGA_MAC_VER_06)
+ tp->rx_headroom = 0;
+ else
+ tp->rx_headroom = XDP_PACKET_HEADROOM;
+
+ params.order = get_order(R8169_RX_BUF_SIZE);
+ params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+ params.pool_size = NUM_RX_DESC;
+ params.nid = dev_to_node(tp_to_dev(tp));
+ params.dev = tp_to_dev(tp);
+ params.napi = &tp->napi;
+ params.dma_dir = DMA_FROM_DEVICE;
+ params.offset = tp->rx_headroom;
+ params.max_len = R8169_RX_BUF_SIZE - tp->rx_headroom;
+
+ tp->rx_pool = page_pool_create(¶ms);
+ if (IS_ERR(tp->rx_pool)) {
+ err = PTR_ERR(tp->rx_pool);
+ tp->rx_pool = NULL;
+ return err;
+ }
+
+ err = xdp_rxq_info_reg(&tp->xdp_rxq, tp->dev, 0, tp->napi.napi_id);
+ if (err)
+ goto err_free_pool;
+
+ err = xdp_rxq_info_reg_mem_model(&tp->xdp_rxq, MEM_TYPE_PAGE_POOL, tp->rx_pool);
+ if (err)
+ goto err_unreg_rxq;
+
memset(tp->tx_skb, 0, sizeof(tp->tx_skb));
memset(tp->Rx_databuff, 0, sizeof(tp->Rx_databuff));
return rtl8169_rx_fill(tp);
+
+err_unreg_rxq:
+ xdp_rxq_info_unreg(&tp->xdp_rxq);
+err_free_pool:
+ page_pool_destroy(tp->rx_pool);
+ tp->rx_pool = NULL;
+ return err;
}
static void rtl8169_unmap_tx_skb(struct rtl8169_private *tp, unsigned int entry)
@@ -4768,16 +4813,39 @@ static inline void rtl8169_rx_csum(struct sk_buff *skb, u32 opts1)
skb_checksum_none_assert(skb);
}
+static void rtl8169_rx_refill(struct rtl8169_private *tp)
+{
+ while (tp->dirty_rx != tp->cur_rx) {
+ unsigned int entry = tp->dirty_rx % NUM_RX_DESC;
+ struct RxDesc *desc = tp->RxDescArray + entry;
+
+ if (!tp->Rx_databuff[entry]) {
+ struct page *new_page = page_pool_dev_alloc_pages(tp->rx_pool);
+
+ if (unlikely(!new_page))
+ break;
+
+ tp->Rx_databuff[entry] = new_page;
+
+ desc->addr = cpu_to_le64(page_pool_get_dma_addr(new_page) +
+ tp->rx_headroom);
+ }
+ rtl8169_mark_to_asic(desc);
+
+ tp->dirty_rx++;
+ }
+}
+
static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, int budget)
{
struct device *d = tp_to_dev(tp);
int count;
- for (count = 0; count < budget; count++, tp->cur_rx++) {
+ for (count = 0; count < budget;) {
unsigned int pkt_size, entry = tp->cur_rx % NUM_RX_DESC;
struct RxDesc *desc = tp->RxDescArray + entry;
struct sk_buff *skb;
- const void *rx_buf;
+ void *rx_buf;
dma_addr_t addr;
u32 status;
@@ -4820,21 +4888,39 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, int budget
goto release_descriptor;
}
- skb = napi_alloc_skb(&tp->napi, pkt_size);
- if (unlikely(!skb)) {
- dev->stats.rx_dropped++;
- goto release_descriptor;
- }
+ if (unlikely(!tp->Rx_databuff[entry]))
+ break;
addr = le64_to_cpu(desc->addr);
rx_buf = page_address(tp->Rx_databuff[entry]);
dma_sync_single_for_cpu(d, addr, pkt_size, DMA_FROM_DEVICE);
- prefetch(rx_buf);
- skb_copy_to_linear_data(skb, rx_buf, pkt_size);
- skb->tail += pkt_size;
- skb->len = pkt_size;
- dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
+ prefetch(rx_buf + tp->rx_headroom);
+
+ if (unlikely(tp->rx_headroom == 0)) {
+ skb = napi_alloc_skb(&tp->napi, pkt_size);
+ if (likely(skb)) {
+ skb_copy_to_linear_data(skb, rx_buf, pkt_size);
+ skb_put(skb, pkt_size);
+ }
+ dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
+ } else {
+ skb = napi_build_skb(rx_buf, R8169_RX_BUF_SIZE);
+ if (likely(skb)) {
+ skb_reserve(skb, tp->rx_headroom);
+ skb_put(skb, pkt_size);
+ skb_mark_for_recycle(skb);
+
+ tp->Rx_databuff[entry] = NULL;
+ } else {
+ dma_sync_single_for_device(d, addr, pkt_size, DMA_FROM_DEVICE);
+ }
+ }
+
+ if (unlikely(!skb)) {
+ dev->stats.rx_dropped++;
+ goto release_descriptor;
+ }
rtl8169_rx_csum(skb, status);
skb->protocol = eth_type_trans(skb, dev);
@@ -4848,9 +4934,12 @@ static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, int budget
dev_sw_netstats_rx_add(dev, pkt_size);
release_descriptor:
- rtl8169_mark_to_asic(desc);
+ tp->cur_rx++;
+ count++;
}
+ rtl8169_rx_refill(tp);
+
return count;
}
--
2.55.0
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2026-07-04 8:07 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-04 8:06 [PATCH v4] r8169: migrate Rx path to page_pool, prepare for XDP Atharva Potdar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox