From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
To: Tony Nguyen <anthony.l.nguyen@intel.com>
Cc: <davem@davemloft.net>, <kuba@kernel.org>, <pabeni@redhat.com>,
<edumazet@google.com>, <netdev@vger.kernel.org>,
Sriram Yagnaraman <sriram.yagnaraman@est.tech>,
<magnus.karlsson@intel.com>, <ast@kernel.org>,
<daniel@iogearbox.net>, <hawk@kernel.org>,
<john.fastabend@gmail.com>, <bpf@vger.kernel.org>,
<kurt@linutronix.de>, <sriram.yagnaraman@ericsson.com>,
<richardcochran@gmail.com>, <benjamin.steinke@woks-audio.com>,
<bigeasy@linutronix.de>,
"Chandan Kumar Rout" <chandanx.rout@intel.com>
Subject: Re: [PATCH net-next 4/4] igb: add AF_XDP zero-copy Tx support
Date: Sat, 10 Aug 2024 16:10:06 +0200 [thread overview]
Message-ID: <Zrd0vnsU2l0OTsvj@boxer> (raw)
In-Reply-To: <20240808183556.386397-5-anthony.l.nguyen@intel.com>
On Thu, Aug 08, 2024 at 11:35:54AM -0700, Tony Nguyen wrote:
> From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
>
> Add support for AF_XDP zero-copy transmit path.
>
> A new TX buffer type IGB_TYPE_XSK is introduced to indicate that the Tx
> frame was allocated from the xsk buff pool, so igb_clean_tx_ring and
> igb_clean_tx_irq can clean the buffers correctly based on type.
>
> igb_xmit_zc performs the actual packet transmit when AF_XDP zero-copy is
> enabled. We share the TX ring between slow path, XDP and AF_XDP
> zero-copy, so we use the netdev queue lock to ensure mutual exclusion.
>
> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
> [Kurt: Set olinfo_status in igb_xmit_zc() so that frames are transmitted]
> Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
> Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
> ---
> drivers/net/ethernet/intel/igb/igb.h | 2 +
> drivers/net/ethernet/intel/igb/igb_main.c | 56 +++++++++++++++++++----
> drivers/net/ethernet/intel/igb/igb_xsk.c | 53 +++++++++++++++++++++
> 3 files changed, 102 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
> index 4983a6ec718e..9ee18ac1ba47 100644
> --- a/drivers/net/ethernet/intel/igb/igb.h
> +++ b/drivers/net/ethernet/intel/igb/igb.h
> @@ -257,6 +257,7 @@ enum igb_tx_flags {
> enum igb_tx_buf_type {
> IGB_TYPE_SKB = 0,
> IGB_TYPE_XDP,
> + IGB_TYPE_XSK
> };
>
> /* wrapper around a pointer to a socket buffer,
> @@ -836,6 +837,7 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter,
> bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count);
> void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
> int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget);
> +bool igb_xmit_zc(struct igb_ring *tx_ring);
> int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
>
> #endif /* _IGB_H_ */
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index 0b779b2ca9ea..1ebd67981978 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -2996,6 +2996,9 @@ static int igb_xdp_xmit(struct net_device *dev, int n,
> if (unlikely(!tx_ring))
> return -ENXIO;
>
> + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
> + return -ENXIO;
> +
> nq = txring_txq(tx_ring);
> __netif_tx_lock(nq, cpu);
>
> @@ -4917,15 +4920,20 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
> {
> u16 i = tx_ring->next_to_clean;
> struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i];
> + u32 xsk_frames = 0;
>
> while (i != tx_ring->next_to_use) {
> union e1000_adv_tx_desc *eop_desc, *tx_desc;
>
> /* Free all the Tx ring sk_buffs or xdp frames */
> - if (tx_buffer->type == IGB_TYPE_SKB)
> + if (tx_buffer->type == IGB_TYPE_SKB) {
> dev_kfree_skb_any(tx_buffer->skb);
> - else
> + } else if (tx_buffer->type == IGB_TYPE_XDP) {
> xdp_return_frame(tx_buffer->xdpf);
> + } else if (tx_buffer->type == IGB_TYPE_XSK) {
> + xsk_frames++;
> + goto skip_for_xsk;
> + }
>
> /* unmap skb header data */
> dma_unmap_single(tx_ring->dev,
> @@ -4956,6 +4964,7 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
> DMA_TO_DEVICE);
> }
>
> +skip_for_xsk:
> tx_buffer->next_to_watch = NULL;
>
> /* move us one more past the eop_desc for start of next pkt */
> @@ -4970,6 +4979,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring)
> /* reset BQL for queue */
> netdev_tx_reset_queue(txring_txq(tx_ring));
>
> + if (tx_ring->xsk_pool && xsk_frames)
> + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
> +
> /* reset next_to_use and next_to_clean */
> tx_ring->next_to_use = 0;
> tx_ring->next_to_clean = 0;
> @@ -6503,6 +6515,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
> return NETDEV_TX_BUSY;
> }
>
> + if (unlikely(test_bit(IGB_RING_FLAG_TX_DISABLED, &tx_ring->flags)))
> + return NETDEV_TX_BUSY;
> +
> /* record the location of the first descriptor for this packet */
> first = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> first->type = IGB_TYPE_SKB;
> @@ -8263,13 +8278,17 @@ static int igb_poll(struct napi_struct *napi, int budget)
> **/
> static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
> {
> - struct igb_adapter *adapter = q_vector->adapter;
> - struct igb_ring *tx_ring = q_vector->tx.ring;
> - struct igb_tx_buffer *tx_buffer;
> - union e1000_adv_tx_desc *tx_desc;
> unsigned int total_bytes = 0, total_packets = 0;
> + struct igb_adapter *adapter = q_vector->adapter;
> unsigned int budget = q_vector->tx.work_limit;
> + struct igb_ring *tx_ring = q_vector->tx.ring;
> unsigned int i = tx_ring->next_to_clean;
> + union e1000_adv_tx_desc *tx_desc;
> + struct igb_tx_buffer *tx_buffer;
> + int cpu = smp_processor_id();
> + bool xsk_xmit_done = true;
> + struct netdev_queue *nq;
> + u32 xsk_frames = 0;
>
> if (test_bit(__IGB_DOWN, &adapter->state))
> return true;
> @@ -8300,10 +8319,14 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
> total_packets += tx_buffer->gso_segs;
>
> /* free the skb */
> - if (tx_buffer->type == IGB_TYPE_SKB)
> + if (tx_buffer->type == IGB_TYPE_SKB) {
> napi_consume_skb(tx_buffer->skb, napi_budget);
> - else
> + } else if (tx_buffer->type == IGB_TYPE_XDP) {
> xdp_return_frame(tx_buffer->xdpf);
> + } else if (tx_buffer->type == IGB_TYPE_XSK) {
> + xsk_frames++;
> + goto skip_for_xsk;
> + }
>
> /* unmap skb header data */
> dma_unmap_single(tx_ring->dev,
> @@ -8335,6 +8358,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
> }
> }
>
> +skip_for_xsk:
> /* move us one more past the eop_desc for start of next pkt */
> tx_buffer++;
> tx_desc++;
> @@ -8363,6 +8387,20 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
> q_vector->tx.total_bytes += total_bytes;
> q_vector->tx.total_packets += total_packets;
>
> + if (tx_ring->xsk_pool) {
READ_ONCE()
> + if (xsk_frames)
> + xsk_tx_completed(tx_ring->xsk_pool, xsk_frames);
> + if (xsk_uses_need_wakeup(tx_ring->xsk_pool))
> + xsk_set_tx_need_wakeup(tx_ring->xsk_pool);
> +
> + nq = txring_txq(tx_ring);
> + __netif_tx_lock(nq, cpu);
> + /* Avoid transmit queue timeout since we share it with the slow path */
> + txq_trans_cond_update(nq);
> + xsk_xmit_done = igb_xmit_zc(tx_ring);
> + __netif_tx_unlock(nq);
> + }
> +
> if (test_bit(IGB_RING_FLAG_TX_DETECT_HANG, &tx_ring->flags)) {
> struct e1000_hw *hw = &adapter->hw;
>
> @@ -8425,7 +8463,7 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget)
> }
> }
>
> - return !!budget;
> + return !!budget && xsk_xmit_done;
> }
>
> /**
> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
> index 66cdc30e9b6e..4e530e1eb3c0 100644
> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
> @@ -431,6 +431,59 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, const int budget)
> return failure ? budget : (int)total_packets;
> }
>
> +bool igb_xmit_zc(struct igb_ring *tx_ring)
> +{
> + unsigned int budget = igb_desc_unused(tx_ring);
> + struct xsk_buff_pool *pool = tx_ring->xsk_pool;
> + u32 cmd_type, olinfo_status, nb_pkts, i = 0;
> + struct xdp_desc *descs = pool->tx_descs;
> + union e1000_adv_tx_desc *tx_desc = NULL;
> + struct igb_tx_buffer *tx_buffer_info;
> + unsigned int total_bytes = 0;
> + dma_addr_t dma;
check IGB_RING_FLAG_TX_DISABLED?
> +
> + nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
> + if (!nb_pkts)
> + return true;
> +
> + while (nb_pkts-- > 0) {
> + dma = xsk_buff_raw_get_dma(pool, descs[i].addr);
> + xsk_buff_raw_dma_sync_for_device(pool, dma, descs[i].len);
> +
> + tx_buffer_info = &tx_ring->tx_buffer_info[tx_ring->next_to_use];
> + tx_buffer_info->bytecount = descs[i].len;
> + tx_buffer_info->type = IGB_TYPE_XSK;
> + tx_buffer_info->xdpf = NULL;
> + tx_buffer_info->gso_segs = 1;
> + tx_buffer_info->time_stamp = jiffies;
> +
> + tx_desc = IGB_TX_DESC(tx_ring, tx_ring->next_to_use);
> + tx_desc->read.buffer_addr = cpu_to_le64(dma);
> +
> + /* put descriptor type bits */
> + cmd_type = E1000_ADVTXD_DTYP_DATA | E1000_ADVTXD_DCMD_DEXT |
> + E1000_ADVTXD_DCMD_IFCS;
> + olinfo_status = descs[i].len << E1000_ADVTXD_PAYLEN_SHIFT;
> +
> + cmd_type |= descs[i].len | IGB_TXD_DCMD;
This is also sub-optimal as you are setting RS bit on each Tx descriptor,
which will in turn raise a lot of irqs. See how ice sets RS bit only on
last desc from a batch and then, on cleaning side, how it finds a
descriptor that is supposed to have DD bit written by HW.
> + tx_desc->read.cmd_type_len = cpu_to_le32(cmd_type);
> + tx_desc->read.olinfo_status = cpu_to_le32(olinfo_status);
> +
> + total_bytes += descs[i].len;
> +
> + i++;
> + tx_ring->next_to_use++;
> + tx_buffer_info->next_to_watch = tx_desc;
> + if (tx_ring->next_to_use == tx_ring->count)
> + tx_ring->next_to_use = 0;
> + }
> +
> + netdev_tx_sent_queue(txring_txq(tx_ring), total_bytes);
> + igb_xdp_ring_update_tail(tx_ring);
> +
> + return nb_pkts < budget;
> +}
> +
> int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> {
> struct igb_adapter *adapter = netdev_priv(dev);
> --
> 2.42.0
>
next prev parent reply other threads:[~2024-08-10 14:10 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-08 18:35 [PATCH net-next 0/4][pull request] igb: Add support for AF_XDP zero-copy Tony Nguyen
2024-08-08 18:35 ` [PATCH net-next 1/4] igb: prepare for AF_XDP zero-copy support Tony Nguyen
2024-08-08 20:38 ` Maciej Fijalkowski
2024-08-09 13:05 ` Kurt Kanzenbach
2024-08-09 13:14 ` Fijalkowski, Maciej
2024-08-09 13:19 ` Kurt Kanzenbach
2024-08-08 18:35 ` [PATCH net-next 2/4] igb: Introduce XSK data structures and helpers Tony Nguyen
2024-08-10 13:35 ` Maciej Fijalkowski
2024-08-08 18:35 ` [PATCH net-next 3/4] igb: add AF_XDP zero-copy Rx support Tony Nguyen
2024-08-10 13:55 ` Maciej Fijalkowski
2024-08-10 14:12 ` Maciej Fijalkowski
2024-08-14 8:29 ` Kurt Kanzenbach
2024-08-14 8:55 ` Maciej Fijalkowski
2024-08-08 18:35 ` [PATCH net-next 4/4] igb: add AF_XDP zero-copy Tx support Tony Nguyen
2024-08-10 14:10 ` Maciej Fijalkowski [this message]
2024-08-14 8:36 ` Kurt Kanzenbach
2024-08-14 8:55 ` Maciej Fijalkowski
2024-08-14 9:12 ` Kurt Kanzenbach
2024-08-14 10:26 ` Maciej Fijalkowski
2024-08-14 12:51 ` Kurt Kanzenbach
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Zrd0vnsU2l0OTsvj@boxer \
--to=maciej.fijalkowski@intel.com \
--cc=anthony.l.nguyen@intel.com \
--cc=ast@kernel.org \
--cc=benjamin.steinke@woks-audio.com \
--cc=bigeasy@linutronix.de \
--cc=bpf@vger.kernel.org \
--cc=chandanx.rout@intel.com \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=hawk@kernel.org \
--cc=john.fastabend@gmail.com \
--cc=kuba@kernel.org \
--cc=kurt@linutronix.de \
--cc=magnus.karlsson@intel.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=richardcochran@gmail.com \
--cc=sriram.yagnaraman@ericsson.com \
--cc=sriram.yagnaraman@est.tech \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.