All of lore.kernel.org
 help / color / mirror / Atom feed
From: Kurt Kanzenbach <kurt@linutronix.de>
To: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Cc: Tony Nguyen <anthony.l.nguyen@intel.com>,
	Przemek Kitszel <przemyslaw.kitszel@intel.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Jesper Dangaard Brouer <hawk@kernel.org>,
	John Fastabend <john.fastabend@gmail.com>,
	Richard Cochran <richardcochran@gmail.com>,
	Sriram Yagnaraman <sriram.yagnaraman@ericsson.com>,
	Benjamin Steinke <benjamin.steinke@woks-audio.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	bpf@vger.kernel.org,
	Sriram Yagnaraman <sriram.yagnaraman@est.tech>
Subject: Re: [PATCH iwl-next v7 4/5] igb: Add AF_XDP zero-copy Rx support
Date: Mon, 07 Oct 2024 15:45:47 +0200	[thread overview]
Message-ID: <87jzek3u2c.fsf@kurt.kurt.home> (raw)
In-Reply-To: <ZwPdOxJrk04D9FKn@boxer>

[-- Attachment #1: Type: text/plain, Size: 14752 bytes --]

Hi Maciej,

On Mon Oct 07 2024, Maciej Fijalkowski wrote:
> On Mon, Oct 07, 2024 at 02:31:26PM +0200, Kurt Kanzenbach wrote:
>> From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
>> 
>> Add support for AF_XDP zero-copy receive path.
>> 
>> When AF_XDP zero-copy is enabled, the rx buffers are allocated from the
>> xsk buff pool using igb_alloc_rx_buffers_zc().
>> 
>> Use xsk_pool_get_rx_frame_size() to set SRRCTL rx buf size when zero-copy
>> is enabled.
>> 
>> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
>> [Kurt: Port to v6.10 and provide napi_id for xdp_rxq_info_reg(),
>>        RCT, remove NETDEV_XDP_ACT_XSK_ZEROCOPY, update NTC handling,
>>        move stats update and xdp finalize to common functions,
>>        READ_ONCE() xsk_pool, likelyfy for XDP_REDIRECT case]
>> Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
>
> Hi Kurt,
>
> Sorry but still have comments :< see below.

No worries :)

>
>> ---
>>  drivers/net/ethernet/intel/igb/igb.h      |   8 +
>>  drivers/net/ethernet/intel/igb/igb_main.c | 132 +++++++++----
>>  drivers/net/ethernet/intel/igb/igb_xsk.c  | 296 +++++++++++++++++++++++++++++-
>>  3 files changed, 398 insertions(+), 38 deletions(-)
>> 
>> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
>> index c30d6f9708f8..ea3977b313fc 100644
>> --- a/drivers/net/ethernet/intel/igb/igb.h
>> +++ b/drivers/net/ethernet/intel/igb/igb.h
>> @@ -88,6 +88,7 @@ struct igb_adapter;
>>  #define IGB_XDP_CONSUMED	BIT(0)
>>  #define IGB_XDP_TX		BIT(1)
>>  #define IGB_XDP_REDIR		BIT(2)
>> +#define IGB_XDP_EXIT		BIT(3)
>>  
>>  struct vf_data_storage {
>>  	unsigned char vf_mac_addresses[ETH_ALEN];
>> @@ -740,6 +741,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring);
>>  void igb_clean_rx_ring(struct igb_ring *rx_ring);
>>  void igb_configure_tx_ring(struct igb_adapter *, struct igb_ring *);
>>  void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *);
>> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status);
>> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
>> +			 unsigned int bytes);
>>  void igb_setup_tctl(struct igb_adapter *);
>>  void igb_setup_rctl(struct igb_adapter *);
>>  void igb_setup_srrctl(struct igb_adapter *, struct igb_ring *);
>> @@ -850,6 +854,10 @@ struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
>>  int igb_xsk_pool_setup(struct igb_adapter *adapter,
>>  		       struct xsk_buff_pool *pool,
>>  		       u16 qid);
>> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count);
>> +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
>> +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
>> +			struct xsk_buff_pool *xsk_pool, const int budget);
>>  int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
>>  
>>  #endif /* _IGB_H_ */
>> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
>> index bdba5c5861be..449ee794b3c9 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_main.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
>> @@ -472,12 +472,17 @@ static void igb_dump(struct igb_adapter *adapter)
>>  
>>  		for (i = 0; i < rx_ring->count; i++) {
>>  			const char *next_desc;
>> -			struct igb_rx_buffer *buffer_info;
>> -			buffer_info = &rx_ring->rx_buffer_info[i];
>> +			dma_addr_t dma = (dma_addr_t)0;
>> +			struct igb_rx_buffer *buffer_info = NULL;
>>  			rx_desc = IGB_RX_DESC(rx_ring, i);
>>  			u0 = (struct my_u0 *)rx_desc;
>>  			staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
>>  
>> +			if (!rx_ring->xsk_pool) {
>> +				buffer_info = &rx_ring->rx_buffer_info[i];
>> +				dma = buffer_info->dma;
>> +			}
>> +
>>  			if (i == rx_ring->next_to_use)
>>  				next_desc = " NTU";
>>  			else if (i == rx_ring->next_to_clean)
>> @@ -497,11 +502,11 @@ static void igb_dump(struct igb_adapter *adapter)
>>  					"R  ", i,
>>  					le64_to_cpu(u0->a),
>>  					le64_to_cpu(u0->b),
>> -					(u64)buffer_info->dma,
>> +					(u64)dma,
>>  					next_desc);
>>  
>>  				if (netif_msg_pktdata(adapter) &&
>> -				    buffer_info->dma && buffer_info->page) {
>> +				    buffer_info && dma && buffer_info->page) {
>>  					print_hex_dump(KERN_INFO, "",
>>  					  DUMP_PREFIX_ADDRESS,
>>  					  16, 1,
>> @@ -1983,7 +1988,10 @@ static void igb_configure(struct igb_adapter *adapter)
>>  	 */
>>  	for (i = 0; i < adapter->num_rx_queues; i++) {
>>  		struct igb_ring *ring = adapter->rx_ring[i];
>> -		igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
>> +		if (ring->xsk_pool)
>> +			igb_alloc_rx_buffers_zc(ring, igb_desc_unused(ring));
>> +		else
>> +			igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
>>  	}
>>  }
>>  
>> @@ -4405,7 +4413,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
>>  	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
>>  		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
>>  	res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
>> -			       rx_ring->queue_index, 0);
>> +			       rx_ring->queue_index,
>> +			       rx_ring->q_vector->napi.napi_id);
>>  	if (res < 0) {
>>  		dev_err(dev, "Failed to register xdp_rxq index %u\n",
>>  			rx_ring->queue_index);
>> @@ -4701,12 +4710,17 @@ void igb_setup_srrctl(struct igb_adapter *adapter, struct igb_ring *ring)
>>  	struct e1000_hw *hw = &adapter->hw;
>>  	int reg_idx = ring->reg_idx;
>>  	u32 srrctl = 0;
>> +	u32 buf_size;
>>  
>> -	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
>> -	if (ring_uses_large_buffer(ring))
>> -		srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>> +	if (ring->xsk_pool)
>> +		buf_size = xsk_pool_get_rx_frame_size(ring->xsk_pool);
>> +	else if (ring_uses_large_buffer(ring))
>> +		buf_size = IGB_RXBUFFER_3072;
>>  	else
>> -		srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>> +		buf_size = IGB_RXBUFFER_2048;
>> +
>> +	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
>> +	srrctl |= buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>>  	srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
>>  	if (hw->mac.type >= e1000_82580)
>>  		srrctl |= E1000_SRRCTL_TIMESTAMP;
>> @@ -4738,9 +4752,17 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
>>  	u32 rxdctl = 0;
>>  
>>  	xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
>> -	WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> -					   MEM_TYPE_PAGE_SHARED, NULL));
>>  	WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring));
>> +	if (ring->xsk_pool) {
>> +		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> +						   MEM_TYPE_XSK_BUFF_POOL,
>> +						   NULL));
>> +		xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
>> +	} else {
>> +		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> +						   MEM_TYPE_PAGE_SHARED,
>> +						   NULL));
>> +	}
>>  
>>  	/* disable the queue */
>>  	wr32(E1000_RXDCTL(reg_idx), 0);
>> @@ -4767,9 +4789,12 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
>>  	rxdctl |= IGB_RX_HTHRESH << 8;
>>  	rxdctl |= IGB_RX_WTHRESH << 16;
>>  
>> -	/* initialize rx_buffer_info */
>> -	memset(ring->rx_buffer_info, 0,
>> -	       sizeof(struct igb_rx_buffer) * ring->count);
>> +	if (ring->xsk_pool)
>> +		memset(ring->rx_buffer_info_zc, 0,
>> +		       sizeof(*ring->rx_buffer_info_zc) * ring->count);
>> +	else
>> +		memset(ring->rx_buffer_info, 0,
>> +		       sizeof(*ring->rx_buffer_info) * ring->count);
>>  
>>  	/* initialize Rx descriptor 0 */
>>  	rx_desc = IGB_RX_DESC(ring, 0);
>> @@ -4957,8 +4982,13 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
>>  
>>  	rx_ring->xdp_prog = NULL;
>>  	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
>> -	vfree(rx_ring->rx_buffer_info);
>> -	rx_ring->rx_buffer_info = NULL;
>> +	if (rx_ring->xsk_pool) {
>> +		vfree(rx_ring->rx_buffer_info_zc);
>> +		rx_ring->rx_buffer_info_zc = NULL;
>> +	} else {
>> +		vfree(rx_ring->rx_buffer_info);
>> +		rx_ring->rx_buffer_info = NULL;
>> +	}
>>  
>>  	/* if not set, then don't free */
>>  	if (!rx_ring->desc)
>> @@ -4996,6 +5026,11 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring)
>>  	dev_kfree_skb(rx_ring->skb);
>>  	rx_ring->skb = NULL;
>>  
>> +	if (rx_ring->xsk_pool) {
>> +		igb_clean_rx_ring_zc(rx_ring);
>> +		goto skip_for_xsk;
>> +	}
>> +
>>  	/* Free all the Rx ring sk_buffs */
>>  	while (i != rx_ring->next_to_alloc) {
>>  		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
>> @@ -5023,6 +5058,7 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring)
>>  			i = 0;
>>  	}
>>  
>> +skip_for_xsk:
>>  	rx_ring->next_to_alloc = 0;
>>  	rx_ring->next_to_clean = 0;
>>  	rx_ring->next_to_use = 0;
>> @@ -8177,6 +8213,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
>>  	struct igb_q_vector *q_vector = container_of(napi,
>>  						     struct igb_q_vector,
>>  						     napi);
>> +	struct xsk_buff_pool *xsk_pool;
>>  	bool clean_complete = true;
>>  	int work_done = 0;
>>  
>> @@ -8188,7 +8225,12 @@ static int igb_poll(struct napi_struct *napi, int budget)
>>  		clean_complete = igb_clean_tx_irq(q_vector, budget);
>>  
>>  	if (q_vector->rx.ring) {
>> -		int cleaned = igb_clean_rx_irq(q_vector, budget);
>> +		int cleaned;
>> +
>> +		xsk_pool = READ_ONCE(q_vector->rx.ring->xsk_pool);
>> +		cleaned = xsk_pool ?
>> +			igb_clean_rx_irq_zc(q_vector, xsk_pool, budget) :
>> +			igb_clean_rx_irq(q_vector, budget);
>>  
>>  		work_done += cleaned;
>>  		if (cleaned >= budget)
>> @@ -8852,6 +8894,38 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring,
>>  	rx_buffer->page = NULL;
>>  }
>>  
>> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status)
>> +{
>> +	int cpu = smp_processor_id();
>> +	struct netdev_queue *nq;
>> +
>> +	if (status & IGB_XDP_REDIR)
>> +		xdp_do_flush();
>> +
>> +	if (status & IGB_XDP_TX) {
>> +		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
>> +
>> +		nq = txring_txq(tx_ring);
>> +		__netif_tx_lock(nq, cpu);
>> +		igb_xdp_ring_update_tail(tx_ring);
>> +		__netif_tx_unlock(nq);
>> +	}
>> +}
>> +
>> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
>> +			 unsigned int bytes)
>> +{
>> +	struct igb_ring *ring = q_vector->rx.ring;
>> +
>> +	u64_stats_update_begin(&ring->rx_syncp);
>> +	ring->rx_stats.packets += packets;
>> +	ring->rx_stats.bytes += bytes;
>> +	u64_stats_update_end(&ring->rx_syncp);
>> +
>> +	q_vector->rx.total_packets += packets;
>> +	q_vector->rx.total_bytes += bytes;
>> +}
>> +
>>  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  {
>>  	unsigned int total_bytes = 0, total_packets = 0;
>> @@ -8859,9 +8933,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  	struct igb_ring *rx_ring = q_vector->rx.ring;
>>  	u16 cleaned_count = igb_desc_unused(rx_ring);
>>  	struct sk_buff *skb = rx_ring->skb;
>> -	int cpu = smp_processor_id();
>>  	unsigned int xdp_xmit = 0;
>> -	struct netdev_queue *nq;
>>  	struct xdp_buff xdp;
>>  	u32 frame_sz = 0;
>>  	int rx_buf_pgcnt;
>> @@ -8983,24 +9055,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  	/* place incomplete frames back on ring for completion */
>>  	rx_ring->skb = skb;
>>  
>> -	if (xdp_xmit & IGB_XDP_REDIR)
>> -		xdp_do_flush();
>> -
>> -	if (xdp_xmit & IGB_XDP_TX) {
>> -		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
>> -
>> -		nq = txring_txq(tx_ring);
>> -		__netif_tx_lock(nq, cpu);
>> -		igb_xdp_ring_update_tail(tx_ring);
>> -		__netif_tx_unlock(nq);
>> -	}
>> +	if (xdp_xmit)
>> +		igb_finalize_xdp(adapter, xdp_xmit);
>
> Nit: given you would be sending next revision, IMHO this is a candidate
> for a separate patch. Not a big deal but would reduce the noise in this
> one.

Yes, makes sense.

>
>>  
>> -	u64_stats_update_begin(&rx_ring->rx_syncp);
>> -	rx_ring->rx_stats.packets += total_packets;
>> -	rx_ring->rx_stats.bytes += total_bytes;
>> -	u64_stats_update_end(&rx_ring->rx_syncp);
>> -	q_vector->rx.total_packets += total_packets;
>> -	q_vector->rx.total_bytes += total_bytes;
>> +	igb_update_rx_stats(q_vector, total_packets, total_bytes);
>
> This also.
>
>>  
>>  	if (cleaned_count)
>>  		igb_alloc_rx_buffers(rx_ring, cleaned_count);
>> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
>> index 7b632be3e7e3..9fd094a799fa 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
>> @@ -70,7 +70,10 @@ static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
>>  	 * at least 1 descriptor unused to make sure
>>  	 * next_to_use != next_to_clean
>>  	 */
>> -	igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
>> +	if (rx_ring->xsk_pool)
>> +		igb_alloc_rx_buffers_zc(rx_ring, igb_desc_unused(rx_ring));
>> +	else
>> +		igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
>>  
>>  	/* Rx/Tx share the same napi context. */
>>  	napi_enable(&rx_ring->q_vector->napi);
>> @@ -169,6 +172,297 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter,
>>  		igb_xsk_pool_disable(adapter, qid);
>>  }
>>  
>> +static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
>> +			     union e1000_adv_rx_desc *rx_desc, u16 count)
>> +{
>> +	dma_addr_t dma;
>> +	u16 buffs;
>> +	int i;
>> +
>> +	/* nothing to do */
>> +	if (!count)
>> +		return 0;
>> +
>> +	buffs = xsk_buff_alloc_batch(pool, xdp, count);
>> +	for (i = 0; i < buffs; i++) {
>> +		dma = xsk_buff_xdp_get_dma(*xdp);
>> +		rx_desc->read.pkt_addr = cpu_to_le64(dma);
>> +		rx_desc->wb.upper.length = 0;
>> +
>> +		rx_desc++;
>> +		xdp++;
>> +	}
>> +
>> +	return buffs;
>> +}
>> +
>> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count)
>> +{
>> +	u32 nb_buffs_extra = 0, nb_buffs = 0;
>> +	union e1000_adv_rx_desc *rx_desc;
>> +	u16 ntu = rx_ring->next_to_use;
>> +	u16 total_count = count;
>> +	struct xdp_buff **xdp;
>> +
>> +	rx_desc = IGB_RX_DESC(rx_ring, ntu);
>> +	xdp = &rx_ring->rx_buffer_info_zc[ntu];
>> +
>> +	if (ntu + count >= rx_ring->count) {
>> +		nb_buffs_extra = igb_fill_rx_descs(rx_ring->xsk_pool, xdp,
>> +						   rx_desc,
>> +						   rx_ring->count - ntu);
>
> Ehh wanted to ack this finally, but I believe that here you need to work
> on the pool pointer that was READ_ONCE() in igb_poll() in hot path and
> in igb_configure() pass rx_ring->xsk_pool as an argument.

Good catch, thanks!

Thanks,
Kurt

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 861 bytes --]

WARNING: multiple messages have this Message-ID (diff)
From: Kurt Kanzenbach <kurt@linutronix.de>
To: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Cc: Jesper Dangaard Brouer <hawk@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	Sriram Yagnaraman <sriram.yagnaraman@est.tech>,
	Przemek Kitszel <przemyslaw.kitszel@intel.com>,
	Richard Cochran <richardcochran@gmail.com>,
	John Fastabend <john.fastabend@gmail.com>,
	Alexei Starovoitov <ast@kernel.org>,
	Sriram Yagnaraman <sriram.yagnaraman@ericsson.com>,
	Benjamin Steinke <benjamin.steinke@woks-audio.com>,
	Eric Dumazet <edumazet@google.com>,
	netdev@vger.kernel.org, Tony Nguyen <anthony.l.nguyen@intel.com>,
	intel-wired-lan@lists.osuosl.org,
	Jakub Kicinski <kuba@kernel.org>,
	bpf@vger.kernel.org, Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Subject: Re: [Intel-wired-lan] [PATCH iwl-next v7 4/5] igb: Add AF_XDP zero-copy Rx support
Date: Mon, 07 Oct 2024 15:45:47 +0200	[thread overview]
Message-ID: <87jzek3u2c.fsf@kurt.kurt.home> (raw)
In-Reply-To: <ZwPdOxJrk04D9FKn@boxer>

[-- Attachment #1: Type: text/plain, Size: 14752 bytes --]

Hi Maciej,

On Mon Oct 07 2024, Maciej Fijalkowski wrote:
> On Mon, Oct 07, 2024 at 02:31:26PM +0200, Kurt Kanzenbach wrote:
>> From: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
>> 
>> Add support for AF_XDP zero-copy receive path.
>> 
>> When AF_XDP zero-copy is enabled, the rx buffers are allocated from the
>> xsk buff pool using igb_alloc_rx_buffers_zc().
>> 
>> Use xsk_pool_get_rx_frame_size() to set SRRCTL rx buf size when zero-copy
>> is enabled.
>> 
>> Signed-off-by: Sriram Yagnaraman <sriram.yagnaraman@est.tech>
>> [Kurt: Port to v6.10 and provide napi_id for xdp_rxq_info_reg(),
>>        RCT, remove NETDEV_XDP_ACT_XSK_ZEROCOPY, update NTC handling,
>>        move stats update and xdp finalize to common functions,
>>        READ_ONCE() xsk_pool, likelyfy for XDP_REDIRECT case]
>> Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
>
> Hi Kurt,
>
> Sorry but still have comments :< see below.

No worries :)

>
>> ---
>>  drivers/net/ethernet/intel/igb/igb.h      |   8 +
>>  drivers/net/ethernet/intel/igb/igb_main.c | 132 +++++++++----
>>  drivers/net/ethernet/intel/igb/igb_xsk.c  | 296 +++++++++++++++++++++++++++++-
>>  3 files changed, 398 insertions(+), 38 deletions(-)
>> 
>> diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
>> index c30d6f9708f8..ea3977b313fc 100644
>> --- a/drivers/net/ethernet/intel/igb/igb.h
>> +++ b/drivers/net/ethernet/intel/igb/igb.h
>> @@ -88,6 +88,7 @@ struct igb_adapter;
>>  #define IGB_XDP_CONSUMED	BIT(0)
>>  #define IGB_XDP_TX		BIT(1)
>>  #define IGB_XDP_REDIR		BIT(2)
>> +#define IGB_XDP_EXIT		BIT(3)
>>  
>>  struct vf_data_storage {
>>  	unsigned char vf_mac_addresses[ETH_ALEN];
>> @@ -740,6 +741,9 @@ void igb_clean_tx_ring(struct igb_ring *tx_ring);
>>  void igb_clean_rx_ring(struct igb_ring *rx_ring);
>>  void igb_configure_tx_ring(struct igb_adapter *, struct igb_ring *);
>>  void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *);
>> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status);
>> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
>> +			 unsigned int bytes);
>>  void igb_setup_tctl(struct igb_adapter *);
>>  void igb_setup_rctl(struct igb_adapter *);
>>  void igb_setup_srrctl(struct igb_adapter *, struct igb_ring *);
>> @@ -850,6 +854,10 @@ struct xsk_buff_pool *igb_xsk_pool(struct igb_adapter *adapter,
>>  int igb_xsk_pool_setup(struct igb_adapter *adapter,
>>  		       struct xsk_buff_pool *pool,
>>  		       u16 qid);
>> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count);
>> +void igb_clean_rx_ring_zc(struct igb_ring *rx_ring);
>> +int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector,
>> +			struct xsk_buff_pool *xsk_pool, const int budget);
>>  int igb_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
>>  
>>  #endif /* _IGB_H_ */
>> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
>> index bdba5c5861be..449ee794b3c9 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_main.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
>> @@ -472,12 +472,17 @@ static void igb_dump(struct igb_adapter *adapter)
>>  
>>  		for (i = 0; i < rx_ring->count; i++) {
>>  			const char *next_desc;
>> -			struct igb_rx_buffer *buffer_info;
>> -			buffer_info = &rx_ring->rx_buffer_info[i];
>> +			dma_addr_t dma = (dma_addr_t)0;
>> +			struct igb_rx_buffer *buffer_info = NULL;
>>  			rx_desc = IGB_RX_DESC(rx_ring, i);
>>  			u0 = (struct my_u0 *)rx_desc;
>>  			staterr = le32_to_cpu(rx_desc->wb.upper.status_error);
>>  
>> +			if (!rx_ring->xsk_pool) {
>> +				buffer_info = &rx_ring->rx_buffer_info[i];
>> +				dma = buffer_info->dma;
>> +			}
>> +
>>  			if (i == rx_ring->next_to_use)
>>  				next_desc = " NTU";
>>  			else if (i == rx_ring->next_to_clean)
>> @@ -497,11 +502,11 @@ static void igb_dump(struct igb_adapter *adapter)
>>  					"R  ", i,
>>  					le64_to_cpu(u0->a),
>>  					le64_to_cpu(u0->b),
>> -					(u64)buffer_info->dma,
>> +					(u64)dma,
>>  					next_desc);
>>  
>>  				if (netif_msg_pktdata(adapter) &&
>> -				    buffer_info->dma && buffer_info->page) {
>> +				    buffer_info && dma && buffer_info->page) {
>>  					print_hex_dump(KERN_INFO, "",
>>  					  DUMP_PREFIX_ADDRESS,
>>  					  16, 1,
>> @@ -1983,7 +1988,10 @@ static void igb_configure(struct igb_adapter *adapter)
>>  	 */
>>  	for (i = 0; i < adapter->num_rx_queues; i++) {
>>  		struct igb_ring *ring = adapter->rx_ring[i];
>> -		igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
>> +		if (ring->xsk_pool)
>> +			igb_alloc_rx_buffers_zc(ring, igb_desc_unused(ring));
>> +		else
>> +			igb_alloc_rx_buffers(ring, igb_desc_unused(ring));
>>  	}
>>  }
>>  
>> @@ -4405,7 +4413,8 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
>>  	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
>>  		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
>>  	res = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
>> -			       rx_ring->queue_index, 0);
>> +			       rx_ring->queue_index,
>> +			       rx_ring->q_vector->napi.napi_id);
>>  	if (res < 0) {
>>  		dev_err(dev, "Failed to register xdp_rxq index %u\n",
>>  			rx_ring->queue_index);
>> @@ -4701,12 +4710,17 @@ void igb_setup_srrctl(struct igb_adapter *adapter, struct igb_ring *ring)
>>  	struct e1000_hw *hw = &adapter->hw;
>>  	int reg_idx = ring->reg_idx;
>>  	u32 srrctl = 0;
>> +	u32 buf_size;
>>  
>> -	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
>> -	if (ring_uses_large_buffer(ring))
>> -		srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>> +	if (ring->xsk_pool)
>> +		buf_size = xsk_pool_get_rx_frame_size(ring->xsk_pool);
>> +	else if (ring_uses_large_buffer(ring))
>> +		buf_size = IGB_RXBUFFER_3072;
>>  	else
>> -		srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>> +		buf_size = IGB_RXBUFFER_2048;
>> +
>> +	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
>> +	srrctl |= buf_size >> E1000_SRRCTL_BSIZEPKT_SHIFT;
>>  	srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
>>  	if (hw->mac.type >= e1000_82580)
>>  		srrctl |= E1000_SRRCTL_TIMESTAMP;
>> @@ -4738,9 +4752,17 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
>>  	u32 rxdctl = 0;
>>  
>>  	xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
>> -	WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> -					   MEM_TYPE_PAGE_SHARED, NULL));
>>  	WRITE_ONCE(ring->xsk_pool, igb_xsk_pool(adapter, ring));
>> +	if (ring->xsk_pool) {
>> +		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> +						   MEM_TYPE_XSK_BUFF_POOL,
>> +						   NULL));
>> +		xsk_pool_set_rxq_info(ring->xsk_pool, &ring->xdp_rxq);
>> +	} else {
>> +		WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
>> +						   MEM_TYPE_PAGE_SHARED,
>> +						   NULL));
>> +	}
>>  
>>  	/* disable the queue */
>>  	wr32(E1000_RXDCTL(reg_idx), 0);
>> @@ -4767,9 +4789,12 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
>>  	rxdctl |= IGB_RX_HTHRESH << 8;
>>  	rxdctl |= IGB_RX_WTHRESH << 16;
>>  
>> -	/* initialize rx_buffer_info */
>> -	memset(ring->rx_buffer_info, 0,
>> -	       sizeof(struct igb_rx_buffer) * ring->count);
>> +	if (ring->xsk_pool)
>> +		memset(ring->rx_buffer_info_zc, 0,
>> +		       sizeof(*ring->rx_buffer_info_zc) * ring->count);
>> +	else
>> +		memset(ring->rx_buffer_info, 0,
>> +		       sizeof(*ring->rx_buffer_info) * ring->count);
>>  
>>  	/* initialize Rx descriptor 0 */
>>  	rx_desc = IGB_RX_DESC(ring, 0);
>> @@ -4957,8 +4982,13 @@ void igb_free_rx_resources(struct igb_ring *rx_ring)
>>  
>>  	rx_ring->xdp_prog = NULL;
>>  	xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
>> -	vfree(rx_ring->rx_buffer_info);
>> -	rx_ring->rx_buffer_info = NULL;
>> +	if (rx_ring->xsk_pool) {
>> +		vfree(rx_ring->rx_buffer_info_zc);
>> +		rx_ring->rx_buffer_info_zc = NULL;
>> +	} else {
>> +		vfree(rx_ring->rx_buffer_info);
>> +		rx_ring->rx_buffer_info = NULL;
>> +	}
>>  
>>  	/* if not set, then don't free */
>>  	if (!rx_ring->desc)
>> @@ -4996,6 +5026,11 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring)
>>  	dev_kfree_skb(rx_ring->skb);
>>  	rx_ring->skb = NULL;
>>  
>> +	if (rx_ring->xsk_pool) {
>> +		igb_clean_rx_ring_zc(rx_ring);
>> +		goto skip_for_xsk;
>> +	}
>> +
>>  	/* Free all the Rx ring sk_buffs */
>>  	while (i != rx_ring->next_to_alloc) {
>>  		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
>> @@ -5023,6 +5058,7 @@ void igb_clean_rx_ring(struct igb_ring *rx_ring)
>>  			i = 0;
>>  	}
>>  
>> +skip_for_xsk:
>>  	rx_ring->next_to_alloc = 0;
>>  	rx_ring->next_to_clean = 0;
>>  	rx_ring->next_to_use = 0;
>> @@ -8177,6 +8213,7 @@ static int igb_poll(struct napi_struct *napi, int budget)
>>  	struct igb_q_vector *q_vector = container_of(napi,
>>  						     struct igb_q_vector,
>>  						     napi);
>> +	struct xsk_buff_pool *xsk_pool;
>>  	bool clean_complete = true;
>>  	int work_done = 0;
>>  
>> @@ -8188,7 +8225,12 @@ static int igb_poll(struct napi_struct *napi, int budget)
>>  		clean_complete = igb_clean_tx_irq(q_vector, budget);
>>  
>>  	if (q_vector->rx.ring) {
>> -		int cleaned = igb_clean_rx_irq(q_vector, budget);
>> +		int cleaned;
>> +
>> +		xsk_pool = READ_ONCE(q_vector->rx.ring->xsk_pool);
>> +		cleaned = xsk_pool ?
>> +			igb_clean_rx_irq_zc(q_vector, xsk_pool, budget) :
>> +			igb_clean_rx_irq(q_vector, budget);
>>  
>>  		work_done += cleaned;
>>  		if (cleaned >= budget)
>> @@ -8852,6 +8894,38 @@ static void igb_put_rx_buffer(struct igb_ring *rx_ring,
>>  	rx_buffer->page = NULL;
>>  }
>>  
>> +void igb_finalize_xdp(struct igb_adapter *adapter, unsigned int status)
>> +{
>> +	int cpu = smp_processor_id();
>> +	struct netdev_queue *nq;
>> +
>> +	if (status & IGB_XDP_REDIR)
>> +		xdp_do_flush();
>> +
>> +	if (status & IGB_XDP_TX) {
>> +		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
>> +
>> +		nq = txring_txq(tx_ring);
>> +		__netif_tx_lock(nq, cpu);
>> +		igb_xdp_ring_update_tail(tx_ring);
>> +		__netif_tx_unlock(nq);
>> +	}
>> +}
>> +
>> +void igb_update_rx_stats(struct igb_q_vector *q_vector, unsigned int packets,
>> +			 unsigned int bytes)
>> +{
>> +	struct igb_ring *ring = q_vector->rx.ring;
>> +
>> +	u64_stats_update_begin(&ring->rx_syncp);
>> +	ring->rx_stats.packets += packets;
>> +	ring->rx_stats.bytes += bytes;
>> +	u64_stats_update_end(&ring->rx_syncp);
>> +
>> +	q_vector->rx.total_packets += packets;
>> +	q_vector->rx.total_bytes += bytes;
>> +}
>> +
>>  static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  {
>>  	unsigned int total_bytes = 0, total_packets = 0;
>> @@ -8859,9 +8933,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  	struct igb_ring *rx_ring = q_vector->rx.ring;
>>  	u16 cleaned_count = igb_desc_unused(rx_ring);
>>  	struct sk_buff *skb = rx_ring->skb;
>> -	int cpu = smp_processor_id();
>>  	unsigned int xdp_xmit = 0;
>> -	struct netdev_queue *nq;
>>  	struct xdp_buff xdp;
>>  	u32 frame_sz = 0;
>>  	int rx_buf_pgcnt;
>> @@ -8983,24 +9055,10 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
>>  	/* place incomplete frames back on ring for completion */
>>  	rx_ring->skb = skb;
>>  
>> -	if (xdp_xmit & IGB_XDP_REDIR)
>> -		xdp_do_flush();
>> -
>> -	if (xdp_xmit & IGB_XDP_TX) {
>> -		struct igb_ring *tx_ring = igb_xdp_tx_queue_mapping(adapter);
>> -
>> -		nq = txring_txq(tx_ring);
>> -		__netif_tx_lock(nq, cpu);
>> -		igb_xdp_ring_update_tail(tx_ring);
>> -		__netif_tx_unlock(nq);
>> -	}
>> +	if (xdp_xmit)
>> +		igb_finalize_xdp(adapter, xdp_xmit);
>
> Nit: given you would be sending next revision, IMHO this is a candidate
> for a separate patch. Not a big deal but would reduce the noise in this
> one.

Yes, makes sense.

>
>>  
>> -	u64_stats_update_begin(&rx_ring->rx_syncp);
>> -	rx_ring->rx_stats.packets += total_packets;
>> -	rx_ring->rx_stats.bytes += total_bytes;
>> -	u64_stats_update_end(&rx_ring->rx_syncp);
>> -	q_vector->rx.total_packets += total_packets;
>> -	q_vector->rx.total_bytes += total_bytes;
>> +	igb_update_rx_stats(q_vector, total_packets, total_bytes);
>
> This also.
>
>>  
>>  	if (cleaned_count)
>>  		igb_alloc_rx_buffers(rx_ring, cleaned_count);
>> diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c
>> index 7b632be3e7e3..9fd094a799fa 100644
>> --- a/drivers/net/ethernet/intel/igb/igb_xsk.c
>> +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c
>> @@ -70,7 +70,10 @@ static void igb_txrx_ring_enable(struct igb_adapter *adapter, u16 qid)
>>  	 * at least 1 descriptor unused to make sure
>>  	 * next_to_use != next_to_clean
>>  	 */
>> -	igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
>> +	if (rx_ring->xsk_pool)
>> +		igb_alloc_rx_buffers_zc(rx_ring, igb_desc_unused(rx_ring));
>> +	else
>> +		igb_alloc_rx_buffers(rx_ring, igb_desc_unused(rx_ring));
>>  
>>  	/* Rx/Tx share the same napi context. */
>>  	napi_enable(&rx_ring->q_vector->napi);
>> @@ -169,6 +172,297 @@ int igb_xsk_pool_setup(struct igb_adapter *adapter,
>>  		igb_xsk_pool_disable(adapter, qid);
>>  }
>>  
>> +static u16 igb_fill_rx_descs(struct xsk_buff_pool *pool, struct xdp_buff **xdp,
>> +			     union e1000_adv_rx_desc *rx_desc, u16 count)
>> +{
>> +	dma_addr_t dma;
>> +	u16 buffs;
>> +	int i;
>> +
>> +	/* nothing to do */
>> +	if (!count)
>> +		return 0;
>> +
>> +	buffs = xsk_buff_alloc_batch(pool, xdp, count);
>> +	for (i = 0; i < buffs; i++) {
>> +		dma = xsk_buff_xdp_get_dma(*xdp);
>> +		rx_desc->read.pkt_addr = cpu_to_le64(dma);
>> +		rx_desc->wb.upper.length = 0;
>> +
>> +		rx_desc++;
>> +		xdp++;
>> +	}
>> +
>> +	return buffs;
>> +}
>> +
>> +bool igb_alloc_rx_buffers_zc(struct igb_ring *rx_ring, u16 count)
>> +{
>> +	u32 nb_buffs_extra = 0, nb_buffs = 0;
>> +	union e1000_adv_rx_desc *rx_desc;
>> +	u16 ntu = rx_ring->next_to_use;
>> +	u16 total_count = count;
>> +	struct xdp_buff **xdp;
>> +
>> +	rx_desc = IGB_RX_DESC(rx_ring, ntu);
>> +	xdp = &rx_ring->rx_buffer_info_zc[ntu];
>> +
>> +	if (ntu + count >= rx_ring->count) {
>> +		nb_buffs_extra = igb_fill_rx_descs(rx_ring->xsk_pool, xdp,
>> +						   rx_desc,
>> +						   rx_ring->count - ntu);
>
> Ehh wanted to ack this finally, but I believe that here you need to work
> on the pool pointer that was READ_ONCE() in igb_poll() in hot path and
> in igb_configure() pass rx_ring->xsk_pool as an argument.

Good catch, thanks!

Thanks,
Kurt

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 861 bytes --]

  reply	other threads:[~2024-10-07 13:45 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-07 12:31 [PATCH iwl-next v7 0/5] igb: Add support for AF_XDP zero-copy Kurt Kanzenbach
2024-10-07 12:31 ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 12:31 ` [PATCH iwl-next v7 1/5] igb: Remove static qualifiers Kurt Kanzenbach
2024-10-07 12:31   ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 13:09   ` Maciej Fijalkowski
2024-10-07 13:09     ` [Intel-wired-lan] " Maciej Fijalkowski
2024-10-07 12:31 ` [PATCH iwl-next v7 2/5] igb: Introduce igb_xdp_is_enabled() Kurt Kanzenbach
2024-10-07 12:31   ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 12:31 ` [PATCH iwl-next v7 3/5] igb: Introduce XSK data structures and helpers Kurt Kanzenbach
2024-10-07 12:31   ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 12:31 ` [PATCH iwl-next v7 4/5] igb: Add AF_XDP zero-copy Rx support Kurt Kanzenbach
2024-10-07 12:31   ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 13:08   ` Maciej Fijalkowski
2024-10-07 13:08     ` [Intel-wired-lan] " Maciej Fijalkowski
2024-10-07 13:45     ` Kurt Kanzenbach [this message]
2024-10-07 13:45       ` Kurt Kanzenbach
2024-10-07 12:31 ` [PATCH iwl-next v7 5/5] igb: Add AF_XDP zero-copy Tx support Kurt Kanzenbach
2024-10-07 12:31   ` [Intel-wired-lan] " Kurt Kanzenbach
2024-10-07 13:15   ` Maciej Fijalkowski
2024-10-07 13:15     ` [Intel-wired-lan] " Maciej Fijalkowski
2024-10-07 13:48     ` Kurt Kanzenbach
2024-10-07 13:48       ` [Intel-wired-lan] " Kurt Kanzenbach

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87jzek3u2c.fsf@kurt.kurt.home \
    --to=kurt@linutronix.de \
    --cc=anthony.l.nguyen@intel.com \
    --cc=ast@kernel.org \
    --cc=benjamin.steinke@woks-audio.com \
    --cc=bigeasy@linutronix.de \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=hawk@kernel.org \
    --cc=intel-wired-lan@lists.osuosl.org \
    --cc=john.fastabend@gmail.com \
    --cc=kuba@kernel.org \
    --cc=maciej.fijalkowski@intel.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=przemyslaw.kitszel@intel.com \
    --cc=richardcochran@gmail.com \
    --cc=sriram.yagnaraman@ericsson.com \
    --cc=sriram.yagnaraman@est.tech \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.