From mboxrd@z Thu Jan  1 00:00:00 1970
From: Vlad Zolotarov <vladz-RmZWMc9puTNJc61us3aD9laTQe2KTcn/@public.gmane.org>
Subject: Re: [PATCH v6 3/3] ixgbe: Add LRO support
Date: Tue, 10 Mar 2015 23:36:11 +0200
Message-ID: <54FF63CB.4040506@cloudius-systems.com>
References: <1425928037-28732-1-git-send-email-vladz@cloudius-systems.com>
 <1425928037-28732-4-git-send-email-vladz@cloudius-systems.com>
 <2601191342CEEE43887BDE71AB977258213F5039@irsmsx105.ger.corp.intel.com>
 <54FEF011.6010205@cloudius-systems.com>
 <2601191342CEEE43887BDE71AB977258213F5475@irsmsx105.ger.corp.intel.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=windows-1252; format=flowed
Content-Transfer-Encoding: quoted-printable
To: "Ananyev, Konstantin" <konstantin.ananyev-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>,
 "dev-VfR2kkLFssw@public.gmane.org" <dev-VfR2kkLFssw@public.gmane.org>
Return-path: <dev-bounces-VfR2kkLFssw@public.gmane.org>
In-Reply-To: <2601191342CEEE43887BDE71AB977258213F5475-pww93C2UFcwu0RiL9chJVbfspsVTdybXVpNB7YpNyf8@public.gmane.org>
List-Id: patches and discussions about DPDK <dev.dpdk.org>
List-Unsubscribe: <http://dpdk.org/ml/options/dev>,
 <mailto:dev-request-VfR2kkLFssw@public.gmane.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev-VfR2kkLFssw@public.gmane.org>
List-Help: <mailto:dev-request-VfR2kkLFssw@public.gmane.org?subject=help>
List-Subscribe: <http://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request-VfR2kkLFssw@public.gmane.org?subject=subscribe>
Errors-To: dev-bounces-VfR2kkLFssw@public.gmane.org
Sender: "dev" <dev-bounces-VfR2kkLFssw@public.gmane.org>



On 03/10/15 22:09, Ananyev, Konstantin wrote:
>>> Hi Vlad,
>>>
>>>> -----Original Message-----
>>>> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Vlad Zolotar=
ov
>>>> Sent: Monday, March 09, 2015 7:07 PM
>>>> To: dev at dpdk.org
>>>> Subject: [dpdk-dev] [PATCH v6 3/3] ixgbe: Add LRO support
>>>>
>>>>       - Only x540 and 82599 devices support LRO.
>>>>       - Add the appropriate HW configuration.
>>>>       - Add RSC aware rx_pkt_burst() handlers:
>>>>          - Implemented bulk allocation and non-bulk allocation versi=
ons.
>>>>          - Add LRO-specific fields to rte_eth_rxmode, to rte_eth_dev=
_data
>>>>            and to igb_rx_queue.
>>>>          - Use the appropriate handler when LRO is requested.
>>>>
>>>> Signed-off-by: Vlad Zolotarov <vladz at cloudius-systems.com>
>>>> ---
>>>> New in v5:
>>>>      - Put the RTE_ETHDEV_HAS_LRO_SUPPORT definition at the beginnin=
g of rte_ethdev.h.
>>>>      - Removed the "TODO: Remove me" comment near RTE_ETHDEV_HAS_LRO=
_SUPPORT.
>>>>
>>>> New in v4:
>>>>      - Define RTE_ETHDEV_HAS_LRO_SUPPORT in rte_ethdev.h instead of
>>>>        RTE_ETHDEV_LRO_SUPPORT defined in config/common_linuxapp.
>>>>
>>>> New in v2:
>>>>      - Removed rte_eth_dev_data.lro_bulk_alloc.
>>>>      - Fixed a few styling and spelling issues.
>>>> ---
>>>>    lib/librte_ether/rte_ethdev.h       |   9 +-
>>>>    lib/librte_pmd_ixgbe/ixgbe_ethdev.c |   6 +
>>>>    lib/librte_pmd_ixgbe/ixgbe_ethdev.h |   5 +
>>>>    lib/librte_pmd_ixgbe/ixgbe_rxtx.c   | 562 +++++++++++++++++++++++=
++++++++++++-
>>>>    lib/librte_pmd_ixgbe/ixgbe_rxtx.h   |   6 +
>>>>    5 files changed, 581 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_et=
hdev.h
>>>> index 8db3127..44f081f 100644
>>>> --- a/lib/librte_ether/rte_ethdev.h
>>>> +++ b/lib/librte_ether/rte_ethdev.h
>>>> @@ -172,6 +172,9 @@ extern "C" {
>>>>
>>>>    #include <stdint.h>
>>>>
>>>> +/* Use this macro to check if LRO API is supported */
>>>> +#define RTE_ETHDEV_HAS_LRO_SUPPORT
>>>> +
>>>>    #include <rte_log.h>
>>>>    #include <rte_interrupts.h>
>>>>    #include <rte_pci.h>
>>>> @@ -320,14 +323,15 @@ struct rte_eth_rxmode {
>>>>    	enum rte_eth_rx_mq_mode mq_mode;
>>>>    	uint32_t max_rx_pkt_len;  /**< Only used if jumbo_frame enabled.=
 */
>>>>    	uint16_t split_hdr_size;  /**< hdr buf size (header_split enable=
d).*/
>>>> -	uint8_t header_split : 1, /**< Header Split enable. */
>>>> +	uint16_t header_split : 1, /**< Header Split enable. */
>>>>    		hw_ip_checksum   : 1, /**< IP/UDP/TCP checksum offload enable. =
*/
>>>>    		hw_vlan_filter   : 1, /**< VLAN filter enable. */
>>>>    		hw_vlan_strip    : 1, /**< VLAN strip enable. */
>>>>    		hw_vlan_extend   : 1, /**< Extended VLAN enable. */
>>>>    		jumbo_frame      : 1, /**< Jumbo Frame Receipt enable. */
>>>>    		hw_strip_crc     : 1, /**< Enable CRC stripping by hardware. */
>>>> -		enable_scatter   : 1; /**< Enable scatter packets rx handler */
>>>> +		enable_scatter   : 1, /**< Enable scatter packets rx handler */
>>>> +		enable_lro       : 1; /**< Enable LRO */
>>>>    };
>>>>
>>>>    /**
>>>> @@ -1515,6 +1519,7 @@ struct rte_eth_dev_data {
>>>>    	uint8_t port_id;           /**< Device [external] port identifie=
r. */
>>>>    	uint8_t promiscuous   : 1, /**< RX promiscuous mode ON(1) / OFF(=
0). */
>>>>    		scattered_rx : 1,  /**< RX of scattered packets is ON(1) / OFF(=
0) */
>>>> +		lro          : 1,  /**< RX LRO is ON(1) / OFF(0) */
>>>>    		all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). *=
/
>>>>    		dev_started : 1;   /**< Device state: STARTED(1) / STOPPED(0). =
*/
>>>>    };
>>>> diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ix=
gbe/ixgbe_ethdev.c
>>>> index 9d3de1a..765174d 100644
>>>> --- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
>>>> +++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c
>>>> @@ -1648,6 +1648,7 @@ ixgbe_dev_stop(struct rte_eth_dev *dev)
>>>>
>>>>    	/* Clear stored conf */
>>>>    	dev->data->scattered_rx =3D 0;
>>>> +	dev->data->lro =3D 0;
>>>>    	hw->rx_bulk_alloc_allowed =3D false;
>>>>    	hw->rx_vec_allowed =3D false;
>>>>
>>>> @@ -2018,6 +2019,11 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, s=
truct rte_eth_dev_info *dev_info)
>>>>    		DEV_RX_OFFLOAD_IPV4_CKSUM |
>>>>    		DEV_RX_OFFLOAD_UDP_CKSUM  |
>>>>    		DEV_RX_OFFLOAD_TCP_CKSUM;
>>>> +
>>>> +	if (hw->mac.type =3D=3D ixgbe_mac_82599EB ||
>>>> +	    hw->mac.type =3D=3D ixgbe_mac_X540)
>>>> +		dev_info->rx_offload_capa |=3D DEV_RX_OFFLOAD_TCP_LRO;
>>>> +
>>>>    	dev_info->tx_offload_capa =3D
>>>>    		DEV_TX_OFFLOAD_VLAN_INSERT |
>>>>    		DEV_TX_OFFLOAD_IPV4_CKSUM  |
>>>> diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h b/lib/librte_pmd_ix=
gbe/ixgbe_ethdev.h
>>>> index a549f5c..e206584 100644
>>>> --- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
>>>> +++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.h
>>>> @@ -349,6 +349,11 @@ uint16_t ixgbe_recv_pkts_bulk_alloc(void *rx_qu=
eue, struct rte_mbuf **rx_pkts,
>>>>    uint16_t ixgbe_recv_scattered_pkts(void *rx_queue,
>>>>    		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
>>>>
>>>> +uint16_t ixgbe_recv_pkts_lro(void *rx_queue,
>>>> +		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
>>>> +uint16_t ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue,
>>>> +		struct rte_mbuf **rx_pkts, uint16_t nb_pkts);
>>>> +
>>>>    uint16_t ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkt=
s,
>>>>    		uint16_t nb_pkts);
>>>>
>>>> diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgb=
e/ixgbe_rxtx.c
>>>> index 58e619b..944c662 100644
>>>> --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
>>>> +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c
>>>> @@ -1366,6 +1366,15 @@ ixgbe_recv_pkts(void *rx_queue, struct rte_mb=
uf **rx_pkts,
>>>>    }
>>>>
>>>>    /**
>>>> + * Detect an RSC descriptor.
>>>> + */
>>>> +static inline uint32_t ixgbe_rsc_count(union ixgbe_adv_rx_desc *rx)
>>>> +{
>>>> +	return (rte_le_to_cpu_32(rx->wb.lower.lo_dword.data) &
>>>> +		IXGBE_RXDADV_RSCCNT_MASK) >> IXGBE_RXDADV_RSCCNT_SHIFT;
>>>> +}
>>>> +
>>>> +/**
>>>>     * Initialize the first mbuf of the returned packet:
>>>>     *    - RX port identifier,
>>>>     *    - hardware offload data, if any:
>>>> @@ -1410,6 +1419,291 @@ static inline void ixgbe_fill_cluster_head_b=
uf(
>>>>    	}
>>>>    }
>>>>
>>>> +/**
>>>> + * Bulk receive handler for and LRO case.
>>>> + *
>>>> + * @rx_queue Rx queue handle
>>>> + * @rx_pkts table of received packets
>>>> + * @nb_pkts size of rx_pkts table
>>>> + * @bulk_alloc if TRUE bulk allocation is used for a HW ring refill=
ing
>>>> + *
>>>> + * Handles the Rx HW ring completions when RSC feature is configure=
d. Uses an
>>>> + * additional ring of igb_rsc_entry's that will hold the relevant R=
SC info.
>>>> + *
>>>> + * We use the same logic as in Lunux and in FreeBSD ixgbe drivers:
>>>> + * 1) When non-EOP RSC completion arrives:
>>>> + *    a) Update the HEAD of the current RSC aggregation cluster wit=
h the new
>>>> + *       segment's data length.
>>>> + *    b) Set the "next" pointer of the current segment to point to =
the segment
>>>> + *       at the NEXTP index.
>>>> + *    c) Pass the HEAD of RSC aggregation cluster on to the next NE=
XTP entry
>>>> + *       in the sw_rsc_ring.
>>>> + * 2) When EOP arrives we just update the cluster's total length an=
d offload
>>>> + *    flags and deliver the cluster up to the upper layers. In our =
case - put it
>>>> + *    in the rx_pkts table.
>>>> + *
>>>> + * Returns the number of received packets/clusters (according to th=
e "bulk
>>>> + * receive" interface).
>>>> + */
>>>> +static inline uint16_t
>>>> +_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t =
nb_pkts,
>>>> +	       bool bulk_alloc)
>>>> +{
>>>> +	struct igb_rx_queue *rxq =3D rx_queue;
>>>> +	volatile union ixgbe_adv_rx_desc *rx_ring =3D rxq->rx_ring;
>>>> +	struct igb_rx_entry *sw_ring =3D rxq->sw_ring;
>>>> +	struct igb_rsc_entry *sw_rsc_ring =3D rxq->sw_rsc_ring;
>>>> +	uint16_t rx_id =3D rxq->rx_tail;
>>>> +	uint16_t nb_rx =3D 0;
>>>> +	uint16_t nb_hold =3D rxq->nb_rx_hold;
>>>> +	uint16_t prev_id =3D rxq->rx_tail;
>>>> +
>>>> +	while (nb_rx < nb_pkts) {
>>>> +		bool eop;
>>>> +		struct igb_rx_entry *rxe;
>>>> +		struct igb_rsc_entry *rsc_entry;
>>>> +		struct igb_rsc_entry *next_rsc_entry;
>>>> +		struct igb_rx_entry *next_rxe;
>>>> +		struct rte_mbuf *first_seg;
>>>> +		struct rte_mbuf *rxm;
>>>> +		struct rte_mbuf *nmb;
>>>> +		union ixgbe_adv_rx_desc rxd;
>>>> +		uint16_t data_len;
>>>> +		uint16_t next_id;
>>>> +		volatile union ixgbe_adv_rx_desc *rxdp;
>>>> +		uint32_t staterr;
>>>> +
>>>> +next_desc:
>>>> +		/*
>>>> +		 * The code in this whole file uses the volatile pointer to
>>>> +		 * ensure the read ordering of the status and the rest of the
>>>> +		 * descriptor fields (on the compiler level only!!!). This is so
>>>> +		 * UGLY - why not to just use the compiler barrier instead? DPDK
>>>> +		 * even has the rte_compiler_barrier() for that.
>>>> +		 *
>>>> +		 * But most importantly this is just wrong because this doesn't
>>>> +		 * ensure memory ordering in a general case at all. For
>>>> +		 * instance, DPDK is supposed to work on Power CPUs where
>>>> +		 * compiler barrier may just not be enough!
>>>> +		 *
>>>> +		 * I tried to write only this function properly to have a
>>>> +		 * starting point (as a part of an LRO/RSC series) but the
>>>> +		 * compiler cursed at me when I tried to cast away the
>>>> +		 * "volatile" from rx_ring (yes, it's volatile too!!!). So, I'm
>>>> +		 * keeping it the way it is for now.
>>>> +		 *
>>>> +		 * The code in this file is broken in so many other places and
>>>> +		 * will just not work on a big endian CPU anyway therefore the
>>>> +		 * lines below will have to be revisited together with the rest
>>>> +		 * of the ixgbe PMD.
>>>> +		 *
>>>> +		 * TODO:
>>>> +		 *    - Get rid of "volatile" crap and let the compiler do its
>>>> +		 *      job.
>>>> +		 *    - Use the proper memory barrier (rte_rmb()) to ensure the
>>>> +		 *      memory ordering below.
>>> Ok, so you wanted to put rte_rmb(), straight after:
>>> staterr =3D rte_le_to_cpu_32(rxdp->wb.upper.status_error);
>>> correct?
>>> I agree that for machines with relaxed memory model (PPC) we do need =
it here.
>>> So why not just put it there, instead of complaining about in in comm=
ents? ;)
>> Because it's not a proper fix and I don't like workarounds.
> Why not? For machines with relaxed memory model you would need  rmb() h=
ere no matter does rxdp points to volatile or not.
>
>>> About rxdp being pointer to volatile, why it bothers you that much?
>> Because using of "volatile" prevent the compiler from optimizing every
>> code piece where the "volatile" variable is participating and that's a
>> shame.
>> Read this
>> https://www.kernel.org/doc/Documentation/volatile-considered-harmful.t=
xt
>> for a more detailed explanation.
>>
>>> You copy the whole RXD to the local variable anyway, and then referen=
ce it only to setup new addresses.
>> The fact that we have to copy the whole descriptor while we may not ne=
ed
>> all the data from it at the end is one problem.
> I understand that, but I don't think that the difference would that cri=
tical.
> Though I don't have any data in hand to compare.
>
>> The proper solution in Rx ring context should go as follows:
>>
>>   1. Remove the "volatile" qualifier from rx_ring (HW Rx descriptors r=
ing).
>>   2. Remove "volatile" at all places where rx_ring is accessed.
>>   3. Adjust the code in (2):
>>       1. Remove the descriptor copy u've mentioned and access the
>>          descriptor data directly.
>>       2. Ensure the proper ordering by using the proper memory barrier=
s,
>>          which are missing in the DPDK SDK at the moment (see a small
>>          discussion about this with Stephen and Avi on "[dpdk-dev]
>>          [PATCH v1 5/5] ixgbe: Add LRO support" thread).
> I think you are mixing 2 different issues here:
>
> 1.  For architectures with relaxed memory model we do need rmb() after =
that line:
> staterr =3D rte_le_to_cpu_32(rxdp->wb.upper.status_error);
> We do need it *always*, not depending on is rx_ring a volatile or not.
> If we really plan to support PPC and other architectures that allow rea=
d reordering  -
> not having an 'rmb()' or similar sync primitive here is a bug.
> Same thing applies to 'wmb()' before updating RDT.
>
> 2. volatile rx_ring vs non-volatile with explicit memory ordering instr=
incts.
> Actually I think that using volatile rx_ring is not a real bug on itsel=
f.
> Code with volatile rx_ring and fix for #1 in place would work correctly=
 on all architectures.
> It might be slower than non-volatile approach, but nothing would be bro=
ken.
>
> About the existing RX/TX functions and PPC support:
> Note that all of them were created before PPC support for DPDK was intr=
oduced.
> At that moment only IA was supported.
> That's why in some places where you would expect to see 'mb()' there ar=
e 'volatile' and/or ' rte_compiler_barrier' instead.
> Why all that places wasn't updated when PPC support was added - that's =
another question.
>  From my understanding - with current implementation some of DPDK PMDs =
RX/TX functions and  rte_ring wouldn't work correctly on PPC.
> So, I suppose we need to decide for ourselves - do we really want to su=
pport PPC and other architectures with non-IA memory model or not?
> If not, then I think we don't need any mb()s inside recv_pkts_lro() - j=
ust rte_compiler_barrier seems enough, and no point to complain about
> it in comments.
> If yes - then why to introduce a new function with a known potential bu=
g?

In order to introduce a new function with the proper implementation or=20
to fix any other places with the similar weakness I would need a proper=20
tools like a proper platform-dependent barrier-macros similar to=20
smp_Xmb() Linux macros that reduce to a compiler barrier where=20
appropriate or to a proper memory fence where needed.

Unfortunately DPDK doesn't have such at the moment. That's why I put a=20
big fat comment at the place that has to be fixed once they are introduce=
d.

U are right though about "volatile" thing not being a bug but it would=20
be strange to keep it after barriers are properly placed. That's why I=20
think these 2 changes should go together.

About the "decision" we have to make - I think it has been decided=20
already since PPC is one the official DPDK targets. Therefore the only=20
thing to decide here is when and who gets to fix these things. One thing=20
is obvious - this patch is not the right place to do it. ;)

>
>> As it sounds this is going to be a VERY sensitive patchset.
>> That's why it should go separately from this patchwork (or from any
>> other patchwork).
> For that patch, I am not suggesting you to change any other functions, =
just one that you introducing.

I don't think that putting an lfence on x86 there is a good idea. As=20
I've just explained above - once DPDK has proper platform-dependent=20
rmb() macros I'll gladly revisit these lines. Frankly, the same could be=20
told about the rte_wmb() before the RDT update but it is much less=20
harmful than lfence so I didn't raise it... ;)

>
>>>> +		 */
>>>> +		rxdp =3D &rx_ring[rx_id];
>>>> +		staterr =3D rte_le_to_cpu_32(rxdp->wb.upper.status_error);
>>>> +
>>>> +		if (!(staterr & IXGBE_RXDADV_STAT_DD))
>>>> +			break;
>>>> +
>>>> +		rxd =3D *rxdp;
>>>> +
>>>> +		PMD_RX_LOG(DEBUG, "port_id=3D%u queue_id=3D%u rx_id=3D%u "
>>>> +				  "staterr=3D0x%x data_len=3D%u",
>>>> +			   rxq->port_id, rxq->queue_id, rx_id, staterr,
>>>> +			   rte_le_to_cpu_16(rxd.wb.upper.length));
>>>> +
>>>> +		if (!bulk_alloc) {
>>>> +			nmb =3D rte_rxmbuf_alloc(rxq->mb_pool);
>>>> +			if (nmb =3D=3D NULL) {
>>>> +				PMD_RX_LOG(DEBUG, "RX mbuf alloc failed "
>>>> +						  "port_id=3D%u queue_id=3D%u",
>>>> +					   rxq->port_id, rxq->queue_id);
>>>> +
>>>> +				rte_eth_devices[rxq->port_id].data->
>>>> +							rx_mbuf_alloc_failed++;
>>>> +				break;
>>>> +			}
>>>> +		} else if (nb_hold > rxq->rx_free_thresh) {
>>>> +			uint16_t next_rdt =3D rxq->rx_free_trigger;
>>>> +
>>>> +			if (!ixgbe_rx_alloc_bufs(rxq, false)) {
>>>> +				rte_wmb();
>>>> +				IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr,
>>>> +						    next_rdt);
>>>> +				nb_hold -=3D rxq->rx_free_thresh;
>>>> +			} else {
>>>> +				PMD_RX_LOG(DEBUG, "RX bulk alloc failed "
>>>> +						  "port_id=3D%u queue_id=3D%u",
>>>> +					   rxq->port_id, rxq->queue_id);
>>>> +
>>>> +				rte_eth_devices[rxq->port_id].data->
>>>> +							rx_mbuf_alloc_failed++;
>>>> +				break;
>>>> +			}
>>>> +		}
>>>> +
>>>> +		nb_hold++;
>>>> +		rxe =3D &sw_ring[rx_id];
>>>> +		eop =3D staterr & IXGBE_RXDADV_STAT_EOP;
>>>> +
>>>> +		next_id =3D rx_id + 1;
>>>> +		if (next_id =3D=3D rxq->nb_rx_desc)
>>>> +			next_id =3D 0;
>>>> +
>>>> +		/* Prefetch next mbuf while processing current one. */
>>>> +		rte_ixgbe_prefetch(sw_ring[next_id].mbuf);
>>>> +
>>>> +		/*
>>>> +		 * When next RX descriptor is on a cache-line boundary,
>>>> +		 * prefetch the next 4 RX descriptors and the next 4 pointers
>>>> +		 * to mbufs.
>>>> +		 */
>>>> +		if ((next_id & 0x3) =3D=3D 0) {
>>>> +			rte_ixgbe_prefetch(&rx_ring[next_id]);
>>>> +			rte_ixgbe_prefetch(&sw_ring[next_id]);
>>>> +		}
>>>> +
>>>> +		rxm =3D rxe->mbuf;
>>>> +
>>>> +		if (!bulk_alloc) {
>>>> +			__le64 dma =3D
>>>> +			  rte_cpu_to_le_64(RTE_MBUF_DATA_DMA_ADDR_DEFAULT(nmb));
>>>> +			/*
>>>> +			 * Update RX descriptor with the physical address of the
>>>> +			 * new data buffer of the new allocated mbuf.
>>>> +			 */
>>>> +			rxe->mbuf =3D nmb;
>>>> +
>>>> +			rxm->data_off =3D RTE_PKTMBUF_HEADROOM;
>>>> +			rxdp->read.hdr_addr =3D dma;
>>>> +			rxdp->read.pkt_addr =3D dma;
>>>> +		}
>>>> +		/*
>>>> +		 * Set data length & data buffer address of mbuf.
>>>> +		 */
>>>> +		data_len =3D rte_le_to_cpu_16(rxd.wb.upper.length);
>>>> +		rxm->data_len =3D data_len;
>>>> +
>>>> +		if (!eop) {
>>>> +			uint16_t nextp_id;
>>>> +			/*
>>>> +			 * Get next descriptor index:
>>>> +			 *  - For RSC it's in the NEXTP field.
>>>> +			 *  - For a scattered packet - it's just a following
>>>> +			 *    descriptor.
>>>> +			 */
>>>> +			if (ixgbe_rsc_count(&rxd))
>>>> +				nextp_id =3D
>>>> +					(staterr & IXGBE_RXDADV_NEXTP_MASK) >>
>>>> +						       IXGBE_RXDADV_NEXTP_SHIFT;
>>>> +			else
>>>> +				nextp_id =3D next_id;
>>>> +
>>>> +			next_rsc_entry =3D &sw_rsc_ring[nextp_id];
>>>> +			next_rxe =3D &sw_ring[nextp_id];
>>>> +			rte_ixgbe_prefetch(next_rxe);
>>>> +		}
>>>> +
>>>> +		rsc_entry =3D &sw_rsc_ring[rx_id];
>>>> +		first_seg =3D rsc_entry->fbuf;
>>>> +		rsc_entry->fbuf =3D NULL;
>>>> +
>>>> +		/*
>>>> +		 * If this is the first buffer of the received packet,
>>>> +		 * set the pointer to the first mbuf of the packet and
>>>> +		 * initialize its context.
>>>> +		 * Otherwise, update the total length and the number of segments
>>>> +		 * of the current scattered packet, and update the pointer to
>>>> +		 * the last mbuf of the current packet.
>>>> +		 */
>>>> +		if (first_seg =3D=3D NULL) {
>>>> +			first_seg =3D rxm;
>>>> +			first_seg->pkt_len =3D data_len;
>>>> +			first_seg->nb_segs =3D 1;
>>>> +		} else {
>>>> +			first_seg->pkt_len +=3D data_len;
>>>> +			first_seg->nb_segs++;
>>>> +		}
>>>> +
>>>> +		prev_id =3D rx_id;
>>>> +		rx_id =3D next_id;
>>>> +
>>>> +		/*
>>>> +		 * If this is not the last buffer of the received packet, update
>>>> +		 * the pointer to the first mbuf at the NEXTP entry in the
>>>> +		 * sw_rsc_ring and continue to parse the RX ring.
>>>> +		 */
>>>> +		if (!eop) {
>>>> +			rxm->next =3D next_rxe->mbuf;
>>>> +			next_rsc_entry->fbuf =3D first_seg;
>>>> +			goto next_desc;
>>> So _recv_pkts_lro() can return with one of rxq->rsc_entry[i] !=3D NUL=
L, correct?
>>> If so, then I think you need at ixgbe_rx_queue_release_mbufs() to add=
 the code, that would go through
>>> all rsc_entry[] to find one whose fbuf  is !=3D NULL, call rte_pktmbu=
f_free() for it and reset to NULL.
>>>    To handle the case:
>>> recv_pkts_lro(rxq, ...);
>>> rte_eth_dev_stop();
>>> rte_eth_dev_start();
>>> recv_pkts_lro(rxq, ...);
>> Right. I've missed that part.
>>
>>> BTW, that also means that you can't do:
>>> rxm->next =3D next_rxe->mbuf;
>>> above, and
>>> rxm->next =3D NULL;
>>> should be done before 'goto next_desc;' too
>> Your proposal will cost cycles in the fast path on account of saving
>> cycles in the slow path: we'll have to add another pointer to the
>> igb_rsc_entry to hold the last mbuf in the current cluster that we'll
>> have to read and update for every new completed RSC descriptor.
>>
>> The easier way would be to just reset the next-pointer of the last
>> descriptor in the RSC cluster to NULL (according to nb_segs) before
>> calling for rte_pktmbuf_free() in ixgbe_rx_queue_release_mbufs().
> Should work too, I think.

The final solution is even nicer - see v7. And it works like a charm=20
too... ;)

>
>>>> +		}
>>>> +
>>>> +		/*
>>>> +		 * This is the last buffer of the received packet - return
>>>> +		 * the current cluster to the user.
>>>> +		 */
>>>> +		rxm->next =3D NULL;
>>>> +
>>>> +		/* Initialize the first mbuf of the returned packet */
>>>> +		ixgbe_fill_cluster_head_buf(first_seg, &rxd, rxq->port_id,
>>>> +					    staterr);
>>>> +
>>>> +		/* Prefetch data of first segment, if configured to do so. */
>>>> +		rte_packet_prefetch((char *)first_seg->buf_addr +
>>>> +			first_seg->data_off);
>>>> +
>>>> +		/*
>>>> +		 * Store the mbuf address into the next entry of the array
>>>> +		 * of returned packets.
>>>> +		 */
>>>> +		rx_pkts[nb_rx++] =3D first_seg;
>>>> +	}
>>>> +
>>>> +	/*
>>>> +	 * Record index of the next RX descriptor to probe.
>>>> +	 */
>>>> +	rxq->rx_tail =3D rx_id;
>>>> +
>>>> +	/*
>>>> +	 * If the number of free RX descriptors is greater than the RX fre=
e
>>>> +	 * threshold of the queue, advance the Receive Descriptor Tail (RD=
T)
>>>> +	 * register.
>>>> +	 * Update the RDT with the value of the last processed RX descript=
or
>>>> +	 * minus 1, to guarantee that the RDT register is never equal to t=
he
>>>> +	 * RDH register, which creates a "full" ring situtation from the
>>>> +	 * hardware point of view...
>>>> +	 */
>>>> +	if (!bulk_alloc && nb_hold > rxq->rx_free_thresh) {
>>>> +		PMD_RX_LOG(DEBUG, "port_id=3D%u queue_id=3D%u rx_tail=3D%u "
>>>> +			   "nb_hold=3D%u nb_rx=3D%u",
>>>> +			   rxq->port_id, rxq->queue_id, rx_id, nb_hold, nb_rx);
>>>> +
>>> I suppose if you do wmb() after rte_rxmbuf_alloc(), you'd better do i=
t here too.
>> Right! Missed that when copied this code from
>> ixgbe_recv_scattered_pkts()... ;) Note that the barrier is missing the=
re
>> too...
>> These are the examples of the code that works on x86 only because of
>> that "volatile" thing and will break once it's removed. On PPC it is
>> broken even with "volatile".
> Yep, as I said above -for IA we don't need mb() here - using 'volatile'=
 or compiler barrier seems enough to me.
> For PPC - I think we do.
>
>>>> +		IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, prev_id);
>>>> +		nb_hold =3D 0;
>>>> +	}
>>>> +
>>>> +	rxq->nb_rx_hold =3D nb_hold;
>>>> +	return nb_rx;
>>>> +}
>>>> +
>>>> +uint16_t
>>>> +ixgbe_recv_pkts_lro(void *rx_queue, struct rte_mbuf **rx_pkts, uint=
16_t nb_pkts)
>>>> +{
>>>> +	return _recv_pkts_lro(rx_queue, rx_pkts, nb_pkts, false);
>>>> +}
>>>> +
>>>> +uint16_t
>>>> +ixgbe_recv_pkts_lro_bulk_alloc(void *rx_queue, struct rte_mbuf **rx=
_pkts,
>>>> +			       uint16_t nb_pkts)
>>>> +{
>>>> +	return _recv_pkts_lro(rx_queue, rx_pkts, nb_pkts, true);
>>>> +}
>>>> +
>>>>    uint16_t
>>>>    ixgbe_recv_scattered_pkts(void *rx_queue, struct rte_mbuf **rx_pk=
ts,
>>>>    			  uint16_t nb_pkts)
>>>> @@ -2024,6 +2318,7 @@ ixgbe_rx_queue_release(struct igb_rx_queue *rx=
q)
>>>>    	if (rxq !=3D NULL) {
>>>>    		ixgbe_rx_queue_release_mbufs(rxq);
>>>>    		rte_free(rxq->sw_ring);
>>>> +		rte_free(rxq->sw_rsc_ring);
>>>>    		rte_free(rxq);
>>>>    	}
>>>>    }
>>>> @@ -2146,6 +2441,7 @@ ixgbe_reset_rx_queue(struct ixgbe_hw *hw, stru=
ct igb_rx_queue *rxq)
>>>>    	rxq->nb_rx_hold =3D 0;
>>>>    	rxq->pkt_first_seg =3D NULL;
>>>>    	rxq->pkt_last_seg =3D NULL;
>>>> +	rxq->rsc_en =3D 0;
>>>>    }
>>>>
>>>>    int
>>>> @@ -2160,6 +2456,14 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev *=
dev,
>>>>    	struct igb_rx_queue *rxq;
>>>>    	struct ixgbe_hw     *hw;
>>>>    	uint16_t len;
>>>> +	struct rte_eth_dev_info dev_info =3D { 0 };
>>>> +	struct rte_eth_rxmode *dev_rx_mode =3D &dev->data->dev_conf.rxmode=
;
>>>> +	bool rsc_requested =3D false;
>>>> +
>>>> +	dev->dev_ops->dev_infos_get(dev, &dev_info);
>>>> +	if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO) &&
>>>> +	    dev_rx_mode->enable_lro)
>>>> +		rsc_requested =3D true;
>>>>
>>>>    	PMD_INIT_FUNC_TRACE();
>>>>    	hw =3D IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>>>> @@ -2265,12 +2569,28 @@ ixgbe_dev_rx_queue_setup(struct rte_eth_dev =
*dev,
>>>>    	rxq->sw_ring =3D rte_zmalloc_socket("rxq->sw_ring",
>>>>    					  sizeof(struct igb_rx_entry) * len,
>>>>    					  RTE_CACHE_LINE_SIZE, socket_id);
>>>> -	if (rxq->sw_ring =3D=3D NULL) {
>>>> +	if (!rxq->sw_ring) {
>>> Wonder what was wrong with that one? :)
>> Nothing - just aligned it with the lines I've added below. ;)
>>
>>>>    		ixgbe_rx_queue_release(rxq);
>>>>    		return (-ENOMEM);
>>>>    	}
>>>> -	PMD_INIT_LOG(DEBUG, "sw_ring=3D%p hw_ring=3D%p dma_addr=3D0x%"PRIx=
64,
>>>> -		     rxq->sw_ring, rxq->rx_ring, rxq->rx_ring_phys_addr);
>>>> +
>>>> +	if (rsc_requested) {
>>>> +		rxq->sw_rsc_ring =3D
>>>> +			rte_zmalloc_socket("rxq->sw_rsc_ring",
>>>> +					   sizeof(struct igb_rsc_entry) * len,
>>>> +					   RTE_CACHE_LINE_SIZE, socket_id);
>>>> +		if (!rxq->sw_rsc_ring) {
>>>> +			ixgbe_rx_queue_release(rxq);
>>>> +			return (-ENOMEM);
>>>> +		}
>>>> +	} else {
>>>> +		rxq->sw_rsc_ring =3D NULL;
>>>> +	}
>>>> +
>>>> +	PMD_INIT_LOG(DEBUG, "sw_ring=3D%p sw_rsc_ring=3D%p hw_ring=3D%p "
>>>> +			    "dma_addr=3D0x%"PRIx64,
>>>> +		     rxq->sw_ring, rxq->sw_rsc_ring, rxq->rx_ring,
>>>> +		     rxq->rx_ring_phys_addr);
>>>>
>>>>    	if (!rte_is_power_of_2(nb_desc)) {
>>>>    		PMD_INIT_LOG(DEBUG, "queue[%d] doesn't meet Vector Rx "
>>>> @@ -3515,6 +3835,84 @@ ixgbe_dev_mq_tx_configure(struct rte_eth_dev =
*dev)
>>>>    	return 0;
>>>>    }
>>>>
>>>> +/**
>>>> + * get_rscctl_maxdesc - Calculate the RSCCTL[n].MAXDESC for PF
>>>> + *
>>>> + * Return the RSCCTL[n].MAXDESC for 82599 and x540 PF devices accor=
ding to the
>>>> + * spec rev. 3.0 chapter 8.2.3.8.13.
>>>> + *
>>>> + * @pool Memory pool of the Rx queue
>>>> + */
>>>> +static inline uint32_t get_rscctl_maxdesc(struct rte_mempool *pool)
>>>> +{
>>>> +	struct rte_pktmbuf_pool_private *mp_priv =3D rte_mempool_get_priv(=
pool);
>>>> +
>>>> +	/* MAXDESC * SRRCTL.BSIZEPKT must not exceed 64 KB minus one */
>>>> +	uint16_t maxdesc =3D
>>>> +		65535 / (mp_priv->mbuf_data_room_size - RTE_PKTMBUF_HEADROOM);
>>> A  nit: use some macro (UINt16_MAX?) instead of hardcoded constant if=
 possible.
>> Using UINT16_MAX here would be very confusing. The value here just lik=
e
>> values below (16, 8, 4) are values that are explicitly stated in the
>> RSCCTL[n].MAXDESC description in the spec and this code piece is
>> implementing what spec is demanding. Therefore IMHO using the
>> explicit values from the spec here is the most readable way considerin=
g
>> the reader that will try to compare this code to the spec section
>> mentioned above and check that the code is correct.
> Ok, if you think UINT16_MAX is confusing, then just add a new one: IXGB=
E_RSC_MAX_PACKET_SIZE or something.
> As I understand, that's sort of upper limit for the RSC packet size sup=
ported, right?

Why to define a macro for a value that is not used anywhere else but=20
here and that is never going to be changed? How does it make the code=20
more readable or robust?

>
>>
>>>> +
>>>> +	if (maxdesc >=3D 16)
>>>> +		return IXGBE_RSCCTL_MAXDESC_16;
>>>> +	else if (maxdesc >=3D 8)
>>>> +		return IXGBE_RSCCTL_MAXDESC_8;
>>>> +	else if (maxdesc >=3D 4)
>>>> +		return IXGBE_RSCCTL_MAXDESC_4;
>>>> +	else
>>>> +		return IXGBE_RSCCTL_MAXDESC_1;
>>>> +}
>>>> +
>>>> +/* (Taken from FreeBSD tree)
>>>> +** Setup the correct IVAR register for a particular MSIX interrupt
>>>> +**   (yes this is all very magic and confusing :)
>>>> +**  - entry is the register array entry
>>>> +**  - vector is the MSIX vector for this queue
>>>> +**  - type is RX/TX/MISC
>>>> +*/
>>>> +static void
>>>> +ixgbe_set_ivar(struct rte_eth_dev *dev, u8 entry, u8 vector, s8 typ=
e)
>>>> +{
>>>> +	struct ixgbe_hw *hw =3D IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_pri=
vate);
>>>> +	u32 ivar, index;
>>>> +
>>>> +	vector |=3D IXGBE_IVAR_ALLOC_VAL;
>>>> +
>>>> +	switch (hw->mac.type) {
>>>> +
>>>> +	case ixgbe_mac_82598EB:
>>>> +		if (type =3D=3D -1)
>>>> +			entry =3D IXGBE_IVAR_OTHER_CAUSES_INDEX;
>>>> +		else
>>>> +			entry +=3D (type * 64);
>>>> +		index =3D (entry >> 2) & 0x1F;
>>>> +		ivar =3D IXGBE_READ_REG(hw, IXGBE_IVAR(index));
>>>> +		ivar &=3D ~(0xFF << (8 * (entry & 0x3)));
>>>> +		ivar |=3D (vector << (8 * (entry & 0x3)));
>>>> +		IXGBE_WRITE_REG(hw, IXGBE_IVAR(index), ivar);
>>>> +		break;
>>>> +
>>>> +	case ixgbe_mac_82599EB:
>>>> +	case ixgbe_mac_X540:
>>>> +		if (type =3D=3D -1) { /* MISC IVAR */
>>>> +			index =3D (entry & 1) * 8;
>>>> +			ivar =3D IXGBE_READ_REG(hw, IXGBE_IVAR_MISC);
>>>> +			ivar &=3D ~(0xFF << index);
>>>> +			ivar |=3D (vector << index);
>>>> +			IXGBE_WRITE_REG(hw, IXGBE_IVAR_MISC, ivar);
>>>> +		} else {	/* RX/TX IVARS */
>>>> +			index =3D (16 * (entry & 1)) + (8 * type);
>>>> +			ivar =3D IXGBE_READ_REG(hw, IXGBE_IVAR(entry >> 1));
>>>> +			ivar &=3D ~(0xFF << index);
>>>> +			ivar |=3D (vector << index);
>>>> +			IXGBE_WRITE_REG(hw, IXGBE_IVAR(entry >> 1), ivar);
>>>> +		}
>>>> +
>>>> +		break;
>>>> +
>>>> +	default:
>>>> +		break;
>>>> +	}
>>>> +}
>>>> +
>>>>    void set_rx_function(struct rte_eth_dev *dev)
>>>>    {
>>>>    	struct ixgbe_hw *hw =3D IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_p=
rivate);
>>>> @@ -3565,6 +3963,25 @@ void set_rx_function(struct rte_eth_dev *dev)
>>>>    			dev->rx_pkt_burst =3D ixgbe_recv_scattered_pkts;
>>>>    		}
>>>>    	}
>>>> +
>>>> +	/*
>>>> +	 * Initialize the appropriate LRO callback.
>>>> +	 *
>>>> +	 * If all queues satisfy the bulk allocation preconditions
>>>> +	 * (hw->rx_bulk_alloc_allowed is TRUE) then we may use bulk alloca=
tion.
>>>> +	 * Otherwise use a single allocation version.
>>>> +	 */
>>>> +	if (dev->data->lro) {
>>>> +		if (hw->rx_bulk_alloc_allowed) {
>>>> +			PMD_INIT_LOG(INFO, "LRO is requested. Using a bulk "
>>>> +					   "allocation version");
>>>> +			dev->rx_pkt_burst =3D ixgbe_recv_pkts_lro_bulk_alloc;
>>>> +		} else {
>>>> +			PMD_INIT_LOG(INFO, "LRO is requested. Using a single "
>>>> +					   "allocation version");
>>>> +			dev->rx_pkt_burst =3D ixgbe_recv_pkts_lro;
>>>> +		}
>>>> +	}
>>>>    }
>>> As I understand, ixgbe_recv_pkts_lro() can handle both LRO and normal=
 scattered packets?
>> Not as it is now. It may be easily patched to do so though.
>>
>>> If that so, then can we remove ixgbe_recv_scattered_pkts() at all and=
 use ixgbe_recv_scattered_pkts() for both cases?
>> This was explicitly requested from me by Bruce Richardson (see
>> "[dpdk-dev] : ixgbe: why bulk allocation is not used for a scattered R=
x
>> flow?" thread) to separate the complicated handling from the simple hi=
gh
>> performance one. The handling in the RSC routine is more generic and
>> thus is a bit of overkill for the simple scattered case: e.g. there is
>> no need to a sw_rsc_ring.
> I think Bruce meant ixgbe_recv_pkts_bulk_alloc() not ixgbe_recv_scatter=
ed_pkts()
> when he told about simple and high performance RX path.
>
>> Therefore I preferred to advance with small steps here. And if there
>> will be a decision to join these flows - it may be done with a rather
>> small patch in the future.
> Ok, that's understandable and I wouldn't insist to do that in the same =
patch.
> It just worries me that number of our ixgbe RX functions keeps increasi=
ng.

Let's have this series get to the master and I'll send a follow-up=20
series that kills non-vector scatter callback. Agreed? ;)

>  =20
>
>>>>    /*
>>>> @@ -3583,10 +4000,26 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    	uint32_t maxfrs;
>>>>    	uint32_t srrctl;
>>>>    	uint32_t rdrxctl;
>>>> +	uint32_t rscctl;
>>>> +	uint32_t psrtype;
>>>> +	uint32_t rfctl;
>>>>    	uint32_t rxcsum;
>>>>    	uint16_t buf_size;
>>>>    	uint16_t i;
>>>>    	struct rte_eth_rxmode *rx_conf =3D &dev->data->dev_conf.rxmode;
>>>> +	struct rte_eth_dev_info dev_info =3D { 0 };
>>>> +	bool rsc_capable =3D false;
>>>> +
>>>> +	/* Sanity check */
>>>> +	dev->dev_ops->dev_infos_get(dev, &dev_info);
>>>> +	if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)
>>>> +		rsc_capable =3D true;
>>> @ 7.11.1 82599 spec says:
>>> " Note that in SR-IOV mode the RSC must be disabled globally by setti=
ng the RFCTL.RSC_DIS bit."
>>> Add a check?
>> Good catch! Will add a check. Thanks.
>>
>>>> +
>>>> +	if (!rsc_capable && rx_conf->enable_lro) {
>>>> +		PMD_INIT_LOG(CRIT, "LRO is requested on HW that doesn't "
>>>> +				   "support it");
>>>> +		return -EINVAL;
>>>> +	}
>>>>
>>>>    	PMD_INIT_FUNC_TRACE();
>>>>    	hw =3D IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
>>>> @@ -3606,13 +4039,44 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    	IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl);
>>>>
>>>>    	/*
>>>> +	 * RFCTL configuration
>>>> +	 *
>>>> +	 * Since NFS packets coalescing is not supported - clear RFCTL.NFS=
W_DIS
>>>> +	 * and RFCTL.NFSR_DIS when RSC is enabled.
>>>> +	 */
>>>> +	if (rsc_capable) {
>>>> +		rfctl =3D IXGBE_READ_REG(hw, IXGBE_RFCTL);
>>>> +		if (rx_conf->enable_lro) {
>>>> +			rfctl &=3D ~(IXGBE_RFCTL_RSC_DIS | IXGBE_RFCTL_NFSW_DIS |
>>>> +				   IXGBE_RFCTL_NFSR_DIS);
>>>> +		} else {
>>>> +			rfctl |=3D IXGBE_RFCTL_RSC_DIS;
>>>> +		}
>>>> +
>>>> +		IXGBE_WRITE_REG(hw, IXGBE_RFCTL, rfctl);
>>>> +	}
>>>> +
>>>> +
>>>> +	/*
>>>>    	 * Configure CRC stripping, if any.
>>>>    	 */
>>>>    	hlreg0 =3D IXGBE_READ_REG(hw, IXGBE_HLREG0);
>>>>    	if (rx_conf->hw_strip_crc)
>>>>    		hlreg0 |=3D IXGBE_HLREG0_RXCRCSTRP;
>>>> -	else
>>>> +	else {
>>>>    		hlreg0 &=3D ~IXGBE_HLREG0_RXCRCSTRP;
>>>> +		if (rx_conf->enable_lro) {
>>>> +			/*
>>>> +			 * According to chapter of 4.6.7.2.1 of the Spec Rev.
>>>> +			 * 3.0 RSC configuration requires HW CRC stripping being
>>>> +			 * enabled. If user requested both HW CRC stripping off
>>>> +			 * and RSC on - return an error.
>>>> +			 */
>>>> +			PMD_INIT_LOG(CRIT, "LRO can't be enabled when HW CRC "
>>>> +					    "is disabled");
>>>> +			return -EINVAL;
>>>> +		}
>>>> +	}
>>>>
>>>>    	/*
>>>>    	 * Configure jumbo frame support, if any.
>>>> @@ -3664,9 +4128,18 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    		 * Configure Header Split
>>>>    		 */
>>>>    		if (rx_conf->header_split) {
>>>> +			/*
>>>> +			 * Print a warning if split_hdr_size is less
>>>> +			 * than 128 bytes when RSC is requested.
>>>> +			 */
>>>> +			if (rx_conf->enable_lro &&
>>>> +			    rx_conf->split_hdr_size < 128)
>>>> +				PMD_INIT_LOG(INFO, "split_hdr_size less than "
>>>> +						   "128 bytes (%d)!",
>>>> +					     rx_conf->split_hdr_size);
>>>> +
>>>>    			if (hw->mac.type =3D=3D ixgbe_mac_82599EB) {
>>>>    				/* Must setup the PSRTYPE register */
>>>> -				uint32_t psrtype;
>>>>    				psrtype =3D IXGBE_PSRTYPE_TCPHDR |
>>>>    					IXGBE_PSRTYPE_UDPHDR   |
>>>>    					IXGBE_PSRTYPE_IPV4HDR  |
>>>> @@ -3679,7 +4152,20 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    			srrctl |=3D IXGBE_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
>>>>    		} else
>>>>    #endif
>>>> +		{
>>>>    			srrctl =3D IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
>>>> +			/*
>>>> +			 * Following the 4.6.7.2.1 chapter of the 82599/x540
>>>> +			 * Spec if RSC is enabled the SRRCTL[n].BSIZEHEADER
>>>> +			 * should be configured even if header split is not
>>>> +			 * enabled. In the later case we will configure it 128
>>>> +			 * bytes following the recommendation in the spec.
>>>> +			 */
>>>> +			if (rx_conf->enable_lro)
>>>> +				srrctl |=3D
>>>> +				     ((128 << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) &
>>>> +						    IXGBE_SRRCTL_BSIZEHDR_MASK);
>>>> +		}
>>>>
>>>>    		/* Set if packets are dropped when no descriptors available */
>>>>    		if (rxq->drop_en)
>>>> @@ -3696,6 +4182,13 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    				       RTE_PKTMBUF_HEADROOM);
>>>>    		srrctl |=3D ((buf_size >> IXGBE_SRRCTL_BSIZEPKT_SHIFT) &
>>>>    			   IXGBE_SRRCTL_BSIZEPKT_MASK);
>>>> +
>>>> +		/*
>>>> +		 * TODO: Consider setting the Receive Descriptor Minimum
>>>> +		 * Threshold Size for and RSC case. This is not an obviously
>>>> +		 * beneficiary option but the one worth considering...
>>>> +		 */
>>>> +
>>>>    		IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxq->reg_idx), srrctl);
>>>>
>>>>    		buf_size =3D (uint16_t) ((srrctl & IXGBE_SRRCTL_BSIZEPKT_MASK) =
<<
>>>> @@ -3705,11 +4198,57 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    		if (dev->data->dev_conf.rxmode.max_rx_pkt_len +
>>>>    					    2 * IXGBE_VLAN_TAG_SIZE > buf_size)
>>>>    			dev->data->scattered_rx =3D 1;
>>>> +
>>>> +		/* RSC per-queue configuration */
>>>> +		if (rx_conf->enable_lro) {
>>>> +			uint32_t eitr;
>>>> +
>>>> +			rscctl =3D
>>>> +				IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxq->reg_idx));
>>>> +			psrtype =3D
>>>> +				IXGBE_READ_REG(hw, IXGBE_PSRTYPE(rxq->reg_idx));
>>>> +			eitr =3D IXGBE_READ_REG(hw, IXGBE_EITR(rxq->reg_idx));
>>>> +
>>>> +			rscctl |=3D IXGBE_RSCCTL_RSCEN;
>>>> +			rscctl |=3D get_rscctl_maxdesc(rxq->mb_pool);
>>>> +			psrtype |=3D IXGBE_PSRTYPE_TCPHDR;
>>>> +
>>>> +			/*
>>>> +			 * RSC: Set ITR interval corresponding to 2K ints/s.
>>>> +			 *
>>>> +			 * Full-sized RSC aggregations for a 10Gb/s link will
>>>> +			 * arrive at about 20K aggregation/s rate.
>>>> +			 *
>>>> +			 * 2K inst/s rate will make only 10% of the
>>>> +			 * aggregations to be closed due to the interrupt timer
>>>> +			 * expiration for a streaming at wire-speed case.
>>>> +			 *
>>>> +			 * For a sparse streaming case this setting will yield
>>>> +			 * at most 500us latency for a single RSC aggregation.
>>>> +			 */
>>>> +			eitr   |=3D (2000 | IXGBE_EITR_CNT_WDIS);
>>> Again probably create some macro for ITR Interval default value here.
>> Well, again - it's the only place where it's used and I've extensively
>> explained it in the comments in the code. Therefore I think it's the
>> most readable way to write this.
>> If it would be used in at least two places - then I would have put it =
in
>> a macro...
> I think it is a good practise to use macros instead of raw numbers in s=
uch places.
> You probably can make these macros self-explanatory:
> /* EITR Inteval in 2us uinits for 1G and 10G. */
> #define IXGBE_EITR_INTERVAL_US	2
>
> #define IXGBE_EITR_INTERVAL_SHIFT	3
>
> #define IXGBE_EITR_INTERVAL(us)	((us) / IXGBE_EITR_INTERVAL_US << IXGBE=
_EITR_INTERVAL_SHIFT)
>
> /* at most 500us latency for a single RSC aggregation */
> #define IXGHE_EITR_INTERVAL_DEFAULT  IXGBE_EITR_INTERVAL(500)

If this value would have a potential be changed one day or if it would=20
going to be used somewhere else in the code I would immediately agree=20
but here u've added 9 long lines of something that nobody would ever=20
care about. The only thing that everybody would care what are the actual=20
implication of this value on the RSC functionality. To understand that=20
having macros like u propose instead of a proper comment like I propose=20
doesn't help much. This is because the thing is not just about the EITR=20
interval and the maximum latency. But if we keep my comment then we=20
don't need any additional self-explanatory macros because everything has=20
been explained in the comment already.

If one day this parameter is going to be configured from the outside -=20
then I agree that there would be a place for macros like above. For the=20
current API state I think it would just pump up the code with useless=20
code lines.

>
>>>> +
>>>> +			IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(rxq->reg_idx), rscctl);
>>>> +			IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(rxq->reg_idx),
>>>> +								       psrtype);
>>>> +			IXGBE_WRITE_REG(hw, IXGBE_EITR(rxq->reg_idx), eitr);
>>>> +
>>>> +			/*
>>>> +			 * RSC requires the mapping of the queue to the
>>>> +			 * interrupt vector.
>>>> +			 */
>>>> +			ixgbe_set_ivar(dev, rxq->reg_idx, i, 0);
>>> Hm, wonder why do we need to setup IVAR for RSC?
>>> Wouldn't just setting EITR be enough?
>> Nope. See 82599 spec chapter 4.6.7.2.2.
> I read it, though it doesn't say 'IVAR must be setup' like it does for =
EITR.Inerval.

82599 Spec, Chapter 4.6.7.2.2 ("RSC Enablement" -> "Per Queue Setting"),=20
the last bullet:

"=97 Map the relevant Rx queues to an interrupt by setting the relevant I=
VAR
registers."

> That made me thought that it might be optional.
>
>> I think I even tried not to map
>> the queues to IVAR and it didn't work... ;)
> Pity, but not much we can in that case, I suppose.
>
>>>> +
>>>> +			rxq->rsc_en =3D 1;
>>>> +		}
>>>>    	}
>>>>
>>>>    	if (rx_conf->enable_scatter)
>>>>    		dev->data->scattered_rx =3D 1;
>>>>
>>>> +	if (rx_conf->enable_lro)
>>>> +		dev->data->lro =3D 1;
>>>> +
>>>>    	set_rx_function(dev);
>>>>
>>>>    	/*
>>>> @@ -3742,6 +4281,19 @@ ixgbe_dev_rx_init(struct rte_eth_dev *dev)
>>>>    		IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
>>>>    	}
>>>>
>>>> +	/* Finalize RSC configuration  */
>>>> +	if (rx_conf->enable_lro) {
>>>> +		/*
>>>> +		 * Follow the instructions in the 4.6.7.2.1 of the Spec Rev. 3.0
>>>> +		 */
>>>> +		rdrxctl =3D IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
>>>> +		rdrxctl |=3D IXGBE_RDRXCTL_RSCACKC;
>>>> +		IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
>>>> +
>>>> +		PMD_INIT_LOG(INFO, "enabling LRO mode");
>>>> +	}
>>>> +
>>>> +
>>>>    	return 0;
>>>>    }
>>>>
>>>> diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.h b/lib/librte_pmd_ixgb=
e/ixgbe_rxtx.h
>>>> index bbe5ff3..389173f 100644
>>>> --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.h
>>>> +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.h
>>>> @@ -79,6 +79,10 @@ struct igb_rx_entry {
>>>>    	struct rte_mbuf *mbuf; /**< mbuf associated with RX descriptor. =
*/
>>>>    };
>>>>
>>>> +struct igb_rsc_entry {
>>>> +	struct rte_mbuf *fbuf; /**< First segment of the fragmented packet=
. */
>>>> +};
>>>> +
>>>>    /**
>>>>     * Structure associated with each descriptor of the TX ring of a =
TX queue.
>>>>     */
>>>> @@ -105,6 +109,7 @@ struct igb_rx_queue {
>>>>    	volatile uint32_t   *rdt_reg_addr; /**< RDT register address. */
>>>>    	volatile uint32_t   *rdh_reg_addr; /**< RDH register address. */
>>>>    	struct igb_rx_entry *sw_ring; /**< address of RX software ring. =
*/
>>>> +	struct igb_rsc_entry *sw_rsc_ring; /**< address of RSC software ri=
ng. */
>>>>    	struct rte_mbuf *pkt_first_seg; /**< First segment of current pa=
cket. */
>>>>    	struct rte_mbuf *pkt_last_seg; /**< Last segment of current pack=
et. */
>>>>    	uint64_t            mbuf_initializer; /**< value to init mbufs *=
/
>>>> @@ -126,6 +131,7 @@ struct igb_rx_queue {
>>>>    	uint8_t             port_id;  /**< Device port identifier. */
>>>>    	uint8_t             crc_len;  /**< 0 if CRC stripped, 4 otherwis=
e. */
>>>>    	uint8_t             drop_en;  /**< If not 0, set SRRCTL.Drop_En.=
 */
>>>> +	uint8_t             rsc_en;   /**< If not 0, RSC is enabled. */
>>>>    	uint8_t             rx_deferred_start; /**< not in global dev st=
art. */
>>>>    #ifdef RTE_LIBRTE_IXGBE_RX_ALLOW_BULK_ALLOC
>>>>    	/** need to alloc dummy mbuf, for wraparound when scanning hw ri=
ng */
>>>> --
>>>> 2.1.0