Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd

All of lore.kernel.org
 help / color / mirror / Atom feed

From: "Sekhar, Ashwin" <Ashwin.Sekhar@cavium.com>
To: "tomasz.kantecki@intel.com" <tomasz.kantecki@intel.com>,
	"Jacob,  Jerin" <Jerin.JacobKollanukkaran@cavium.com>,
	"jianbo.liu@linaro.org" <jianbo.liu@linaro.org>,
	"dev@dpdk.org" <dev@dpdk.org>
Subject: Re: [PATCH 5/5] examples/l3fwd: add neon support for l3fwd
Date: Tue, 2 May 2017 11:20:35 +0000	[thread overview]
Message-ID: <1493724035.3602.14.camel@caviumnetworks.com> (raw)
In-Reply-To: <1493709255-8887-5-git-send-email-jianbo.liu@linaro.org>

Hi,

Please find comments inline.

On Tue, 2017-05-02 at 15:14 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.
> 
> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
> ---
>  examples/l3fwd/l3fwd.h             |   4 -
>  examples/l3fwd/l3fwd_em.c          |   4 +-
>  examples/l3fwd/l3fwd_em_hlm.h      |   5 +
>  examples/l3fwd/l3fwd_em_hlm_neon.h |  74 +++++++++++
>  examples/l3fwd/l3fwd_em_single.h   |   4 +
>  examples/l3fwd/l3fwd_lpm.c         |   4 +-
>  examples/l3fwd/l3fwd_lpm_neon.h    | 157 ++++++++++++++++++++++
>  examples/l3fwd/l3fwd_neon.h        | 259
> +++++++++++++++++++++++++++++++++++++
>  8 files changed, 504 insertions(+), 7 deletions(-)
>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>  create mode 100644 examples/l3fwd/l3fwd_neon.h
> 
> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
> index 011ba14..c45589a 100644
> --- a/examples/l3fwd/l3fwd.h
> +++ b/examples/l3fwd/l3fwd.h
> @@ -40,10 +40,6 @@
>  
>  #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
>  
> -#if !defined(NO_HASH_MULTI_LOOKUP) &&
> defined(RTE_MACHINE_CPUFLAG_NEON)
> -#define NO_HASH_MULTI_LOOKUP 1
> -#endif
> -
>  #define MAX_PKT_BURST     32
>  #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
>  
> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
> index cccf797..ac1e2e0 100644
> --- a/examples/l3fwd/l3fwd_em.c
> +++ b/examples/l3fwd/l3fwd_em.c
> @@ -328,7 +328,7 @@ struct ipv6_l3fwd_em_route {
>  	return (uint8_t)((ret < 0) ? portid :
> ipv6_l3fwd_out_if[ret]);
>  }
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  #if defined(NO_HASH_MULTI_LOOKUP)
>  #include "l3fwd_em_single.h"
>  #else
> @@ -709,7 +709,7 @@ struct ipv6_l3fwd_em_route {
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_em_send_packets(nb_rx, pkts_burst,
>  							portid,
> qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
> b/examples/l3fwd/l3fwd_em_hlm.h
> index 636dea4..3329c1a 100644
> --- a/examples/l3fwd/l3fwd_em_hlm.h
> +++ b/examples/l3fwd/l3fwd_em_hlm.h
> @@ -35,8 +35,13 @@
>  #ifndef __L3FWD_EM_HLM_H__
>  #define __L3FWD_EM_HLM_H__
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
>  #include "l3fwd_em_hlm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#include "l3fwd_em_hlm_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) void
>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
> *m[8],
> diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h
> b/examples/l3fwd/l3fwd_em_hlm_neon.h
> new file mode 100644
> index 0000000..dae1acf
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
> @@ -0,0 +1,74 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_EM_HLM_NEON_H__
> +#define __L3FWD_EM_HLM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +static inline void
> +get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		union ipv4_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0,
> int32_t *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv4_hdr,
> time_to_live)));
> +
> +	key->xmm = vandq_s32(tmpdata0, mask0);
> +}
> +
> +static inline void
> +get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
> +		int32x4_t mask1, union ipv6_5tuple_host *key)
> +{
> +	int32x4_t tmpdata0 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len)));
> +
> +	int32x4_t tmpdata1 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 8));
> +
> +	int32x4_t tmpdata2 = vld1q_s32(
> +			rte_pktmbuf_mtod_offset(m0, int *,
> +				sizeof(struct ether_hdr) +
> +				offsetof(struct ipv6_hdr,
> payload_len) + 16));
> +
> +	key->xmm[0] = vandq_s32(tmpdata0, mask0);
> +	key->xmm[1] = tmpdata1;
> +	key->xmm[2] = vandq_s32(tmpdata2, mask1);
> +}
> +#endif /* __L3FWD_EM_HLM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_em_single.h
> b/examples/l3fwd/l3fwd_em_single.h
> index c0a9725..8604571 100644
> --- a/examples/l3fwd/l3fwd_em_single.h
> +++ b/examples/l3fwd/l3fwd_em_single.h
> @@ -43,7 +43,11 @@
>   * compilation time.
>   */
>  
> +#if defined(__SSE4_1__)
>  #include "l3fwd_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_neon.h"
> +#endif
>  
>  static inline __attribute__((always_inline)) uint16_t
>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf
> *pkt,
> diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
> index fc554fc..ddef250 100644
> --- a/examples/l3fwd/l3fwd_lpm.c
> +++ b/examples/l3fwd/l3fwd_lpm.c
> @@ -189,6 +189,8 @@ static inline __attribute__((always_inline))
> uint16_t
>  
>  #if defined(__SSE4_1__)
>  #include "l3fwd_lpm_sse.h"
> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
> +#include "l3fwd_lpm_neon.h"
>  #else
>  #include "l3fwd_lpm.h"
>  #endif
> @@ -261,7 +263,7 @@ static inline __attribute__((always_inline))
> uint16_t
>  			if (nb_rx == 0)
>  				continue;
>  
> -#if defined(__SSE4_1__)
> +#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
>  			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
>  						portid, qconf);
>  #else
> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
> b/examples/l3fwd/l3fwd_lpm_neon.h
> new file mode 100644
> index 0000000..772e54b
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
> @@ -0,0 +1,157 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef __L3FWD_LPM_NEON_H__
> +#define __L3FWD_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +
> +#include "l3fwd_neon.h"
> +
> +/*
> + * Read packet_type and destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
> +		int32x4_t *dip,
> +		uint32_t *ipv4_flag)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[0] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[1] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[1]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[2] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[2]->packet_type;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
> +	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
> +	dst[3] = ipv4_hdr->dst_addr;
> +	ipv4_flag[0] &= pkt[3]->packet_type;
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + * If lookup fails, use incoming port (portid) as destination port.
> + */
> +static inline void
> +processx4_step2(const struct lcore_conf *qconf,
> +		int32x4_t dip,
> +		uint32_t ipv4_flag,
> +		uint8_t portid,
> +		struct rte_mbuf *pkt[FWDSTEP],
> +		uint16_t dprt[FWDSTEP])
> +{
> +	rte_xmm_t dst;
> +	uint8x16_t bswap_mask = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9,
> 8,
> +				 15, 14, 13, 12};
> +
> +	/* Byte swap 4 IPV4 addresses. */
> +	dip =
> vreinterpretq_s32_u8(vqtbl1q_u8(vreinterpretq_u8_s32(dip),
> +					      bswap_mask));
> +
This can be easily done by vrev32q_u8. With this we can avoid the need
for bswap_mask. Also TBL instruction has higher latency compared to the
rev32 instruction in thunderx, thunderx2t99 and cortexa57. 

> +	/* if all 4 packets are IPV4. */
> +	if (likely(ipv4_flag)) {
> +		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip,
> dst.u32,
> +			portid);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +	} else {
> +		dst.x = dip;
> +		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
> +						     dst.u32[0],
> portid);
> +		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
> +						     dst.u32[1],
> portid);
> +		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
> +						     dst.u32[2],
> portid);
> +		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
> +						     dst.u32[3],
> portid);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of packets, invoked
> + * from main_loop.
> + */
> +static inline void
> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
> +			uint8_t portid, struct lcore_conf *qconf)
> +{
> +	int32_t j;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
> +	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	for (j = 0; j != k; j += FWDSTEP)
> +		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
> +				&ipv4_flag[j / FWDSTEP]);
> +
> +	for (j = 0; j != k; j += FWDSTEP)
> +		processx4_step2(qconf, dip[j / FWDSTEP],
> +				ipv4_flag[j / FWDSTEP], portid,
> &pkts_burst[j],
> +				&dst_port[j]);
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
> portid);
> +		j++;
> +	}
> +
> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
> +}
> +
> +#endif /* __L3FWD_LPM_NEON_H__ */
> diff --git a/examples/l3fwd/l3fwd_neon.h
> b/examples/l3fwd/l3fwd_neon.h
> new file mode 100644
> index 0000000..75c8976
> --- /dev/null
> +++ b/examples/l3fwd/l3fwd_neon.h
> @@ -0,0 +1,259 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2017, Linaro Limited
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or
> without
> + *   modification, are permitted provided that the following
> conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer.
> + *     * Redistributions in binary form must reproduce the above
> copyright
> + *       notice, this list of conditions and the following
> disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products
> derived
> + *       from this software without specific prior written
> permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +
> +#ifndef _L3FWD_NEON_H_
> +#define _L3FWD_NEON_H_
> +
> +#include "l3fwd.h"
> +#include "l3fwd_common.h"
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t
> dst_port[FWDSTEP])
> +{
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +
> +	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
> +	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
> +	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
> +	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
> +
> +	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +	te[0] = vld1q_u32(p[0]);
> +
> +	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
> +	te[1] = vld1q_u32(p[1]);
> +
> +	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
> +	te[2] = vld1q_u32(p[2]);
> +
> +	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
> +	te[3] = vld1q_u32(p[3]);
> +
> +	/* Update last 4 bytes */
> +	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
> +	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
> +	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
> +	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
> +
> +	vst1q_u32(p[0], ve[0]);
> +	vst1q_u32(p[1], ve[1]);
> +	vst1q_u32(p[2], ve[2]);
> +	vst1q_u32(p[3], ve[3]);
> +
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0]
> + 1),
> +		&dst_port[0], pkt[0]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1]
> + 1),
> +		&dst_port[1], pkt[1]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2]
> + 1),
> +		&dst_port[2], pkt[2]->packet_type);
> +	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3]
> + 1),
> +		&dst_port[3], pkt[3]->packet_type);
> +}
> +
> +/*
> + * Group consecutive packets with the same destination port in
> bursts of 4.
> + * Suppose we have array of destionation ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisions at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +	     uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	int32_t v;
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +/**
> + * Process one packet:
> + * Update source and destination MAC addresses in the ethernet
> header.
> + * Perform RFC1812 checks and updates for IPV4 packets.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
> +{
> +	struct ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +
> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
> +			pkt->packet_type);
> +
> +	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +}
> +
> +/**
> + * Send packets burst from pkts_burst to the ports in dst_port array
> + */
> +static inline __attribute__((always_inline)) void
> +send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf
> **pkts_burst,
> +		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
> +{
> +	int32_t k;
> +	int j = 0;
> +	uint16_t dlp;
> +	uint16_t *lp;
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts_burst, dst_port);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (j = FWDSTEP; j != k; j += FWDSTEP) {
> +			processx4_step3(&pkts_burst[j],
> &dst_port[j]);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
> +			lp  = port_groupx4(&pnum[j - FWDSTEP], lp,
> dp1, dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2,
> 3);
> +		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1,
> dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[j - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts_burst[j], dst_port + j);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
> +		j++;
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (j = 0; j < nb_rx; j += k) {
> +
> +		int32_t m;
> +		uint16_t pn;
> +
> +		pn = dst_port[j];
> +		k = pnum[j];
> +
> +		if (likely(pn != BAD_PORT))
> +			send_packetsx4(qconf, pn, pkts_burst + j,
> k);
> +		else
> +			for (m = j; m != j + k; m++)
> +				rte_pktmbuf_free(pkts_burst[m]);
> +
> +	}
> +}
> +
> +#endif /* _L3FWD_NEON_H_ */

Thanks and Regards
Ashwin

next prev parent reply	other threads:[~2017-05-02 11:20 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-02  7:14 [PATCH 1/5] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-02  7:14 ` [PATCH 2/5] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_single.h Jianbo Liu
2017-05-02  9:40   ` Sekhar, Ashwin
2017-05-02  7:14 ` [PATCH 3/5] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-02  7:14 ` [PATCH 4/5] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-02  7:14 ` [PATCH 5/5] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-02 11:20   ` Sekhar, Ashwin [this message]
2017-05-02 11:47   ` Sekhar, Ashwin
2017-05-03  5:24     ` Jianbo Liu
2017-05-04  8:42       ` Jianbo Liu
2017-05-05  4:24         ` Sekhar, Ashwin
2017-05-05  5:43           ` Jianbo Liu
2017-05-09  8:10             ` Sekhar, Ashwin
2017-05-10  2:39               ` Jianbo Liu
2017-05-10  2:30 ` [PATCH v2 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-10 15:00     ` Sekhar, Ashwin
2017-05-11  3:16       ` Jianbo Liu
2017-05-11  4:14         ` Sekhar, Ashwin
2017-05-11  4:27           ` Sekhar, Ashwin
2017-05-11  6:11             ` Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-10  2:30   ` [PATCH v2 7/7] examples/l3fwd: change the guard micro name for header file Jianbo Liu
2017-05-10 11:57     ` Sekhar, Ashwin
2017-05-11  9:25 ` [PATCH v3 0/7] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 1/7] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 2/7] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 3/7] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 4/7] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 5/7] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-11  9:49     ` Sekhar, Ashwin
2017-05-11 10:01       ` Jianbo Liu
2017-05-11 10:27         ` Sekhar, Ashwin
2017-05-12  2:40           ` Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 6/7] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-11  9:25   ` [PATCH v3 7/7] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-05-15  3:34 ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
2017-05-15  4:01     ` Jerin Jacob
2017-05-15  3:34   ` [PATCH v4 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-05-15  5:22     ` Sekhar, Ashwin
2017-05-15  3:34   ` [PATCH v4 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-05-15  3:34   ` [PATCH v4 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-07-03 21:02   ` [PATCH v4 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon
2017-07-04 10:23 ` [PATCH v5 " Jianbo Liu
2017-07-04 10:23   ` [PATCH v5 1/8] examples/l3fwd: extract arch independent code from multi hash lookup Jianbo Liu
2017-07-04 10:23   ` [PATCH v5 2/8] examples/l3fwd: rename l3fwd_em_sse.h to l3fwd_em_sequential.h Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 3/8] examples/l3fwd: extract common code from multi packet send Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 4/8] examples/l3fwd: rearrange the code for lpm_l3fwd Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 5/8] arch/arm: add vcopyq_laneq_u32 for old version of gcc Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 6/8] examples/l3fwd: add neon support for l3fwd Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 7/8] examples/l3fwd: add the times of hash multi-lookup for different Archs Jianbo Liu
2017-07-04 10:24   ` [PATCH v5 8/8] examples/l3fwd: change the guard macro name for header file Jianbo Liu
2017-07-04 15:11   ` [PATCH v5 0/8] accelerate examples/l3fwd with NEON on ARM64 platform Thomas Monjalon

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1493724035.3602.14.camel@caviumnetworks.com \
    --to=ashwin.sekhar@cavium.com \
    --cc=Jerin.JacobKollanukkaran@cavium.com \
    --cc=dev@dpdk.org \
    --cc=jianbo.liu@linaro.org \
    --cc=tomasz.kantecki@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.