From mboxrd@z Thu Jan  1 00:00:00 1970
From: Jianbo Liu <jianbo.liu@arm.com>
Subject: Re: [PATCH v2 8/8] net/mlx5: fix synchonization on
 polling Rx completions
Date: Tue, 16 Jan 2018 11:53:05 +0800
Message-ID: <20180116035304.GA14809@arm.com>
References: <20171227042824.33373-1-yskoh@mellanox.com>
 <20180116011050.18866-1-yskoh@mellanox.com>
 <20180116011050.18866-9-yskoh@mellanox.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
Cc: adrien.mazarguil@6wind.com, nelio.laranjeiro@6wind.com,
 jerin.jacob@caviumnetworks.com, dev@dpdk.org, stable@dpdk.org
To: Yongseok Koh <yskoh@mellanox.com>
Return-path: <dev-bounces@dpdk.org>
Content-Disposition: inline
In-Reply-To: <20180116011050.18866-9-yskoh@mellanox.com>
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://dpdk.org/ml/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://dpdk.org/ml/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://dpdk.org/ml/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

The 01/15/2018 17:10, Yongseok Koh wrote:
> Polling a new packet is basically sensing the generation bit in a
> completion entry. For some processors not having strongly-ordered memory
> model, there has to be an IO memory barrier between reading the generatio=
n
> bit and other fields of the entry in order to guarantee data is not stale=
.
>
> Fixes: 570acdb1da8a ("net/mlx5: add vectorized Rx/Tx burst for ARM")
> Cc: stable@dpdk.org
>
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> Acked-by: Shahaf Shuler <shahafs@mellanox.com>
> Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Acked-by: Jianbo Liu <jianbo.liu@arm.com>

> ---
>  drivers/net/mlx5/mlx5_rxtx.c          |  1 +
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 53 ++++++++++++++++++++---------=
------
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  2 +-
>  3 files changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 99a5f8681..8065d9d0b 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1669,6 +1669,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatil=
e struct mlx5_cqe *cqe,
>                       return 0;
>               ++rxq->cq_ci;
>               op_own =3D cqe->op_own;
> +             rte_dma_rmb();
>               if (MLX5_CQE_FORMAT(op_own) =3D=3D MLX5_COMPRESSED) {
>                       volatile struct mlx5_mini_cqe8 (*mc)[8] =3D
>                               (volatile struct mlx5_mini_cqe8 (*)[8])
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx=
5_rxtx_vec_neon.h
> index e11565f69..29ae933e7 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -814,6 +814,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu=
f **pkts, uint16_t pkts_n,
>               uint16x4_t mask;
>               uint16x4_t byte_cnt;
>               uint32x4_t ptype_info, flow_tag;
> +             register uint64x2_t c0, c1, c2, c3;
>               uint8_t *p0, *p1, *p2, *p3;
>               uint8_t *e0 =3D (void *)&elts[pos]->pkt_len;
>               uint8_t *e1 =3D (void *)&elts[pos + 1]->pkt_len;
> @@ -830,6 +831,16 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mb=
uf **pkts, uint16_t pkts_n,
>               p1 =3D p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe);
>               p2 =3D p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe);
>               p3 =3D p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe);
> +             /* B.0 (CQE 3) load a block having op_own. */
> +             c3 =3D vld1q_u64((uint64_t *)(p3 + 48));
> +             /* B.0 (CQE 2) load a block having op_own. */
> +             c2 =3D vld1q_u64((uint64_t *)(p2 + 48));
> +             /* B.0 (CQE 1) load a block having op_own. */
> +             c1 =3D vld1q_u64((uint64_t *)(p1 + 48));
> +             /* B.0 (CQE 0) load a block having op_own. */
> +             c0 =3D vld1q_u64((uint64_t *)(p0 + 48));
> +             /* Synchronize for loading the rest of blocks. */
> +             rte_dma_rmb();
>               /* Prefetch next 4 CQEs. */
>               if (pkts_n - pos >=3D 2 * MLX5_VPMD_DESCS_PER_LOOP) {
>                       unsigned int next =3D pos + MLX5_VPMD_DESCS_PER_LOO=
P;
> @@ -839,50 +850,46 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_m=
buf **pkts, uint16_t pkts_n,
>                       rte_prefetch_non_temporal(&cq[next + 3]);
>               }
>               __asm__ volatile (
> -             /* B.1 (CQE 3) load a block having op_own. */
> -             "ld1 {v19.16b}, [%[p3]] \n\t"
> -             "sub %[p3], %[p3], #48 \n\t"
> -             /* B.2 (CQE 3) load the rest blocks. */
> +             /* B.1 (CQE 3) load the rest of blocks. */
>               "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t"
> +             /* B.2 (CQE 3) move the block having op_own. */
> +             "mov v19.16b, %[c3].16b \n\t"
>               /* B.3 (CQE 3) extract 16B fields. */
>               "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
> +             /* B.1 (CQE 2) load the rest of blocks. */
> +             "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
>               /* B.4 (CQE 3) adjust CRC length. */
>               "sub v23.8h, v23.8h, %[crc_adj].8h \n\t"
> -             /* B.1 (CQE 2) load a block having op_own. */
> -             "ld1 {v19.16b}, [%[p2]] \n\t"
> -             "sub %[p2], %[p2], #48 \n\t"
>               /* C.1 (CQE 3) generate final structure for mbuf. */
>               "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t"
> -             /* B.2 (CQE 2) load the rest blocks. */
> -             "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t"
> +             /* B.2 (CQE 2) move the block having op_own. */
> +             "mov v19.16b, %[c2].16b \n\t"
>               /* B.3 (CQE 2) extract 16B fields. */
>               "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
> +             /* B.1 (CQE 1) load the rest of blocks. */
> +             "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
>               /* B.4 (CQE 2) adjust CRC length. */
>               "sub v22.8h, v22.8h, %[crc_adj].8h \n\t"
> -             /* B.1 (CQE 1) load a block having op_own. */
> -             "ld1 {v19.16b}, [%[p1]] \n\t"
> -             "sub %[p1], %[p1], #48 \n\t"
>               /* C.1 (CQE 2) generate final structure for mbuf. */
>               "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t"
> -             /* B.2 (CQE 1) load the rest blocks. */
> -             "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t"
> +             /* B.2 (CQE 1) move the block having op_own. */
> +             "mov v19.16b, %[c1].16b \n\t"
>               /* B.3 (CQE 1) extract 16B fields. */
>               "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
> +             /* B.1 (CQE 0) load the rest of blocks. */
> +             "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
>               /* B.4 (CQE 1) adjust CRC length. */
>               "sub v21.8h, v21.8h, %[crc_adj].8h \n\t"
> -             /* B.1 (CQE 0) load a block having op_own. */
> -             "ld1 {v19.16b}, [%[p0]] \n\t"
> -             "sub %[p0], %[p0], #48 \n\t"
>               /* C.1 (CQE 1) generate final structure for mbuf. */
>               "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t"
> -             /* B.2 (CQE 0) load the rest blocks. */
> -             "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t"
> +             /* B.2 (CQE 0) move the block having op_own. */
> +             "mov v19.16b, %[c0].16b \n\t"
> +             /* A.1 load mbuf pointers. */
> +             "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
>               /* B.3 (CQE 0) extract 16B fields. */
>               "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t"
>               /* B.4 (CQE 0) adjust CRC length. */
>               "sub v20.8h, v20.8h, %[crc_adj].8h \n\t"
> -             /* A.1 load mbuf pointers. */
> -             "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t"
>               /* D.1 extract op_own byte. */
>               "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b =
\n\t"
>               /* C.2 (CQE 3) adjust flow mark. */
> @@ -917,9 +924,9 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu=
f **pkts, uint16_t pkts_n,
>                [byte_cnt]"=3D&w"(byte_cnt),
>                [ptype_info]"=3D&w"(ptype_info),
>                [flow_tag]"=3D&w"(flow_tag)
> -             :[p3]"r"(p3 + 48), [p2]"r"(p2 + 48),
> -              [p1]"r"(p1 + 48), [p0]"r"(p0 + 48),
> +             :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0),
>                [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0),
> +              [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0),
>                [elts_p]"r"(elts_p),
>                [pkts_p]"r"(pkts_p),
>                [cqe_shuf_m]"w"(cqe_shuf_m),
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5=
_rxtx_vec_sse.h
> index 559b0237e..6c4d1c3d5 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -833,7 +833,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu=
f **pkts, uint16_t pkts_n,
>               /* B.2 copy mbuf pointers. */
>               _mm_storeu_si128((__m128i *)&pkts[pos], mbp1);
>               _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2);
> -             rte_compiler_barrier();
> +             rte_dma_rmb();
>               /* C.1 load remained CQE data and extract necessary fields.=
 */
>               cqe_tmp2 =3D _mm_load_si128((__m128i *)&cq[pos + p3]);
>               cqe_tmp1 =3D _mm_load_si128((__m128i *)&cq[pos + p2]);
> --
> 2.11.0
>

--
IMPORTANT NOTICE: The contents of this email and any attachments are confid=
ential and may also be privileged. If you are not the intended recipient, p=
lease notify the sender immediately and do not disclose the contents to any=
 other person, use it for any purpose, or store or copy the information in =
any medium. Thank you.