From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jianbo Liu Subject: Re: [PATCH v2 8/8] net/mlx5: fix synchonization on polling Rx completions Date: Tue, 16 Jan 2018 11:53:05 +0800 Message-ID: <20180116035304.GA14809@arm.com> References: <20171227042824.33373-1-yskoh@mellanox.com> <20180116011050.18866-1-yskoh@mellanox.com> <20180116011050.18866-9-yskoh@mellanox.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable Cc: adrien.mazarguil@6wind.com, nelio.laranjeiro@6wind.com, jerin.jacob@caviumnetworks.com, dev@dpdk.org, stable@dpdk.org To: Yongseok Koh Return-path: Content-Disposition: inline In-Reply-To: <20180116011050.18866-9-yskoh@mellanox.com> List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" The 01/15/2018 17:10, Yongseok Koh wrote: > Polling a new packet is basically sensing the generation bit in a > completion entry. For some processors not having strongly-ordered memory > model, there has to be an IO memory barrier between reading the generatio= n > bit and other fields of the entry in order to guarantee data is not stale= . > > Fixes: 570acdb1da8a ("net/mlx5: add vectorized Rx/Tx burst for ARM") > Cc: stable@dpdk.org > > Signed-off-by: Yongseok Koh > Acked-by: Shahaf Shuler > Acked-by: Nelio Laranjeiro Acked-by: Jianbo Liu > --- > drivers/net/mlx5/mlx5_rxtx.c | 1 + > drivers/net/mlx5/mlx5_rxtx_vec_neon.h | 53 ++++++++++++++++++++---------= ------ > drivers/net/mlx5/mlx5_rxtx_vec_sse.h | 2 +- > 3 files changed, 32 insertions(+), 24 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c > index 99a5f8681..8065d9d0b 100644 > --- a/drivers/net/mlx5/mlx5_rxtx.c > +++ b/drivers/net/mlx5/mlx5_rxtx.c > @@ -1669,6 +1669,7 @@ mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatil= e struct mlx5_cqe *cqe, > return 0; > ++rxq->cq_ci; > op_own =3D cqe->op_own; > + rte_dma_rmb(); > if (MLX5_CQE_FORMAT(op_own) =3D=3D MLX5_COMPRESSED) { > volatile struct mlx5_mini_cqe8 (*mc)[8] =3D > (volatile struct mlx5_mini_cqe8 (*)[8]) > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx= 5_rxtx_vec_neon.h > index e11565f69..29ae933e7 100644 > --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h > @@ -814,6 +814,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu= f **pkts, uint16_t pkts_n, > uint16x4_t mask; > uint16x4_t byte_cnt; > uint32x4_t ptype_info, flow_tag; > + register uint64x2_t c0, c1, c2, c3; > uint8_t *p0, *p1, *p2, *p3; > uint8_t *e0 =3D (void *)&elts[pos]->pkt_len; > uint8_t *e1 =3D (void *)&elts[pos + 1]->pkt_len; > @@ -830,6 +831,16 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mb= uf **pkts, uint16_t pkts_n, > p1 =3D p0 + (pkts_n - pos > 1) * sizeof(struct mlx5_cqe); > p2 =3D p1 + (pkts_n - pos > 2) * sizeof(struct mlx5_cqe); > p3 =3D p2 + (pkts_n - pos > 3) * sizeof(struct mlx5_cqe); > + /* B.0 (CQE 3) load a block having op_own. */ > + c3 =3D vld1q_u64((uint64_t *)(p3 + 48)); > + /* B.0 (CQE 2) load a block having op_own. */ > + c2 =3D vld1q_u64((uint64_t *)(p2 + 48)); > + /* B.0 (CQE 1) load a block having op_own. */ > + c1 =3D vld1q_u64((uint64_t *)(p1 + 48)); > + /* B.0 (CQE 0) load a block having op_own. */ > + c0 =3D vld1q_u64((uint64_t *)(p0 + 48)); > + /* Synchronize for loading the rest of blocks. */ > + rte_dma_rmb(); > /* Prefetch next 4 CQEs. */ > if (pkts_n - pos >=3D 2 * MLX5_VPMD_DESCS_PER_LOOP) { > unsigned int next =3D pos + MLX5_VPMD_DESCS_PER_LOO= P; > @@ -839,50 +850,46 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_m= buf **pkts, uint16_t pkts_n, > rte_prefetch_non_temporal(&cq[next + 3]); > } > __asm__ volatile ( > - /* B.1 (CQE 3) load a block having op_own. */ > - "ld1 {v19.16b}, [%[p3]] \n\t" > - "sub %[p3], %[p3], #48 \n\t" > - /* B.2 (CQE 3) load the rest blocks. */ > + /* B.1 (CQE 3) load the rest of blocks. */ > "ld1 {v16.16b - v18.16b}, [%[p3]] \n\t" > + /* B.2 (CQE 3) move the block having op_own. */ > + "mov v19.16b, %[c3].16b \n\t" > /* B.3 (CQE 3) extract 16B fields. */ > "tbl v23.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" > + /* B.1 (CQE 2) load the rest of blocks. */ > + "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t" > /* B.4 (CQE 3) adjust CRC length. */ > "sub v23.8h, v23.8h, %[crc_adj].8h \n\t" > - /* B.1 (CQE 2) load a block having op_own. */ > - "ld1 {v19.16b}, [%[p2]] \n\t" > - "sub %[p2], %[p2], #48 \n\t" > /* C.1 (CQE 3) generate final structure for mbuf. */ > "tbl v15.16b, {v23.16b}, %[mb_shuf_m].16b \n\t" > - /* B.2 (CQE 2) load the rest blocks. */ > - "ld1 {v16.16b - v18.16b}, [%[p2]] \n\t" > + /* B.2 (CQE 2) move the block having op_own. */ > + "mov v19.16b, %[c2].16b \n\t" > /* B.3 (CQE 2) extract 16B fields. */ > "tbl v22.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" > + /* B.1 (CQE 1) load the rest of blocks. */ > + "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t" > /* B.4 (CQE 2) adjust CRC length. */ > "sub v22.8h, v22.8h, %[crc_adj].8h \n\t" > - /* B.1 (CQE 1) load a block having op_own. */ > - "ld1 {v19.16b}, [%[p1]] \n\t" > - "sub %[p1], %[p1], #48 \n\t" > /* C.1 (CQE 2) generate final structure for mbuf. */ > "tbl v14.16b, {v22.16b}, %[mb_shuf_m].16b \n\t" > - /* B.2 (CQE 1) load the rest blocks. */ > - "ld1 {v16.16b - v18.16b}, [%[p1]] \n\t" > + /* B.2 (CQE 1) move the block having op_own. */ > + "mov v19.16b, %[c1].16b \n\t" > /* B.3 (CQE 1) extract 16B fields. */ > "tbl v21.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" > + /* B.1 (CQE 0) load the rest of blocks. */ > + "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t" > /* B.4 (CQE 1) adjust CRC length. */ > "sub v21.8h, v21.8h, %[crc_adj].8h \n\t" > - /* B.1 (CQE 0) load a block having op_own. */ > - "ld1 {v19.16b}, [%[p0]] \n\t" > - "sub %[p0], %[p0], #48 \n\t" > /* C.1 (CQE 1) generate final structure for mbuf. */ > "tbl v13.16b, {v21.16b}, %[mb_shuf_m].16b \n\t" > - /* B.2 (CQE 0) load the rest blocks. */ > - "ld1 {v16.16b - v18.16b}, [%[p0]] \n\t" > + /* B.2 (CQE 0) move the block having op_own. */ > + "mov v19.16b, %[c0].16b \n\t" > + /* A.1 load mbuf pointers. */ > + "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t" > /* B.3 (CQE 0) extract 16B fields. */ > "tbl v20.16b, {v16.16b - v19.16b}, %[cqe_shuf_m].16b \n\t" > /* B.4 (CQE 0) adjust CRC length. */ > "sub v20.8h, v20.8h, %[crc_adj].8h \n\t" > - /* A.1 load mbuf pointers. */ > - "ld1 {v24.2d - v25.2d}, [%[elts_p]] \n\t" > /* D.1 extract op_own byte. */ > "tbl %[op_own].8b, {v20.16b - v23.16b}, %[owner_shuf_m].8b = \n\t" > /* C.2 (CQE 3) adjust flow mark. */ > @@ -917,9 +924,9 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu= f **pkts, uint16_t pkts_n, > [byte_cnt]"=3D&w"(byte_cnt), > [ptype_info]"=3D&w"(ptype_info), > [flow_tag]"=3D&w"(flow_tag) > - :[p3]"r"(p3 + 48), [p2]"r"(p2 + 48), > - [p1]"r"(p1 + 48), [p0]"r"(p0 + 48), > + :[p3]"r"(p3), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), > [e3]"r"(e3), [e2]"r"(e2), [e1]"r"(e1), [e0]"r"(e0), > + [c3]"w"(c3), [c2]"w"(c2), [c1]"w"(c1), [c0]"w"(c0), > [elts_p]"r"(elts_p), > [pkts_p]"r"(pkts_p), > [cqe_shuf_m]"w"(cqe_shuf_m), > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5= _rxtx_vec_sse.h > index 559b0237e..6c4d1c3d5 100644 > --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h > @@ -833,7 +833,7 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbu= f **pkts, uint16_t pkts_n, > /* B.2 copy mbuf pointers. */ > _mm_storeu_si128((__m128i *)&pkts[pos], mbp1); > _mm_storeu_si128((__m128i *)&pkts[pos + 2], mbp2); > - rte_compiler_barrier(); > + rte_dma_rmb(); > /* C.1 load remained CQE data and extract necessary fields.= */ > cqe_tmp2 =3D _mm_load_si128((__m128i *)&cq[pos + p3]); > cqe_tmp1 =3D _mm_load_si128((__m128i *)&cq[pos + p2]); > -- > 2.11.0 > -- IMPORTANT NOTICE: The contents of this email and any attachments are confid= ential and may also be privileged. If you are not the intended recipient, p= lease notify the sender immediately and do not disclose the contents to any= other person, use it for any purpose, or store or copy the information in = any medium. Thank you.