Netdev List
 help / color / mirror / Atom feed
* [net-next 13/17] net/mlx5e: RX, Split WQ objects for different RQ types
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Replace the common RQ WQ object with two separate ones for the
different RQ types.
This is in preparation for switching to using a cyclic WQ type
in Legacy RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   4 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 128 ++++++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   |  35 +++--
 3 files changed, 110 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9b4ed83783e4..f2f2dcf6b23c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -498,10 +498,9 @@ enum mlx5e_rq_flag {
 
 struct mlx5e_rq {
 	/* data path */
-	struct mlx5_wq_ll      wq;
-
 	union {
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
@@ -509,6 +508,7 @@ struct mlx5e_rq {
 			};
 		} wqe;
 		struct {
+			struct mlx5_wq_ll      wq;
 			struct mlx5e_umr_wqe   umr_wqe;
 			struct mlx5e_mpw_info *info;
 			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3f1f0552843c..3a007717cba5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -319,10 +319,30 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
 }
 
+static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
+	default:
+		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+	}
+}
+
+static u32 mlx5e_rqwq_get_cur_sz(struct mlx5e_rq *rq)
+{
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		return rq->mpwqe.wq.cur_sz;
+	default:
+		return rq->wqe.wq.cur_sz;
+	}
+}
+
 static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 				     struct mlx5e_channel *c)
 {
-	int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+	int wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 	rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
 				      GFP_KERNEL, cpu_to_node(c->cpu));
@@ -370,7 +390,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
-	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->wq));
+	u64 num_mtts = MLX5E_REQUIRED_MTTS(mlx5_wq_ll_get_size(&rq->mpwqe.wq));
 
 	return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, &rq->umr_mkey);
 }
@@ -397,15 +417,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rqp->wq.db_numa_node = cpu_to_node(c->cpu);
 
-	err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
-				&rq->wq_ctrl);
-	if (err)
-		return err;
-
-	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
-
-	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
-
 	rq->wq_type = params->rq_wq_type;
 	rq->pdev    = c->pdev;
 	rq->netdev  = c->netdev;
@@ -434,8 +445,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->mpwqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->mpwqe.wq.db = &rq->mpwqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
+
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -472,6 +492,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_destroy_umr_mkey;
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					&rq->wq_ctrl);
+		if (err)
+			return err;
+
+		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
+
+		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
 				     GFP_KERNEL, cpu_to_node(c->cpu));
@@ -538,16 +567,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		goto err_rq_wq_destroy;
 
 	for (i = 0; i < wq_sz; i++) {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
-
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-		}
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		} else {
+			struct mlx5e_rx_wqe *wqe =
+				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
 
-		wqe->data.byte_count = cpu_to_be32(byte_count);
-		wqe->data.lkey = rq->mkey_be;
+			wqe->data.byte_count = cpu_to_be32(byte_count);
+			wqe->data.lkey = rq->mkey_be;
+		}
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
@@ -744,51 +778,65 @@ static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time)
 	unsigned long exp_time = jiffies + msecs_to_jiffies(wait_time);
 	struct mlx5e_channel *c = rq->channel;
 
-	struct mlx5_wq_ll *wq = &rq->wq;
-	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5_wq_ll_get_size(wq));
+	u16 min_wqes = mlx5_min_rx_wqes(rq->wq_type, mlx5e_rqwq_get_size(rq));
 
 	do {
-		if (wq->cur_sz >= min_wqes)
+		if (mlx5e_rqwq_get_cur_sz(rq) >= min_wqes)
 			return 0;
 
 		msleep(20);
 	} while (time_before(jiffies, exp_time));
 
 	netdev_warn(c->netdev, "Failed to get min RX wqes on Channel[%d] RQN[0x%x] wq cur_sz(%d) min_rx_wqes(%d)\n",
-		    c->ix, rq->rqn, wq->cur_sz, min_wqes);
+		    c->ix, rq->rqn, mlx5e_rqwq_get_cur_sz(rq), min_wqes);
 
 	return -ETIMEDOUT;
 }
 
 static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
-	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_ix_be;
 	u16 wqe_ix;
 
-	/* UMR WQE (if in progress) is always at wq->head */
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
-	    rq->mpwqe.umr_in_progress)
-		mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
-
-	while (!mlx5_wq_ll_is_empty(wq)) {
-		wqe_ix_be = *wq->tail_next;
-		wqe_ix    = be16_to_cpu(wqe_ix_be);
-		wqe       = mlx5_wq_ll_get_wqe(&rq->wq, wqe_ix);
-		rq->dealloc_wqe(rq, wqe_ix);
-		mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
-			       &wqe->next.next_wqe_index);
-	}
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
+		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
+
+		if (rq->mpwqe.umr_in_progress)
+			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
+	} else {
+		struct mlx5_wq_ll *wq = &rq->wqe.wq;
+
+		while (!mlx5_wq_ll_is_empty(wq)) {
+			struct mlx5e_rx_wqe *wqe;
+
+			wqe_ix_be = *wq->tail_next;
+			wqe_ix    = be16_to_cpu(wqe_ix_be);
+			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+			rq->dealloc_wqe(rq, wqe_ix);
+			mlx5_wq_ll_pop(wq, wqe_ix_be,
+				       &wqe->next.next_wqe_index);
+		}
 
-	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
-		int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+		if (rq->wqe.page_reuse) {
+			int wq_sz = mlx5_wq_ll_get_size(wq);
 
-		for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-			rq->dealloc_wqe(rq, wqe_ix);
+			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+				rq->dealloc_wqe(rq, wqe_ix);
+		}
 	}
 }
 
@@ -2809,7 +2857,7 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wq,
+	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
 				&rq->wq_ctrl);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 634540afdcfc..3b12d4de5b98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -369,7 +369,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
@@ -470,7 +470,7 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
@@ -546,7 +546,7 @@ static void mlx5e_poll_ico_cq(struct mlx5e_cq *cq, struct mlx5e_rq *rq)
 
 bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wq;
+	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
@@ -987,6 +987,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -996,7 +997,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1018,7 +1019,7 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1029,6 +1030,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
@@ -1038,7 +1040,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1063,7 +1065,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 #endif
@@ -1164,6 +1166,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
 	struct mlx5e_rx_wqe *wqe;
+	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
 
@@ -1193,9 +1196,10 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
 		return;
 
-	wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	wq  = &rq->mpwqe.wq;
+	wqe = mlx5_wq_ll_get_wqe(wq, wqe_id);
 	mlx5e_free_rx_mpwqe(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
 
 int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
@@ -1399,6 +1403,7 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1408,7 +1413,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1425,7 +1430,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
+	mlx5_wq_ll_pop(wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
 
@@ -1435,6 +1440,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5_wq_ll *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
@@ -1444,7 +1450,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
 	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
@@ -1465,8 +1471,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
-	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
-- 
2.17.0

^ permalink raw reply related

* [net-next 12/17] net/mlx5e: RX, Remove HW LRO support in legacy RQ
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Current LRO implementation in Legacy RQ uses high-order pages.
In downstream patches of this series we complete the transition
to using only order-0 pages in RX datapath (which was already done
in Striding RQ).

Unlike the more advanced Striding RQ, Legacy RQ does not make reuse
of any non-consumed buffers of non-full LRO sessions, and combining
it with order-0 pages has many performance drawbacks.

Hence, here we totally remove LRO support in Legacy RQ.
This guarantees having no out-of-order completions, which allows using
a cyclic work queue (instead of a linked-list) in a downstream patch.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/en_ethtool.c  |  7 ++++
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 33 +++++++++++--------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 42bd256e680d..fffe514ba855 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -1515,6 +1515,9 @@ static int set_pflag_rx_striding_rq(struct net_device *netdev, bool enable)
 			return -EOPNOTSUPP;
 		if (!mlx5e_striding_rq_possible(mdev, &priv->channels.params))
 			return -EINVAL;
+	} else if (priv->channels.params.lro_en) {
+		netdev_warn(netdev, "Can't set legacy RQ with LRO, disable LRO first\n");
+		return -EINVAL;
 	}
 
 	new_channels.params = priv->channels.params;
@@ -1589,6 +1592,10 @@ static int mlx5e_set_priv_flags(struct net_device *netdev, u32 pflags)
 
 out:
 	mutex_unlock(&priv->state_lock);
+
+	/* Need to fix some features.. */
+	netdev_update_features(netdev);
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ab7b2a4e6edc..3f1f0552843c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -182,14 +182,6 @@ void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
 	params->log_rq_mtu_frames = is_kdump_kernel() ?
 		MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
 		MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
-	switch (params->rq_wq_type) {
-	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		/* Extra room needed for build_skb */
-		params->lro_wqe_sz -= mlx5e_get_rq_headroom(mdev, params) +
-			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	}
 
 	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
@@ -503,14 +495,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			goto err_rq_wq_destroy;
 		}
 
-		byte_count = params->lro_en  ?
-				params->lro_wqe_sz :
-				MLX5E_SW2HW_MTU(params, params->sw_mtu);
+		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 #ifdef CONFIG_MLX5_EN_IPSEC
 		if (MLX5_IPSEC_DEV(mdev))
 			byte_count += MLX5E_METADATA_ETHER_LEN;
 #endif
-		rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
+		rq->wqe.page_reuse = !params->xdp_prog;
 
 		/* calc the required page order */
 		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
@@ -3311,6 +3301,12 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	mutex_lock(&priv->state_lock);
 
 	old_params = &priv->channels.params;
+	if (enable && !MLX5E_GET_PFLAG(old_params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		netdev_warn(netdev, "can't set LRO with legacy RQ\n");
+		err = -EINVAL;
+		goto out;
+	}
+
 	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	new_channels.params = *old_params;
@@ -3480,16 +3476,24 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev,
 					    netdev_features_t features)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_params *params;
 
 	mutex_lock(&priv->state_lock);
+	params = &priv->channels.params;
 	if (!bitmap_empty(priv->fs.vlan.active_svlans, VLAN_N_VID)) {
 		/* HW strips the outer C-tag header, this is a problem
 		 * for S-tag traffic.
 		 */
 		features &= ~NETIF_F_HW_VLAN_CTAG_RX;
-		if (!priv->channels.params.vlan_strip_disable)
+		if (!params->vlan_strip_disable)
 			netdev_warn(netdev, "Dropping C-tag vlan stripping offload due to S-tag vlan\n");
 	}
+	if (!MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ)) {
+		features &= ~NETIF_F_LRO;
+		if (params->lro_en)
+			netdev_warn(netdev, "Disabling LRO, not supported in legacy RQ\n");
+	}
+
 	mutex_unlock(&priv->state_lock);
 
 	return features;
@@ -4328,7 +4332,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_enc_features  |= NETIF_F_HW_VLAN_CTAG_RX;
 
-	if (!!MLX5_CAP_ETH(mdev, lro_cap))
+	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
+	    mlx5e_check_fragmented_striding_rq_cap(mdev))
 		netdev->vlan_features    |= NETIF_F_LRO;
 
 	netdev->hw_features       = netdev->vlan_features;
-- 
2.17.0

^ permalink raw reply related

* [net-next 16/17] net/mlx5e: RX, Always prefer Linear SKB configuration
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Prefer the linear SKB configuration of Legacy RQ over the
non-linear one of Striding RQ.

This implies that ConnectX-4 LX now uses legacy RQ by default,
as it does not support the linear configuration of Striding RQ.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2c634e50d051..333d4ed52b94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4405,9 +4405,16 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS, params->rx_cqe_compress_def);
 
 	/* RQ */
-	if (mlx5e_striding_rq_possible(mdev, params))
-		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ,
-				!slow_pci_heuristic(mdev));
+	/* Prefer Striding RQ, unless any of the following holds:
+	 * - Striding RQ configuration is not possible/supported.
+	 * - Slow PCI heuristic.
+	 * - Legacy RQ would use linear SKB while Striding RQ would use non-linear.
+	 */
+	if (!slow_pci_heuristic(mdev) &&
+	    mlx5e_striding_rq_possible(mdev, params) &&
+	    (mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ||
+	     !mlx5e_rx_is_linear_skb(mdev, params)))
+		MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ, true);
 	mlx5e_set_rq_type(mdev, params);
 	mlx5e_init_rq_type_params(mdev, params);
 
-- 
2.17.0

^ permalink raw reply related

* [net-next 15/17] net/mlx5e: RX, Enhance legacy Receive Queue memory scheme
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Enhance the memory scheme of the legacy RQ, such that
only order-0 pages are used.

Whenever possible, prefer using a linear SKB, and build it
wrapping the WQE buffer.

Otherwise (for example, jumbo frames on x86), use non-linear SKB,
with as many frags as needed. In this case, multiple WQE
scatter entries are used, up to a maximum of 4 frags and 10KB of MTU.

This implied to remove support of HW LRO in legacy RQ, as it would
require large number of page allocations and scatter entries per WQE
on archs with PAGE_SIZE = 4KB, yielding bad performance.

In earlier patches, we guaranteed that all completions are in-order,
and that we use a cyclic WQ.
This creates an oppurtunity for a performance optimization:
The mapping between a "struct mlx5e_dma_info", and the
WQEs (struct mlx5e_wqe_frag_info) pointing to it, is constant
across different cycles of a WQ. This allows initializing
the mapping in the time of RQ creation, and not handle it
in datapath.

A struct mlx5e_dma_info that is shared between different WQEs
is allocated by the first WQE, and freed by the last one.
This implies an important requirement: WQEs that share the same
struct mlx5e_dma_info must be posted within the same NAPI.
Otherwise, upon completion, struct mlx5e_wqe_frag_info would mistakenly
point to the new struct mlx5e_dma_info, not the one that was posted
(and the HW wrote to).
This bulking requirement is actually good also for performance reasons,
hence we extend the bulk beyong the minimal requirement above.

With this memory scheme, the RQs memory footprint is reduce by a
factor of 2 on x86, and by a factor of 32 on PowerPC.
Same factors apply for the number of pages in a GRO session.

Performance tests:
ConnectX-4, single core, single RX ring, default MTU.

x86:
CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz

Packet rate (early drop in TC): no degradation
TCP streams: ~5% improvement

PowerPC:
CPU: POWER8 (raw), altivec supported

Packet rate (early drop in TC): 20% gain
TCP streams: 25% gain

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  44 +++-
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 238 +++++++++++++-----
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 207 ++++++++++-----
 3 files changed, 362 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index af521dd52993..eb9eb7aa953a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -101,11 +101,15 @@ struct page_pool;
 	(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
 	 (MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
 
+#define MLX5E_MIN_SKB_FRAG_SZ		(MLX5_SKB_FRAG_SZ(MLX5_RX_HEADROOM))
+#define MLX5E_LOG_MAX_RX_WQE_BULK	\
+	(ilog2(PAGE_SIZE / roundup_pow_of_two(MLX5E_MIN_SKB_FRAG_SZ)))
+
 #define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE                0x6
 #define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE                0xd
 
-#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE                0x1
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE (1 + MLX5E_LOG_MAX_RX_WQE_BULK)
 #define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,	\
 					       MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
@@ -462,8 +466,9 @@ struct mlx5e_dma_info {
 };
 
 struct mlx5e_wqe_frag_info {
-	struct mlx5e_dma_info di;
+	struct mlx5e_dma_info *di;
 	u32 offset;
+	bool last_in_page;
 };
 
 struct mlx5e_umr_dma_info {
@@ -476,6 +481,8 @@ struct mlx5e_mpw_info {
 	DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };
 
+#define MLX5E_MAX_RX_FRAGS 4
+
 /* a single cache unit is capable to serve one napi call (for non-striding rq)
  * or a MPWQE (for striding rq).
  */
@@ -493,6 +500,9 @@ typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
 typedef struct sk_buff *
 (*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe)(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			 struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
@@ -500,16 +510,27 @@ enum mlx5e_rq_flag {
 	MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
 };
 
+struct mlx5e_rq_frag_info {
+	int frag_size;
+	int frag_stride;
+};
+
+struct mlx5e_rq_frags_info {
+	struct mlx5e_rq_frag_info arr[MLX5E_MAX_RX_FRAGS];
+	u8 num_frags;
+	u8 log_num_frags;
+	u8 wqe_bulk;
+};
+
 struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_cyc     wq;
-			struct mlx5e_wqe_frag_info *frag_info;
-			u32 frag_sz;	/* max possible skb frag_sz */
-			union {
-				bool page_reuse;
-			};
+			struct mlx5_wq_cyc          wq;
+			struct mlx5e_wqe_frag_info *frags;
+			struct mlx5e_dma_info      *di;
+			struct mlx5e_rq_frags_info  info;
+			mlx5e_fp_skb_from_cqe       skb_from_cqe;
 		} wqe;
 		struct {
 			struct mlx5_wq_ll      wq;
@@ -523,7 +544,6 @@ struct mlx5e_rq {
 	};
 	struct {
 		u16            headroom;
-		u8             page_order;
 		u8             map_dir;   /* dma map direction */
 	} buff;
 
@@ -879,6 +899,12 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 struct sk_buff *
 mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
 				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7fd2d736fbb1..2c634e50d051 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -51,6 +51,7 @@
 struct mlx5e_rq_param {
 	u32			rqc[MLX5_ST_SZ_DW(rqc)];
 	struct mlx5_wq_param	wq;
+	struct mlx5e_rq_frags_info frags_info;
 };
 
 struct mlx5e_sq_param {
@@ -93,7 +94,7 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 	return true;
 }
 
-static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
+static u32 mlx5e_rx_get_linear_frag_sz(struct mlx5e_params *params)
 {
 	if (!params->xdp_prog) {
 		u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
@@ -107,19 +108,27 @@ static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
 
 static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
 {
-	u32 linear_frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 
 	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
 }
 
+static bool mlx5e_rx_is_linear_skb(struct mlx5_core_dev *mdev,
+				   struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
+
+	return !params->lro_en && frag_sz <= PAGE_SIZE;
+}
+
 static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 					 struct mlx5e_params *params)
 {
-	u32 frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	u32 frag_sz = mlx5e_rx_get_linear_frag_sz(params);
 	s8 signed_log_num_strides_param;
 	u8 log_num_strides;
 
-	if (params->lro_en || frag_sz > PAGE_SIZE)
+	if (!mlx5e_rx_is_linear_skb(mdev, params))
 		return false;
 
 	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
@@ -145,7 +154,7 @@ static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
 					  struct mlx5e_params *params)
 {
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return order_base_2(mlx5e_mpwqe_get_linear_frag_sz(params));
+		return order_base_2(mlx5e_rx_get_linear_frag_sz(params));
 
 	return MLX5E_MPWQE_STRIDE_SZ(mdev,
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
@@ -163,16 +172,15 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 {
 	u16 linear_rq_headroom = params->xdp_prog ?
 		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
+	bool is_linear_skb;
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
-		return linear_rq_headroom;
-
-	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
-		return linear_rq_headroom;
+	is_linear_skb = (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC) ?
+		mlx5e_rx_is_linear_skb(mdev, params) :
+		mlx5e_rx_mpwqe_is_linear_skb(mdev, params);
 
-	return 0;
+	return is_linear_skb ? linear_rq_headroom : 0;
 }
 
 void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
@@ -400,6 +408,61 @@ static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
 	return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
 }
 
+static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
+{
+	struct mlx5e_wqe_frag_info next_frag, *prev;
+	int i;
+
+	next_frag.di = &rq->wqe.di[0];
+	next_frag.offset = 0;
+	prev = NULL;
+
+	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
+		struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+		struct mlx5e_wqe_frag_info *frag =
+			&rq->wqe.frags[i << rq->wqe.info.log_num_frags];
+		int f;
+
+		for (f = 0; f < rq->wqe.info.num_frags; f++, frag++) {
+			if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) {
+				next_frag.di++;
+				next_frag.offset = 0;
+				if (prev)
+					prev->last_in_page = true;
+			}
+			*frag = next_frag;
+
+			/* prepare next */
+			next_frag.offset += frag_info[f].frag_stride;
+			prev = frag;
+		}
+	}
+
+	if (prev)
+		prev->last_in_page = true;
+}
+
+static int mlx5e_init_di_list(struct mlx5e_rq *rq,
+			      struct mlx5e_params *params,
+			      int wq_sz, int cpu)
+{
+	int len = wq_sz << rq->wqe.info.log_num_frags;
+
+	rq->wqe.di = kvzalloc_node(len * sizeof(*rq->wqe.di),
+				   GFP_KERNEL, cpu_to_node(cpu));
+	if (!rq->wqe.di)
+		return -ENOMEM;
+
+	mlx5e_init_frags_partition(rq);
+
+	return 0;
+}
+
+static void mlx5e_free_di_list(struct mlx5e_rq *rq)
+{
+	kvfree(rq->wqe.di);
+}
+
 static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_params *params,
 			  struct mlx5e_rq_param *rqp,
@@ -409,8 +472,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	struct mlx5_core_dev *mdev = c->mdev;
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	u32 byte_count, pool_size;
-	int npages;
+	u32 pool_size;
 	int wq_sz;
 	int err;
 	int i;
@@ -480,8 +542,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
 		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
 
-		byte_count = rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-
 		err = mlx5e_create_rq_umr_mkey(mdev, rq);
 		if (err)
 			goto err_rq_wq_destroy;
@@ -489,7 +549,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		err = mlx5e_rq_alloc_mpwqe_info(rq, c);
 		if (err)
-			goto err_destroy_umr_mkey;
+			goto err_free;
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
@@ -501,13 +561,17 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
-		rq->wqe.frag_info =
-			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
-				     GFP_KERNEL, cpu_to_node(c->cpu));
-		if (!rq->wqe.frag_info) {
-			err = -ENOMEM;
-			goto err_rq_wq_destroy;
-		}
+		rq->wqe.info = rqp->frags_info;
+		rq->wqe.frags =
+			kvzalloc_node((wq_sz << rq->wqe.info.log_num_frags) *
+				      sizeof(*rq->wqe.frags),
+				      GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frags)
+			goto err_free;
+
+		err = mlx5e_init_di_list(rq, params, wq_sz, c->cpu);
+		if (err)
+			goto err_free;
 		rq->post_wqes = mlx5e_post_rx_wqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_wqe;
 
@@ -518,30 +582,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 #endif
 			rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
 		if (!rq->handle_rx_cqe) {
-			kfree(rq->wqe.frag_info);
 			err = -EINVAL;
 			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
-			goto err_rq_wq_destroy;
+			goto err_free;
 		}
 
-		byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
-#ifdef CONFIG_MLX5_EN_IPSEC
-		if (MLX5_IPSEC_DEV(mdev))
-			byte_count += MLX5E_METADATA_ETHER_LEN;
-#endif
-		rq->wqe.page_reuse = !params->xdp_prog;
-
-		/* calc the required page order */
-		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->buff.headroom + byte_count);
-		npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
-		rq->buff.page_order = order_base_2(npages);
-
-		byte_count |= MLX5_HW_START_PADDING;
+		rq->wqe.skb_from_cqe = mlx5e_rx_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_linear :
+			mlx5e_skb_from_cqe_nonlinear;
 		rq->mkey_be = c->mkey_be;
 	}
 
 	/* Create a page_pool and register it with rxq */
-	pp_params.order     = rq->buff.page_order;
+	pp_params.order     = 0;
 	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
 	pp_params.pool_size = pool_size;
 	pp_params.nid       = cpu_to_node(c->cpu);
@@ -555,21 +608,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 	 */
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
-		if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-			kfree(rq->wqe.frag_info);
 		err = PTR_ERR(rq->page_pool);
 		rq->page_pool = NULL;
-		goto err_rq_wq_destroy;
+		goto err_free;
 	}
 	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
 					 MEM_TYPE_PAGE_POOL, rq->page_pool);
 	if (err)
-		goto err_rq_wq_destroy;
+		goto err_free;
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
+			u32 byte_count =
+				rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
 			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
@@ -578,9 +631,21 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		} else {
 			struct mlx5e_rx_wqe_cyc *wqe =
 				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
+			int f;
 
-			wqe->data[0].byte_count = cpu_to_be32(byte_count);
-			wqe->data[0].lkey = rq->mkey_be;
+			for (f = 0; f < rq->wqe.info.num_frags; f++) {
+				u32 frag_size = rq->wqe.info.arr[f].frag_size |
+					MLX5_HW_START_PADDING;
+
+				wqe->data[f].byte_count = cpu_to_be32(frag_size);
+				wqe->data[f].lkey = rq->mkey_be;
+			}
+			/* check if num_frags is not a pow of two */
+			if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) {
+				wqe->data[f].byte_count = 0;
+				wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY);
+				wqe->data[f].addr = 0;
+			}
 		}
 	}
 
@@ -600,8 +665,16 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	return 0;
 
-err_destroy_umr_mkey:
-	mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+err_free:
+	switch (rq->wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		kfree(rq->mpwqe.info);
+		mlx5_core_destroy_mkey(mdev, &rq->umr_mkey);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
+	}
 
 err_rq_wq_destroy:
 	if (rq->xdp_prog)
@@ -631,7 +704,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
-		kfree(rq->wqe.frag_info);
+		kvfree(rq->wqe.frags);
+		mlx5e_free_di_list(rq);
 	}
 
 	for (i = rq->page_cache.head; i != rq->page_cache.tail;
@@ -823,17 +897,8 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 			rq->dealloc_wqe(rq, wqe_ix);
 			mlx5_wq_cyc_pop(wq);
 		}
-
-		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
-		 * but yet to be re-posted.
-		 */
-		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_cyc_get_size(wq);
-
-			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
-				rq->dealloc_wqe(rq, wqe_ix);
-		}
 	}
+
 }
 
 static int mlx5e_open_rq(struct mlx5e_channel *c,
@@ -1954,6 +2019,61 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+#define DEFAULT_FRAG_SIZE (2048)
+
+static void mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
+				      struct mlx5e_params *params,
+				      struct mlx5e_rq_frags_info *info)
+{
+	u32 byte_count = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	int frag_size_max = DEFAULT_FRAG_SIZE;
+	u32 buf_size = 0;
+	int i;
+
+#ifdef CONFIG_MLX5_EN_IPSEC
+	if (MLX5_IPSEC_DEV(mdev))
+		byte_count += MLX5E_METADATA_ETHER_LEN;
+#endif
+
+	if (mlx5e_rx_is_linear_skb(mdev, params)) {
+		int frag_stride;
+
+		frag_stride = mlx5e_rx_get_linear_frag_sz(params);
+		frag_stride = roundup_pow_of_two(frag_stride);
+
+		info->arr[0].frag_size = byte_count;
+		info->arr[0].frag_stride = frag_stride;
+		info->num_frags = 1;
+		info->wqe_bulk = PAGE_SIZE / frag_stride;
+		goto out;
+	}
+
+	if (byte_count > PAGE_SIZE +
+	    (MLX5E_MAX_RX_FRAGS - 1) * frag_size_max)
+		frag_size_max = PAGE_SIZE;
+
+	i = 0;
+	while (buf_size < byte_count) {
+		int frag_size = byte_count - buf_size;
+
+		if (i < MLX5E_MAX_RX_FRAGS - 1)
+			frag_size = min(frag_size, frag_size_max);
+
+		info->arr[i].frag_size = frag_size;
+		info->arr[i].frag_stride = roundup_pow_of_two(frag_size);
+
+		buf_size += frag_size;
+		i++;
+	}
+	info->num_frags = i;
+	/* number of different wqes sharing a page */
+	info->wqe_bulk = 1 + (info->num_frags % 2);
+
+out:
+	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
+	info->log_num_frags = order_base_2(info->num_frags);
+}
+
 static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
 {
 	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
@@ -1990,6 +2110,8 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		break;
 	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
+		mlx5e_build_rq_frags_info(mdev, params, &param->frags_info);
+		ndsegs = param->frags_info.num_frags;
 	}
 
 	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3cdf2c097356..d3a1dd20e41d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -164,8 +164,6 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
 	return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
 }
 
-#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
-
 static inline bool mlx5e_page_is_reserved(struct page *page)
 {
 	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id();
@@ -214,7 +212,7 @@ static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
 	stats->cache_reuse++;
 
 	dma_sync_single_for_device(rq->pdev, dma_info->addr,
-				   RQ_PAGE_SIZE(rq),
+				   PAGE_SIZE,
 				   DMA_FROM_DEVICE);
 	return true;
 }
@@ -230,7 +228,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 		return -ENOMEM;
 
 	dma_info->addr = dma_map_page(rq->pdev, dma_info->page, 0,
-				      RQ_PAGE_SIZE(rq), rq->buff.map_dir);
+				      PAGE_SIZE, rq->buff.map_dir);
 	if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) {
 		put_page(dma_info->page);
 		dma_info->page = NULL;
@@ -243,8 +241,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
 					struct mlx5e_dma_info *dma_info)
 {
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
+	dma_unmap_page(rq->pdev, dma_info->addr, PAGE_SIZE, rq->buff.map_dir);
 }
 
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
@@ -262,58 +259,96 @@ void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 	}
 }
 
-static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
-				    struct mlx5e_wqe_frag_info *wi)
+static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *frag)
+{
+	int err = 0;
+
+	if (!frag->offset)
+		/* On first frag (offset == 0), replenish page (dma_info actually).
+		 * Other frags that point to the same dma_info (with a different
+		 * offset) should just use the new one without replenishing again
+		 * by themselves.
+		 */
+		err = mlx5e_page_alloc_mapped(rq, frag->di);
+
+	return err;
+}
+
+static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *frag)
 {
-	return rq->wqe.page_reuse && wi->di.page &&
-		(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
-		!mlx5e_page_is_reserved(wi->di.page);
+	if (frag->last_in_page)
+		mlx5e_page_release(rq, frag->di, true);
 }
 
 static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
 {
-	return &rq->wqe.frag_info[ix];
+	return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags];
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
+			      u16 ix)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5e_wqe_frag_info *frag = get_frag(rq, ix);
+	int err;
+	int i;
 
-	/* check if page exists, hence can be reused */
-	if (!wi->di.page) {
-		if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
-			return -ENOMEM;
-		wi->offset = 0;
+	for (i = 0; i < rq->wqe.info.num_frags; i++, frag++) {
+		err = mlx5e_get_rx_frag(rq, frag);
+		if (unlikely(err))
+			goto free_frags;
+
+		wqe->data[i].addr = cpu_to_be64(frag->di->addr +
+						frag->offset + rq->buff.headroom);
 	}
 
-	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
+
+free_frags:
+	while (--i >= 0)
+		mlx5e_put_rx_frag(rq, --frag);
+
+	return err;
 }
 
 static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
 				     struct mlx5e_wqe_frag_info *wi)
 {
-	mlx5e_page_release(rq, &wi->di, true);
-	wi->di.page = NULL;
+	int i;
+
+	for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
+		mlx5e_put_rx_frag(rq, wi);
 }
 
-static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
-					   struct mlx5e_wqe_frag_info *wi)
+void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-	if (mlx5e_page_reuse(rq, wi)) {
-		rq->stats->page_reuse++;
-		return;
-	}
+	struct mlx5e_wqe_frag_info *wi = get_frag(rq, ix);
 
 	mlx5e_free_rx_wqe(rq, wi);
 }
 
-void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
+static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 {
-	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int err;
+	int i;
 
-	if (wi->di.page)
-		mlx5e_free_rx_wqe(rq, wi);
+	for (i = 0; i < wqe_bulk; i++) {
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
+
+		err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i);
+		if (unlikely(err))
+			goto free_wqes;
+	}
+
+	return 0;
+
+free_wqes:
+	while (--i >= 0)
+		mlx5e_dealloc_rx_wqe(rq, ix + i);
+
+	return err;
 }
 
 static inline void
@@ -476,26 +511,28 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	u8 wqe_bulk;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_cyc_is_full(wq))
+	wqe_bulk = rq->wqe.info.wqe_bulk;
+
+	if (mlx5_wq_cyc_missing(wq) < wqe_bulk)
 		return false;
 
 	do {
 		u16 head = mlx5_wq_cyc_get_head(wq);
-		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
+		err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_cyc_push(wq);
-	} while (!mlx5_wq_cyc_is_full(wq));
+		mlx5_wq_cyc_push_n(wq, wqe_bulk);
+	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
@@ -949,11 +986,11 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
 	return skb;
 }
 
-static inline
-struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
-			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+struct sk_buff *
+mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			  struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
-	struct mlx5e_dma_info *di = &wi->di;
+	struct mlx5e_dma_info *di = wi->di;
 	u16 rx_headroom = rq->buff.headroom;
 	struct sk_buff *skb;
 	void *va, *data;
@@ -968,7 +1005,6 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 				      frag_size, DMA_FROM_DEVICE);
 	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
-	wi->offset += frag_size;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 		rq->stats->wqe_err++;
@@ -991,6 +1027,56 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 	return skb;
 }
 
+struct sk_buff *
+mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
+{
+	struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
+	struct mlx5e_wqe_frag_info *head_wi = wi;
+	u16 headlen      = min_t(u32, MLX5E_RX_MAX_HEAD, cqe_bcnt);
+	u16 frag_headlen = headlen;
+	u16 byte_cnt     = cqe_bcnt - headlen;
+	struct sk_buff *skb;
+
+	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
+		rq->stats->wqe_err++;
+		return NULL;
+	}
+
+	/* XDP is not supported in this configuration, as incoming packets
+	 * might spread among multiple pages.
+	 */
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5E_RX_MAX_HEAD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats->buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
+
+	while (byte_cnt) {
+		u16 frag_consumed_bytes =
+			min_t(u16, frag_info->frag_size - frag_headlen, byte_cnt);
+
+		mlx5e_add_skb_frag(rq, skb, wi->di, wi->offset + frag_headlen,
+				   frag_consumed_bytes, frag_info->frag_stride);
+		byte_cnt -= frag_consumed_bytes;
+		frag_headlen = 0;
+		frag_info++;
+		wi++;
+	}
+
+	/* copy header */
+	mlx5e_copy_skb_header(rq->pdev, skb, head_wi->di, head_wi->offset,
+			      0, headlen);
+	/* skb linear part was allocated with headlen and aligned to long */
+	skb->tail += headlen;
+	skb->len  += headlen;
+
+	return skb;
+}
+
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -1003,23 +1089,23 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1041,16 +1127,16 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
+		/* probably for XDP */
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
-			wi->di.page = NULL;
-			/* do not return page to cache, it will be returned on XDP_TX completion */
+			/* do not return page to cache,
+			 * it will be returned on XDP_TX completion.
+			 */
 			goto wq_cyc_pop;
 		}
-		/* probably an XDP_DROP, save the page-reuse checks */
-		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_cyc_pop;
+		goto free_wqe;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1060,7 +1146,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+free_wqe:
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
@@ -1409,7 +1496,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
 		goto wq_free_wqe;
 
@@ -1421,7 +1508,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 wq_free_wqe:
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 	mlx5_wq_cyc_pop(wq);
 }
 
@@ -1441,7 +1528,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	wi       = get_frag(rq, ci);
 	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	skb = rq->wqe.skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
@@ -1456,7 +1543,7 @@ void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-	mlx5e_free_rx_wqe_reuse(rq, wi);
+	mlx5e_free_rx_wqe(rq, wi);
 wq_cyc_pop:
 	mlx5_wq_cyc_pop(wq);
 }
-- 
2.17.0

^ permalink raw reply related

* [net-next 14/17] net/mlx5e: RX, Use cyclic WQ in legacy RQ
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Now that LRO is not supported for Legacy RQ, there is no source of
out-of-order completions in the WQ, and we can use a cyclic one.
This has multiple advantages:
- reduces the WQE size (smaller PCI transactions).
- lower overhead in datapath (no handling of 'next' pointers).
- no reserved WQE for the WQ head (was need in linked-list).
- allows using a constant map between frag and dma_info struct, in downstream patch.

Performance tests:
ConnectX-4, single core, single RX ring.
Major gain in packet rate of single ring XDP drop.
Bottleneck is shifted form HW (at 16Mpps) to SW (at 20Mpps).

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  89 ++++++++------
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en_rx.c   | 115 ++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/wq.c  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  |  55 ++++++++-
 6 files changed, 161 insertions(+), 111 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f2f2dcf6b23c..af521dd52993 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -186,9 +186,13 @@ struct mlx5e_tx_wqe {
 	struct mlx5_wqe_data_seg data[0];
 };
 
-struct mlx5e_rx_wqe {
+struct mlx5e_rx_wqe_ll {
 	struct mlx5_wqe_srq_next_seg  next;
-	struct mlx5_wqe_data_seg      data;
+	struct mlx5_wqe_data_seg      data[0];
+};
+
+struct mlx5e_rx_wqe_cyc {
+	struct mlx5_wqe_data_seg      data[0];
 };
 
 struct mlx5e_umr_wqe {
@@ -500,7 +504,7 @@ struct mlx5e_rq {
 	/* data path */
 	union {
 		struct {
-			struct mlx5_wq_ll      wq;
+			struct mlx5_wq_cyc     wq;
 			struct mlx5e_wqe_frag_info *frag_info;
 			u32 frag_sz;	/* max possible skb frag_sz */
 			union {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3a007717cba5..7fd2d736fbb1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -166,7 +166,7 @@ static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
 
 	linear_rq_headroom += NET_IP_ALIGN;
 
-	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST)
+	if (params->rq_wq_type == MLX5_WQ_TYPE_CYCLIC)
 		return linear_rq_headroom;
 
 	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
@@ -205,7 +205,7 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
 	params->rq_wq_type = mlx5e_striding_rq_possible(mdev, params) &&
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_STRIDING_RQ) ?
 		MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ :
-		MLX5_WQ_TYPE_LINKED_LIST;
+		MLX5_WQ_TYPE_CYCLIC;
 }
 
 static void mlx5e_update_carrier(struct mlx5e_priv *priv)
@@ -325,7 +325,7 @@ static u32 mlx5e_rqwq_get_size(struct mlx5e_rq *rq)
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 		return mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	default:
-		return mlx5_wq_ll_get_size(&rq->wqe.wq);
+		return mlx5_wq_cyc_get_size(&rq->wqe.wq);
 	}
 }
 
@@ -491,15 +491,15 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		if (err)
 			goto err_destroy_umr_mkey;
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
-					&rq->wq_ctrl);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		err = mlx5_wq_cyc_create(mdev, &rqp->wq, rqc_wq, &rq->wqe.wq,
+					 &rq->wq_ctrl);
 		if (err)
 			return err;
 
 		rq->wqe.wq.db = &rq->wqe.wq.db[MLX5_RCV_DBR];
 
-		wq_sz = mlx5_wq_ll_get_size(&rq->wqe.wq);
+		wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
 
 		rq->wqe.frag_info =
 			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
@@ -568,19 +568,19 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	for (i = 0; i < wq_sz; i++) {
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
-			struct mlx5e_rx_wqe *wqe =
+			struct mlx5e_rx_wqe_ll *wqe =
 				mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
 			u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
 
-			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		} else {
-			struct mlx5e_rx_wqe *wqe =
-				mlx5_wq_ll_get_wqe(&rq->wqe.wq, i);
+			struct mlx5e_rx_wqe_cyc *wqe =
+				mlx5_wq_cyc_get_wqe(&rq->wqe.wq, i);
 
-			wqe->data.byte_count = cpu_to_be32(byte_count);
-			wqe->data.lkey = rq->mkey_be;
+			wqe->data[0].byte_count = cpu_to_be32(byte_count);
+			wqe->data[0].lkey = rq->mkey_be;
 		}
 	}
 
@@ -630,7 +630,7 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		kfree(rq->mpwqe.info);
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		kfree(rq->wqe.frag_info);
 	}
 
@@ -801,11 +801,12 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
 
+		/* UMR WQE (if in progress) is always at wq->head */
 		if (rq->mpwqe.umr_in_progress)
 			mlx5e_free_rx_mpwqe(rq, &rq->mpwqe.info[wq->head]);
 
 		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+			struct mlx5e_rx_wqe_ll *wqe;
 
 			wqe_ix_be = *wq->tail_next;
 			wqe_ix    = be16_to_cpu(wqe_ix_be);
@@ -815,24 +816,19 @@ static void mlx5e_free_rx_descs(struct mlx5e_rq *rq)
 				       &wqe->next.next_wqe_index);
 		}
 	} else {
-		struct mlx5_wq_ll *wq = &rq->wqe.wq;
-
-		while (!mlx5_wq_ll_is_empty(wq)) {
-			struct mlx5e_rx_wqe *wqe;
+		struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 
-			wqe_ix_be = *wq->tail_next;
-			wqe_ix    = be16_to_cpu(wqe_ix_be);
-			wqe       = mlx5_wq_ll_get_wqe(wq, wqe_ix);
+		while (!mlx5_wq_cyc_is_empty(wq)) {
+			wqe_ix = mlx5_wq_cyc_get_tail(wq);
 			rq->dealloc_wqe(rq, wqe_ix);
-			mlx5_wq_ll_pop(wq, wqe_ix_be,
-				       &wqe->next.next_wqe_index);
+			mlx5_wq_cyc_pop(wq);
 		}
 
 		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
 		 * but yet to be re-posted.
 		 */
 		if (rq->wqe.page_reuse) {
-			int wq_sz = mlx5_wq_ll_get_size(wq);
+			int wq_sz = mlx5_wq_cyc_get_size(wq);
 
 			for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
 				rq->dealloc_wqe(rq, wqe_ix);
@@ -1958,6 +1954,21 @@ static void mlx5e_close_channel(struct mlx5e_channel *c)
 	kfree(c);
 }
 
+static inline u8 mlx5e_get_rqwq_log_stride(u8 wq_type, int ndsegs)
+{
+	int sz = sizeof(struct mlx5_wqe_data_seg) * ndsegs;
+
+	switch (wq_type) {
+	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+		sz += sizeof(struct mlx5e_rx_wqe_ll);
+		break;
+	default: /* MLX5_WQ_TYPE_CYCLIC */
+		sz += sizeof(struct mlx5e_rx_wqe_cyc);
+	}
+
+	return order_base_2(sz);
+}
+
 static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 				 struct mlx5e_params *params,
 				 struct mlx5e_rq_param *param)
@@ -1965,6 +1976,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	struct mlx5_core_dev *mdev = priv->mdev;
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	int ndsegs = 1;
 
 	switch (params->rq_wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
@@ -1974,16 +1986,16 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 		MLX5_SET(wq, wq, log_wqe_stride_size,
 			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
 			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
 		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames);
 	}
 
+	MLX5_SET(wq, wq, wq_type,          params->rq_wq_type);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
 	MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
 	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
@@ -1999,8 +2011,9 @@ static void mlx5e_build_drop_rq_param(struct mlx5e_priv *priv,
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
 
-	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
-	MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC);
+	MLX5_SET(wq, wq, log_wq_stride,
+		 mlx5e_get_rqwq_log_stride(MLX5_WQ_TYPE_CYCLIC, 1));
 	MLX5_SET(rqc, rqc, counter_set_id, priv->drop_rq_q_counter);
 
 	param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
@@ -2051,7 +2064,7 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
 		log_cq_size = mlx5e_mpwqe_get_log_rq_size(params) +
 			mlx5e_mpwqe_get_log_num_strides(mdev, params);
 		break;
-	default: /* MLX5_WQ_TYPE_LINKED_LIST */
+	default: /* MLX5_WQ_TYPE_CYCLIC */
 		log_cq_size = params->log_rq_mtu_frames;
 	}
 
@@ -2857,8 +2870,8 @@ static int mlx5e_alloc_drop_rq(struct mlx5_core_dev *mdev,
 
 	param->wq.db_numa_node = param->wq.buf_numa_node;
 
-	err = mlx5_wq_ll_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
-				&rq->wq_ctrl);
+	err = mlx5_wq_cyc_create(mdev, &param->wq, rqc_wq, &rq->wqe.wq,
+				 &rq->wq_ctrl);
 	if (err)
 		return err;
 
@@ -3360,7 +3373,7 @@ static int set_feature_lro(struct net_device *netdev, bool enable)
 	new_channels.params = *old_params;
 	new_channels.params.lro_en = enable;
 
-	if (old_params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_CYCLIC) {
 		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
 		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
 			reset = false;
@@ -3566,7 +3579,7 @@ int mlx5e_change_mtu(struct net_device *netdev, int new_mtu,
 	new_channels.params = *params;
 	new_channels.params.sw_mtu = new_mtu;
 
-	if (params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 		u8 ppw_old = mlx5e_mpwqe_log_pkts_per_wqe(params);
 		u8 ppw_new = mlx5e_mpwqe_log_pkts_per_wqe(&new_channels.params);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 8ab4c96b7f7c..3857f22b5500 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -927,7 +927,7 @@ static void mlx5e_build_rep_params(struct mlx5_core_dev *mdev,
 	params->hard_mtu    = MLX5E_ETH_HARD_MTU;
 	params->sw_mtu      = mtu;
 	params->log_sq_size = MLX5E_REP_PARAMS_LOG_SQ_SIZE;
-	params->rq_wq_type  = MLX5_WQ_TYPE_LINKED_LIST;
+	params->rq_wq_type  = MLX5_WQ_TYPE_CYCLIC;
 	params->log_rq_mtu_frames = MLX5E_REP_PARAMS_LOG_RQ_SIZE;
 
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3b12d4de5b98..3cdf2c097356 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -113,7 +113,7 @@ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
 			mpwrq_get_cqe_consumed_strides(&cq->title);
 	else
 		cq->decmprs_wqe_counter =
-			mlx5_wq_ll_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
+			mlx5_wq_cyc_ctr2ix(&rq->wqe.wq, cq->decmprs_wqe_counter + 1);
 }
 
 static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
@@ -270,7 +270,12 @@ static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
 		!mlx5e_page_is_reserved(wi->di.page);
 }
 
-static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
+static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
+{
+	return &rq->wqe.frag_info[ix];
+}
+
+static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix)
 {
 	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
@@ -281,7 +286,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16
 		wi->offset = 0;
 	}
 
-	wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
+	wqe->data[0].addr = cpu_to_be64(wi->di.addr + wi->offset + rq->buff.headroom);
 	return 0;
 }
 
@@ -370,7 +375,7 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_ll *wq = &rq->mpwqe.wq;
-	struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+	struct mlx5e_rx_wqe_ll *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
 	rq->mpwqe.umr_in_progress = false;
 
@@ -470,31 +475,32 @@ void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 
 bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	if (mlx5_wq_ll_is_full(wq))
+	if (mlx5_wq_cyc_is_full(wq))
 		return false;
 
 	do {
-		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
+		u16 head = mlx5_wq_cyc_get_head(wq);
+		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, head);
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, wq->head);
+		err = mlx5e_alloc_rx_wqe(rq, wqe, head);
 		if (unlikely(err)) {
 			rq->stats->buff_alloc_err++;
 			break;
 		}
 
-		mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
-	} while (!mlx5_wq_ll_is_full(wq));
+		mlx5_wq_cyc_push(wq);
+	} while (!mlx5_wq_cyc_is_full(wq));
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
 
-	mlx5_wq_ll_update_db_record(wq);
+	mlx5_wq_cyc_update_db_record(wq);
 
 	return !!err;
 }
@@ -987,19 +993,15 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
@@ -1007,20 +1009,19 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #ifdef CONFIG_MLX5_ESWITCH
@@ -1030,30 +1031,26 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
-	__be16 wqe_counter_be;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb) {
 		if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
 			wi->di.page = NULL;
 			/* do not return page to cache, it will be returned on XDP_TX completion */
-			goto wq_ll_pop;
+			goto wq_cyc_pop;
 		}
 		/* probably an XDP_DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
@@ -1064,9 +1061,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 #endif
 
@@ -1165,7 +1161,7 @@ void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
 	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
 	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
-	struct mlx5e_rx_wqe *wqe;
+	struct mlx5e_rx_wqe_ll *wqe;
 	struct mlx5_wq_ll *wq;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
@@ -1403,19 +1399,15 @@ static inline void mlx5i_complete_rx_cqe(struct mlx5e_rq *rq,
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
@@ -1430,8 +1422,7 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 wq_free_wqe:
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-	mlx5_wq_ll_pop(wq, wqe_counter_be,
-		       &wqe->next.next_wqe_index);
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_CORE_IPOIB */
@@ -1440,38 +1431,34 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 
 void mlx5e_ipsec_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
-	struct mlx5_wq_ll *wq = &rq->wqe.wq;
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	struct mlx5e_wqe_frag_info *wi;
-	struct mlx5e_rx_wqe *wqe;
-	__be16 wqe_counter_be;
 	struct sk_buff *skb;
-	u16 wqe_counter;
 	u32 cqe_bcnt;
+	u16 ci;
 
-	wqe_counter_be = cqe->wqe_counter;
-	wqe_counter    = be16_to_cpu(wqe_counter_be);
-	wqe            = mlx5_wq_ll_get_wqe(wq, wqe_counter);
-	wi             = &rq->wqe.frag_info[wqe_counter];
-	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
+	ci       = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter));
+	wi       = get_frag(rq, ci);
+	cqe_bcnt = be32_to_cpu(cqe->byte_cnt);
 
 	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (unlikely(!skb)) {
 		/* a DROP, save the page-reuse checks */
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 	skb = mlx5e_ipsec_handle_rx_skb(rq->netdev, skb);
 	if (unlikely(!skb)) {
 		mlx5e_free_rx_wqe(rq, wi);
-		goto wq_ll_pop;
+		goto wq_cyc_pop;
 	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
 	mlx5e_free_rx_wqe_reuse(rq, wi);
-wq_ll_pop:
-	mlx5_wq_ll_pop(wq, wqe_counter_be, &wqe->next.next_wqe_index);
+wq_cyc_pop:
+	mlx5_wq_cyc_pop(wq);
 }
 
 #endif /* CONFIG_MLX5_EN_IPSEC */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
index 5b8b35392025..b97bb72b4db4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c
@@ -85,6 +85,7 @@ int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param,
 	mlx5_fill_fbc(MLX5_GET(wq, wqc, log_wq_stride),
 		      MLX5_GET(wq, wqc, log_wq_sz),
 		      fbc);
+	wq->sz    = wq->fbc.sz_m1 + 1;
 
 	err = mlx5_db_alloc_node(mdev, &wq_ctrl->db, param->db_numa_node);
 	if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index b9d7c01fc7cb..0b47126815b6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -51,6 +51,9 @@ struct mlx5_wq_ctrl {
 struct mlx5_wq_cyc {
 	struct mlx5_frag_buf_ctrl fbc;
 	__be32			*db;
+	u16			sz;
+	u16			wqe_ctr;
+	u16			cur_sz;
 };
 
 struct mlx5_wq_qp {
@@ -95,6 +98,43 @@ u32 mlx5_wq_ll_get_size(struct mlx5_wq_ll *wq);
 
 void mlx5_wq_destroy(struct mlx5_wq_ctrl *wq_ctrl);
 
+static inline int mlx5_wq_cyc_is_full(struct mlx5_wq_cyc *wq)
+{
+	return wq->cur_sz == wq->sz;
+}
+
+static inline int mlx5_wq_cyc_missing(struct mlx5_wq_cyc *wq)
+{
+	return wq->sz - wq->cur_sz;
+}
+
+static inline int mlx5_wq_cyc_is_empty(struct mlx5_wq_cyc *wq)
+{
+	return !wq->cur_sz;
+}
+
+static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq)
+{
+	wq->wqe_ctr++;
+	wq->cur_sz++;
+}
+
+static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n)
+{
+	wq->wqe_ctr += n;
+	wq->cur_sz += n;
+}
+
+static inline void mlx5_wq_cyc_pop(struct mlx5_wq_cyc *wq)
+{
+	wq->cur_sz--;
+}
+
+static inline void mlx5_wq_cyc_update_db_record(struct mlx5_wq_cyc *wq)
+{
+	*wq->db = cpu_to_be32(wq->wqe_ctr);
+}
+
 static inline u16 mlx5_wq_cyc_ctr2ix(struct mlx5_wq_cyc *wq, u16 ctr)
 {
 	return ctr & wq->fbc.sz_m1;
@@ -105,6 +145,16 @@ static inline u16 mlx5_wq_cyc_ctr2fragix(struct mlx5_wq_cyc *wq, u16 ctr)
 	return ctr & wq->fbc.frag_sz_m1;
 }
 
+static inline u16 mlx5_wq_cyc_get_head(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr);
+}
+
+static inline u16 mlx5_wq_cyc_get_tail(struct mlx5_wq_cyc *wq)
+{
+	return mlx5_wq_cyc_ctr2ix(wq, wq->wqe_ctr - wq->cur_sz);
+}
+
 static inline void *mlx5_wq_cyc_get_wqe(struct mlx5_wq_cyc *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
@@ -179,11 +229,6 @@ static inline int mlx5_wq_ll_is_empty(struct mlx5_wq_ll *wq)
 	return !wq->cur_sz;
 }
 
-static inline u16 mlx5_wq_ll_ctr2ix(struct mlx5_wq_ll *wq, u16 ctr)
-{
-	return ctr & wq->fbc.sz_m1;
-}
-
 static inline void *mlx5_wq_ll_get_wqe(struct mlx5_wq_ll *wq, u16 ix)
 {
 	return mlx5_frag_buf_get_wqe(&wq->fbc, ix);
-- 
2.17.0

^ permalink raw reply related

* [net-next 17/17] net/mlx5e: TX, Separate cachelines of xmit and completion stats
From: Saeed Mahameed @ 2018-06-02  0:05 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Tariq Toukan, Saeed Mahameed
In-Reply-To: <20180602000544.18717-1-saeedm@mellanox.com>

From: Tariq Toukan <tariqt@mellanox.com>

Avoid false sharing of cachelines by separating the cachelines of
TX stats that are dertied in xmit flow and in completion flow.

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 8 ++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 697dc7397ba2..1646859974ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -64,11 +64,11 @@ static const struct counter_desc sw_stats_desc[] = {
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_csum_partial_inner) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_stopped) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_dropped) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_xmit_more) },
-	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_recover) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_queue_wake) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, tx_cqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_wqe_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_mpwqe_filler) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
@@ -1137,11 +1137,11 @@ static const struct counter_desc sq_stats_desc[] = {
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, nop) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, csum_none) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, stopped) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, dropped) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, xmit_more) },
-	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, recover) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, wake) },
+	{ MLX5E_DECLARE_TX_STAT(struct mlx5e_sq_stats, cqe_err) },
 };
 
 static const struct counter_desc ch_stats_desc[] = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 390c7afa5188..643153bb3607 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -75,11 +75,11 @@ struct mlx5e_sw_stats {
 	u64 tx_csum_partial;
 	u64 tx_csum_partial_inner;
 	u64 tx_queue_stopped;
-	u64 tx_queue_wake;
 	u64 tx_queue_dropped;
 	u64 tx_xmit_more;
-	u64 tx_cqe_err;
 	u64 tx_recover;
+	u64 tx_queue_wake;
+	u64 tx_cqe_err;
 	u64 rx_wqe_err;
 	u64 rx_mpwqe_filler;
 	u64 rx_buff_alloc_err;
@@ -203,10 +203,11 @@ struct mlx5e_sq_stats {
 	/* less likely accessed in data path */
 	u64 csum_none;
 	u64 stopped;
-	u64 wake;
 	u64 dropped;
-	u64 cqe_err;
 	u64 recover;
+	/* dirtied @completion */
+	u64 wake ____cacheline_aligned_in_smp;
+	u64 cqe_err;
 };
 
 struct mlx5e_ch_stats {
-- 
2.17.0

^ permalink raw reply related

* [net-next:master 379/381] drivers/net/ethernet/marvell/mvpp2/mvpp2.h:553:2: warning: overflow in implicit constant conversion
From: kbuild test robot @ 2018-06-02  1:31 UTC (permalink / raw)
  To: Maxime Chevallier; +Cc: kbuild-all, netdev

[-- Attachment #1: Type: text/plain, Size: 4566 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   21ad1173589ef63a93f94e05c879393e2c27588c
commit: db9d7d36eecc8926f03a8f2e46781887577b3353 [379/381] net: mvpp2: Split the PPv2 driver to a dedicated directory
config: s390-allmodconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        git checkout db9d7d36eecc8926f03a8f2e46781887577b3353
        # save the attached .config to linux build tree
        make.cross ARCH=s390 

All warnings (new ones prefixed by >>):

   In file included from drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c:43:0:
   drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c: In function 'mvpp2_setup_bm_pool':
>> drivers/net/ethernet/marvell/mvpp2/mvpp2.h:553:2: warning: overflow in implicit constant conversion [-Woverflow]
     ((total_size) - NET_SKB_PAD - MVPP2_SKB_SHINFO_SIZE)
     ^
>> drivers/net/ethernet/marvell/mvpp2/mvpp2.h:604:33: note: in expansion of macro 'MVPP2_RX_MAX_PKT_SIZE'
    #define MVPP2_BM_SHORT_PKT_SIZE MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_SHORT_FRAME_SIZE)
                                    ^~~~~~~~~~~~~~~~~~~~~
>> drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c:543:41: note: in expansion of macro 'MVPP2_BM_SHORT_PKT_SIZE'
     mvpp2_pools[MVPP2_BM_SHORT].pkt_size = MVPP2_BM_SHORT_PKT_SIZE;
                                            ^~~~~~~~~~~~~~~~~~~~~~~

vim +553 drivers/net/ethernet/marvell/mvpp2/mvpp2.h

   532	
   533	/* TX FIFO constants */
   534	#define MVPP22_TX_FIFO_DATA_SIZE_10KB		0xa
   535	#define MVPP22_TX_FIFO_DATA_SIZE_3KB		0x3
   536	#define MVPP2_TX_FIFO_THRESHOLD_MIN		256
   537	#define MVPP2_TX_FIFO_THRESHOLD_10KB	\
   538		(MVPP22_TX_FIFO_DATA_SIZE_10KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
   539	#define MVPP2_TX_FIFO_THRESHOLD_3KB	\
   540		(MVPP22_TX_FIFO_DATA_SIZE_3KB * 1024 - MVPP2_TX_FIFO_THRESHOLD_MIN)
   541	
   542	/* RX buffer constants */
   543	#define MVPP2_SKB_SHINFO_SIZE \
   544		SKB_DATA_ALIGN(sizeof(struct skb_shared_info))
   545	
   546	#define MVPP2_RX_PKT_SIZE(mtu) \
   547		ALIGN((mtu) + MVPP2_MH_SIZE + MVPP2_VLAN_TAG_LEN + \
   548		      ETH_HLEN + ETH_FCS_LEN, cache_line_size())
   549	
   550	#define MVPP2_RX_BUF_SIZE(pkt_size)	((pkt_size) + NET_SKB_PAD)
   551	#define MVPP2_RX_TOTAL_SIZE(buf_size)	((buf_size) + MVPP2_SKB_SHINFO_SIZE)
   552	#define MVPP2_RX_MAX_PKT_SIZE(total_size) \
 > 553		((total_size) - NET_SKB_PAD - MVPP2_SKB_SHINFO_SIZE)
   554	
   555	#define MVPP2_BIT_TO_BYTE(bit)		((bit) / 8)
   556	
   557	/* IPv6 max L3 address size */
   558	#define MVPP2_MAX_L3_ADDR_SIZE		16
   559	
   560	/* Port flags */
   561	#define MVPP2_F_LOOPBACK		BIT(0)
   562	
   563	/* Marvell tag types */
   564	enum mvpp2_tag_type {
   565		MVPP2_TAG_TYPE_NONE = 0,
   566		MVPP2_TAG_TYPE_MH   = 1,
   567		MVPP2_TAG_TYPE_DSA  = 2,
   568		MVPP2_TAG_TYPE_EDSA = 3,
   569		MVPP2_TAG_TYPE_VLAN = 4,
   570		MVPP2_TAG_TYPE_LAST = 5
   571	};
   572	
   573	/* L2 cast enum */
   574	enum mvpp2_prs_l2_cast {
   575		MVPP2_PRS_L2_UNI_CAST,
   576		MVPP2_PRS_L2_MULTI_CAST,
   577	};
   578	
   579	/* L3 cast enum */
   580	enum mvpp2_prs_l3_cast {
   581		MVPP2_PRS_L3_UNI_CAST,
   582		MVPP2_PRS_L3_MULTI_CAST,
   583		MVPP2_PRS_L3_BROAD_CAST
   584	};
   585	
   586	/* BM constants */
   587	#define MVPP2_BM_JUMBO_BUF_NUM		512
   588	#define MVPP2_BM_LONG_BUF_NUM		1024
   589	#define MVPP2_BM_SHORT_BUF_NUM		2048
   590	#define MVPP2_BM_POOL_SIZE_MAX		(16*1024 - MVPP2_BM_POOL_PTR_ALIGN/4)
   591	#define MVPP2_BM_POOL_PTR_ALIGN		128
   592	
   593	/* BM cookie (32 bits) definition */
   594	#define MVPP2_BM_COOKIE_POOL_OFFS	8
   595	#define MVPP2_BM_COOKIE_CPU_OFFS	24
   596	
   597	#define MVPP2_BM_SHORT_FRAME_SIZE		512
   598	#define MVPP2_BM_LONG_FRAME_SIZE		2048
   599	#define MVPP2_BM_JUMBO_FRAME_SIZE		10240
   600	/* BM short pool packet size
   601	 * These value assure that for SWF the total number
   602	 * of bytes allocated for each buffer will be 512
   603	 */
 > 604	#define MVPP2_BM_SHORT_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_SHORT_FRAME_SIZE)
   605	#define MVPP2_BM_LONG_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_LONG_FRAME_SIZE)
   606	#define MVPP2_BM_JUMBO_PKT_SIZE	MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_JUMBO_FRAME_SIZE)
   607	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 49422 bytes --]

^ permalink raw reply

* ANNOUNCE: Enhanced IP v1.4
From: Sam Patton @ 2018-06-02  1:48 UTC (permalink / raw)
  To: netdev

Hello!

If you do not know what Enhanced IP is, read this post on netdev first:

https://www.spinics.net/lists/netdev/msg327242.html


The Enhanced IP project presents:

             Enhanced IP v1.4

The Enhanced IP (EnIP) code has been updated.  It now builds with OpenWRT barrier breaker (for 148 different devices). We've been testing with the Western Digital N600 and N750 wireless home routers.

Interested in seeing Enhanced IP in the Linux kernel, read on.  Not interested in seeing Enhanced IP in the Linux kernel read on.

Here's the value proposition: if the Internet community wants to pass IP option 26 in the fast path (a scriptable single command enables this for Juniper and Cisco ISPs). Because this is easier than repeering globally(IPv6), this could in a relatively short period of time become a wide-spread option for use of IP option 26 WAN support.  Further, for experimentation to begin with EnIP, it is not necessary for IP option 26 to be passed in the fast path.  Quickly enabling WAN support is something IPv6 cannot do.  Yes, NATs have to be upgraded.  This is not optimal.    All that being said, the protocol is much much simpler to understand than IPv6.  It is an evolution (similar to how CIDR and NAT became pervasive over time in the 1990's).  We have a lot of users who want to try the technology out but most of them are not patch your kernel kinds of people.  It would work a lot better if Enhanced IP was an option available in standard kernels.

We are new to the Linux kernel community and would take advice on the code.  If you look at our repository you will find that every file we update has a comment with the phrase "enhanced ip" so it is easy to see where we have made changes.  We are maintaining separate patches across five kernels at the moment.  Each new kernel requires a seperate porting process.  It takes 30 minutes-1 hour to get all the patch points reinstalled and a little more time to test.  We are especially interested in ideas people might have about integrating Enhanced IP with the IPv4 icmp code.  I have been able to shim the rest of the code into the existing tcp/udp/ipv4 code base without much trouble.  However, the icmp code is a different animal and I would welcome help from another developer(e.g. code patches).  At present you can do tcp and udp sockets only. If a message such as an icmp port unreachable needs to be sent to an EnIP address it does not work.  We are also working on the process by which we will accept patches to EnIP.  We were thinking about adopting the OpenWRT process but would certainly be willing to listen to other arguments based on experience.  Enhanced IP is a hobby project, so we're not looking to come up with our own process.  Rather, we would happily adopt a simple/terse process.

We've used Apache, samba, and ssh/scp over EnIP.  Basically, anything that uses getaddrinfo followed by connect should work fine.  E.g. we think all the code that has already been ported to use getaddrinfo pattern for IPv6 compatibility should also work with EnIP.

Our web site:
http://www.enhancedip.org/

Our git:
git clone https://github.com/EnIP/enhancedip.git

Our test virtual machines (write kernel code with VMs to simulate end to
end EnIP connections):
http://www.enhancedip.org/ENIP/

Thank you for your time.

-Sam Patton

^ permalink raw reply

* [PATCH bpf] bpf: fix uapi hole for 32 bit compat applications
From: Daniel Borkmann @ 2018-06-02  3:21 UTC (permalink / raw)
  To: ast
  Cc: netdev, jakub.kicinski, kafai, songliubraving, ldv, esyr,
	Daniel Borkmann

In 64 bit, we have a 4 byte hole between ifindex and netns_dev in the
case of struct bpf_map_info but also struct bpf_prog_info. In net-next
commit b85fab0e67b ("bpf: Add gpl_compatible flag to struct bpf_prog_info")
added a bitfield into it to expose some flags related to programs. Thus,
add an unnamed __u32 bitfield for both so that alignment keeps the same
in both 32 and 64 bit cases, and can be naturally extended from there
as in b85fab0e67b.

Before:

  # file test.o
  test.o: ELF 32-bit LSB relocatable, Intel 80386, version 1 (SYSV), not stripped
  # pahole test.o
  struct bpf_map_info {
	__u32                      type;                 /*     0     4 */
	__u32                      id;                   /*     4     4 */
	__u32                      key_size;             /*     8     4 */
	__u32                      value_size;           /*    12     4 */
	__u32                      max_entries;          /*    16     4 */
	__u32                      map_flags;            /*    20     4 */
	char                       name[16];             /*    24    16 */
	__u32                      ifindex;              /*    40     4 */
	__u64                      netns_dev;            /*    44     8 */
	__u64                      netns_ino;            /*    52     8 */

	/* size: 64, cachelines: 1, members: 10 */
	/* padding: 4 */
  };

After (same as on 64 bit):

  # file test.o
  test.o: ELF 32-bit LSB relocatable, Intel 80386, version 1 (SYSV), not stripped
  # pahole test.o
  struct bpf_map_info {
	__u32                      type;                 /*     0     4 */
	__u32                      id;                   /*     4     4 */
	__u32                      key_size;             /*     8     4 */
	__u32                      value_size;           /*    12     4 */
	__u32                      max_entries;          /*    16     4 */
	__u32                      map_flags;            /*    20     4 */
	char                       name[16];             /*    24    16 */
	__u32                      ifindex;              /*    40     4 */

	/* XXX 4 bytes hole, try to pack */

	__u64                      netns_dev;            /*    48     8 */
	__u64                      netns_ino;            /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */

	/* size: 64, cachelines: 1, members: 10 */
	/* sum members: 60, holes: 1, sum holes: 4 */
  };

Reported-by: Dmitry V. Levin <ldv@altlinux.org>
Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
Fixes: 52775b33bb507 ("bpf: offload: report device information about offloaded maps")
Fixes: 675fc275a3a2d ("bpf: offload: report device information for offloaded programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 2 ++
 tools/include/uapi/linux/bpf.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec897..8c31773 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1017,6 +1017,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 :32;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
@@ -1030,6 +1031,7 @@ struct bpf_map_info {
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 :32;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c5ec897..8c31773 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1017,6 +1017,7 @@ struct bpf_prog_info {
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 :32;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
@@ -1030,6 +1031,7 @@ struct bpf_map_info {
 	__u32 map_flags;
 	char  name[BPF_OBJ_NAME_LEN];
 	__u32 ifindex;
+	__u32 :32;
 	__u64 netns_dev;
 	__u64 netns_ino;
 } __attribute__((aligned(8)));
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH bpf 1/2] bpf: fix alignment of netns_dev/netns_ino fields in bpf_{map,prog}_info
From: Daniel Borkmann @ 2018-06-02  3:28 UTC (permalink / raw)
  To: Song Liu, Eugene Syromiatnikov
  Cc: netdev, open list, Martin KaFai Lau, Alexei Starovoitov,
	David S. Miller, Jiri Olsa, Ingo Molnar, Lawrence Brakmo,
	Andrey Ignatov, Jakub Kicinski, John Fastabend, Dmitry V. Levin
In-Reply-To: <CAPhsuW4v=V0mbz07xe_shd68WP3BoV8iofxyVGApi+sL9kgzWQ@mail.gmail.com>

On 05/29/2018 07:17 PM, Song Liu wrote:
> On Sun, May 27, 2018 at 4:28 AM, Eugene Syromiatnikov <esyr@redhat.com> wrote:
>> Recent introduction of netns_dev/netns_ino to bpf_map_info/bpf_prog info
>> has broken compat, as offsets of these fields are different in 32-bit
>> and 64-bit ABIs.  One fix (other than implementing compat support in
>> syscall in order to handle this discrepancy) is to use __aligned_u64
>> instead of __u64 for these fields.
>>
>> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
>> Fixes: 52775b33bb507 ("bpf: offload: report device information about
>> offloaded maps")
>> Fixes: 675fc275a3a2d ("bpf: offload: report device information for
>> offloaded programs")
>>
>> Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
>> ---
>>  include/uapi/linux/bpf.h       | 8 ++++----
>>  tools/include/uapi/linux/bpf.h | 8 ++++----
>>  2 files changed, 8 insertions(+), 8 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index c5ec897..903010a 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1017,8 +1017,8 @@ struct bpf_prog_info {
>>         __aligned_u64 map_ids;
>>         char name[BPF_OBJ_NAME_LEN];
>>         __u32 ifindex;
>> -       __u64 netns_dev;
>> -       __u64 netns_ino;
>> +       __aligned_u64 netns_dev;
>> +       __aligned_u64 netns_ino;
>>  } __attribute__((aligned(8)));
> 
> Shall we add a __u32 padding variable before netns_dev? We can use it
> for in the future.

Agree with Song, and definitely prefer that approach since we already use the hole
as a bitfield in net-next; like this https://patchwork.ozlabs.org/patch/924415/.

^ permalink raw reply

* Re: [PATCH bpf] bpf: fix uapi hole for 32 bit compat applications
From: Alexei Starovoitov @ 2018-06-02  3:50 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: ast, netdev, jakub.kicinski, kafai, songliubraving, ldv, esyr
In-Reply-To: <20180602032159.21199-1-daniel@iogearbox.net>

On Sat, Jun 02, 2018 at 05:21:59AM +0200, Daniel Borkmann wrote:
> In 64 bit, we have a 4 byte hole between ifindex and netns_dev in the
> case of struct bpf_map_info but also struct bpf_prog_info. In net-next
> commit b85fab0e67b ("bpf: Add gpl_compatible flag to struct bpf_prog_info")
> added a bitfield into it to expose some flags related to programs. Thus,
> add an unnamed __u32 bitfield for both so that alignment keeps the same
> in both 32 and 64 bit cases, and can be naturally extended from there
> as in b85fab0e67b.
> 
> 
> Reported-by: Dmitry V. Levin <ldv@altlinux.org>
> Reported-by: Eugene Syromiatnikov <esyr@redhat.com>
> Fixes: 52775b33bb507 ("bpf: offload: report device information about offloaded maps")
> Fixes: 675fc275a3a2d ("bpf: offload: report device information for offloaded programs")
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: Alexei Starovoitov <ast@kernel.org>

Applied, Thanks.

^ permalink raw reply

* Re: [PATCH bpf-next v4 0/5] fix test_sockmap
From: Daniel Borkmann @ 2018-06-02  4:16 UTC (permalink / raw)
  To: Prashant Bhole, Alexei Starovoitov, John Fastabend
  Cc: David S . Miller, Shuah Khan, netdev, linux-kselftest
In-Reply-To: <20180531044240.796-1-bhole_prashant_q7@lab.ntt.co.jp>

On 05/31/2018 06:42 AM, Prashant Bhole wrote:
> test_sockmap was originally written only to exercise kernel code
> paths, so there was no strict checking of errors. When the code was
> modified to run as selftests, due to lack of error handling it was not
> able to detect test failures.
> 
> In order to improve, this series fixes error handling, test run time
> and data verification.
> 
> Also slightly improved test output by printing parameter values (cork,
> apply, start, end) so that parameters for all tests are displayed.
> 
> Changes in v4:
>   - patch1: Ignore RX timoute error only for corked tests
>   - patch3: Setting different timeout for corked tests and reduce
>       run time by reducing number of iterations in some tests
> 
> Changes in v3:
>   - Skipped error checking for corked tests
> 
> Prashant Bhole (5):
>   selftests/bpf: test_sockmap, check test failure
>   selftests/bpf: test_sockmap, join cgroup in selftest mode
>   selftests/bpf: test_sockmap, timing improvements
>   selftests/bpf: test_sockmap, fix data verification
>   selftests/bpf: test_sockmap, print additional test options
> 
>  tools/testing/selftests/bpf/test_sockmap.c | 87 +++++++++++++++++-----
>  1 file changed, 67 insertions(+), 20 deletions(-)
> 

Applied to bpf-next, thanks Prashant!

^ permalink raw reply

* Re: [PATCH] samples/bpf: Add xdp_sample_pkts example
From: Daniel Borkmann @ 2018-06-02  4:22 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Song Liu; +Cc: Networking
In-Reply-To: <87a7sgcg7i.fsf@toke.dk>

On 05/31/2018 11:44 AM, Toke Høiland-Jørgensen wrote:
> Song Liu <liu.song.a23@gmail.com> writes:
> 
>> On Wed, May 30, 2018 at 9:45 AM, Toke Høiland-Jørgensen <toke@toke.dk> wrote:
>>> This adds an example program showing how to sample packets from XDP using
>>> the perf event buffer. The example userspace program just prints the
>>> ethernet header for every packet sampled.
>>>
>>> Most of the userspace code is borrowed from other examples, most notably
>>> trace_output.
>>>
>>> Note that the example only works when everything runs on CPU0; so
>>> suitable smp_affinity needs to be set on the device. Some drivers seem
>>> to reset smp_affinity when loading an XDP program, so it may be
>>> necessary to change it after starting the example userspace program.
>>
>> Why does this only works when everything runs on CPU0? Is this
>> something we can improve?
> 
> Yeah, good question. Basically, the call from XDP to
> bpf_perf_event_output() will fail with -EOPNOTSUPP. I tracked this down
> to this if statement in __bpf_perf_event_output() in bpf_trace.c:
> 
>> 	if (unlikely(event->oncpu != cpu))
>> 		return -EOPNOTSUPP;
> 
> I *think* that the way to fix this is for the userspace program to open
> a perf file descriptor for each CPU in the system and poll all of them,
> in which case the XDP program can pass the BPF_F_CURRENT_CPU flag to
> access the right one.
That is correct, you need one perf fd per cpu, and map them accordingly
into the map slots when you use BPF_F_CURRENT_CPU.

^ permalink raw reply

* Re: [PATCH 15/18] rhashtable: use bit_spin_locks to protect hash bucket.
From: Herbert Xu @ 2018-06-02  5:03 UTC (permalink / raw)
  To: NeilBrown
  Cc: Thomas Graf, netdev, linux-kernel, Eric Dumazet, David S. Miller
In-Reply-To: <152782824984.30340.1634082820568216846.stgit@noble>

On Fri, Jun 01, 2018 at 02:44:09PM +1000, NeilBrown wrote:
> This patch changes rhashtables to use a bit_spin_lock (BIT(1))
> the bucket pointer to lock the hash chain for that bucket.
> 
> The benefits of a bit spin_lock are:
>  - no need to allocate a separate array of locks.
>  - no need to have a configuration option to guide the
>    choice of the size of this array
>  - locking cost if often a single test-and-set in a cache line
>    that will have to be loaded anyway.  When inserting at, or removing
>    from, the head of the chain, the unlock is free - writing the new
>    address in the bucket head implicitly clears the lock bit.
>  - even when lockings costs 2 updates (lock and unlock), they are
>    in a cacheline that needs to be read anyway.
> 
> The cost of using a bit spin_lock is a little bit of code complexity,
> which I think is quite manageable.
> 
> Bit spin_locks are sometimes inappropriate because they are not fair -
> if multiple CPUs repeatedly contend of the same lock, one CPU can
> easily be starved.  This is not a credible situation with rhashtable.
> Multiple CPUs may want to repeatedly add or remove objects, but they
> will typically do so at different buckets, so they will attempt to
> acquire different locks.
> 
> As we have more bit-locks than we previously had spinlocks (by at
> least a factor of two) we can expect slightly less contention to
> go with the slightly better cache behavior and reduced memory
> consumption.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>

...

> @@ -74,6 +71,61 @@ struct bucket_table {
>  	struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp;
>  };
>  
> +/*
> + * We lock a bucket by setting BIT(1) in the pointer - this is always
> + * zero in real pointers and in the nulls marker.
> + * bit_spin_locks do not handle contention well, but the whole point
> + * of the hashtable design is to achieve minimum per-bucket contention.
> + * A nested hash table might not have a bucket pointer.  In that case
> + * we cannot get a lock.  For remove and replace the bucket cannot be
> + * interesting and doesn't need locking.
> + * For insert we allocate the bucket if this is the last bucket_table,
> + * and then take the lock.
> + * Sometimes we unlock a bucket by writing a new pointer there.  In that
> + * case we don't need to unlock, but we do need to reset state such as
> + * local_bh. For that we have rht_unlocked().  This doesn't include
> + * the memory barrier that bit_spin_unlock() provides, but rcu_assign_pointer()
> + * will have provided that.
> + */

Yes the concept looks good to me.  But I would like to hear from
Eric/Dave as to whether this would be acceptable for existing
network hash tables such as the ones in inet.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH rdma-next v3 00/14] Verbs flow counters support
From: Leon Romanovsky @ 2018-06-02  5:04 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Boris Pismenny, Matan Barak,
	Michael J . Ruhl, Or Gerlitz, Raed Salem, Yishai Hadas,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20180601211149.GA24028@ziepe.ca>

[-- Attachment #1: Type: text/plain, Size: 1791 bytes --]

On Fri, Jun 01, 2018 at 03:11:49PM -0600, Jason Gunthorpe wrote:
> On Thu, May 31, 2018 at 04:43:27PM +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@mellanox.com>
> >
> > Changelog:
> > v2->v3:
> >  * Change function mlx5_fc_query signature to hide the details of
> >    internal core driver struct mlx5_fc
> >  * Add commen to data[] field at struct mlx5_ib_flow_counters_data (mlx5-abi.h)
> >  * Use array of struct mlx5_ib_flow_counters_desc to clarify the output
> > v1->v2:
> >  * Removed conversion from struct mlx5_fc* to void*
> >  * Fixed one place with double space in it
> >  * Balanced release of hardware handler in case of counters allocation failure
> >  * Added Tested-by
> >  * Minimize time spent holding mutex lock
> >  * Fixed deadlock caused by nested lock in error path
> >  * Protect from handler pointer derefence in the error paths
>
> Okay,
>
> Acked-by: Jason Gunthorpe <jgg@mellanox.com>
>
> I've revised some of the commit messages, fixed the two bad
> check-patch warnings, and fixed the patch ordering..
>
> https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/log/?h=wip/jgg-counters
>
> Please send a PR with the mlx-core bits and above commits.

Hi,

I applied two mlx5-next commits to the relevant tree:
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/commit/?h=mlx5-next&id=930821e39d0a5f91ed58fea1692afe04f0fe0e1f
https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git/commit/?h=mlx5-next&id=5f9bf63ae80c4d0e5e986b6c1280bf8174978545

In first commit, I dropped the words "as used to be", per-Saeed's request.

The proper signed tag for whole the series is: verbs_flow_counters
git://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git tags/verbs_flow_counters

Thanks

>
> Thanks,
> Jason

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* Re: [PATCH rdma-next v3 05/14] IB/uverbs: Add create/destroy counters support
From: Leon Romanovsky @ 2018-06-02  5:05 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Doug Ledford, RDMA mailing list, Boris Pismenny, Matan Barak,
	Michael J . Ruhl, Or Gerlitz, Raed Salem, Yishai Hadas,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20180601205436.GA20377@ziepe.ca>

[-- Attachment #1: Type: text/plain, Size: 2984 bytes --]

On Fri, Jun 01, 2018 at 02:54:36PM -0600, Jason Gunthorpe wrote:
> > diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
> > new file mode 100644
> > index 000000000000..a5bc50ceee13
> > +++ b/drivers/infiniband/core/uverbs_std_types_counters.c
> > @@ -0,0 +1,100 @@
> > +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
>
> Check patch tells me this is malformed should be:
>
> // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>
> Apparently the WITH Linux-syscall-note is only used in uapi header
> files.
>
> > +/*
> > + * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
> > + *
> > + * This software is available to you under a choice of one of two
> > + * licenses.  You may choose to be licensed under the terms of the GNU
> > + * General Public License (GPL) Version 2, available from the file
> > + * COPYING in the main directory of this source tree, or the
> > + * OpenIB.org BSD license below:
> > + *
> > + *     Redistribution and use in source and binary forms, with or
> > + *     without modification, are permitted provided that the following
> > + *     conditions are met:
> > + *
> > + *      - Redistributions of source code must retain the above
> > + *        copyright notice, this list of conditions and the following
> > + *        disclaimer.
> > + *
> > + *      - Redistributions in binary form must reproduce the above
> > + *        copyright notice, this list of conditions and the following
> > + *        disclaimer in the documentation and/or other materials
> > + *        provided with the distribution.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + */
>
> And this is not a SPDX BSD-2-Clause license, this is the SPDX
> Linux-OpenIB license.
>
> Please be careful to use the correct tag with SPDX..
>
> Also can you check if these SPDX tags are what are intended:
>
> include/rdma/restrack.h:/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
> drivers/infiniband/core/restrack.c:/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
> drivers/infiniband/hw/mlx5/ib_rep.c:/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
> drivers/infiniband/hw/mlx5/ib_rep.h:/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
>
> I'm not super excited about the license proliferation, so if they
> should have been OR Linux-OpenIB as well then please send a patch.

They are supposed to be OpenIB.

Thanks

>
> Jason

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* pull-request: bpf 2018-06-02
From: Daniel Borkmann @ 2018-06-02  5:07 UTC (permalink / raw)
  To: davem; +Cc: daniel, ast, netdev

Hi David,

The following pull-request contains BPF updates for your *net* tree.

The main changes are:

1) BPF uapi fix in struct bpf_prog_info and struct bpf_map_info in
   order to fix offsets on 32 bit archs.

Please consider pulling these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git

This will have a minor merge conflict with net-next which has the
__u32 gpl_compatible:1 bitfield in struct bpf_prog_info at this
location. Resolution is to use the gpl_compatible member.

Thanks a lot!

----------------------------------------------------------------

The following changes since commit 8005b09d99fac78e6f5fb9da30b5ae94840af03b:

  net: ethernet: davinci_emac: fix error handling in probe() (2018-05-31 16:12:00 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git 

for you to fetch changes up to 36f9814a494a874d5a0f44843544b4b2539022db:

  bpf: fix uapi hole for 32 bit compat applications (2018-06-01 20:41:35 -0700)

----------------------------------------------------------------
Daniel Borkmann (1):
      bpf: fix uapi hole for 32 bit compat applications

 include/uapi/linux/bpf.h       | 2 ++
 tools/include/uapi/linux/bpf.h | 2 ++
 2 files changed, 4 insertions(+)

^ permalink raw reply

* Re: ANNOUNCE: Enhanced IP v1.4
From: Willy Tarreau @ 2018-06-02  5:57 UTC (permalink / raw)
  To: Sam Patton; +Cc: netdev
In-Reply-To: <d5377a99-da2d-583f-2a8c-72b58223c14a@enhancedip.org>

Hello Sam,

On Fri, Jun 01, 2018 at 09:48:28PM -0400, Sam Patton wrote:
> Hello!
> 
> If you do not know what Enhanced IP is, read this post on netdev first:
> 
> https://www.spinics.net/lists/netdev/msg327242.html
> 
> 
> The Enhanced IP project presents:
> 
>              Enhanced IP v1.4
> 
> The Enhanced IP (EnIP) code has been updated.  It now builds with OpenWRT barrier breaker (for 148 different devices). We've been testing with the Western Digital N600 and N750 wireless home routers.
(...) First note, please think about breaking your lines if you want your
mails to be read by the widest audience, as for some of us here, reading
lines wider than a terminal is really annoying, and often not considered
worth spending time on them considering there are so many easier ones
left to read.

> Interested in seeing Enhanced IP in the Linux kernel, read on.  Not
> interested in seeing Enhanced IP in the Linux kernel read on.
(...)

So I personally find the concept quite interesting. It reminds me of the
previous IPv5/IPv7/IPv8 initiatives, which in my opinion were a bit hopeless.
Here the fact that you decide to consider the IPv4 address as a network opens
new perspectives. For containerized environments it could be considered that
each server, with one IPv4, can host 2^32 guests and that NAT is not needed
anymore for example. It could also open the possibility that enthousiasts
can more easily host some services at home behind their ADSL line without
having to run on strange ports.

However I think your approach is not the most efficient to encourage adoption.
It's important to understand that there will be little incentive for people
to patch their kernels to run some code if they don't have the applications
on top of it. The kernel is not the end goal for most users, the kernel is
just the lower layer needed to run applications on top. I looked at your site
and the github repo, and all I could find was a pre-patched openssh, no simple
explanation of what to change in an application.

What you need to do first is to *explain* how to modify userland applications
to support En-IP, provide an echo server and show the parts which have to be
changed. Write a simple client and do the same. Provide your changes to
existing programs as patches, not as pre-patched code. This way anyone can
use your patches on top of other versions, and can use these patches to
understand what has to be modified in their applications.

Once applications are easy to patch, the incentive to install patched kernels
everywhere will be higher. For many enthousiasts, knowing that they only have
to modify the ADSL router to automatically make their internal IoT stuff
accessible from outside indeed becomes appealing.

Then you'll need to provide patches for well known applications like curl,
wget, DNS servers (bind...), then browsers.

In my case I could be interested in adding support for En-ip into haproxy,
and only once I don't see any showstopped in doing this, I'd be willing to
patch my kernel to support it.

Last advice, provide links to your drafts in future e-mails, they are not
easy to find on your site, we have to navigate through various pages to
finally find them.

Regards,
Willy

^ permalink raw reply

* [PATCH net v2] ipv6: omit traffic class when calculating flow hash
From: Michal Kubecek @ 2018-06-02  7:40 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, linux-kernel, Nicolas Dichtel, Tom Herbert, David Ahern,
	Ido Schimmel

Some of the code paths calculating flow hash for IPv6 use flowlabel member
of struct flowi6 which, despite its name, encodes both flow label and
traffic class. If traffic class changes within a TCP connection (as e.g.
ssh does), ECMP route can switch between path. It's also incosistent with
other code paths where ip6_flowlabel() (returning only flow label) is used
to feed the key.

Use only flow label everywhere, including one place where hash key is set
using ip6_flowinfo().

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
v2: introduce and use an inline helper as suggested by David Ahern

 include/net/ipv6.h        | 5 +++++
 net/core/flow_dissector.c | 2 +-
 net/ipv6/route.c          | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 836f31af1369..7fbdc3e9e25d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -906,6 +906,11 @@ static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel)
 	return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel;
 }
 
+static inline u32 flowi6_get_flowlabel(const struct flowi6 *fl6)
+{
+	return (__force u32)(fl6->flowlabel & IPV6_FLOWLABEL_MASK);
+}
+
 /*
  *	Prototypes exported by ipv6
  */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d29f09bc5ff9..64e5cb8c0b3e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1334,7 +1334,7 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 	keys->ports.src = fl6->fl6_sport;
 	keys->ports.dst = fl6->fl6_dport;
 	keys->keyid.keyid = fl6->fl6_gre_key;
-	keys->tags.flow_label = (__force u32)fl6->flowlabel;
+	keys->tags.flow_label = flowi6_get_flowlabel(fl6);
 	keys->basic.ip_proto = fl6->flowi6_proto;
 
 	return flow_hash_from_keys(keys);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f4d61736c41a..b208cd597510 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1868,7 +1868,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 	} else {
 		keys->addrs.v6addrs.src = key_iph->saddr;
 		keys->addrs.v6addrs.dst = key_iph->daddr;
-		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->tags.flow_label = ip6_flowlabel(key_iph);
 		keys->basic.ip_proto = key_iph->nexthdr;
 	}
 }
@@ -1889,7 +1889,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 		} else {
 			hash_keys.addrs.v6addrs.src = fl6->saddr;
 			hash_keys.addrs.v6addrs.dst = fl6->daddr;
-			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.tags.flow_label = flowi6_get_flowlabel(fl6);
 			hash_keys.basic.ip_proto = fl6->flowi6_proto;
 		}
 		break;
-- 
2.17.1

^ permalink raw reply related

* re
From: Ms. Ella Golan @ 2018-06-02  6:53 UTC (permalink / raw)
  To: Recipients

I am Ms.Ella Golan, I am the Executive Vice President Banking Division with FIRST INTERNATIONAL BANK OF ISRAEL LTD (FIBI). I am getting in touch with you regarding an extremely important and urgent matter. If you would oblige me the opportunity, I shall provide you with details upon your response.

Faithfully,
Ms.Ella Golan

^ permalink raw reply

* Re: [PATCH net] ipv6: omit traffic class when calculating flow hash
From: Michal Kubecek @ 2018-06-02  7:54 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: David S. Miller, netdev, linux-kernel, Nicolas Dichtel,
	Tom Herbert, David Ahern
In-Reply-To: <20180601181929.GA16452@splinter>

On Fri, Jun 01, 2018 at 09:19:29PM +0300, Ido Schimmel wrote:
> On Fri, Jun 01, 2018 at 12:34:41PM +0200, Michal Kubecek wrote:
> > Some of the code paths calculating flow hash for IPv6 use flowlabel member
> > of struct flowi6 which, despite its name, encodes both flow label and
> > traffic class. If traffic class changes within a TCP connection (as e.g.
> > ssh does), ECMP route can switch between path. It's also incosistent with
> > other code paths where ip6_flowlabel() (returning only flow label) is used
> > to feed the key.
> > 
> > Use only flow label everywhere, including one place where hash key is set
> > using ip6_flowinfo().
> > 
> > Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> > Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
> > Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> 
> Please consider adding a test case to
> tools/testing/selftests/net/fib_tests.sh
> 
> Personally, I tested the patch by looping over different values of 'tos'
> for 'ip route get' and confirmed that the same nexthop is selected.

Thanks for the tip, I'll look into it next week.

Michal Kubecek

^ permalink raw reply

* Lucrative Business Proposal
From: Adrien Saif @ 2018-06-02  9:04 UTC (permalink / raw)




-- 
Dear Friend,

I would like to discuss a very important issue with you. I am writing 
to find out if this is your valid email. Please, let me know if this 
email is valid

Kind regards
Adrien Saif
Attorney to Quatif Group of Companies

^ permalink raw reply

* Re: [PATCH net v2] ipv6: omit traffic class when calculating flow hash
From: Ido Schimmel @ 2018-06-02  9:39 UTC (permalink / raw)
  To: Michal Kubecek
  Cc: David S. Miller, netdev, linux-kernel, Nicolas Dichtel,
	Tom Herbert, David Ahern
In-Reply-To: <20180602080528.54B27A0C48@unicorn.suse.cz>

On Sat, Jun 02, 2018 at 09:40:34AM +0200, Michal Kubecek wrote:
> Some of the code paths calculating flow hash for IPv6 use flowlabel member
> of struct flowi6 which, despite its name, encodes both flow label and
> traffic class. If traffic class changes within a TCP connection (as e.g.
> ssh does), ECMP route can switch between path. It's also incosistent with
> other code paths where ip6_flowlabel() (returning only flow label) is used
> to feed the key.
> 
> Use only flow label everywhere, including one place where hash key is set
> using ip6_flowinfo().
> 
> Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
> Fixes: f70ea018da06 ("net: Add functions to get skb->hash based on flow structures")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>

Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>

^ permalink raw reply

* Re: [PATCH 15/18] rhashtable: use bit_spin_locks to protect hash bucket.
From: Eric Dumazet @ 2018-06-02  9:53 UTC (permalink / raw)
  To: Herbert Xu, NeilBrown
  Cc: Thomas Graf, netdev, linux-kernel, Eric Dumazet, David S. Miller
In-Reply-To: <20180602050322.liesw324q5kawcue@gondor.apana.org.au>



On 06/02/2018 01:03 AM, Herbert Xu wrote:
 
> Yes the concept looks good to me.  But I would like to hear from
> Eric/Dave as to whether this would be acceptable for existing
> network hash tables such as the ones in inet.


What about lockdep support ?

^ permalink raw reply

* Re: [net-next][PATCH] tcp: probe timer MUST not less than 5 minuter for tcp PMTU
From: Eric Dumazet @ 2018-06-02 10:19 UTC (permalink / raw)
  To: Li RongQing, netdev
In-Reply-To: <1527851039-6626-1-git-send-email-lirongqing@baidu.com>



On 06/01/2018 07:03 AM, Li RongQing wrote:
> RFC4821 say: The value for this timer MUST NOT be less than
> 5 minutes and is recommended to be 10 minutes, per RFC 1981.
> 
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> ---
>  net/ipv4/sysctl_net_ipv4.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index d2eed3ddcb0a..ed8952bb6874 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -47,6 +47,7 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
>  static int ip_ping_group_range_min[] = { 0, 0 };
>  static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
>  static int comp_sack_nr_max = 255;
> +static int tcp_probe_interval_min = 300;
>  
>  /* obsolete */
>  static int sysctl_tcp_low_latency __read_mostly;
> @@ -711,7 +712,8 @@ static struct ctl_table ipv4_net_table[] = {
>  		.data		= &init_net.ipv4.sysctl_tcp_probe_interval,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
> -		.proc_handler	= proc_dointvec,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &tcp_probe_interval_min,
>  	},
>  	{
>  		.procname	= "igmp_link_local_mcast_reports",
> 

Note that this change would stop people from being able to have packetdrill
tests which would run in a reasonable amount of time.

I do not believe linux kernel must enforce such a limit.

It is up to the admin to set a value here really, depending on the environment
the host is running in.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox