Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 net-next 09/11] qede: Better utilize the qede_[rt]x_queue
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Improve the cacheline usage of both queues by reordering -
This reduces the cachelines required for egress datapath processing
from 3 to 2 and those required by ingress datapath processing by 2.

It also changes a couple of datapath related functions that currently
require either the fastpath or the qede_dev, changing them to be based
on the tx/rx queue instead.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |  79 ++++++-----
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |   4 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 166 +++++++++++-------------
 3 files changed, 124 insertions(+), 125 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 4ebc638..21e8533 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -243,27 +243,33 @@ struct qede_agg_info {
 };
 
 struct qede_rx_queue {
-	__le16			*hw_cons_ptr;
-	struct sw_rx_data	*sw_rx_ring;
-	u16			sw_rx_cons;
-	u16			sw_rx_prod;
-	struct qed_chain	rx_bd_ring;
-	struct qed_chain	rx_comp_ring;
-	void __iomem		*hw_rxq_prod_addr;
+	__le16 *hw_cons_ptr;
+	void __iomem *hw_rxq_prod_addr;
 
-	/* GRO */
-	struct qede_agg_info	tpa_info[ETH_TPA_MAX_AGGS_NUM];
+	/* Required for the allocation of replacement buffers */
+	struct device *dev;
+
+	u16 sw_rx_cons;
+	u16 sw_rx_prod;
 
-	int			rx_buf_size;
-	unsigned int		rx_buf_seg_size;
+	u16 num_rx_buffers; /* Slowpath */
+	u8 rxq_id;
 
-	u16			num_rx_buffers;
-	u16			rxq_id;
+	u32 rx_buf_size;
+	u32 rx_buf_seg_size;
 
-	u64			rcv_pkts;
-	u64			rx_hw_errors;
-	u64			rx_alloc_errors;
-	u64			rx_ip_frags;
+	u64 rcv_pkts;
+
+	struct sw_rx_data *sw_rx_ring;
+	struct qed_chain rx_bd_ring;
+	struct qed_chain rx_comp_ring ____cacheline_aligned;
+
+	/* GRO */
+	struct qede_agg_info tpa_info[ETH_TPA_MAX_AGGS_NUM];
+
+	u64 rx_hw_errors;
+	u64 rx_alloc_errors;
+	u64 rx_ip_frags;
 
 	void *handle;
 };
@@ -281,22 +287,28 @@ struct sw_tx_bd {
 };
 
 struct qede_tx_queue {
-	int			index; /* Queue index */
-	__le16			*hw_cons_ptr;
-	struct sw_tx_bd		*sw_tx_ring;
-	u16			sw_tx_cons;
-	u16			sw_tx_prod;
-	struct qed_chain	tx_pbl;
-	void __iomem		*doorbell_addr;
-	union db_prod		tx_db;
-
-	u16			num_tx_buffers;
-	u64			xmit_pkts;
-	u64			stopped_cnt;
-
-	bool			is_legacy;
-	void *handle;
+	bool is_legacy;
+	u16 sw_tx_cons;
+	u16 sw_tx_prod;
+	u16 num_tx_buffers; /* Slowpath only */
 
+	u64 xmit_pkts;
+	u64 stopped_cnt;
+
+	__le16 *hw_cons_ptr;
+
+	/* Needed for the mapping of packets */
+	struct device *dev;
+
+	void __iomem *doorbell_addr;
+	union db_prod tx_db;
+	int index; /* Slowpath only */
+
+	struct sw_tx_bd *sw_tx_ring;
+	struct qed_chain tx_pbl;
+
+	/* Slowpath; Should be kept in end [unless missing padding] */
+	void *handle;
 };
 
 #define BD_UNMAP_ADDR(bd)		HILO_U64(le32_to_cpu((bd)->addr.hi), \
@@ -363,8 +375,7 @@ void qede_reload(struct qede_dev *edev,
 void __qede_unlock(struct qede_dev *edev);
 bool qede_has_rx_work(struct qede_rx_queue *rxq);
 int qede_txq_has_work(struct qede_tx_queue *txq);
-void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, struct qede_dev *edev,
-			     u8 count);
+void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, u8 count);
 void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq);
 
 #define RX_RING_SIZE_POW	13
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index ef8c327..60a2e58 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -1337,13 +1337,13 @@ static int qede_selftest_receive_traffic(struct qede_dev *edev)
 					break;
 				}
 
-			qede_recycle_rx_bd_ring(rxq, edev, 1);
+			qede_recycle_rx_bd_ring(rxq, 1);
 			qed_chain_recycle_consumed(&rxq->rx_comp_ring);
 			break;
 		}
 
 		DP_INFO(edev, "Not the transmitted packet\n");
-		qede_recycle_rx_bd_ring(rxq, edev, 1);
+		qede_recycle_rx_bd_ring(rxq, 1);
 		qed_chain_recycle_consumed(&rxq->rx_comp_ring);
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index ec01de2..15520bf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -96,8 +96,6 @@ enum qede_pci_private {
 
 static void qede_remove(struct pci_dev *pdev);
 static void qede_shutdown(struct pci_dev *pdev);
-static int qede_alloc_rx_buffer(struct qede_dev *edev,
-				struct qede_rx_queue *rxq);
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
 /* The qede lock is used to protect driver state change and driver flows that
@@ -355,8 +353,7 @@ static int qede_free_tx_pkt(struct qede_dev *edev,
 }
 
 /* Unmap the data and free skb when mapping failed during start_xmit */
-static void qede_free_failed_tx_pkt(struct qede_dev *edev,
-				    struct qede_tx_queue *txq,
+static void qede_free_failed_tx_pkt(struct qede_tx_queue *txq,
 				    struct eth_tx_1st_bd *first_bd,
 				    int nbd, bool data_split)
 {
@@ -378,7 +375,7 @@ static void qede_free_failed_tx_pkt(struct qede_dev *edev,
 		nbd--;
 	}
 
-	dma_unmap_single(&edev->pdev->dev, BD_UNMAP_ADDR(first_bd),
+	dma_unmap_single(txq->dev, BD_UNMAP_ADDR(first_bd),
 			 BD_UNMAP_LEN(first_bd) + split_bd_len, DMA_TO_DEVICE);
 
 	/* Unmap the data of the skb frags */
@@ -386,7 +383,7 @@ static void qede_free_failed_tx_pkt(struct qede_dev *edev,
 		tx_data_bd = (struct eth_tx_bd *)
 			qed_chain_produce(&txq->tx_pbl);
 		if (tx_data_bd->nbytes)
-			dma_unmap_page(&edev->pdev->dev,
+			dma_unmap_page(txq->dev,
 				       BD_UNMAP_ADDR(tx_data_bd),
 				       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
 	}
@@ -401,8 +398,7 @@ static void qede_free_failed_tx_pkt(struct qede_dev *edev,
 	txq->sw_tx_ring[idx].flags = 0;
 }
 
-static u32 qede_xmit_type(struct qede_dev *edev,
-			  struct sk_buff *skb, int *ipv6_ext)
+static u32 qede_xmit_type(struct sk_buff *skb, int *ipv6_ext)
 {
 	u32 rc = XMIT_L4_CSUM;
 	__be16 l3_proto;
@@ -469,18 +465,16 @@ static void qede_set_params_for_ipv6_ext(struct sk_buff *skb,
 	second_bd->data.bitfields2 = cpu_to_le16(bd2_bits2);
 }
 
-static int map_frag_to_bd(struct qede_dev *edev,
+static int map_frag_to_bd(struct qede_tx_queue *txq,
 			  skb_frag_t *frag, struct eth_tx_bd *bd)
 {
 	dma_addr_t mapping;
 
 	/* Map skb non-linear frag data for DMA */
-	mapping = skb_frag_dma_map(&edev->pdev->dev, frag, 0,
+	mapping = skb_frag_dma_map(txq->dev, frag, 0,
 				   skb_frag_size(frag), DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
-		DP_NOTICE(edev, "Unable to map frag - dropping packet\n");
+	if (unlikely(dma_mapping_error(txq->dev, mapping)))
 		return -ENOMEM;
-	}
 
 	/* Setup the data pointer of the frag data */
 	BD_SET_UNMAP_ADDR_LEN(bd, mapping, skb_frag_size(frag));
@@ -500,8 +494,7 @@ static u16 qede_get_skb_hlen(struct sk_buff *skb, bool is_encap_pkt)
 
 /* +2 for 1st BD for headers and 2nd BD for headlen (if required) */
 #if ((MAX_SKB_FRAGS + 2) > ETH_TX_MAX_BDS_PER_NON_LSO_PACKET)
-static bool qede_pkt_req_lin(struct qede_dev *edev, struct sk_buff *skb,
-			     u8 xmit_type)
+static bool qede_pkt_req_lin(struct sk_buff *skb, u8 xmit_type)
 {
 	int allowed_frags = ETH_TX_MAX_BDS_PER_NON_LSO_PACKET - 1;
 
@@ -565,10 +558,10 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 
 	WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1));
 
-	xmit_type = qede_xmit_type(edev, skb, &ipv6_ext);
+	xmit_type = qede_xmit_type(skb, &ipv6_ext);
 
 #if ((MAX_SKB_FRAGS + 2) > ETH_TX_MAX_BDS_PER_NON_LSO_PACKET)
-	if (qede_pkt_req_lin(edev, skb, xmit_type)) {
+	if (qede_pkt_req_lin(skb, xmit_type)) {
 		if (skb_linearize(skb)) {
 			DP_NOTICE(edev,
 				  "SKB linearization failed - silently dropping this SKB\n");
@@ -588,11 +581,11 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 		1 << ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT;
 
 	/* Map skb linear data for DMA and set in the first BD */
-	mapping = dma_map_single(&edev->pdev->dev, skb->data,
+	mapping = dma_map_single(txq->dev, skb->data,
 				 skb_headlen(skb), DMA_TO_DEVICE);
-	if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
+	if (unlikely(dma_mapping_error(txq->dev, mapping))) {
 		DP_NOTICE(edev, "SKB mapping failed\n");
-		qede_free_failed_tx_pkt(edev, txq, first_bd, 0, false);
+		qede_free_failed_tx_pkt(txq, first_bd, 0, false);
 		qede_update_tx_producer(txq);
 		return NETDEV_TX_OK;
 	}
@@ -716,12 +709,11 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 	/* Handle fragmented skb */
 	/* special handle for frags inside 2nd and 3rd bds.. */
 	while (tx_data_bd && frag_idx < skb_shinfo(skb)->nr_frags) {
-		rc = map_frag_to_bd(edev,
+		rc = map_frag_to_bd(txq,
 				    &skb_shinfo(skb)->frags[frag_idx],
 				    tx_data_bd);
 		if (rc) {
-			qede_free_failed_tx_pkt(edev, txq, first_bd, nbd,
-						data_split);
+			qede_free_failed_tx_pkt(txq, first_bd, nbd, data_split);
 			qede_update_tx_producer(txq);
 			return NETDEV_TX_OK;
 		}
@@ -741,12 +733,11 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 
 		memset(tx_data_bd, 0, sizeof(*tx_data_bd));
 
-		rc = map_frag_to_bd(edev,
+		rc = map_frag_to_bd(txq,
 				    &skb_shinfo(skb)->frags[frag_idx],
 				    tx_data_bd);
 		if (rc) {
-			qede_free_failed_tx_pkt(edev, txq, first_bd, nbd,
-						data_split);
+			qede_free_failed_tx_pkt(txq, first_bd, nbd, data_split);
 			qede_update_tx_producer(txq);
 			return NETDEV_TX_OK;
 		}
@@ -903,8 +894,7 @@ static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
 /* This function reuses the buffer(from an offset) from
  * consumer index to producer index in the bd ring
  */
-static inline void qede_reuse_page(struct qede_dev *edev,
-				   struct qede_rx_queue *rxq,
+static inline void qede_reuse_page(struct qede_rx_queue *rxq,
 				   struct sw_rx_data *curr_cons)
 {
 	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
@@ -926,27 +916,62 @@ static inline void qede_reuse_page(struct qede_dev *edev,
 /* In case of allocation failures reuse buffers
  * from consumer index to produce buffers for firmware
  */
-void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq,
-			     struct qede_dev *edev, u8 count)
+void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, u8 count)
 {
 	struct sw_rx_data *curr_cons;
 
 	for (; count > 0; count--) {
 		curr_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
-		qede_reuse_page(edev, rxq, curr_cons);
+		qede_reuse_page(rxq, curr_cons);
 		qede_rx_bd_ring_consume(rxq);
 	}
 }
 
-static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
-					 struct qede_rx_queue *rxq,
+static int qede_alloc_rx_buffer(struct qede_rx_queue *rxq)
+{
+	struct sw_rx_data *sw_rx_data;
+	struct eth_rx_bd *rx_bd;
+	dma_addr_t mapping;
+	struct page *data;
+
+	data = alloc_pages(GFP_ATOMIC, 0);
+	if (unlikely(!data))
+		return -ENOMEM;
+
+	/* Map the entire page as it would be used
+	 * for multiple RX buffer segment size mapping.
+	 */
+	mapping = dma_map_page(rxq->dev, data, 0,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(rxq->dev, mapping))) {
+		__free_page(data);
+		return -ENOMEM;
+	}
+
+	sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
+	sw_rx_data->page_offset = 0;
+	sw_rx_data->data = data;
+	sw_rx_data->mapping = mapping;
+
+	/* Advance PROD and get BD pointer */
+	rx_bd = (struct eth_rx_bd *)qed_chain_produce(&rxq->rx_bd_ring);
+	WARN_ON(!rx_bd);
+	rx_bd->addr.hi = cpu_to_le32(upper_32_bits(mapping));
+	rx_bd->addr.lo = cpu_to_le32(lower_32_bits(mapping));
+
+	rxq->sw_rx_prod++;
+
+	return 0;
+}
+
+static inline int qede_realloc_rx_buffer(struct qede_rx_queue *rxq,
 					 struct sw_rx_data *curr_cons)
 {
 	/* Move to the next segment in the page */
 	curr_cons->page_offset += rxq->rx_buf_seg_size;
 
 	if (curr_cons->page_offset == PAGE_SIZE) {
-		if (unlikely(qede_alloc_rx_buffer(edev, rxq))) {
+		if (unlikely(qede_alloc_rx_buffer(rxq))) {
 			/* Since we failed to allocate new buffer
 			 * current buffer can be used again.
 			 */
@@ -955,7 +980,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
 			return -ENOMEM;
 		}
 
-		dma_unmap_page(&edev->pdev->dev, curr_cons->mapping,
+		dma_unmap_page(rxq->dev, curr_cons->mapping,
 			       PAGE_SIZE, DMA_FROM_DEVICE);
 	} else {
 		/* Increment refcount of the page as we don't want
@@ -963,7 +988,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
 		 * which can be recycled multiple times by the driver.
 		 */
 		page_ref_inc(curr_cons->data);
-		qede_reuse_page(edev, rxq, curr_cons);
+		qede_reuse_page(rxq, curr_cons);
 	}
 
 	return 0;
@@ -1026,6 +1051,7 @@ static void qede_set_skb_csum(struct sk_buff *skb, u8 csum_flag)
 
 static inline void qede_skb_receive(struct qede_dev *edev,
 				    struct qede_fastpath *fp,
+				    struct qede_rx_queue *rxq,
 				    struct sk_buff *skb, u16 vlan_tag)
 {
 	if (vlan_tag)
@@ -1068,7 +1094,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 			   current_bd->data, current_bd->page_offset,
 			   len_on_bd);
 
-	if (unlikely(qede_realloc_rx_buffer(edev, rxq, current_bd))) {
+	if (unlikely(qede_realloc_rx_buffer(rxq, current_bd))) {
 		/* Incr page ref count to reuse on allocation failure
 		 * so that it doesn't get freed while freeing SKB.
 		 */
@@ -1087,7 +1113,8 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 
 out:
 	tpa_info->state = QEDE_AGG_STATE_ERROR;
-	qede_recycle_rx_bd_ring(rxq, edev, 1);
+	qede_recycle_rx_bd_ring(rxq, 1);
+
 	return -ENOMEM;
 }
 
@@ -1239,7 +1266,7 @@ static void qede_gro_receive(struct qede_dev *edev,
 
 send_skb:
 	skb_record_rx_queue(skb, fp->rxq->rxq_id);
-	qede_skb_receive(edev, fp, skb, vlan_tag);
+	qede_skb_receive(edev, fp, fp->rxq, skb, vlan_tag);
 }
 
 static inline void qede_tpa_cont(struct qede_dev *edev,
@@ -1414,7 +1441,7 @@ static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
 	if (len + pad <= edev->rx_copybreak) {
 		memcpy(skb_put(skb, len),
 		       page_address(page) + pad + offset, len);
-		qede_reuse_page(edev, rxq, bd);
+		qede_reuse_page(rxq, bd);
 		goto out;
 	}
 
@@ -1435,7 +1462,7 @@ static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
 	skb->data_len -= pull_len;
 	skb->tail += pull_len;
 
-	if (unlikely(qede_realloc_rx_buffer(edev, rxq, bd))) {
+	if (unlikely(qede_realloc_rx_buffer(rxq, bd))) {
 		/* Incr page ref count to reuse on allocation failure so
 		 * that it doesn't get freed while freeing SKB [as its
 		 * already mapped there].
@@ -1477,7 +1504,7 @@ static int qede_rx_build_jumbo(struct qede_dev *edev,
 		}
 
 		/* We need a replacement buffer for each BD */
-		if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+		if (unlikely(qede_alloc_rx_buffer(rxq)))
 			goto out;
 
 		/* Now that we've allocated the replacement buffer,
@@ -1487,7 +1514,7 @@ static int qede_rx_build_jumbo(struct qede_dev *edev,
 		bd = &rxq->sw_rx_ring[bd_cons_idx];
 		qede_rx_bd_ring_consume(rxq);
 
-		dma_unmap_page(&edev->pdev->dev, bd->mapping,
+		dma_unmap_page(rxq->dev, bd->mapping,
 			       PAGE_SIZE, DMA_FROM_DEVICE);
 
 		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
@@ -1582,7 +1609,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 				  "CQE has error, flags = %x, dropping incoming packet\n",
 				  parse_flag);
 			rxq->rx_hw_errors++;
-			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+			qede_recycle_rx_bd_ring(rxq, fp_cqe->bd_num);
 			return 0;
 		}
 	}
@@ -1593,7 +1620,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 	skb = qede_rx_allocate_skb(edev, rxq, bd, len, pad);
 	if (!skb) {
 		rxq->rx_alloc_errors++;
-		qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+		qede_recycle_rx_bd_ring(rxq, fp_cqe->bd_num);
 		return 0;
 	}
 
@@ -1605,7 +1632,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 							 fp_cqe, len);
 
 		if (unlikely(unmapped_frags > 0)) {
-			qede_recycle_rx_bd_ring(rxq, edev, unmapped_frags);
+			qede_recycle_rx_bd_ring(rxq, unmapped_frags);
 			dev_kfree_skb_any(skb);
 			return 0;
 		}
@@ -1618,7 +1645,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 	skb_record_rx_queue(skb, rxq->rxq_id);
 
 	/* SKB is prepared - pass it to stack */
-	qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
+	qede_skb_receive(edev, fp, rxq, skb, le16_to_cpu(fp_cqe->vlan_tag));
 
 	return 1;
 }
@@ -2875,47 +2902,6 @@ static void qede_free_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 	edev->ops->common->chain_free(edev->cdev, &rxq->rx_comp_ring);
 }
 
-static int qede_alloc_rx_buffer(struct qede_dev *edev,
-				struct qede_rx_queue *rxq)
-{
-	struct sw_rx_data *sw_rx_data;
-	struct eth_rx_bd *rx_bd;
-	dma_addr_t mapping;
-	struct page *data;
-
-	data = alloc_pages(GFP_ATOMIC, 0);
-	if (unlikely(!data)) {
-		DP_NOTICE(edev, "Failed to allocate Rx data [page]\n");
-		return -ENOMEM;
-	}
-
-	/* Map the entire page as it would be used
-	 * for multiple RX buffer segment size mapping.
-	 */
-	mapping = dma_map_page(&edev->pdev->dev, data, 0,
-			       PAGE_SIZE, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) {
-		__free_page(data);
-		DP_NOTICE(edev, "Failed to map Rx buffer\n");
-		return -ENOMEM;
-	}
-
-	sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
-	sw_rx_data->page_offset = 0;
-	sw_rx_data->data = data;
-	sw_rx_data->mapping = mapping;
-
-	/* Advance PROD and get BD pointer */
-	rx_bd = (struct eth_rx_bd *)qed_chain_produce(&rxq->rx_bd_ring);
-	WARN_ON(!rx_bd);
-	rx_bd->addr.hi = cpu_to_le32(upper_32_bits(mapping));
-	rx_bd->addr.lo = cpu_to_le32(lower_32_bits(mapping));
-
-	rxq->sw_rx_prod++;
-
-	return 0;
-}
-
 static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 {
 	dma_addr_t mapping;
@@ -3010,7 +2996,7 @@ static int qede_alloc_mem_rxq(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	/* Allocate buffers for the Rx ring */
 	for (i = 0; i < rxq->num_rx_buffers; i++) {
-		rc = qede_alloc_rx_buffer(edev, rxq);
+		rc = qede_alloc_rx_buffer(rxq);
 		if (rc) {
 			DP_ERR(edev,
 			       "Rx buffers allocation failed at index %d\n", i);
@@ -3151,12 +3137,14 @@ static void qede_init_fp(struct qede_dev *edev)
 
 		if (fp->type & QEDE_FASTPATH_RX) {
 			fp->rxq->rxq_id = rxq_index++;
+			fp->rxq->dev = &edev->pdev->dev;
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
 			fp->txq->index = txq_index++;
 			if (edev->dev_info.is_legacy)
 				fp->txq->is_legacy = 1;
+			fp->txq->dev = &edev->pdev->dev;
 		}
 
 		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 08/11] qede: Don't check netdevice for rx-hash
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Receive-hashing is a fixed feature, so there's no need to check
during the ingress datapath whether it's set or not.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 34 ++++++++++------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 615202f..ec01de2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -997,22 +997,20 @@ void qede_update_rx_prod(struct qede_dev *edev, struct qede_rx_queue *rxq)
 	mmiowb();
 }
 
-static u32 qede_get_rxhash(struct qede_dev *edev,
-			   u8 bitfields,
-			   __le32 rss_hash, enum pkt_hash_types *rxhash_type)
+static void qede_get_rxhash(struct sk_buff *skb, u8 bitfields, __le32 rss_hash)
 {
+	enum pkt_hash_types hash_type = PKT_HASH_TYPE_NONE;
 	enum rss_hash_type htype;
+	u32 hash = 0;
 
 	htype = GET_FIELD(bitfields, ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE);
-
-	if ((edev->ndev->features & NETIF_F_RXHASH) && htype) {
-		*rxhash_type = ((htype == RSS_HASH_TYPE_IPV4) ||
-				(htype == RSS_HASH_TYPE_IPV6)) ?
-				PKT_HASH_TYPE_L3 : PKT_HASH_TYPE_L4;
-		return le32_to_cpu(rss_hash);
+	if (htype) {
+		hash_type = ((htype == RSS_HASH_TYPE_IPV4) ||
+			     (htype == RSS_HASH_TYPE_IPV6)) ?
+			    PKT_HASH_TYPE_L3 : PKT_HASH_TYPE_L4;
+		hash = le32_to_cpu(rss_hash);
 	}
-	*rxhash_type = PKT_HASH_TYPE_NONE;
-	return 0;
+	skb_set_hash(skb, hash, hash_type);
 }
 
 static void qede_set_skb_csum(struct sk_buff *skb, u8 csum_flag)
@@ -1104,8 +1102,6 @@ static void qede_tpa_start(struct qede_dev *edev,
 	dma_addr_t mapping = tpa_info->buffer_mapping;
 	struct sw_rx_data *sw_rx_data_cons;
 	struct sw_rx_data *sw_rx_data_prod;
-	enum pkt_hash_types rxhash_type;
-	u32 rxhash;
 
 	sw_rx_data_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons & NUM_RX_BDS_MAX];
 	sw_rx_data_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod & NUM_RX_BDS_MAX];
@@ -1150,10 +1146,6 @@ static void qede_tpa_start(struct qede_dev *edev,
 	tpa_info->frag_id = 0;
 	tpa_info->state = QEDE_AGG_STATE_START;
 
-	rxhash = qede_get_rxhash(edev, cqe->bitfields,
-				 cqe->rss_hash, &rxhash_type);
-	skb_set_hash(tpa_info->skb, rxhash, rxhash_type);
-
 	/* Store some information from first CQE */
 	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
 	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
@@ -1164,6 +1156,8 @@ static void qede_tpa_start(struct qede_dev *edev,
 	else
 		tpa_info->vlan_tag = 0;
 
+	qede_get_rxhash(tpa_info->skb, cqe->bitfields, cqe->rss_hash);
+
 	/* This is needed in order to enable forwarding support */
 	qede_set_gro_params(edev, tpa_info->skb, cqe);
 
@@ -1541,14 +1535,12 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 {
 	struct eth_fast_path_rx_reg_cqe *fp_cqe;
 	u16 len, pad, bd_cons_idx, parse_flag;
-	enum pkt_hash_types rxhash_type;
 	enum eth_rx_cqe_type cqe_type;
 	union eth_rx_cqe *cqe;
 	struct sw_rx_data *bd;
 	struct sk_buff *skb;
 	__le16 flags;
 	u8 csum_flag;
-	u32 rx_hash;
 
 	/* Get the CQE from the completion ring */
 	cqe = (union eth_rx_cqe *)qed_chain_consume(&rxq->rx_comp_ring);
@@ -1621,9 +1613,7 @@ static int qede_rx_process_cqe(struct qede_dev *edev,
 
 	/* The SKB contains all the data. Now prepare meta-magic */
 	skb->protocol = eth_type_trans(skb, edev->ndev);
-	rx_hash = qede_get_rxhash(edev, fp_cqe->bitfields,
-				  fp_cqe->rss_hash, &rxhash_type);
-	skb_set_hash(skb, rx_hash, rxhash_type);
+	qede_get_rxhash(skb, fp_cqe->bitfields, fp_cqe->rss_hash);
 	qede_set_skb_csum(skb, csum_flag);
 	skb_record_rx_queue(skb, rxq->rxq_id);
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 07/11] qed*: Handle-based L2-queues.
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

The driver needs to maintain several FW/HW-indices for each one of
its queues. Currently, that mapping is done by the QED where it uses
an rx/tx array of so-called hw-cids, populating them whenever a new
queue is opened and clearing them upon destruction of said queues.

This maintenance is far from ideal - there's no real reason why
QED needs to maintain such a data-structure. It becomes even worse
when considering the fact that the PF's queues and its child VFs' queues
are all mapped into the same data-structure.
As a by-product, the set of parameters an interface needs to supply for
queue APIs is non-trivial, and some of the variables in the API
structures have different meaning depending on their exact place
in the configuration flow.

This patch re-organizes the way L2 queues are configured and maintained.
In short:
  - Required parameters for queue init are now well-defined.
  - Qed would allocate a queue-cid based on parameters.
    Upon initialization success, it would return a handle to caller.
  - Queue-handle would be maintained by entity requesting queue-init,
    not necessarily qed.
  - All further queue-APIs [update, destroy] would use the opaque
    handle as reference for the queue instead of various indices.

The possible owners of such handles:
  - PF queues [qede] - complete handles based on provided configuration.
  - VF queues [qede] - fw-context-less handles, containing only relative
    information; Only the PF-side would need the absolute indices
    for configuration, so they're omitted here.
  - VF queues [qed, PF-side] - complete handles based on VF initialization.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed.h        |  12 -
 drivers/net/ethernet/qlogic/qed/qed_dev.c    |  26 --
 drivers/net/ethernet/qlogic/qed/qed_l2.c     | 595 +++++++++++++++------------
 drivers/net/ethernet/qlogic/qed/qed_l2.h     | 133 ++++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.c  | 275 +++++++++----
 drivers/net/ethernet/qlogic/qed/qed_sriov.h  |  21 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c     |  90 ++--
 drivers/net/ethernet/qlogic/qed/qed_vf.h     |  40 +-
 drivers/net/ethernet/qlogic/qede/qede.h      |   4 +
 drivers/net/ethernet/qlogic/qede/qede_main.c | 109 ++---
 include/linux/qed/qed_eth_if.h               |  56 +--
 11 files changed, 791 insertions(+), 570 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 50b8a01..244dd40 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -241,15 +241,6 @@ struct qed_hw_info {
 	enum qed_wol_support b_wol_support;
 };
 
-struct qed_hw_cid_data {
-	u32	cid;
-	bool	b_cid_allocated;
-
-	/* Additional identifiers */
-	u16	opaque_fid;
-	u8	vport_id;
-};
-
 /* maximun size of read/write commands (HW limit) */
 #define DMAE_MAX_RW_SIZE        0x2000
 
@@ -416,9 +407,6 @@ struct qed_hwfn {
 
 	struct qed_dcbx_info		*p_dcbx_info;
 
-	struct qed_hw_cid_data		*p_tx_cids;
-	struct qed_hw_cid_data		*p_rx_cids;
-
 	struct qed_dmae_info		dmae_info;
 
 	/* QM init */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 80162ee..00b9a67 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -137,15 +137,6 @@ void qed_resc_free(struct qed_dev *cdev)
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 
-		kfree(p_hwfn->p_tx_cids);
-		p_hwfn->p_tx_cids = NULL;
-		kfree(p_hwfn->p_rx_cids);
-		p_hwfn->p_rx_cids = NULL;
-	}
-
-	for_each_hwfn(cdev, i) {
-		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
-
 		qed_cxt_mngr_free(p_hwfn);
 		qed_qm_info_free(p_hwfn);
 		qed_spq_free(p_hwfn);
@@ -425,23 +416,6 @@ int qed_resc_alloc(struct qed_dev *cdev)
 	if (!cdev->fw_data)
 		return -ENOMEM;
 
-	/* Allocate Memory for the Queue->CID mapping */
-	for_each_hwfn(cdev, i) {
-		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
-		int tx_size = sizeof(struct qed_hw_cid_data) *
-				     RESC_NUM(p_hwfn, QED_L2_QUEUE);
-		int rx_size = sizeof(struct qed_hw_cid_data) *
-				     RESC_NUM(p_hwfn, QED_L2_QUEUE);
-
-		p_hwfn->p_tx_cids = kzalloc(tx_size, GFP_KERNEL);
-		if (!p_hwfn->p_tx_cids)
-			goto alloc_no_mem;
-
-		p_hwfn->p_rx_cids = kzalloc(rx_size, GFP_KERNEL);
-		if (!p_hwfn->p_rx_cids)
-			goto alloc_no_mem;
-	}
-
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
 		u32 n_eqes, num_cons;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 900b253..d40b167 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
+#include <linux/vmalloc.h>
 #include "qed.h"
 #include <linux/qed/qed_chain.h>
 #include "qed_cxt.h"
@@ -41,6 +42,124 @@
 #define QED_MAX_SGES_NUM 16
 #define CRC32_POLY 0x1edc6f41
 
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid)
+{
+	/* VFs' CIDs are 0-based in PF-view, and uninitialized on VF */
+	if (!p_cid->is_vf && IS_PF(p_hwfn->cdev))
+		qed_cxt_release_cid(p_hwfn, p_cid->cid);
+	vfree(p_cid);
+}
+
+/* The internal is only meant to be directly called by PFs initializeing CIDs
+ * for their VFs.
+ */
+struct qed_queue_cid *
+_qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+		      u16 opaque_fid,
+		      u32 cid,
+		      u8 vf_qid,
+		      struct qed_queue_start_common_params *p_params)
+{
+	bool b_is_same = (p_hwfn->hw_info.opaque_fid == opaque_fid);
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = vmalloc(sizeof(*p_cid));
+	if (!p_cid)
+		return NULL;
+	memset(p_cid, 0, sizeof(*p_cid));
+
+	p_cid->opaque_fid = opaque_fid;
+	p_cid->cid = cid;
+	p_cid->vf_qid = vf_qid;
+	p_cid->rel = *p_params;
+
+	/* Don't try calculating the absolute indices for VFs */
+	if (IS_VF(p_hwfn->cdev)) {
+		p_cid->abs = p_cid->rel;
+		goto out;
+	}
+
+	/* Calculate the engine-absolute indices of the resources.
+	 * This would guarantee they're valid later on.
+	 * In some cases [SBs] we already have the right values.
+	 */
+	rc = qed_fw_vport(p_hwfn, p_cid->rel.vport_id, &p_cid->abs.vport_id);
+	if (rc)
+		goto fail;
+
+	rc = qed_fw_l2_queue(p_hwfn, p_cid->rel.queue_id, &p_cid->abs.queue_id);
+	if (rc)
+		goto fail;
+
+	/* In case of a PF configuring its VF's queues, the stats-id is already
+	 * absolute [since there's a single index that's suitable per-VF].
+	 */
+	if (b_is_same) {
+		rc = qed_fw_vport(p_hwfn, p_cid->rel.stats_id,
+				  &p_cid->abs.stats_id);
+		if (rc)
+			goto fail;
+	} else {
+		p_cid->abs.stats_id = p_cid->rel.stats_id;
+	}
+
+	/* SBs relevant information was already provided as absolute */
+	p_cid->abs.sb = p_cid->rel.sb;
+	p_cid->abs.sb_idx = p_cid->rel.sb_idx;
+
+	/* This is tricky - we're actually interested in whehter this is a PF
+	 * entry meant for the VF.
+	 */
+	if (!b_is_same)
+		p_cid->is_vf = true;
+out:
+	DP_VERBOSE(p_hwfn,
+		   QED_MSG_SP,
+		   "opaque_fid: %04x CID %08x vport %02x [%02x] qzone %04x [%04x] stats %02x [%02x] SB %04x PI %02x\n",
+		   p_cid->opaque_fid,
+		   p_cid->cid,
+		   p_cid->rel.vport_id,
+		   p_cid->abs.vport_id,
+		   p_cid->rel.queue_id,
+		   p_cid->abs.queue_id,
+		   p_cid->rel.stats_id,
+		   p_cid->abs.stats_id, p_cid->abs.sb, p_cid->abs.sb_idx);
+
+	return p_cid;
+
+fail:
+	vfree(p_cid);
+	return NULL;
+}
+
+static struct qed_queue_cid *qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+						  u16 opaque_fid, struct
+						  qed_queue_start_common_params
+						  *p_params)
+{
+	struct qed_queue_cid *p_cid;
+	u32 cid = 0;
+
+	/* Get a unique firmware CID for this queue, in case it's a PF.
+	 * VF's don't need a CID as the queue configuration will be done
+	 * by PF.
+	 */
+	if (IS_PF(p_hwfn->cdev)) {
+		if (qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &cid)) {
+			DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
+			return NULL;
+		}
+	}
+
+	p_cid = _qed_eth_queue_to_cid(p_hwfn, opaque_fid, cid, 0, p_params);
+	if (!p_cid && IS_PF(p_hwfn->cdev))
+		qed_cxt_release_cid(p_hwfn, cid);
+
+	return p_cid;
+}
+
 int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
 			   struct qed_sp_vport_start_params *p_params)
 {
@@ -496,61 +615,26 @@ static int qed_filter_accept_cmd(struct qed_dev *cdev,
 	return 0;
 }
 
-static int qed_sp_release_queue_cid(
-	struct qed_hwfn *p_hwfn,
-	struct qed_hw_cid_data *p_cid_data)
-{
-	if (!p_cid_data->b_cid_allocated)
-		return 0;
-
-	qed_cxt_release_cid(p_hwfn, p_cid_data->cid);
-
-	p_cid_data->b_cid_allocated = false;
-
-	return 0;
-}
-
-int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
-				u16 opaque_fid,
-				u32 cid,
-				struct qed_queue_start_common_params *p_params,
-				u8 stats_id,
-				u16 bd_max_bytes,
-				dma_addr_t bd_chain_phys_addr,
-				dma_addr_t cqe_pbl_addr,
-				u16 cqe_pbl_size, bool b_use_zone_a_prod)
+int qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			     struct qed_queue_cid *p_cid,
+			     u16 bd_max_bytes,
+			     dma_addr_t bd_chain_phys_addr,
+			     dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size)
 {
 	struct rx_queue_start_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_rx_cid;
-	u16 abs_rx_q_id = 0;
-	u8 abs_vport_id = 0;
 	int rc = -EINVAL;
 
-	/* Store information for the stop */
-	p_rx_cid = &p_hwfn->p_rx_cids[p_params->queue_id];
-	p_rx_cid->cid = cid;
-	p_rx_cid->opaque_fid = opaque_fid;
-	p_rx_cid->vport_id = p_params->vport_id;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_rx_q_id);
-	if (rc)
-		return rc;
-
 	DP_VERBOSE(p_hwfn, QED_MSG_SP,
-		   "opaque_fid=0x%x, cid=0x%x, rx_qid=0x%x, vport_id=0x%x, sb_id=0x%x\n",
-		   opaque_fid,
-		   cid, p_params->queue_id, p_params->vport_id, p_params->sb);
+		   "opaque_fid=0x%x, cid=0x%x, rx_qzone=0x%x, vport_id=0x%x, sb_id=0x%x\n",
+		   p_cid->opaque_fid, p_cid->cid,
+		   p_cid->abs.queue_id, p_cid->abs.vport_id, p_cid->abs.sb);
 
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = cid;
-	init_data.opaque_fid = opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -561,11 +645,11 @@ int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
 
 	p_ramrod = &p_ent->ramrod.rx_queue_start;
 
-	p_ramrod->sb_id = cpu_to_le16(p_params->sb);
-	p_ramrod->sb_index = p_params->sb_idx;
-	p_ramrod->vport_id = abs_vport_id;
-	p_ramrod->stats_counter_id = stats_id;
-	p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+	p_ramrod->sb_id = cpu_to_le16(p_cid->abs.sb);
+	p_ramrod->sb_index = p_cid->abs.sb_idx;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 	p_ramrod->complete_cqe_flg = 0;
 	p_ramrod->complete_event_flg = 1;
 
@@ -575,85 +659,85 @@ int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
 	p_ramrod->num_of_pbl_pages = cpu_to_le16(cqe_pbl_size);
 	DMA_REGPAIR_LE(p_ramrod->cqe_pbl_addr, cqe_pbl_addr);
 
-	if (p_params->vf_qid || b_use_zone_a_prod) {
-		p_ramrod->vf_rx_prod_index = p_params->vf_qid;
+	if (p_cid->is_vf) {
+		p_ramrod->vf_rx_prod_index = p_cid->vf_qid;
 		DP_VERBOSE(p_hwfn, QED_MSG_SP,
 			   "Queue%s is meant for VF rxq[%02x]\n",
-			   b_use_zone_a_prod ? " [legacy]" : "",
-			   p_params->vf_qid);
-		p_ramrod->vf_rx_prod_use_zone_a = b_use_zone_a_prod;
+			   !!p_cid->b_legacy_vf ? " [legacy]" : "",
+			   p_cid->vf_qid);
+		p_ramrod->vf_rx_prod_use_zone_a = !!p_cid->b_legacy_vf;
 	}
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
 static int
-qed_sp_eth_rx_queue_start(struct qed_hwfn *p_hwfn,
-			  u16 opaque_fid,
-			  struct qed_queue_start_common_params *p_params,
+qed_eth_pf_rx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
 			  u16 bd_max_bytes,
 			  dma_addr_t bd_chain_phys_addr,
 			  dma_addr_t cqe_pbl_addr,
 			  u16 cqe_pbl_size, void __iomem **pp_prod)
 {
-	struct qed_hw_cid_data *p_rx_cid;
 	u32 init_prod_val = 0;
-	u16 abs_l2_queue = 0;
-	u8 abs_stats_id = 0;
-	int rc;
 
-	if (IS_VF(p_hwfn->cdev)) {
-		return qed_vf_pf_rxq_start(p_hwfn,
-					   p_params->queue_id,
-					   p_params->sb,
-					   (u8)p_params->sb_idx,
-					   bd_max_bytes,
-					   bd_chain_phys_addr,
-					   cqe_pbl_addr, cqe_pbl_size, pp_prod);
-	}
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_l2_queue);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_stats_id);
-	if (rc)
-		return rc;
-
-	*pp_prod = (u8 __iomem *)p_hwfn->regview +
-				 GTT_BAR0_MAP_REG_MSDM_RAM +
-				 MSTORM_ETH_PF_PRODS_OFFSET(abs_l2_queue);
+	*pp_prod = p_hwfn->regview +
+		   GTT_BAR0_MAP_REG_MSDM_RAM +
+		    MSTORM_ETH_PF_PRODS_OFFSET(p_cid->abs.queue_id);
 
 	/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
 	__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
 			  (u32 *)(&init_prod_val));
 
+	return qed_eth_rxq_start_ramrod(p_hwfn, p_cid,
+					bd_max_bytes,
+					bd_chain_phys_addr,
+					cqe_pbl_addr, cqe_pbl_size);
+}
+
+static int
+qed_eth_rx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u16 bd_max_bytes,
+		       dma_addr_t bd_chain_phys_addr,
+		       dma_addr_t cqe_pbl_addr,
+		       u16 cqe_pbl_size,
+		       struct qed_rxq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
 	/* Allocate a CID for the queue */
-	p_rx_cid = &p_hwfn->p_rx_cids[p_params->queue_id];
-	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &p_rx_cid->cid);
-	if (rc) {
-		DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
-		return rc;
-	}
-	p_rx_cid->b_cid_allocated = true;
+	p_cid = qed_eth_queue_to_cid(p_hwfn, opaque_fid, p_params);
+	if (!p_cid)
+		return -ENOMEM;
 
-	rc = qed_sp_eth_rxq_start_ramrod(p_hwfn,
-					 opaque_fid,
-					 p_rx_cid->cid,
-					 p_params,
-					 abs_stats_id,
+	if (IS_PF(p_hwfn->cdev)) {
+		rc = qed_eth_pf_rx_queue_start(p_hwfn, p_cid,
+					       bd_max_bytes,
+					       bd_chain_phys_addr,
+					       cqe_pbl_addr, cqe_pbl_size,
+					       &p_ret_params->p_prod);
+	} else {
+		rc = qed_vf_pf_rxq_start(p_hwfn, p_cid,
 					 bd_max_bytes,
 					 bd_chain_phys_addr,
-					 cqe_pbl_addr, cqe_pbl_size, false);
+					 cqe_pbl_addr,
+					 cqe_pbl_size, &p_ret_params->p_prod);
+	}
 
+	/* Provide the caller with a reference to as handler */
 	if (rc)
-		qed_sp_release_queue_cid(p_hwfn, p_rx_cid);
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
 
 	return rc;
 }
 
 int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
-				u16 rx_queue_id,
+				void **pp_rxq_handles,
 				u8 num_rxqs,
 				u8 complete_cqe_flg,
 				u8 complete_event_flg,
@@ -663,8 +747,7 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	struct rx_queue_update_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_rx_cid;
-	u16 qid, abs_rx_q_id = 0;
+	struct qed_queue_cid *p_cid;
 	int rc = -EINVAL;
 	u8 i;
 
@@ -673,12 +756,11 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	init_data.p_comp_data = p_comp_data;
 
 	for (i = 0; i < num_rxqs; i++) {
-		qid = rx_queue_id + i;
-		p_rx_cid = &p_hwfn->p_rx_cids[qid];
+		p_cid = ((struct qed_queue_cid **)pp_rxq_handles)[i];
 
 		/* Get SPQ entry */
-		init_data.cid = p_rx_cid->cid;
-		init_data.opaque_fid = p_rx_cid->opaque_fid;
+		init_data.cid = p_cid->cid;
+		init_data.opaque_fid = p_cid->opaque_fid;
 
 		rc = qed_sp_init_request(p_hwfn, &p_ent,
 					 ETH_RAMROD_RX_QUEUE_UPDATE,
@@ -687,10 +769,9 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 			return rc;
 
 		p_ramrod = &p_ent->ramrod.rx_queue_update;
+		p_ramrod->vport_id = p_cid->abs.vport_id;
 
-		qed_fw_vport(p_hwfn, p_rx_cid->vport_id, &p_ramrod->vport_id);
-		qed_fw_l2_queue(p_hwfn, qid, &abs_rx_q_id);
-		p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+		p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 		p_ramrod->complete_cqe_flg = complete_cqe_flg;
 		p_ramrod->complete_event_flg = complete_event_flg;
 
@@ -702,24 +783,19 @@ int qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
-			     u16 rx_queue_id,
-			     bool eq_completion_only, bool cqe_completion)
+static int
+qed_eth_pf_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 bool b_eq_completion_only, bool b_cqe_completion)
 {
-	struct qed_hw_cid_data *p_rx_cid = &p_hwfn->p_rx_cids[rx_queue_id];
 	struct rx_queue_stop_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	u16 abs_rx_q_id = 0;
-	int rc = -EINVAL;
-
-	if (IS_VF(p_hwfn->cdev))
-		return qed_vf_pf_rxq_stop(p_hwfn, rx_queue_id, cqe_completion);
+	int rc;
 
-	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = p_rx_cid->cid;
-	init_data.opaque_fid = p_rx_cid->opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -729,62 +805,53 @@ int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
 		return rc;
 
 	p_ramrod = &p_ent->ramrod.rx_queue_stop;
-
-	qed_fw_vport(p_hwfn, p_rx_cid->vport_id, &p_ramrod->vport_id);
-	qed_fw_l2_queue(p_hwfn, rx_queue_id, &abs_rx_q_id);
-	p_ramrod->rx_queue_id = cpu_to_le16(abs_rx_q_id);
+	p_ramrod->vport_id = p_cid->abs.vport_id;
+	p_ramrod->rx_queue_id = cpu_to_le16(p_cid->abs.queue_id);
 
 	/* Cleaning the queue requires the completion to arrive there.
 	 * In addition, VFs require the answer to come as eqe to PF.
 	 */
-	p_ramrod->complete_cqe_flg =
-		(!!(p_rx_cid->opaque_fid == p_hwfn->hw_info.opaque_fid) &&
-		 !eq_completion_only) || cqe_completion;
-	p_ramrod->complete_event_flg =
-		!(p_rx_cid->opaque_fid == p_hwfn->hw_info.opaque_fid) ||
-		eq_completion_only;
+	p_ramrod->complete_cqe_flg = (!p_cid->is_vf &&
+				      !b_eq_completion_only) ||
+				     b_cqe_completion;
+	p_ramrod->complete_event_flg = p_cid->is_vf || b_eq_completion_only;
 
-	rc = qed_spq_post(p_hwfn, p_ent, NULL);
-	if (rc)
-		return rc;
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+			  void *p_rxq,
+			  bool eq_completion_only, bool cqe_completion)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_rxq;
+	int rc = -EINVAL;
 
-	return qed_sp_release_queue_cid(p_hwfn, p_rx_cid);
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_rx_queue_stop(p_hwfn, p_cid,
+					      eq_completion_only,
+					      cqe_completion);
+	else
+		rc = qed_vf_pf_rxq_stop(p_hwfn, p_cid, cqe_completion);
+
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
 }
 
-int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
-				u16  opaque_fid,
-				u32  cid,
-				struct qed_queue_start_common_params *p_params,
-				u8  stats_id,
-				dma_addr_t pbl_addr,
-				u16 pbl_size,
-				union qed_qm_pq_params *p_pq_params)
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id)
 {
 	struct tx_queue_start_ramrod_data *p_ramrod = NULL;
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	struct qed_hw_cid_data *p_tx_cid;
-	u16 pq_id, abs_tx_q_id = 0;
 	int rc = -EINVAL;
-	u8 abs_vport_id;
-
-	/* Store information for the stop */
-	p_tx_cid = &p_hwfn->p_tx_cids[p_params->queue_id];
-	p_tx_cid->cid		= cid;
-	p_tx_cid->opaque_fid	= opaque_fid;
-
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
-	if (rc)
-		return rc;
-
-	rc = qed_fw_l2_queue(p_hwfn, p_params->queue_id, &abs_tx_q_id);
-	if (rc)
-		return rc;
 
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = cid;
-	init_data.opaque_fid = opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -794,96 +861,92 @@ int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
 		return rc;
 
 	p_ramrod = &p_ent->ramrod.tx_queue_start;
-	p_ramrod->vport_id = abs_vport_id;
+	p_ramrod->vport_id = p_cid->abs.vport_id;
 
-	p_ramrod->sb_id = cpu_to_le16(p_params->sb);
-	p_ramrod->sb_index = p_params->sb_idx;
-	p_ramrod->stats_counter_id = stats_id;
+	p_ramrod->sb_id = cpu_to_le16(p_cid->abs.sb);
+	p_ramrod->sb_index = p_cid->abs.sb_idx;
+	p_ramrod->stats_counter_id = p_cid->abs.stats_id;
 
-	p_ramrod->queue_zone_id = cpu_to_le16(abs_tx_q_id);
+	p_ramrod->queue_zone_id = cpu_to_le16(p_cid->abs.queue_id);
+	p_ramrod->same_as_last_id = cpu_to_le16(p_cid->abs.queue_id);
 
 	p_ramrod->pbl_size = cpu_to_le16(pbl_size);
 	DMA_REGPAIR_LE(p_ramrod->pbl_base_addr, pbl_addr);
 
-	pq_id = qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH, p_pq_params);
 	p_ramrod->qm_pq_id = cpu_to_le16(pq_id);
 
 	return qed_spq_post(p_hwfn, p_ent, NULL);
 }
 
 static int
-qed_sp_eth_tx_queue_start(struct qed_hwfn *p_hwfn,
-			  u16 opaque_fid,
-			  struct qed_queue_start_common_params *p_params,
+qed_eth_pf_tx_queue_start(struct qed_hwfn *p_hwfn,
+			  struct qed_queue_cid *p_cid,
+			  u8 tc,
 			  dma_addr_t pbl_addr,
 			  u16 pbl_size, void __iomem **pp_doorbell)
 {
-	struct qed_hw_cid_data *p_tx_cid;
 	union qed_qm_pq_params pq_params;
-	u8 abs_stats_id = 0;
 	int rc;
 
-	if (IS_VF(p_hwfn->cdev)) {
-		return qed_vf_pf_txq_start(p_hwfn,
-					   p_params->queue_id,
-					   p_params->sb,
-					   p_params->sb_idx,
-					   pbl_addr, pbl_size, pp_doorbell);
-	}
+	memset(&pq_params, 0, sizeof(pq_params));
 
-	rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_stats_id);
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_cid,
+				      pbl_addr, pbl_size,
+				      qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH,
+						    &pq_params));
 	if (rc)
 		return rc;
 
-	p_tx_cid = &p_hwfn->p_tx_cids[p_params->queue_id];
-	memset(p_tx_cid, 0, sizeof(*p_tx_cid));
-	memset(&pq_params, 0, sizeof(pq_params));
+	/* Provide the caller with the necessary return values */
+	*pp_doorbell = p_hwfn->doorbells +
+		       qed_db_addr(p_cid->cid, DQ_DEMS_LEGACY);
 
-	/* Allocate a CID for the queue */
-	rc = qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_ETH, &p_tx_cid->cid);
-	if (rc) {
-		DP_NOTICE(p_hwfn, "Failed to acquire cid\n");
-		return rc;
-	}
-	p_tx_cid->b_cid_allocated = true;
+	return 0;
+}
 
-	DP_VERBOSE(p_hwfn, QED_MSG_SP,
-		   "opaque_fid=0x%x, cid=0x%x, tx_qid=0x%x, vport_id=0x%x, sb_id=0x%x\n",
-		   opaque_fid, p_tx_cid->cid,
-		   p_params->queue_id, p_params->vport_id, p_params->sb);
-
-	rc = qed_sp_eth_txq_start_ramrod(p_hwfn,
-					 opaque_fid,
-					 p_tx_cid->cid,
-					 p_params,
-					 abs_stats_id,
-					 pbl_addr,
-					 pbl_size,
-					 &pq_params);
-
-	*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-				     qed_db_addr(p_tx_cid->cid, DQ_DEMS_LEGACY);
+static int
+qed_eth_tx_queue_start(struct qed_hwfn *p_hwfn,
+		       u16 opaque_fid,
+		       struct qed_queue_start_common_params *p_params,
+		       u8 tc,
+		       dma_addr_t pbl_addr,
+		       u16 pbl_size,
+		       struct qed_txq_start_ret_params *p_ret_params)
+{
+	struct qed_queue_cid *p_cid;
+	int rc;
+
+	p_cid = qed_eth_queue_to_cid(p_hwfn, opaque_fid, p_params);
+	if (!p_cid)
+		return -EINVAL;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_start(p_hwfn, p_cid, tc,
+					       pbl_addr, pbl_size,
+					       &p_ret_params->p_doorbell);
+	else
+		rc = qed_vf_pf_txq_start(p_hwfn, p_cid,
+					 pbl_addr, pbl_size,
+					 &p_ret_params->p_doorbell);
 
 	if (rc)
-		qed_sp_release_queue_cid(p_hwfn, p_tx_cid);
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	else
+		p_ret_params->p_handle = (void *)p_cid;
 
 	return rc;
 }
 
-int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id)
+static int
+qed_eth_pf_tx_queue_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
 {
-	struct qed_hw_cid_data *p_tx_cid = &p_hwfn->p_tx_cids[tx_queue_id];
 	struct qed_spq_entry *p_ent = NULL;
 	struct qed_sp_init_data init_data;
-	int rc = -EINVAL;
-
-	if (IS_VF(p_hwfn->cdev))
-		return qed_vf_pf_txq_stop(p_hwfn, tx_queue_id);
+	int rc;
 
-	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
-	init_data.cid = p_tx_cid->cid;
-	init_data.opaque_fid = p_tx_cid->opaque_fid;
+	init_data.cid = p_cid->cid;
+	init_data.opaque_fid = p_cid->opaque_fid;
 	init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
 
 	rc = qed_sp_init_request(p_hwfn, &p_ent,
@@ -892,11 +955,22 @@ int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id)
 	if (rc)
 		return rc;
 
-	rc = qed_spq_post(p_hwfn, p_ent, NULL);
-	if (rc)
-		return rc;
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_handle)
+{
+	struct qed_queue_cid *p_cid = (struct qed_queue_cid *)p_handle;
+	int rc;
+
+	if (IS_PF(p_hwfn->cdev))
+		rc = qed_eth_pf_tx_queue_stop(p_hwfn, p_cid);
+	else
+		rc = qed_vf_pf_txq_stop(p_hwfn, p_cid);
 
-	return qed_sp_release_queue_cid(p_hwfn, p_tx_cid);
+	if (!rc)
+		qed_eth_queue_cid_release(p_hwfn, p_cid);
+	return rc;
 }
 
 static enum eth_filter_action qed_filter_action(enum qed_filter_opcode opcode)
@@ -1880,58 +1954,53 @@ static int qed_update_vport(struct qed_dev *cdev,
 }
 
 static int qed_start_rxq(struct qed_dev *cdev,
-			 struct qed_queue_start_common_params *params,
+			 u8 rss_num,
+			 struct qed_queue_start_common_params *p_params,
 			 u16 bd_max_bytes,
 			 dma_addr_t bd_chain_phys_addr,
 			 dma_addr_t cqe_pbl_addr,
 			 u16 cqe_pbl_size,
-			 void __iomem **pp_prod)
+			 struct qed_rxq_start_ret_params *ret_params)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index = params->rss_id % cdev->num_hwfns;
+	hwfn_index = rss_num % cdev->num_hwfns;
 	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	/* Fix queue ID in 100g mode */
-	params->queue_id /= cdev->num_hwfns;
-
-	rc = qed_sp_eth_rx_queue_start(p_hwfn,
-				       p_hwfn->hw_info.opaque_fid,
-				       params,
-				       bd_max_bytes,
-				       bd_chain_phys_addr,
-				       cqe_pbl_addr,
-				       cqe_pbl_size,
-				       pp_prod);
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
 
+	rc = qed_eth_rx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params,
+				    bd_max_bytes,
+				    bd_chain_phys_addr,
+				    cqe_pbl_addr, cqe_pbl_size, ret_params);
 	if (rc) {
-		DP_ERR(cdev, "Failed to start RXQ#%d\n", params->queue_id);
+		DP_ERR(cdev, "Failed to start RXQ#%d\n", p_params->queue_id);
 		return rc;
 	}
 
 	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-		   "Started RX-Q %d [rss %d] on V-PORT %d and SB %d\n",
-		   params->queue_id, params->rss_id, params->vport_id,
-		   params->sb);
+		   "Started RX-Q %d [rss_num %d] on V-PORT %d and SB %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
+		   p_params->sb);
 
 	return 0;
 }
 
-static int qed_stop_rxq(struct qed_dev *cdev,
-			struct qed_stop_rxq_params *params)
+static int qed_stop_rxq(struct qed_dev *cdev, u8 rss_id, void *handle)
 {
 	int rc, hwfn_index;
 	struct qed_hwfn *p_hwfn;
 
-	hwfn_index	= params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	rc = qed_sp_eth_rx_queue_stop(p_hwfn,
-				      params->rx_queue_id / cdev->num_hwfns,
-				      params->eq_completion_only, false);
+	rc = qed_eth_rx_queue_stop(p_hwfn, handle, false, false);
 	if (rc) {
-		DP_ERR(cdev, "Failed to stop RXQ#%d\n", params->rx_queue_id);
+		DP_ERR(cdev, "Failed to stop RXQ#%02x\n", rss_id);
 		return rc;
 	}
 
@@ -1939,26 +2008,24 @@ static int qed_stop_rxq(struct qed_dev *cdev,
 }
 
 static int qed_start_txq(struct qed_dev *cdev,
+			 u8 rss_num,
 			 struct qed_queue_start_common_params *p_params,
 			 dma_addr_t pbl_addr,
 			 u16 pbl_size,
-			 void __iomem **pp_doorbell)
+			 struct qed_txq_start_ret_params *ret_params)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index	= p_params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
-
-	/* Fix queue ID in 100g mode */
-	p_params->queue_id /= cdev->num_hwfns;
+	hwfn_index = rss_num % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
+	p_params->queue_id = p_params->queue_id / cdev->num_hwfns;
+	p_params->stats_id = p_params->vport_id;
 
-	rc = qed_sp_eth_tx_queue_start(p_hwfn,
-				       p_hwfn->hw_info.opaque_fid,
-				       p_params,
-				       pbl_addr,
-				       pbl_size,
-				       pp_doorbell);
+	rc = qed_eth_tx_queue_start(p_hwfn,
+				    p_hwfn->hw_info.opaque_fid,
+				    p_params, 0,
+				    pbl_addr, pbl_size, ret_params);
 
 	if (rc) {
 		DP_ERR(cdev, "Failed to start TXQ#%d\n", p_params->queue_id);
@@ -1966,8 +2033,8 @@ static int qed_start_txq(struct qed_dev *cdev,
 	}
 
 	DP_VERBOSE(cdev, (QED_MSG_SPQ | NETIF_MSG_IFUP),
-		   "Started TX-Q %d [rss %d] on V-PORT %d and SB %d\n",
-		   p_params->queue_id, p_params->rss_id, p_params->vport_id,
+		   "Started TX-Q %d [rss_num %d] on V-PORT %d and SB %d\n",
+		   p_params->queue_id, rss_num, p_params->vport_id,
 		   p_params->sb);
 
 	return 0;
@@ -1981,19 +2048,17 @@ static int qed_fastpath_stop(struct qed_dev *cdev)
 	return 0;
 }
 
-static int qed_stop_txq(struct qed_dev *cdev,
-			struct qed_stop_txq_params *params)
+static int qed_stop_txq(struct qed_dev *cdev, u8 rss_id, void *handle)
 {
 	struct qed_hwfn *p_hwfn;
 	int rc, hwfn_index;
 
-	hwfn_index	= params->rss_id % cdev->num_hwfns;
-	p_hwfn		= &cdev->hwfns[hwfn_index];
+	hwfn_index = rss_id % cdev->num_hwfns;
+	p_hwfn = &cdev->hwfns[hwfn_index];
 
-	rc = qed_sp_eth_tx_queue_stop(p_hwfn,
-				      params->tx_queue_id / cdev->num_hwfns);
+	rc = qed_eth_tx_queue_stop(p_hwfn, handle);
 	if (rc) {
-		DP_ERR(cdev, "Failed to stop TXQ#%d\n", params->tx_queue_id);
+		DP_ERR(cdev, "Failed to stop TXQ#%02x\n", rss_id);
 		return rc;
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index e495d62..48c9bfc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -78,11 +78,34 @@ struct qed_filter_mcast {
 	unsigned char mac[QED_MAX_MC_ADDRS][ETH_ALEN];
 };
 
-int qed_sp_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
-			     u16 rx_queue_id,
-			     bool eq_completion_only, bool cqe_completion);
+/**
+ * @brief qed_eth_rx_queue_stop - This ramrod closes an Rx queue
+ *
+ * @param p_hwfn
+ * @param p_rxq			Handler of queue to close
+ * @param eq_completion_only	If True completion will be on
+ *				EQe, if False completion will be
+ *				on EQe if p_hwfn opaque
+ *				different from the RXQ opaque
+ *				otherwise on CQe.
+ * @param cqe_completion	If True completion will be
+ *				receive on CQe.
+ * @return int
+ */
+int
+qed_eth_rx_queue_stop(struct qed_hwfn *p_hwfn,
+		      void *p_rxq,
+		      bool eq_completion_only, bool cqe_completion);
 
-int qed_sp_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, u16 tx_queue_id);
+/**
+ * @brief qed_eth_tx_queue_stop - closes a Tx queue
+ *
+ * @param p_hwfn
+ * @param p_txq - handle to Tx queue needed to be closed
+ *
+ * @return int
+ */
+int qed_eth_tx_queue_stop(struct qed_hwfn *p_hwfn, void *p_txq);
 
 enum qed_tpa_mode {
 	QED_TPA_MODE_NONE,
@@ -196,19 +219,19 @@ int qed_sp_eth_filter_ucast(struct qed_hwfn *p_hwfn,
  * @note At the moment - only used by non-linux VFs.
  *
  * @param p_hwfn
- * @param rx_queue_id		RX Queue ID
- * @param num_rxqs		Allow to update multiple rx
- *				queues, from rx_queue_id to
- *				(rx_queue_id + num_rxqs)
+ * @param pp_rxq_handlers	An array of queue handlers to be updated.
+ * @param num_rxqs              number of queues to update.
  * @param complete_cqe_flg	Post completion to the CQE Ring if set
  * @param complete_event_flg	Post completion to the Event Ring if set
+ * @param comp_mode
+ * @param p_comp_data
  *
  * @return int
  */
 
 int
 qed_sp_eth_rx_queues_update(struct qed_hwfn *p_hwfn,
-			    u16 rx_queue_id,
+			    void **pp_rxq_handlers,
 			    u8 num_rxqs,
 			    u8 complete_cqe_flg,
 			    u8 complete_event_flg,
@@ -217,27 +240,79 @@ int qed_sp_eth_filter_ucast(struct qed_hwfn *p_hwfn,
 
 void qed_get_vport_stats(struct qed_dev *cdev, struct qed_eth_stats *stats);
 
-int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
-			   struct qed_sp_vport_start_params *p_params);
+void qed_reset_vport_stats(struct qed_dev *cdev);
+
+struct qed_queue_cid {
+	/* 'Relative' is a relative term ;-). Usually the indices [not counting
+	 * SBs] would be PF-relative, but there are some cases where that isn't
+	 * the case - specifically for a PF configuring its VF indices it's
+	 * possible some fields [E.g., stats-id] in 'rel' would already be abs.
+	 */
+	struct qed_queue_start_common_params rel;
+	struct qed_queue_start_common_params abs;
+	u32 cid;
+	u16 opaque_fid;
+
+	/* VFs queues are mapped differently, so we need to know the
+	 * relative queue associated with them [0-based].
+	 * Notice this is relevant on the *PF* queue-cid of its VF's queues,
+	 * and not on the VF itself.
+	 */
+	bool is_vf;
+	u8 vf_qid;
+
+	/* Legacy VFs might have Rx producer located elsewhere */
+	bool b_legacy_vf;
+};
 
-int qed_sp_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
-				u16 opaque_fid,
-				u32 cid,
-				struct qed_queue_start_common_params *params,
-				u8 stats_id,
-				u16 bd_max_bytes,
-				dma_addr_t bd_chain_phys_addr,
-				dma_addr_t cqe_pbl_addr,
-				u16 cqe_pbl_size, bool b_use_zone_a_prod);
-
-int qed_sp_eth_txq_start_ramrod(struct qed_hwfn  *p_hwfn,
-				u16  opaque_fid,
-				u32  cid,
-				struct qed_queue_start_common_params *p_params,
-				u8  stats_id,
-				dma_addr_t pbl_addr,
-				u16 pbl_size,
-				union qed_qm_pq_params *p_pq_params);
+void qed_eth_queue_cid_release(struct qed_hwfn *p_hwfn,
+			       struct qed_queue_cid *p_cid);
+
+struct qed_queue_cid *_qed_eth_queue_to_cid(struct qed_hwfn *p_hwfn,
+					    u16 opaque_fid,
+					    u32 cid,
+					    u8 vf_qid,
+					    struct qed_queue_start_common_params
+					    *p_params);
+
+int
+qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
+		       struct qed_sp_vport_start_params *p_params);
+
+/**
+ * @brief - Starts an Rx queue, when queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param bd_max_bytes
+ * @param bd_chain_phys_addr
+ * @param cqe_pbl_addr
+ * @param cqe_pbl_size
+ *
+ * @return int
+ */
+int
+qed_eth_rxq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 u16 bd_max_bytes,
+			 dma_addr_t bd_chain_phys_addr,
+			 dma_addr_t cqe_pbl_addr, u16 cqe_pbl_size);
+
+/**
+ * @brief - Starts a Tx queue, where queue_cid is already prepared
+ *
+ * @param p_hwfn
+ * @param p_cid
+ * @param pbl_addr
+ * @param pbl_size
+ * @param p_pq_params - parameters for choosing the PQ for this Tx queue
+ *
+ * @return int
+ */
+int
+qed_eth_txq_start_ramrod(struct qed_hwfn *p_hwfn,
+			 struct qed_queue_cid *p_cid,
+			 dma_addr_t pbl_addr, u16 pbl_size, u16 pq_id);
 
 u8 qed_mcast_bin_from_mac(u8 *mac);
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index f3f742a..85b09dd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -808,37 +808,70 @@ static void qed_iov_free_vf_igu_sbs(struct qed_hwfn *p_hwfn,
 
 static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt,
-				  u16 rel_vf_id, u16 num_rx_queues)
+				  struct qed_iov_vf_init_params *p_params)
 {
 	u8 num_of_vf_avaiable_chains = 0;
 	struct qed_vf_info *vf = NULL;
+	u16 qid, num_irqs;
 	int rc = 0;
 	u32 cids;
 	u8 i;
 
-	vf = qed_iov_get_vf_info(p_hwfn, rel_vf_id, false);
+	vf = qed_iov_get_vf_info(p_hwfn, p_params->rel_vf_id, false);
 	if (!vf) {
 		DP_ERR(p_hwfn, "qed_iov_init_hw_for_vf : vf is NULL\n");
 		return -EINVAL;
 	}
 
 	if (vf->b_init) {
-		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n", rel_vf_id);
+		DP_NOTICE(p_hwfn, "VF[%d] is already active.\n",
+			  p_params->rel_vf_id);
 		return -EINVAL;
 	}
 
+	/* Perform sanity checking on the requested queue_id */
+	for (i = 0; i < p_params->num_queues; i++) {
+		u16 min_vf_qzone = FEAT_NUM(p_hwfn, QED_PF_L2_QUE);
+		u16 max_vf_qzone = min_vf_qzone +
+		    FEAT_NUM(p_hwfn, QED_VF_L2_QUE) - 1;
+
+		qid = p_params->req_rx_queue[i];
+		if (qid < min_vf_qzone || qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Rx qid [%04x] for VF[%d]: qids [0x%04x,...,0x%04x] available\n",
+				  qid,
+				  p_params->rel_vf_id,
+				  min_vf_qzone, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		qid = p_params->req_tx_queue[i];
+		if (qid > max_vf_qzone) {
+			DP_NOTICE(p_hwfn,
+				  "Can't enable Tx qid [%04x] for VF[%d]: max qid 0x%04x\n",
+				  qid, p_params->rel_vf_id, max_vf_qzone);
+			return -EINVAL;
+		}
+
+		/* If client *really* wants, Tx qid can be shared with PF */
+		if (qid < min_vf_qzone)
+			DP_VERBOSE(p_hwfn,
+				   QED_MSG_IOV,
+				   "VF[%d] is using PF qid [0x%04x] for Txq[0x%02x]\n",
+				   p_params->rel_vf_id, qid, i);
+	}
+
 	/* Limit number of queues according to number of CIDs */
 	qed_cxt_get_proto_cid_count(p_hwfn, PROTOCOLID_ETH, &cids);
 	DP_VERBOSE(p_hwfn,
 		   QED_MSG_IOV,
 		   "VF[%d] - requesting to initialize for 0x%04x queues [0x%04x CIDs available]\n",
-		   vf->relative_vf_id, num_rx_queues, (u16) cids);
-	num_rx_queues = min_t(u16, num_rx_queues, ((u16) cids));
+		   vf->relative_vf_id, p_params->num_queues, (u16)cids);
+	num_irqs = min_t(u16, p_params->num_queues, ((u16)cids));
 
 	num_of_vf_avaiable_chains = qed_iov_alloc_vf_igu_sbs(p_hwfn,
 							     p_ptt,
-							     vf,
-							     num_rx_queues);
+							     vf, num_irqs);
 	if (!num_of_vf_avaiable_chains) {
 		DP_ERR(p_hwfn, "no available igu sbs\n");
 		return -ENOMEM;
@@ -849,25 +882,22 @@ static int qed_iov_init_hw_for_vf(struct qed_hwfn *p_hwfn,
 	vf->num_txqs = num_of_vf_avaiable_chains;
 
 	for (i = 0; i < vf->num_rxqs; i++) {
-		u16 queue_id = qed_int_queue_id_from_sb_id(p_hwfn,
-							   vf->igu_sbs[i]);
+		struct qed_vf_q_info *p_queue = &vf->vf_queues[i];
 
-		if (queue_id > RESC_NUM(p_hwfn, QED_L2_QUEUE)) {
-			DP_NOTICE(p_hwfn,
-				  "VF[%d] will require utilizing of out-of-bounds queues - %04x\n",
-				  vf->relative_vf_id, queue_id);
-			return -EINVAL;
-		}
+		p_queue->fw_rx_qid = p_params->req_rx_queue[i];
+		p_queue->fw_tx_qid = p_params->req_tx_queue[i];
 
 		/* CIDs are per-VF, so no problem having them 0-based. */
-		vf->vf_queues[i].fw_rx_qid = queue_id;
-		vf->vf_queues[i].fw_tx_qid = queue_id;
-		vf->vf_queues[i].fw_cid = i;
+		p_queue->fw_cid = i;
 
 		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-			   "VF[%d] - [%d] SB %04x, Tx/Rx queue %04x CID %04x\n",
-			   vf->relative_vf_id, i, vf->igu_sbs[i], queue_id, i);
+			   "VF[%d] - Q[%d] SB %04x, qid [Rx %04x Tx %04x]  CID %04x\n",
+			   vf->relative_vf_id,
+			   i, vf->igu_sbs[i],
+			   p_queue->fw_rx_qid,
+			   p_queue->fw_tx_qid, p_queue->fw_cid);
 	}
+
 	rc = qed_iov_enable_vf_access(p_hwfn, p_ptt, vf);
 	if (!rc) {
 		vf->b_init = true;
@@ -1187,8 +1217,19 @@ static void qed_iov_vf_cleanup(struct qed_hwfn *p_hwfn,
 
 	p_vf->num_active_rxqs = 0;
 
-	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++)
-		p_vf->vf_queues[i].rxq_active = 0;
+	for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
+		struct qed_vf_q_info *p_queue = &p_vf->vf_queues[i];
+
+		if (p_queue->p_rx_cid) {
+			qed_eth_queue_cid_release(p_hwfn, p_queue->p_rx_cid);
+			p_queue->p_rx_cid = NULL;
+		}
+
+		if (p_queue->p_tx_cid) {
+			qed_eth_queue_cid_release(p_hwfn, p_queue->p_tx_cid);
+			p_queue->p_tx_cid = NULL;
+		}
+	}
 
 	memset(&p_vf->shadow_config, 0, sizeof(p_vf->shadow_config));
 	memset(&p_vf->acquire, 0, sizeof(p_vf->acquire));
@@ -1594,21 +1635,21 @@ static int qed_iov_configure_vport_forced(struct qed_hwfn *p_hwfn,
 
 		/* Update all the Rx queues */
 		for (i = 0; i < QED_MAX_VF_CHAINS_PER_PF; i++) {
-			u16 qid;
+			struct qed_queue_cid *p_cid;
 
-			if (!p_vf->vf_queues[i].rxq_active)
+			p_cid = p_vf->vf_queues[i].p_rx_cid;
+			if (!p_cid)
 				continue;
 
-			qid = p_vf->vf_queues[i].fw_rx_qid;
-
-			rc = qed_sp_eth_rx_queues_update(p_hwfn, qid,
+			rc = qed_sp_eth_rx_queues_update(p_hwfn,
+							 (void **)&p_cid,
 							 1, 0, 1,
 							 QED_SPQ_MODE_EBLOCK,
 							 NULL);
 			if (rc) {
 				DP_NOTICE(p_hwfn,
 					  "Failed to send Rx update fo queue[0x%04x]\n",
-					  qid);
+					  p_cid->rel.queue_id);
 				return rc;
 			}
 		}
@@ -1782,23 +1823,34 @@ static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
 	struct qed_queue_start_common_params params;
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	u8 status = PFVF_STATUS_NO_RESOURCE;
+	struct qed_vf_q_info *p_queue;
 	struct vfpf_start_rxq_tlv *req;
 	bool b_legacy_vf = false;
 	int rc;
 
-	memset(&params, 0, sizeof(params));
 	req = &mbx->req_virt->start_rxq;
 
 	if (!qed_iov_validate_rxq(p_hwfn, vf, req->rx_qid) ||
 	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
 		goto out;
 
-	params.queue_id =  vf->vf_queues[req->rx_qid].fw_rx_qid;
-	params.vf_qid = req->rx_qid;
+	/* Acquire a new queue-cid */
+	p_queue = &vf->vf_queues[req->rx_qid];
+
+	memset(&params, 0, sizeof(params));
+	params.queue_id = p_queue->fw_rx_qid;
 	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
 	params.sb = req->hw_sb;
 	params.sb_idx = req->sb_index;
 
+	p_queue->p_rx_cid = _qed_eth_queue_to_cid(p_hwfn,
+						  vf->opaque_fid,
+						  p_queue->fw_cid,
+						  req->rx_qid, &params);
+	if (!p_queue->p_rx_cid)
+		goto out;
+
 	/* Legacy VFs have their Producers in a different location, which they
 	 * calculate on their own and clean the producer prior to this.
 	 */
@@ -1811,21 +1863,19 @@ static void qed_iov_vf_mbx_start_rxq(struct qed_hwfn *p_hwfn,
 		       MSTORM_ETH_VF_PRODS_OFFSET(vf->abs_vf_id, req->rx_qid),
 		       0);
 	}
+	p_queue->p_rx_cid->b_legacy_vf = b_legacy_vf;
 
-	rc = qed_sp_eth_rxq_start_ramrod(p_hwfn, vf->opaque_fid,
-					 vf->vf_queues[req->rx_qid].fw_cid,
-					 &params,
-					 vf->abs_vf_id + 0x10,
-					 req->bd_max_bytes,
-					 req->rxq_addr,
-					 req->cqe_pbl_addr, req->cqe_pbl_size,
-					 b_legacy_vf);
-
+	rc = qed_eth_rxq_start_ramrod(p_hwfn,
+				      p_queue->p_rx_cid,
+				      req->bd_max_bytes,
+				      req->rxq_addr,
+				      req->cqe_pbl_addr, req->cqe_pbl_size);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_queue->p_rx_cid);
+		p_queue->p_rx_cid = NULL;
 	} else {
 		status = PFVF_STATUS_SUCCESS;
-		vf->vf_queues[req->rx_qid].rxq_active = true;
 		vf->num_active_rxqs++;
 	}
 
@@ -1882,7 +1932,9 @@ static void qed_iov_vf_mbx_start_txq(struct qed_hwfn *p_hwfn,
 	u8 status = PFVF_STATUS_NO_RESOURCE;
 	union qed_qm_pq_params pq_params;
 	struct vfpf_start_txq_tlv *req;
+	struct qed_vf_q_info *p_queue;
 	int rc;
+	u16 pq;
 
 	/* Prepare the parameters which would choose the right PQ */
 	memset(&pq_params, 0, sizeof(pq_params));
@@ -1896,24 +1948,31 @@ static void qed_iov_vf_mbx_start_txq(struct qed_hwfn *p_hwfn,
 	    !qed_iov_validate_sb(p_hwfn, vf, req->hw_sb))
 		goto out;
 
-	params.queue_id =  vf->vf_queues[req->tx_qid].fw_tx_qid;
+	/* Acquire a new queue-cid */
+	p_queue = &vf->vf_queues[req->tx_qid];
+
+	params.queue_id = p_queue->fw_tx_qid;
 	params.vport_id = vf->vport_id;
+	params.stats_id = vf->abs_vf_id + 0x10;
 	params.sb = req->hw_sb;
 	params.sb_idx = req->sb_index;
 
-	rc = qed_sp_eth_txq_start_ramrod(p_hwfn,
-					 vf->opaque_fid,
-					 vf->vf_queues[req->tx_qid].fw_cid,
-					 &params,
-					 vf->abs_vf_id + 0x10,
-					 req->pbl_addr,
-					 req->pbl_size, &pq_params);
+	p_queue->p_tx_cid = _qed_eth_queue_to_cid(p_hwfn,
+						  vf->opaque_fid,
+						  p_queue->fw_cid,
+						  req->tx_qid, &params);
+	if (!p_queue->p_tx_cid)
+		goto out;
 
+	pq = qed_get_qm_pq(p_hwfn, PROTOCOLID_ETH, &pq_params);
+	rc = qed_eth_txq_start_ramrod(p_hwfn, p_queue->p_tx_cid,
+				      req->pbl_addr, req->pbl_size, pq);
 	if (rc) {
 		status = PFVF_STATUS_FAILURE;
+		qed_eth_queue_cid_release(p_hwfn, p_queue->p_tx_cid);
+		p_queue->p_tx_cid = NULL;
 	} else {
 		status = PFVF_STATUS_SUCCESS;
-		vf->vf_queues[req->tx_qid].txq_active = true;
 	}
 
 out:
@@ -1924,6 +1983,7 @@ static int qed_iov_vf_stop_rxqs(struct qed_hwfn *p_hwfn,
 				struct qed_vf_info *vf,
 				u16 rxq_id, u8 num_rxqs, bool cqe_completion)
 {
+	struct qed_vf_q_info *p_queue;
 	int rc = 0;
 	int qid;
 
@@ -1931,16 +1991,18 @@ static int qed_iov_vf_stop_rxqs(struct qed_hwfn *p_hwfn,
 		return -EINVAL;
 
 	for (qid = rxq_id; qid < rxq_id + num_rxqs; qid++) {
-		if (vf->vf_queues[qid].rxq_active) {
-			rc = qed_sp_eth_rx_queue_stop(p_hwfn,
-						      vf->vf_queues[qid].
-						      fw_rx_qid, false,
-						      cqe_completion);
+		p_queue = &vf->vf_queues[qid];
 
-			if (rc)
-				return rc;
-		}
-		vf->vf_queues[qid].rxq_active = false;
+		if (!p_queue->p_rx_cid)
+			continue;
+
+		rc = qed_eth_rx_queue_stop(p_hwfn,
+					   p_queue->p_rx_cid,
+					   false, cqe_completion);
+		if (rc)
+			return rc;
+
+		vf->vf_queues[qid].p_rx_cid = NULL;
 		vf->num_active_rxqs--;
 	}
 
@@ -1951,22 +2013,24 @@ static int qed_iov_vf_stop_txqs(struct qed_hwfn *p_hwfn,
 				struct qed_vf_info *vf, u16 txq_id, u8 num_txqs)
 {
 	int rc = 0;
+	struct qed_vf_q_info *p_queue;
 	int qid;
 
 	if (txq_id + num_txqs > ARRAY_SIZE(vf->vf_queues))
 		return -EINVAL;
 
 	for (qid = txq_id; qid < txq_id + num_txqs; qid++) {
-		if (vf->vf_queues[qid].txq_active) {
-			rc = qed_sp_eth_tx_queue_stop(p_hwfn,
-						      vf->vf_queues[qid].
-						      fw_tx_qid);
+		p_queue = &vf->vf_queues[qid];
+		if (!p_queue->p_tx_cid)
+			continue;
 
-			if (rc)
-				return rc;
-		}
-		vf->vf_queues[qid].txq_active = false;
+		rc = qed_eth_tx_queue_stop(p_hwfn, p_queue->p_tx_cid);
+		if (rc)
+			return rc;
+
+		p_queue->p_tx_cid = NULL;
 	}
+
 	return rc;
 }
 
@@ -2021,10 +2085,11 @@ static void qed_iov_vf_mbx_update_rxqs(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_vf_info *vf)
 {
+	struct qed_queue_cid *handlers[QED_MAX_VF_CHAINS_PER_PF];
 	u16 length = sizeof(struct pfvf_def_resp_tlv);
 	struct qed_iov_vf_mbx *mbx = &vf->vf_mbx;
 	struct vfpf_update_rxq_tlv *req;
-	u8 status = PFVF_STATUS_SUCCESS;
+	u8 status = PFVF_STATUS_FAILURE;
 	u8 complete_event_flg;
 	u8 complete_cqe_flg;
 	u16 qid;
@@ -2035,29 +2100,36 @@ static void qed_iov_vf_mbx_update_rxqs(struct qed_hwfn *p_hwfn,
 	complete_cqe_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_CQE_FLAG);
 	complete_event_flg = !!(req->flags & VFPF_RXQ_UPD_COMPLETE_EVENT_FLAG);
 
+	/* Validate inputs */
+	if (req->num_rxqs + req->rx_qid > QED_MAX_VF_CHAINS_PER_PF ||
+	    !qed_iov_validate_rxq(p_hwfn, vf, req->rx_qid)) {
+		DP_INFO(p_hwfn, "VF[%d]: Incorrect Rxqs [%04x, %02x]\n",
+			vf->relative_vf_id, req->rx_qid, req->num_rxqs);
+		goto out;
+	}
+
 	for (i = 0; i < req->num_rxqs; i++) {
 		qid = req->rx_qid + i;
-
-		if (!vf->vf_queues[qid].rxq_active) {
-			DP_NOTICE(p_hwfn, "VF rx_qid = %d isn`t active!\n",
-				  qid);
-			status = PFVF_STATUS_FAILURE;
-			break;
+		if (!vf->vf_queues[qid].p_rx_cid) {
+			DP_INFO(p_hwfn,
+				"VF[%d] rx_qid = %d isn`t active!\n",
+				vf->relative_vf_id, qid);
+			goto out;
 		}
 
-		rc = qed_sp_eth_rx_queues_update(p_hwfn,
-						 vf->vf_queues[qid].fw_rx_qid,
-						 1,
-						 complete_cqe_flg,
-						 complete_event_flg,
-						 QED_SPQ_MODE_EBLOCK, NULL);
-
-		if (rc) {
-			status = PFVF_STATUS_FAILURE;
-			break;
-		}
+		handlers[i] = vf->vf_queues[qid].p_rx_cid;
 	}
 
+	rc = qed_sp_eth_rx_queues_update(p_hwfn, (void **)&handlers,
+					 req->num_rxqs,
+					 complete_cqe_flg,
+					 complete_event_flg,
+					 QED_SPQ_MODE_EBLOCK, NULL);
+	if (rc)
+		goto out;
+
+	status = PFVF_STATUS_SUCCESS;
+out:
 	qed_iov_prepare_resp(p_hwfn, p_ptt, vf, CHANNEL_TLV_UPDATE_RXQ,
 			     length, status);
 }
@@ -2268,7 +2340,7 @@ void *qed_iov_search_list_tlvs(struct qed_hwfn *p_hwfn,
 			DP_NOTICE(p_hwfn,
 				  "rss_ind_table[%d] = %d, rxq is out of range\n",
 				  i, q_idx);
-		else if (!vf->vf_queues[q_idx].rxq_active)
+		else if (!vf->vf_queues[q_idx].p_rx_cid)
 			DP_NOTICE(p_hwfn,
 				  "rss_ind_table[%d] = %d, rxq is not active\n",
 				  i, q_idx);
@@ -3468,8 +3540,28 @@ int qed_sriov_disable(struct qed_dev *cdev, bool pci_enabled)
 	return 0;
 }
 
+static void qed_sriov_enable_qid_config(struct qed_hwfn *hwfn,
+					u16 vfid,
+					struct qed_iov_vf_init_params *params)
+{
+	u16 base, i;
+
+	/* Since we have an equal resource distribution per-VF, and we assume
+	 * PF has acquired the QED_PF_L2_QUE first queues, we start setting
+	 * sequentially from there.
+	 */
+	base = FEAT_NUM(hwfn, QED_PF_L2_QUE) + vfid * params->num_queues;
+
+	params->rel_vf_id = vfid;
+	for (i = 0; i < params->num_queues; i++) {
+		params->req_rx_queue[i] = base + i;
+		params->req_tx_queue[i] = base + i;
+	}
+}
+
 static int qed_sriov_enable(struct qed_dev *cdev, int num)
 {
+	struct qed_iov_vf_init_params params;
 	int i, j, rc;
 
 	if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
@@ -3478,15 +3570,17 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
 		return -EINVAL;
 	}
 
+	memset(&params, 0, sizeof(params));
+
 	/* Initialize HW for VF access */
 	for_each_hwfn(cdev, j) {
 		struct qed_hwfn *hwfn = &cdev->hwfns[j];
 		struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
-		int num_queues;
 
 		/* Make sure not to use more than 16 queues per VF */
-		num_queues = min_t(int,
-				   FEAT_NUM(hwfn, QED_VF_L2_QUE) / num, 16);
+		params.num_queues = min_t(int,
+					  FEAT_NUM(hwfn, QED_VF_L2_QUE) / num,
+					  16);
 
 		if (!ptt) {
 			DP_ERR(hwfn, "Failed to acquire ptt\n");
@@ -3498,7 +3592,8 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
 			if (!qed_iov_is_valid_vfid(hwfn, i, false, true))
 				continue;
 
-			rc = qed_iov_init_hw_for_vf(hwfn, ptt, i, num_queues);
+			qed_sriov_enable_qid_config(hwfn, i, &params);
+			rc = qed_iov_init_hw_for_vf(hwfn, ptt, &params);
 			if (rc) {
 				DP_ERR(cdev, "Failed to enable VF[%d]\n", i);
 				qed_ptt_release(hwfn, ptt);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 3cf515b..509c02b 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -58,6 +58,23 @@ struct qed_public_vf_info {
 	int tx_rate;
 };
 
+struct qed_iov_vf_init_params {
+	u16 rel_vf_id;
+
+	/* Number of requested Queues; Currently, don't support different
+	 * number of Rx/Tx queues.
+	 */
+
+	u16 num_queues;
+
+	/* Allow the client to choose which qzones to use for Rx/Tx,
+	 * and which queue_base to use for Tx queues on a per-queue basis.
+	 * Notice values should be relative to the PF resources.
+	 */
+	u16 req_rx_queue[QED_MAX_VF_CHAINS_PER_PF];
+	u16 req_tx_queue[QED_MAX_VF_CHAINS_PER_PF];
+};
+
 /* This struct is part of qed_dev and contains data relevant to all hwfns;
  * Initialized only if SR-IOV cpabability is exposed in PCIe config space.
  */
@@ -99,10 +116,10 @@ struct qed_iov_vf_mbx {
 
 struct qed_vf_q_info {
 	u16 fw_rx_qid;
+	struct qed_queue_cid *p_rx_cid;
 	u16 fw_tx_qid;
+	struct qed_queue_cid *p_tx_cid;
 	u8 fw_cid;
-	u8 rxq_active;
-	u8 txq_active;
 };
 
 enum vf_state {
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 3c06336..60b31a8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -388,18 +388,18 @@ int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 #define MSTORM_QZONE_START(dev)   (TSTORM_QZONE_START +	\
 				   (TSTORM_QZONE_SIZE * NUM_OF_L2_QUEUES(dev)))
 
-int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-			u8 rx_qid,
-			u16 sb,
-			u8 sb_index,
-			u16 bd_max_bytes,
-			dma_addr_t bd_chain_phys_addr,
-			dma_addr_t cqe_pbl_addr,
-			u16 cqe_pbl_size, void __iomem **pp_prod)
+int
+qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    u16 bd_max_bytes,
+		    dma_addr_t bd_chain_phys_addr,
+		    dma_addr_t cqe_pbl_addr,
+		    u16 cqe_pbl_size, void __iomem **pp_prod)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct pfvf_start_queue_resp_tlv *resp;
 	struct vfpf_start_rxq_tlv *req;
+	u8 rx_qid = p_cid->rel.queue_id;
 	int rc;
 
 	/* clear mailbox and prep first tlv */
@@ -409,21 +409,22 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 	req->cqe_pbl_addr = cqe_pbl_addr;
 	req->cqe_pbl_size = cqe_pbl_size;
 	req->rxq_addr = bd_chain_phys_addr;
-	req->hw_sb = sb;
-	req->sb_index = sb_index;
+	req->hw_sb = p_cid->rel.sb;
+	req->sb_index = p_cid->rel.sb_idx;
 	req->bd_max_bytes = bd_max_bytes;
 	req->stat_id = -1;
 
 	/* If PF is legacy, we'll need to calculate producers ourselves
 	 * as well as clean them.
 	 */
-	if (pp_prod && p_iov->b_pre_fp_hsi) {
+	if (p_iov->b_pre_fp_hsi) {
 		u8 hw_qid = p_iov->acquire_resp.resc.hw_qid[rx_qid];
 		u32 init_prod_val = 0;
 
-		*pp_prod = (u8 __iomem *)p_hwfn->regview +
-					 MSTORM_QZONE_START(p_hwfn->cdev) +
-					 hw_qid * MSTORM_QZONE_SIZE;
+		*pp_prod = (u8 __iomem *)
+		    p_hwfn->regview +
+		    MSTORM_QZONE_START(p_hwfn->cdev) +
+		    hw_qid * MSTORM_QZONE_SIZE;
 
 		/* Init the rcq, rx bd and rx sge (if valid) producers to 0 */
 		__internal_ram_wr(p_hwfn, *pp_prod, sizeof(u32),
@@ -444,7 +445,7 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 	}
 
 	/* Learn the address of the producer from the response */
-	if (pp_prod && !p_iov->b_pre_fp_hsi) {
+	if (!p_iov->b_pre_fp_hsi) {
 		u32 init_prod_val = 0;
 
 		*pp_prod = (u8 __iomem *)p_hwfn->regview + resp->offset;
@@ -462,7 +463,8 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn, u16 rx_qid, bool cqe_completion)
+int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
+		       struct qed_queue_cid *p_cid, bool cqe_completion)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_stop_rxqs_tlv *req;
@@ -472,7 +474,7 @@ int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn, u16 rx_qid, bool cqe_completion)
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_RXQS, sizeof(*req));
 
-	req->rx_qid = rx_qid;
+	req->rx_qid = p_cid->rel.queue_id;
 	req->num_rxqs = 1;
 	req->cqe_completion = cqe_completion;
 
@@ -496,28 +498,28 @@ int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn, u16 rx_qid, bool cqe_completion)
 	return rc;
 }
 
-int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-			u16 tx_queue_id,
-			u16 sb,
-			u8 sb_index,
-			dma_addr_t pbl_addr,
-			u16 pbl_size, void __iomem **pp_doorbell)
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct pfvf_start_queue_resp_tlv *resp;
 	struct vfpf_start_txq_tlv *req;
+	u16 qid = p_cid->rel.queue_id;
 	int rc;
 
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_START_TXQ, sizeof(*req));
 
-	req->tx_qid = tx_queue_id;
+	req->tx_qid = qid;
 
 	/* Tx */
 	req->pbl_addr = pbl_addr;
 	req->pbl_size = pbl_size;
-	req->hw_sb = sb;
-	req->sb_index = sb_index;
+	req->hw_sb = p_cid->rel.sb;
+	req->sb_index = p_cid->rel.sb_idx;
 
 	/* add list termination tlv */
 	qed_add_tlv(p_hwfn, &p_iov->offset,
@@ -533,33 +535,29 @@ int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
 		goto exit;
 	}
 
-	if (pp_doorbell) {
-		/* Modern PFs provide the actual offsets, while legacy
-		 * provided only the queue id.
-		 */
-		if (!p_iov->b_pre_fp_hsi) {
-			*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-						     resp->offset;
-		} else {
-			u8 cid = p_iov->acquire_resp.resc.cid[tx_queue_id];
-			u32 db_addr;
-
-			db_addr = qed_db_addr_vf(cid, DQ_DEMS_LEGACY);
-			*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
-						     db_addr;
-		}
+	/* Modern PFs provide the actual offsets, while legacy
+	 * provided only the queue id.
+	 */
+	if (!p_iov->b_pre_fp_hsi) {
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells + resp->offset;
+	} else {
+		u8 cid = p_iov->acquire_resp.resc.cid[qid];
 
-		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
-			   "Txq[0x%02x]: doorbell at %p [offset 0x%08x]\n",
-			   tx_queue_id, *pp_doorbell, resp->offset);
+		*pp_doorbell = (u8 __iomem *)p_hwfn->doorbells +
+					     qed_db_addr_vf(cid,
+							    DQ_DEMS_LEGACY);
 	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+		   "Txq[0x%02x]: doorbell at %p [offset 0x%08x]\n",
+		   qid, *pp_doorbell, resp->offset);
 exit:
 	qed_vf_pf_req_end(p_hwfn, rc);
 
 	return rc;
 }
 
-int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid)
 {
 	struct qed_vf_iov *p_iov = p_hwfn->vf_iov_info;
 	struct vfpf_stop_txqs_tlv *req;
@@ -569,7 +567,7 @@ int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
 	/* clear mailbox and prep first tlv */
 	req = qed_vf_pf_prep(p_hwfn, CHANNEL_TLV_STOP_TXQS, sizeof(*req));
 
-	req->tx_qid = tx_qid;
+	req->tx_qid = p_cid->rel.queue_id;
 	req->num_txqs = 1;
 
 	/* add list termination tlv */
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 325c250..11eb385 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -666,10 +666,7 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 /**
  * @brief VF - start the RX Queue by sending a message to the PF
  * @param p_hwfn
- * @param cid                   - zero based within the VF
- * @param rx_queue_id           - zero based within the VF
- * @param sb                    - VF status block for this queue
- * @param sb_index              - Index within the status block
+ * @param p_cid			- Only relative fields are relevant
  * @param bd_max_bytes          - maximum number of bytes per bd
  * @param bd_chain_phys_addr    - physical address of bd chain
  * @param cqe_pbl_addr          - physical address of pbl
@@ -680,9 +677,7 @@ void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
  * @return int
  */
 int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-			u8 rx_queue_id,
-			u16 sb,
-			u8 sb_index,
+			struct qed_queue_cid *p_cid,
 			u16 bd_max_bytes,
 			dma_addr_t bd_chain_phys_addr,
 			dma_addr_t cqe_pbl_addr,
@@ -702,24 +697,23 @@ int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
  *
  * @return int
  */
-int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-			u16 tx_queue_id,
-			u16 sb,
-			u8 sb_index,
-			dma_addr_t pbl_addr,
-			u16 pbl_size, void __iomem **pp_doorbell);
+int
+qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
+		    struct qed_queue_cid *p_cid,
+		    dma_addr_t pbl_addr,
+		    u16 pbl_size, void __iomem **pp_doorbell);
 
 /**
  * @brief VF - stop the RX queue by sending a message to the PF
  *
  * @param p_hwfn
- * @param rx_qid
+ * @param p_cid
  * @param cqe_completion
  *
  * @return int
  */
 int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
-		       u16 rx_qid, bool cqe_completion);
+		       struct qed_queue_cid *p_cid, bool cqe_completion);
 
 /**
  * @brief VF - stop the TX queue by sending a message to the PF
@@ -729,7 +723,7 @@ int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
  *
  * @return int
  */
-int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid);
+int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, struct qed_queue_cid *p_cid);
 
 /**
  * @brief VF - send a vport update command
@@ -902,9 +896,7 @@ static inline int qed_vf_hw_prepare(struct qed_hwfn *p_hwfn)
 }
 
 static inline int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
-				      u8 rx_queue_id,
-				      u16 sb,
-				      u8 sb_index,
+				      struct qed_queue_cid *p_cid,
 				      u16 bd_max_bytes,
 				      dma_addr_t bd_chain_phys_adr,
 				      dma_addr_t cqe_pbl_addr,
@@ -914,9 +906,7 @@ static inline int qed_vf_pf_rxq_start(struct qed_hwfn *p_hwfn,
 }
 
 static inline int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
-				      u16 tx_queue_id,
-				      u16 sb,
-				      u8 sb_index,
+				      struct qed_queue_cid *p_cid,
 				      dma_addr_t pbl_addr,
 				      u16 pbl_size, void __iomem **pp_doorbell)
 {
@@ -924,12 +914,14 @@ static inline int qed_vf_pf_txq_start(struct qed_hwfn *p_hwfn,
 }
 
 static inline int qed_vf_pf_rxq_stop(struct qed_hwfn *p_hwfn,
-				     u16 rx_qid, bool cqe_completion)
+				     struct qed_queue_cid *p_cid,
+				     bool cqe_completion)
 {
 	return -EINVAL;
 }
 
-static inline int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn, u16 tx_qid)
+static inline int qed_vf_pf_txq_stop(struct qed_hwfn *p_hwfn,
+				     struct qed_queue_cid *p_cid)
 {
 	return -EINVAL;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index e0a94bc..4ebc638 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -264,6 +264,8 @@ struct qede_rx_queue {
 	u64			rx_hw_errors;
 	u64			rx_alloc_errors;
 	u64			rx_ip_frags;
+
+	void *handle;
 };
 
 union db_prod {
@@ -293,6 +295,8 @@ struct qede_tx_queue {
 	u64			stopped_cnt;
 
 	bool			is_legacy;
+	void *handle;
+
 };
 
 #define BD_UNMAP_ADDR(bd)		HILO_U64(le32_to_cpu((bd)->addr.hi), \
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8774f8e..615202f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -3334,6 +3334,12 @@ static int qede_drain_txq(struct qede_dev *edev,
 	return 0;
 }
 
+static int qede_stop_txq(struct qede_dev *edev,
+			 struct qede_tx_queue *txq, int rss_id)
+{
+	return edev->ops->q_tx_stop(edev->cdev, rss_id, txq->handle);
+}
+
 static int qede_stop_queues(struct qede_dev *edev)
 {
 	struct qed_update_vport_params vport_update_params;
@@ -3367,28 +3373,18 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 	/* Stop all Queues in reverse order */
 	for (i = QEDE_QUEUE_CNT(edev) - 1; i >= 0; i--) {
-		struct qed_stop_rxq_params rx_params;
-
 		fp = &edev->fp_array[i];
 
 		/* Stop the Tx Queue(s) */
 		if (fp->type & QEDE_FASTPATH_TX) {
-			struct qed_stop_txq_params tx_params;
-
-			tx_params.rss_id = i;
-			tx_params.tx_queue_id = fp->txq->index;
-				rc = edev->ops->q_tx_stop(cdev, &tx_params);
-				if (rc)
-					return rc;
+			rc = qede_stop_txq(edev, fp->txq, i);
+			if (rc)
+				return rc;
 		}
 
 		/* Stop the Rx Queue */
 		if (fp->type & QEDE_FASTPATH_RX) {
-			memset(&rx_params, 0, sizeof(rx_params));
-			rx_params.rss_id = i;
-			rx_params.rx_queue_id = fp->rxq->rxq_id;
-
-			rc = edev->ops->q_rx_stop(cdev, &rx_params);
+			rc = edev->ops->q_rx_stop(cdev, i, fp->rxq->handle);
 			if (rc) {
 				DP_ERR(edev, "Failed to stop RXQ #%d\n", i);
 				return rc;
@@ -3404,6 +3400,46 @@ static int qede_stop_queues(struct qede_dev *edev)
 	return rc;
 }
 
+static int qede_start_txq(struct qede_dev *edev,
+			  struct qede_fastpath *fp,
+			  struct qede_tx_queue *txq, u8 rss_id, u16 sb_idx)
+{
+	dma_addr_t phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
+	u32 page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
+	struct qed_queue_start_common_params params;
+	struct qed_txq_start_ret_params ret_params;
+	int rc;
+
+	memset(&params, 0, sizeof(params));
+	memset(&ret_params, 0, sizeof(ret_params));
+
+	params.queue_id = txq->index;
+	params.sb = fp->sb_info->igu_sb_id;
+	params.sb_idx = sb_idx;
+
+	rc = edev->ops->q_tx_start(edev->cdev, rss_id, &params, phys_table,
+				   page_cnt, &ret_params);
+	if (rc) {
+		DP_ERR(edev, "Start TXQ #%d failed %d\n", txq->index, rc);
+		return rc;
+	}
+
+	txq->doorbell_addr = ret_params.p_doorbell;
+	txq->handle = ret_params.p_handle;
+
+	/* Determine the FW consumer address associated */
+	txq->hw_cons_ptr = &fp->sb_info->sb_virt->pi_array[sb_idx];
+
+	/* Prepare the doorbell parameters */
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_DEST, DB_DEST_XCM);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD, DB_AGG_CMD_SET);
+	SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_VAL_SEL,
+		  DQ_XCM_ETH_TX_BD_PROD_CMD);
+	txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD;
+
+	return rc;
+}
+
 static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 {
 	int vlan_removal_en = 1;
@@ -3445,11 +3481,12 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		u32 page_cnt;
 
 		if (fp->type & QEDE_FASTPATH_RX) {
+			struct qed_rxq_start_ret_params ret_params;
 			struct qede_rx_queue *rxq = fp->rxq;
 			__le16 *val;
 
+			memset(&ret_params, 0, sizeof(ret_params));
 			memset(&q_params, 0, sizeof(q_params));
-			q_params.rss_id = i;
 			q_params.queue_id = rxq->rxq_id;
 			q_params.vport_id = 0;
 			q_params.sb = fp->sb_info->igu_sb_id;
@@ -3459,18 +3496,21 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			    qed_chain_get_pbl_phys(&rxq->rx_comp_ring);
 			page_cnt = qed_chain_get_page_cnt(&rxq->rx_comp_ring);
 
-			rc = edev->ops->q_rx_start(cdev, &q_params,
+			rc = edev->ops->q_rx_start(cdev, i, &q_params,
 						   rxq->rx_buf_size,
 						   rxq->rx_bd_ring.p_phys_addr,
 						   p_phys_table,
-						   page_cnt,
-						   &rxq->hw_rxq_prod_addr);
+						   page_cnt, &ret_params);
 			if (rc) {
 				DP_ERR(edev, "Start RXQ #%d failed %d\n", i,
 				       rc);
 				return rc;
 			}
 
+			/* Use the return parameters */
+			rxq->hw_rxq_prod_addr = ret_params.p_prod;
+			rxq->handle = ret_params.p_handle;
+
 			val = &fp->sb_info->sb_virt->pi_array[RX_PI];
 			rxq->hw_cons_ptr = val;
 
@@ -3478,38 +3518,9 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			struct qede_tx_queue *txq = fp->txq;
-
-			p_phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
-			page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
-
-			memset(&q_params, 0, sizeof(q_params));
-			q_params.rss_id = i;
-			q_params.queue_id = txq->index;
-			q_params.vport_id = 0;
-			q_params.sb = fp->sb_info->igu_sb_id;
-			q_params.sb_idx = TX_PI(0);
-
-			rc = edev->ops->q_tx_start(cdev, &q_params,
-						   p_phys_table, page_cnt,
-						   &txq->doorbell_addr);
-			if (rc) {
-				DP_ERR(edev, "Start TXQ #%d failed %d\n",
-				       txq->index, rc);
+			rc = qede_start_txq(edev, fp, fp->txq, i, TX_PI(0));
+			if (rc)
 				return rc;
-			}
-
-			txq->hw_cons_ptr =
-				&fp->sb_info->sb_virt->pi_array[TX_PI(0)];
-			SET_FIELD(txq->tx_db.data.params,
-				  ETH_DB_DATA_DEST, DB_DEST_XCM);
-			SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD,
-				  DB_AGG_CMD_SET);
-			SET_FIELD(txq->tx_db.data.params,
-				  ETH_DB_DATA_AGG_VAL_SEL,
-				  DQ_XCM_ETH_TX_BD_PROD_CMD);
-
-			txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD;
 		}
 	}
 
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 9755a3f..7a52f7c 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -15,6 +15,29 @@
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_iov_if.h>
 
+struct qed_queue_start_common_params {
+	/* Should always be relative to entity sending this. */
+	u8 vport_id;
+	u16 queue_id;
+
+	/* Relative, but relevant only for PFs */
+	u8 stats_id;
+
+	/* These are always absolute */
+	u16 sb;
+	u8 sb_idx;
+};
+
+struct qed_rxq_start_ret_params {
+	void __iomem *p_prod;
+	void *p_handle;
+};
+
+struct qed_txq_start_ret_params {
+	void __iomem *p_doorbell;
+	void *p_handle;
+};
+
 struct qed_dev_eth_info {
 	struct qed_dev_info common;
 
@@ -56,18 +79,6 @@ struct qed_start_vport_params {
 	bool clear_stats;
 };
 
-struct qed_stop_rxq_params {
-	u8 rss_id;
-	u8 rx_queue_id;
-	u8 vport_id;
-	bool eq_completion_only;
-};
-
-struct qed_stop_txq_params {
-	u8 rss_id;
-	u8 tx_queue_id;
-};
-
 enum qed_filter_rx_mode_type {
 	QED_FILTER_RX_MODE_TYPE_REGULAR,
 	QED_FILTER_RX_MODE_TYPE_MULTI_PROMISC,
@@ -112,15 +123,6 @@ struct qed_filter_params {
 	union qed_filter_type_params filter;
 };
 
-struct qed_queue_start_common_params {
-	u8 rss_id;
-	u8 queue_id;
-	u8 vport_id;
-	u16 sb;
-	u16 sb_idx;
-	u16 vf_qid;
-};
-
 struct qed_tunn_params {
 	u16 vxlan_port;
 	u8 update_vxlan_port;
@@ -220,24 +222,24 @@ struct qed_eth_ops {
 			    struct qed_update_vport_params *params);
 
 	int (*q_rx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  u16 bd_max_bytes,
 			  dma_addr_t bd_chain_phys_addr,
 			  dma_addr_t cqe_pbl_addr,
 			  u16 cqe_pbl_size,
-			  void __iomem **pp_prod);
+			  struct qed_rxq_start_ret_params *ret_params);
 
-	int (*q_rx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_rxq_params *params);
+	int (*q_rx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*q_tx_start)(struct qed_dev *cdev,
+			  u8 rss_num,
 			  struct qed_queue_start_common_params *params,
 			  dma_addr_t pbl_addr,
 			  u16 pbl_size,
-			  void __iomem **pp_doorbell);
+			  struct qed_txq_start_ret_params *ret_params);
 
-	int (*q_tx_stop)(struct qed_dev *cdev,
-			 struct qed_stop_txq_params *params);
+	int (*q_tx_stop)(struct qed_dev *cdev, u8 rss_id, void *handle);
 
 	int (*filter_config)(struct qed_dev *cdev,
 			     struct qed_filter_params *params);
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 04/11] qede: Refactor statistics gathering
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Refactor logic for gathering statistics into a per-queue function.
This improves readability of the driver statistics' flows.

In addition, this would be required by the XDP forwarding queues
[as we'll need the Txq statistics gathering methods for those as well].

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 142 ++++++++++++++----------
 1 file changed, 86 insertions(+), 56 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index a892843..d2c23d1 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -16,13 +16,6 @@
 #include <linux/capability.h>
 #include "qede.h"
 
-#define QEDE_STAT_OFFSET(stat_name) (offsetof(struct qede_stats, stat_name))
-#define QEDE_STAT_STRING(stat_name) (#stat_name)
-#define _QEDE_STAT(stat_name, pf_only) \
-	 {QEDE_STAT_OFFSET(stat_name), QEDE_STAT_STRING(stat_name), pf_only}
-#define QEDE_PF_STAT(stat_name)		_QEDE_STAT(stat_name, true)
-#define QEDE_STAT(stat_name)		_QEDE_STAT(stat_name, false)
-
 #define QEDE_RQSTAT_OFFSET(stat_name) \
 	 (offsetof(struct qede_rx_queue, stat_name))
 #define QEDE_RQSTAT_STRING(stat_name) (#stat_name)
@@ -42,9 +35,6 @@
 };
 
 #define QEDE_NUM_RQSTATS ARRAY_SIZE(qede_rqstats_arr)
-#define QEDE_RQSTATS_DATA(dev, sindex, rqindex) \
-	(*((u64 *)(((char *)(dev->fp_array[(rqindex)].rxq)) +\
-		    qede_rqstats_arr[(sindex)].offset)))
 #define QEDE_TQSTAT_OFFSET(stat_name) \
 	(offsetof(struct qede_tx_queue, stat_name))
 #define QEDE_TQSTAT_STRING(stat_name) (#stat_name)
@@ -59,10 +49,12 @@
 	QEDE_TQSTAT(stopped_cnt),
 };
 
-#define QEDE_TQSTATS_DATA(dev, sindex, tssid) \
-	(*((u64 *)(((void *)((dev)->fp_array[tssid].txq)) + \
-		   qede_tqstats_arr[(sindex)].offset)))
-
+#define QEDE_STAT_OFFSET(stat_name) (offsetof(struct qede_stats, stat_name))
+#define QEDE_STAT_STRING(stat_name) (#stat_name)
+#define _QEDE_STAT(stat_name, pf_only) \
+	 {QEDE_STAT_OFFSET(stat_name), QEDE_STAT_STRING(stat_name), pf_only}
+#define QEDE_PF_STAT(stat_name)	_QEDE_STAT(stat_name, true)
+#define QEDE_STAT(stat_name)	_QEDE_STAT(stat_name, false)
 static const struct {
 	u64 offset;
 	char string[ETH_GSTRING_LEN];
@@ -136,10 +128,6 @@
 	QEDE_STAT(coalesced_bytes),
 };
 
-#define QEDE_STATS_DATA(dev, index) \
-	(*((u64 *)(((char *)(dev)) + offsetof(struct qede_dev, stats) \
-			+ qede_stats_arr[(index)].offset)))
-
 #define QEDE_NUM_STATS	ARRAY_SIZE(qede_stats_arr)
 
 enum {
@@ -170,36 +158,52 @@ enum qede_ethtool_tests {
 	"Nvram (online)\t\t",
 };
 
+static void qede_get_strings_stats_txq(struct qede_dev *edev,
+				       struct qede_tx_queue *txq, u8 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_TQSTATS; i++) {
+		sprintf(*buf, "%d: %s", txq->index,
+			qede_tqstats_arr[i].string);
+		*buf += ETH_GSTRING_LEN;
+	}
+}
+
+static void qede_get_strings_stats_rxq(struct qede_dev *edev,
+				       struct qede_rx_queue *rxq, u8 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_RQSTATS; i++) {
+		sprintf(*buf, "%d: %s", rxq->rxq_id,
+			qede_rqstats_arr[i].string);
+		*buf += ETH_GSTRING_LEN;
+	}
+}
+
 static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 {
-	int i, j, k;
+	struct qede_fastpath *fp;
+	int i;
 
-	for (i = 0, k = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+	/* Account for queue statistics */
+	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+		fp = &edev->fp_array[i];
 
-		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
-			for (j = 0; j < QEDE_NUM_RQSTATS; j++)
-				sprintf(buf + (k + j) * ETH_GSTRING_LEN,
-					"%d:   %s", i,
-					qede_rqstats_arr[j].string);
-			k += QEDE_NUM_RQSTATS;
-		}
+		if (fp->type & QEDE_FASTPATH_RX)
+			qede_get_strings_stats_rxq(edev, fp->rxq, &buf);
 
-		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			for (j = 0; j < QEDE_NUM_TQSTATS; j++)
-				sprintf(buf + (k + j) *
-					ETH_GSTRING_LEN,
-					"%d: %s", i,
-					qede_tqstats_arr[j].string);
-			k += QEDE_NUM_TQSTATS;
-		}
+		if (fp->type & QEDE_FASTPATH_TX)
+			qede_get_strings_stats_txq(edev, fp->txq, &buf);
 	}
 
-	for (i = 0, j = 0; i < QEDE_NUM_STATS; i++) {
+	/* Account for non-queue statistics */
+	for (i = 0; i < QEDE_NUM_STATS; i++) {
 		if (IS_VF(edev) && qede_stats_arr[i].pf_only)
 			continue;
-		strcpy(buf + (k + j) * ETH_GSTRING_LEN,
-		       qede_stats_arr[i].string);
-		j++;
+		strcpy(buf, qede_stats_arr[i].string);
+		buf += ETH_GSTRING_LEN;
 	}
 }
 
@@ -225,34 +229,53 @@ static void qede_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 	}
 }
 
+static void qede_get_ethtool_stats_txq(struct qede_tx_queue *txq, u64 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_TQSTATS; i++) {
+		**buf = *((u64 *)(((void *)txq) + qede_tqstats_arr[i].offset));
+		(*buf)++;
+	}
+}
+
+static void qede_get_ethtool_stats_rxq(struct qede_rx_queue *rxq, u64 **buf)
+{
+	int i;
+
+	for (i = 0; i < QEDE_NUM_RQSTATS; i++) {
+		**buf = *((u64 *)(((void *)rxq) + qede_rqstats_arr[i].offset));
+		(*buf)++;
+	}
+}
+
 static void qede_get_ethtool_stats(struct net_device *dev,
 				   struct ethtool_stats *stats, u64 *buf)
 {
 	struct qede_dev *edev = netdev_priv(dev);
-	int sidx, cnt = 0;
-	int qid;
+	struct qede_fastpath *fp;
+	int i;
 
 	qede_fill_by_demand_stats(edev);
 
 	mutex_lock(&edev->qede_lock);
+	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
+		fp = &edev->fp_array[i];
 
-	for (qid = 0; qid < QEDE_QUEUE_CNT(edev); qid++) {
-
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_RX)
-			for (sidx = 0; sidx < QEDE_NUM_RQSTATS; sidx++)
-				buf[cnt++] = QEDE_RQSTATS_DATA(edev, sidx, qid);
+		if (fp->type & QEDE_FASTPATH_RX)
+			qede_get_ethtool_stats_rxq(fp->rxq, &buf);
 
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_TX)
-			for (sidx = 0; sidx < QEDE_NUM_TQSTATS; sidx++)
-				buf[cnt++] = QEDE_TQSTATS_DATA(edev,
-							       sidx,
-							       qid);
+		if (fp->type & QEDE_FASTPATH_TX)
+			qede_get_ethtool_stats_txq(fp->txq, &buf);
 	}
 
-	for (sidx = 0; sidx < QEDE_NUM_STATS; sidx++) {
-		if (IS_VF(edev) && qede_stats_arr[sidx].pf_only)
+	for (i = 0; i < QEDE_NUM_STATS; i++) {
+		if (IS_VF(edev) && qede_stats_arr[i].pf_only)
 			continue;
-		buf[cnt++] = QEDE_STATS_DATA(edev, sidx);
+		*buf = *((u64 *)(((void *)&edev->stats) +
+				 qede_stats_arr[i].offset));
+
+		buf++;
 	}
 
 	mutex_unlock(&edev->qede_lock);
@@ -272,8 +295,15 @@ static int qede_get_sset_count(struct net_device *dev, int stringset)
 				if (qede_stats_arr[i].pf_only)
 					num_stats--;
 		}
-		return num_stats + QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS +
-		       QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
+
+		/* Account for the Regular Tx statistics */
+		num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
+
+		/* Account for the Regular Rx statistics */
+		num_stats += QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS;
+
+		return num_stats;
+
 	case ETH_SS_PRIV_FLAGS:
 		return QEDE_PRI_FLAG_LEN;
 	case ETH_SS_TEST:
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 06/11] qede: Revise state locking scheme
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

As qede utilizes an internal-reload sequence as result of various
configuration changes, the netif state wouldn't always accurately describe
the status of the configuration.
To compensate, we're storing an internal state of the device, which should
only be accessed under the qede_lock.

This patch fixes and improves several state/lock interactions:
  - The internal state should only be checked while locked.
  - While holding lock, it's preferable to check state rather than
    the netdevice's state.
  - The reload sequence is not 'atomic' - unload and subsequent load
    are not in the same critical section.

This also add the 'locked' variant for the reload, which would later be
used by XDP - useful in the case where the correct sequence is 'lock,
check state and re-configure if good', instead of allowing the reload
itself to make the decision regarding the configurability of the device.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |  14 ++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  37 +++---
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 146 +++++++++++++++---------
 3 files changed, 122 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index b9584b2..e0a94bc 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -338,8 +338,12 @@ struct qede_fastpath {
 #define QEDE_SP_VXLAN_PORT_CONFIG	2
 #define QEDE_SP_GENEVE_PORT_CONFIG	3
 
-union qede_reload_args {
-	u16 mtu;
+struct qede_reload_args {
+	void (*func)(struct qede_dev *edev, struct qede_reload_args *args);
+	union {
+		netdev_features_t features;
+		u16 mtu;
+	} u;
 };
 
 #ifdef CONFIG_DCB
@@ -348,11 +352,11 @@ struct qede_fastpath {
 void qede_config_debug(uint debug, u32 *p_dp_module, u8 *p_dp_level);
 void qede_set_ethtool_ops(struct net_device *netdev);
 void qede_reload(struct qede_dev *edev,
-		 void (*func)(struct qede_dev *edev,
-			      union qede_reload_args *args),
-		 union qede_reload_args *args);
+		 struct qede_reload_args *args, bool is_locked);
 int qede_change_mtu(struct net_device *dev, int new_mtu);
 void qede_fill_by_demand_stats(struct qede_dev *edev);
+void __qede_lock(struct qede_dev *edev);
+void __qede_unlock(struct qede_dev *edev);
 bool qede_has_rx_work(struct qede_rx_queue *rxq);
 int qede_txq_has_work(struct qede_tx_queue *txq);
 void qede_recycle_rx_bd_ring(struct qede_rx_queue *rxq, struct qede_dev *edev,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index d2c23d1..ef8c327 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -258,7 +258,9 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 
 	qede_fill_by_demand_stats(edev);
 
-	mutex_lock(&edev->qede_lock);
+	/* Need to protect the access to the fastpath array */
+	__qede_lock(edev);
+
 	for (i = 0; i < QEDE_QUEUE_CNT(edev); i++) {
 		fp = &edev->fp_array[i];
 
@@ -278,7 +280,7 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 		buf++;
 	}
 
-	mutex_unlock(&edev->qede_lock);
+	__qede_unlock(edev);
 }
 
 static int qede_get_sset_count(struct net_device *dev, int stringset)
@@ -374,6 +376,8 @@ static int qede_get_link_ksettings(struct net_device *dev,
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qed_link_output current_link;
 
+	__qede_lock(edev);
+
 	memset(&current_link, 0, sizeof(current_link));
 	edev->ops->common->get_link(edev->cdev, &current_link);
 
@@ -393,6 +397,9 @@ static int qede_get_link_ksettings(struct net_device *dev,
 		base->speed = SPEED_UNKNOWN;
 		base->duplex = DUPLEX_UNKNOWN;
 	}
+
+	__qede_unlock(edev);
+
 	base->port = current_link.port;
 	base->autoneg = (current_link.autoneg) ? AUTONEG_ENABLE :
 			AUTONEG_DISABLE;
@@ -701,8 +708,7 @@ static int qede_set_ringparam(struct net_device *dev,
 	edev->q_num_rx_buffers = ering->rx_pending;
 	edev->q_num_tx_buffers = ering->tx_pending;
 
-	if (netif_running(edev->ndev))
-		qede_reload(edev, NULL, NULL);
+	qede_reload(edev, NULL, false);
 
 	return 0;
 }
@@ -787,29 +793,27 @@ static int qede_get_regs_len(struct net_device *ndev)
 		return -EINVAL;
 }
 
-static void qede_update_mtu(struct qede_dev *edev, union qede_reload_args *args)
+static void qede_update_mtu(struct qede_dev *edev,
+			    struct qede_reload_args *args)
 {
-	edev->ndev->mtu = args->mtu;
+	edev->ndev->mtu = args->u.mtu;
 }
 
 /* Netdevice NDOs */
 int qede_change_mtu(struct net_device *ndev, int new_mtu)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
-	union qede_reload_args args;
+	struct qede_reload_args args;
 
 	DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
 		   "Configuring MTU size of %d\n", new_mtu);
 
-	/* Set the mtu field and re-start the interface if needed*/
-	args.mtu = new_mtu;
-
-	if (netif_running(edev->ndev))
-		qede_reload(edev, &qede_update_mtu, &args);
-
-	qede_update_mtu(edev, &args);
+	/* Set the mtu field and re-start the interface if needed */
+	args.u.mtu = new_mtu;
+	args.func = &qede_update_mtu;
+	qede_reload(edev, &args, false);
 
-	edev->ops->common->update_mtu(edev->cdev, args.mtu);
+	edev->ops->common->update_mtu(edev->cdev, new_mtu);
 
 	return 0;
 }
@@ -893,8 +897,7 @@ static int qede_set_channels(struct net_device *dev,
 		       sizeof(edev->rss_params.rss_ind_table));
 	}
 
-	if (netif_running(dev))
-		qede_reload(edev, NULL, NULL);
+	qede_reload(edev, NULL, false);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index acc350b..8774f8e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -100,6 +100,19 @@ static int qede_alloc_rx_buffer(struct qede_dev *edev,
 				struct qede_rx_queue *rxq);
 static void qede_link_update(void *dev, struct qed_link_output *link);
 
+/* The qede lock is used to protect driver state change and driver flows that
+ * are not reentrant.
+ */
+void __qede_lock(struct qede_dev *edev)
+{
+	mutex_lock(&edev->qede_lock);
+}
+
+void __qede_unlock(struct qede_dev *edev)
+{
+	mutex_unlock(&edev->qede_lock);
+}
+
 #ifdef CONFIG_QED_SRIOV
 static int qede_set_vf_vlan(struct net_device *ndev, int vf, u16 vlan, u8 qos,
 			    __be16 vlan_proto)
@@ -1952,7 +1965,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qede_vlan *vlan, *tmp;
-	int rc;
+	int rc = 0;
 
 	DP_VERBOSE(edev, NETIF_MSG_IFUP, "Adding vlan 0x%04x\n", vid);
 
@@ -1976,6 +1989,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 	}
 
 	/* If interface is down, cache this VLAN ID and return */
+	__qede_lock(edev);
 	if (edev->state != QEDE_STATE_OPEN) {
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "Interface is down, VLAN %d will be configured when interface is up\n",
@@ -1983,8 +1997,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 		if (vid != 0)
 			edev->non_configured_vlans++;
 		list_add(&vlan->list, &edev->vlan_list);
-
-		return 0;
+		goto out;
 	}
 
 	/* Check for the filter limit.
@@ -2000,7 +2013,7 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 			DP_ERR(edev, "Failed to configure VLAN %d\n",
 			       vlan->vid);
 			kfree(vlan);
-			return -EINVAL;
+			goto out;
 		}
 		vlan->configured = true;
 
@@ -2017,7 +2030,9 @@ static int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 
 	list_add(&vlan->list, &edev->vlan_list);
 
-	return 0;
+out:
+	__qede_unlock(edev);
+	return rc;
 }
 
 static void qede_del_vlan_from_list(struct qede_dev *edev,
@@ -2094,11 +2109,12 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	struct qede_vlan *vlan = NULL;
-	int rc;
+	int rc = 0;
 
 	DP_VERBOSE(edev, NETIF_MSG_IFDOWN, "Removing vlan 0x%04x\n", vid);
 
 	/* Find whether entry exists */
+	__qede_lock(edev);
 	list_for_each_entry(vlan, &edev->vlan_list, list)
 		if (vlan->vid == vid)
 			break;
@@ -2106,7 +2122,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	if (!vlan || (vlan->vid != vid)) {
 		DP_VERBOSE(edev, (NETIF_MSG_IFUP | NETIF_MSG_IFDOWN),
 			   "Vlan isn't configured\n");
-		return 0;
+		goto out;
 	}
 
 	if (edev->state != QEDE_STATE_OPEN) {
@@ -2116,7 +2132,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 		DP_VERBOSE(edev, NETIF_MSG_IFDOWN,
 			   "Interface is down, removing VLAN from list only\n");
 		qede_del_vlan_from_list(edev, vlan);
-		return 0;
+		goto out;
 	}
 
 	/* Remove vlan */
@@ -2125,7 +2141,7 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 					    vid);
 		if (rc) {
 			DP_ERR(edev, "Failed to remove VLAN %d\n", vid);
-			return -EINVAL;
+			goto out;
 		}
 	}
 
@@ -2136,6 +2152,8 @@ static int qede_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 	 */
 	rc = qede_configure_vlan_filters(edev);
 
+out:
+	__qede_unlock(edev);
 	return rc;
 }
 
@@ -2165,7 +2183,13 @@ static void qede_vlan_mark_nonconfigured(struct qede_dev *edev)
 	edev->accept_any_vlan = false;
 }
 
-static int qede_set_features(struct net_device *dev, netdev_features_t features)
+static void qede_set_features_reload(struct qede_dev *edev,
+				     struct qede_reload_args *args)
+{
+	edev->ndev->features = args->u.features;
+}
+
+int qede_set_features(struct net_device *dev, netdev_features_t features)
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	netdev_features_t changes = features ^ dev->features;
@@ -2179,9 +2203,14 @@ static int qede_set_features(struct net_device *dev, netdev_features_t features)
 			need_reload = edev->gro_disable;
 	}
 
-	if (need_reload && netif_running(edev->ndev)) {
-		dev->features = features;
-		qede_reload(edev, NULL, NULL);
+	if (need_reload) {
+		struct qede_reload_args args;
+
+		args.u.features = features;
+		args.func = &qede_set_features_reload;
+
+		qede_reload(edev, &args, false);
+
 		return 1;
 	}
 
@@ -2528,12 +2557,11 @@ static void qede_sp_task(struct work_struct *work)
 					     sp_task.work);
 	struct qed_dev *cdev = edev->cdev;
 
-	mutex_lock(&edev->qede_lock);
+	__qede_lock(edev);
 
-	if (edev->state == QEDE_STATE_OPEN) {
-		if (test_and_clear_bit(QEDE_SP_RX_MODE, &edev->sp_flags))
+	if (test_and_clear_bit(QEDE_SP_RX_MODE, &edev->sp_flags))
+		if (edev->state == QEDE_STATE_OPEN)
 			qede_config_rx_mode(edev->ndev);
-	}
 
 	if (test_and_clear_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags)) {
 		struct qed_tunn_params tunn_params;
@@ -2553,7 +2581,7 @@ static void qede_sp_task(struct work_struct *work)
 		qed_ops->tunn_config(cdev, &tunn_params);
 	}
 
-	mutex_unlock(&edev->qede_lock);
+	__qede_unlock(edev);
 }
 
 static void qede_update_pf_params(struct qed_dev *cdev)
@@ -3576,15 +3604,18 @@ enum qede_unload_mode {
 	QEDE_UNLOAD_NORMAL,
 };
 
-static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode)
+static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode,
+			bool is_locked)
 {
 	struct qed_link_params link_params;
 	int rc;
 
 	DP_INFO(edev, "Starting qede unload\n");
 
+	if (!is_locked)
+		__qede_lock(edev);
+
 	qede_roce_dev_event_close(edev);
-	mutex_lock(&edev->qede_lock);
 	edev->state = QEDE_STATE_CLOSED;
 
 	/* Close OS Tx */
@@ -3616,7 +3647,8 @@ static void qede_unload(struct qede_dev *edev, enum qede_unload_mode mode)
 	qede_free_fp_array(edev);
 
 out:
-	mutex_unlock(&edev->qede_lock);
+	if (!is_locked)
+		__qede_unlock(edev);
 	DP_INFO(edev, "Ending qede unload\n");
 }
 
@@ -3625,7 +3657,8 @@ enum qede_load_mode {
 	QEDE_LOAD_RELOAD,
 };
 
-static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
+static int qede_load(struct qede_dev *edev, enum qede_load_mode mode,
+		     bool is_locked)
 {
 	struct qed_link_params link_params;
 	struct qed_link_output link_output;
@@ -3633,13 +3666,16 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 
 	DP_INFO(edev, "Starting qede load\n");
 
+	if (!is_locked)
+		__qede_lock(edev);
+
 	rc = qede_set_num_queues(edev);
 	if (rc)
-		goto err0;
+		goto out;
 
 	rc = qede_alloc_fp_array(edev);
 	if (rc)
-		goto err0;
+		goto out;
 
 	qede_init_fp(edev);
 
@@ -3669,10 +3705,6 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	/* Add primary mac and set Rx filters */
 	ether_addr_copy(edev->primary_mac, edev->ndev->dev_addr);
 
-	mutex_lock(&edev->qede_lock);
-	edev->state = QEDE_STATE_OPEN;
-	mutex_unlock(&edev->qede_lock);
-
 	/* Program un-configured VLANs */
 	qede_configure_vlan_filters(edev);
 
@@ -3687,10 +3719,12 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	qede_roce_dev_event_open(edev);
 	qede_link_update(edev, &link_output);
 
+	edev->state = QEDE_STATE_OPEN;
+
 	DP_INFO(edev, "Ending successfully qede load\n");
 
-	return 0;
 
+	goto out;
 err4:
 	qede_sync_free_irqs(edev);
 	memset(&edev->int_info.msix_cnt, 0, sizeof(struct qed_int_info));
@@ -3704,26 +3738,40 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	edev->num_queues = 0;
 	edev->fp_num_tx = 0;
 	edev->fp_num_rx = 0;
-err0:
+out:
+	if (!is_locked)
+		__qede_unlock(edev);
+
 	return rc;
 }
 
+/* 'func' should be able to run between unload and reload assuming interface
+ * is actually running, or afterwards in case it's currently DOWN.
+ */
 void qede_reload(struct qede_dev *edev,
-		 void (*func)(struct qede_dev *, union qede_reload_args *),
-		 union qede_reload_args *args)
+		 struct qede_reload_args *args, bool is_locked)
 {
-	qede_unload(edev, QEDE_UNLOAD_NORMAL);
-	/* Call function handler to update parameters
-	 * needed for function load.
+	if (!is_locked)
+		__qede_lock(edev);
+
+	/* Since qede_lock is held, internal state wouldn't change even
+	 * if netdev state would start transitioning. Check whether current
+	 * internal configuration indicates device is up, then reload.
 	 */
-	if (func)
-		func(edev, args);
+	if (edev->state == QEDE_STATE_OPEN) {
+		qede_unload(edev, QEDE_UNLOAD_NORMAL, true);
+		if (args)
+			args->func(edev, args);
+		qede_load(edev, QEDE_LOAD_RELOAD, true);
 
-	qede_load(edev, QEDE_LOAD_RELOAD);
+		/* Since no one is going to do it for us, re-configure */
+		qede_config_rx_mode(edev->ndev);
+	} else if (args) {
+		args->func(edev, args);
+	}
 
-	mutex_lock(&edev->qede_lock);
-	qede_config_rx_mode(edev->ndev);
-	mutex_unlock(&edev->qede_lock);
+	if (!is_locked)
+		__qede_unlock(edev);
 }
 
 /* called with rtnl_lock */
@@ -3736,8 +3784,7 @@ static int qede_open(struct net_device *ndev)
 
 	edev->ops->common->set_power_state(edev->cdev, PCI_D0);
 
-	rc = qede_load(edev, QEDE_LOAD_NORMAL);
-
+	rc = qede_load(edev, QEDE_LOAD_NORMAL, false);
 	if (rc)
 		return rc;
 
@@ -3752,7 +3799,7 @@ static int qede_close(struct net_device *ndev)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
-	qede_unload(edev, QEDE_UNLOAD_NORMAL);
+	qede_unload(edev, QEDE_UNLOAD_NORMAL, false);
 
 	edev->ops->common->update_drv_state(edev->cdev, false);
 
@@ -3884,15 +3931,8 @@ static void qede_set_rx_mode(struct net_device *ndev)
 {
 	struct qede_dev *edev = netdev_priv(ndev);
 
-	DP_INFO(edev, "qede_set_rx_mode called\n");
-
-	if (edev->state != QEDE_STATE_OPEN) {
-		DP_INFO(edev,
-			"qede_set_rx_mode called while interface is down\n");
-	} else {
-		set_bit(QEDE_SP_RX_MODE, &edev->sp_flags);
-		schedule_delayed_work(&edev->sp_task, 0);
-	}
+	set_bit(QEDE_SP_RX_MODE, &edev->sp_flags);
+	schedule_delayed_work(&edev->sp_task, 0);
 }
 
 /* Must be called with qede_lock held */
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 05/11] qede: Refactor data-path Rx flow
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver's NAPI poll is using a long sequence for processing ingress
packets, and it's going to get even longer once we do XDP.
Break down the main loop into a series of sub-functions to allow
better readability of the function.

While we're at it, correct the accounting of the NAPI budget -
currently we're counting only packets passed to the stack against
the budget, even in case those are actually aggregations.
After refactoring every CQE processed would be counted against the budget.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 488 +++++++++++++++------------
 1 file changed, 264 insertions(+), 224 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 2006dd4..acc350b 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1021,6 +1021,7 @@ static inline void qede_skb_receive(struct qede_dev *edev,
 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 
 	napi_gro_receive(&fp->napi, skb);
+	fp->rxq->rcv_pkts++;
 }
 
 static void qede_set_gro_params(struct qede_dev *edev,
@@ -1383,238 +1384,298 @@ static bool qede_pkt_is_ip_fragmented(struct eth_fast_path_rx_reg_cqe *cqe,
 	return false;
 }
 
-static int qede_rx_int(struct qede_fastpath *fp, int budget)
+static struct sk_buff *qede_rx_allocate_skb(struct qede_dev *edev,
+					    struct qede_rx_queue *rxq,
+					    struct sw_rx_data *bd, u16 len,
+					    u16 pad)
 {
-	struct qede_dev *edev = fp->edev;
-	struct qede_rx_queue *rxq = fp->rxq;
-
-	u16 hw_comp_cons, sw_comp_cons, sw_rx_index, parse_flag;
-	int rx_pkt = 0;
-	u8 csum_flag;
+	unsigned int offset = bd->page_offset;
+	struct skb_frag_struct *frag;
+	struct page *page = bd->data;
+	unsigned int pull_len;
+	struct sk_buff *skb;
+	unsigned char *va;
 
-	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
-	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
+	/* Allocate a new SKB with a sufficient large header len */
+	skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
+	if (unlikely(!skb))
+		return NULL;
 
-	/* Memory barrier to prevent the CPU from doing speculative reads of CQE
-	 * / BD in the while-loop before reading hw_comp_cons. If the CQE is
-	 * read before it is written by FW, then FW writes CQE and SB, and then
-	 * the CPU reads the hw_comp_cons, it will use an old CQE.
+	/* Copy data into SKB - if it's small, we can simply copy it and
+	 * re-use the already allcoated & mapped memory.
 	 */
-	rmb();
+	if (len + pad <= edev->rx_copybreak) {
+		memcpy(skb_put(skb, len),
+		       page_address(page) + pad + offset, len);
+		qede_reuse_page(edev, rxq, bd);
+		goto out;
+	}
 
-	/* Loop to complete all indicated BDs */
-	while (sw_comp_cons != hw_comp_cons) {
-		struct eth_fast_path_rx_reg_cqe *fp_cqe;
-		enum pkt_hash_types rxhash_type;
-		enum eth_rx_cqe_type cqe_type;
-		struct sw_rx_data *sw_rx_data;
-		union eth_rx_cqe *cqe;
-		struct sk_buff *skb;
-		struct page *data;
-		__le16 flags;
-		u16 len, pad;
-		u32 rx_hash;
-
-		/* Get the CQE from the completion ring */
-		cqe = (union eth_rx_cqe *)
-			qed_chain_consume(&rxq->rx_comp_ring);
-		cqe_type = cqe->fast_path_regular.type;
-
-		if (unlikely(cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH)) {
-			edev->ops->eth_cqe_completion(
-					edev->cdev, fp->id,
-					(struct eth_slow_path_rx_cqe *)cqe);
-			goto next_cqe;
-		}
+	frag = &skb_shinfo(skb)->frags[0];
 
-		if (cqe_type != ETH_RX_CQE_TYPE_REGULAR) {
-			switch (cqe_type) {
-			case ETH_RX_CQE_TYPE_TPA_START:
-				qede_tpa_start(edev, rxq,
-					       &cqe->fast_path_tpa_start);
-				goto next_cqe;
-			case ETH_RX_CQE_TYPE_TPA_CONT:
-				qede_tpa_cont(edev, rxq,
-					      &cqe->fast_path_tpa_cont);
-				goto next_cqe;
-			case ETH_RX_CQE_TYPE_TPA_END:
-				qede_tpa_end(edev, fp,
-					     &cqe->fast_path_tpa_end);
-				goto next_rx_only;
-			default:
-				break;
-			}
-		}
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
+			page, pad + offset, len, rxq->rx_buf_seg_size);
 
-		/* Get the data from the SW ring */
-		sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
-		sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
-		data = sw_rx_data->data;
-
-		fp_cqe = &cqe->fast_path_regular;
-		len =  le16_to_cpu(fp_cqe->len_on_first_bd);
-		pad = fp_cqe->placement_offset;
-		flags = cqe->fast_path_regular.pars_flags.flags;
-
-		/* If this is an error packet then drop it */
-		parse_flag = le16_to_cpu(flags);
-
-		csum_flag = qede_check_csum(parse_flag);
-		if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
-			if (qede_pkt_is_ip_fragmented(&cqe->fast_path_regular,
-						      parse_flag)) {
-				rxq->rx_ip_frags++;
-				goto alloc_skb;
-			}
+	va = skb_frag_address(frag);
+	pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
 
-			DP_NOTICE(edev,
-				  "CQE in CONS = %u has error, flags = %x, dropping incoming packet\n",
-				  sw_comp_cons, parse_flag);
-			rxq->rx_hw_errors++;
-			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
-			goto next_cqe;
-		}
+	/* Align the pull_len to optimize memcpy */
+	memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
 
-alloc_skb:
-		skb = netdev_alloc_skb(edev->ndev, QEDE_RX_HDR_SIZE);
-		if (unlikely(!skb)) {
-			DP_NOTICE(edev,
-				  "skb allocation failed, dropping incoming packet\n");
-			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
-			rxq->rx_alloc_errors++;
-			goto next_cqe;
+	/* Correct the skb & frag sizes offset after the pull */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+
+	if (unlikely(qede_realloc_rx_buffer(edev, rxq, bd))) {
+		/* Incr page ref count to reuse on allocation failure so
+		 * that it doesn't get freed while freeing SKB [as its
+		 * already mapped there].
+		 */
+		page_ref_inc(page);
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+out:
+	/* We've consumed the first BD and prepared an SKB */
+	qede_rx_bd_ring_consume(rxq);
+	return skb;
+}
+
+static int qede_rx_build_jumbo(struct qede_dev *edev,
+			       struct qede_rx_queue *rxq,
+			       struct sk_buff *skb,
+			       struct eth_fast_path_rx_reg_cqe *cqe,
+			       u16 first_bd_len)
+{
+	u16 pkt_len = le16_to_cpu(cqe->pkt_len);
+	struct sw_rx_data *bd;
+	u16 bd_cons_idx;
+	u8 num_frags;
+
+	pkt_len -= first_bd_len;
+
+	/* We've already used one BD for the SKB. Now take care of the rest */
+	for (num_frags = cqe->bd_num - 1; num_frags > 0; num_frags--) {
+		u16 cur_size = pkt_len > rxq->rx_buf_size ? rxq->rx_buf_size :
+		    pkt_len;
+
+		if (unlikely(!cur_size)) {
+			DP_ERR(edev,
+			       "Still got %d BDs for mapping jumbo, but length became 0\n",
+			       num_frags);
+			goto out;
 		}
 
-		/* Copy data into SKB */
-		if (len + pad <= edev->rx_copybreak) {
-			memcpy(skb_put(skb, len),
-			       page_address(data) + pad +
-				sw_rx_data->page_offset, len);
-			qede_reuse_page(edev, rxq, sw_rx_data);
+		/* We need a replacement buffer for each BD */
+		if (unlikely(qede_alloc_rx_buffer(edev, rxq)))
+			goto out;
+
+		/* Now that we've allocated the replacement buffer,
+		 * we can safely consume the next BD and map it to the SKB.
+		 */
+		bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+		bd = &rxq->sw_rx_ring[bd_cons_idx];
+		qede_rx_bd_ring_consume(rxq);
+
+		dma_unmap_page(&edev->pdev->dev, bd->mapping,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+
+		skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags++,
+				   bd->data, 0, cur_size);
+
+		skb->truesize += PAGE_SIZE;
+		skb->data_len += cur_size;
+		skb->len += cur_size;
+		pkt_len -= cur_size;
+	}
+
+	if (unlikely(pkt_len))
+		DP_ERR(edev,
+		       "Mapped all BDs of jumbo, but still have %d bytes\n",
+		       pkt_len);
+
+out:
+	return num_frags;
+}
+
+static int qede_rx_process_tpa_cqe(struct qede_dev *edev,
+				   struct qede_fastpath *fp,
+				   struct qede_rx_queue *rxq,
+				   union eth_rx_cqe *cqe,
+				   enum eth_rx_cqe_type type)
+{
+	switch (type) {
+	case ETH_RX_CQE_TYPE_TPA_START:
+		qede_tpa_start(edev, rxq, &cqe->fast_path_tpa_start);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_CONT:
+		qede_tpa_cont(edev, rxq, &cqe->fast_path_tpa_cont);
+		return 0;
+	case ETH_RX_CQE_TYPE_TPA_END:
+		qede_tpa_end(edev, fp, &cqe->fast_path_tpa_end);
+		return 1;
+	default:
+		return 0;
+	}
+}
+
+static int qede_rx_process_cqe(struct qede_dev *edev,
+			       struct qede_fastpath *fp,
+			       struct qede_rx_queue *rxq)
+{
+	struct eth_fast_path_rx_reg_cqe *fp_cqe;
+	u16 len, pad, bd_cons_idx, parse_flag;
+	enum pkt_hash_types rxhash_type;
+	enum eth_rx_cqe_type cqe_type;
+	union eth_rx_cqe *cqe;
+	struct sw_rx_data *bd;
+	struct sk_buff *skb;
+	__le16 flags;
+	u8 csum_flag;
+	u32 rx_hash;
+
+	/* Get the CQE from the completion ring */
+	cqe = (union eth_rx_cqe *)qed_chain_consume(&rxq->rx_comp_ring);
+	cqe_type = cqe->fast_path_regular.type;
+
+	/* Process an unlikely slowpath event */
+	if (unlikely(cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH)) {
+		struct eth_slow_path_rx_cqe *sp_cqe;
+
+		sp_cqe = (struct eth_slow_path_rx_cqe *)cqe;
+		edev->ops->eth_cqe_completion(edev->cdev, fp->id, sp_cqe);
+		return 0;
+	}
+
+	/* Handle TPA cqes */
+	if (cqe_type != ETH_RX_CQE_TYPE_REGULAR)
+		return qede_rx_process_tpa_cqe(edev, fp, rxq, cqe, cqe_type);
+
+	/* Get the data from the SW ring; Consume it only after it's evident
+	 * we wouldn't recycle it.
+	 */
+	bd_cons_idx = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
+	bd = &rxq->sw_rx_ring[bd_cons_idx];
+
+	fp_cqe = &cqe->fast_path_regular;
+	len = le16_to_cpu(fp_cqe->len_on_first_bd);
+	pad = fp_cqe->placement_offset;
+
+	/* If this is an error packet then drop it */
+	flags = cqe->fast_path_regular.pars_flags.flags;
+	parse_flag = le16_to_cpu(flags);
+
+	csum_flag = qede_check_csum(parse_flag);
+	if (unlikely(csum_flag == QEDE_CSUM_ERROR)) {
+		if (qede_pkt_is_ip_fragmented(fp_cqe, parse_flag)) {
+			rxq->rx_ip_frags++;
 		} else {
-			struct skb_frag_struct *frag;
-			unsigned int pull_len;
-			unsigned char *va;
-
-			frag = &skb_shinfo(skb)->frags[0];
-
-			skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, data,
-					pad + sw_rx_data->page_offset,
-					len, rxq->rx_buf_seg_size);
-
-			va = skb_frag_address(frag);
-			pull_len = eth_get_headlen(va, QEDE_RX_HDR_SIZE);
-
-			/* Align the pull_len to optimize memcpy */
-			memcpy(skb->data, va, ALIGN(pull_len, sizeof(long)));
-
-			skb_frag_size_sub(frag, pull_len);
-			frag->page_offset += pull_len;
-			skb->data_len -= pull_len;
-			skb->tail += pull_len;
-
-			if (unlikely(qede_realloc_rx_buffer(edev, rxq,
-							    sw_rx_data))) {
-				DP_ERR(edev, "Failed to allocate rx buffer\n");
-				/* Incr page ref count to reuse on allocation
-				 * failure so that it doesn't get freed while
-				 * freeing SKB.
-				 */
-
-				page_ref_inc(sw_rx_data->data);
-				rxq->rx_alloc_errors++;
-				qede_recycle_rx_bd_ring(rxq, edev,
-							fp_cqe->bd_num);
-				dev_kfree_skb_any(skb);
-				goto next_cqe;
-			}
+			DP_NOTICE(edev,
+				  "CQE has error, flags = %x, dropping incoming packet\n",
+				  parse_flag);
+			rxq->rx_hw_errors++;
+			qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+			return 0;
 		}
+	}
 
-		qede_rx_bd_ring_consume(rxq);
+	/* Basic validation passed; Need to prepare an SKB. This would also
+	 * guarantee to finally consume the first BD upon success.
+	 */
+	skb = qede_rx_allocate_skb(edev, rxq, bd, len, pad);
+	if (!skb) {
+		rxq->rx_alloc_errors++;
+		qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num);
+		return 0;
+	}
 
-		if (fp_cqe->bd_num != 1) {
-			u16 pkt_len = le16_to_cpu(fp_cqe->pkt_len);
-			u8 num_frags;
-
-			pkt_len -= len;
-
-			for (num_frags = fp_cqe->bd_num - 1; num_frags > 0;
-			     num_frags--) {
-				u16 cur_size = pkt_len > rxq->rx_buf_size ?
-						rxq->rx_buf_size : pkt_len;
-				if (unlikely(!cur_size)) {
-					DP_ERR(edev,
-					       "Still got %d BDs for mapping jumbo, but length became 0\n",
-					       num_frags);
-					qede_recycle_rx_bd_ring(rxq, edev,
-								num_frags);
-					dev_kfree_skb_any(skb);
-					goto next_cqe;
-				}
-
-				if (unlikely(qede_alloc_rx_buffer(edev, rxq))) {
-					qede_recycle_rx_bd_ring(rxq, edev,
-								num_frags);
-					dev_kfree_skb_any(skb);
-					goto next_cqe;
-				}
-
-				sw_rx_index = rxq->sw_rx_cons & NUM_RX_BDS_MAX;
-				sw_rx_data = &rxq->sw_rx_ring[sw_rx_index];
-				qede_rx_bd_ring_consume(rxq);
-
-				dma_unmap_page(&edev->pdev->dev,
-					       sw_rx_data->mapping,
-					       PAGE_SIZE, DMA_FROM_DEVICE);
-
-				skb_fill_page_desc(skb,
-						   skb_shinfo(skb)->nr_frags++,
-						   sw_rx_data->data, 0,
-						   cur_size);
-
-				skb->truesize += PAGE_SIZE;
-				skb->data_len += cur_size;
-				skb->len += cur_size;
-				pkt_len -= cur_size;
-			}
+	/* In case of Jumbo packet, several PAGE_SIZEd buffers will be pointed
+	 * by a single cqe.
+	 */
+	if (fp_cqe->bd_num > 1) {
+		u16 unmapped_frags = qede_rx_build_jumbo(edev, rxq, skb,
+							 fp_cqe, len);
 
-			if (unlikely(pkt_len))
-				DP_ERR(edev,
-				       "Mapped all BDs of jumbo, but still have %d bytes\n",
-				       pkt_len);
+		if (unlikely(unmapped_frags > 0)) {
+			qede_recycle_rx_bd_ring(rxq, edev, unmapped_frags);
+			dev_kfree_skb_any(skb);
+			return 0;
 		}
+	}
 
-		skb->protocol = eth_type_trans(skb, edev->ndev);
+	/* The SKB contains all the data. Now prepare meta-magic */
+	skb->protocol = eth_type_trans(skb, edev->ndev);
+	rx_hash = qede_get_rxhash(edev, fp_cqe->bitfields,
+				  fp_cqe->rss_hash, &rxhash_type);
+	skb_set_hash(skb, rx_hash, rxhash_type);
+	qede_set_skb_csum(skb, csum_flag);
+	skb_record_rx_queue(skb, rxq->rxq_id);
 
-		rx_hash = qede_get_rxhash(edev, fp_cqe->bitfields,
-					  fp_cqe->rss_hash, &rxhash_type);
+	/* SKB is prepared - pass it to stack */
+	qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
 
-		skb_set_hash(skb, rx_hash, rxhash_type);
+	return 1;
+}
 
-		qede_set_skb_csum(skb, csum_flag);
+static int qede_rx_int(struct qede_fastpath *fp, int budget)
+{
+	struct qede_rx_queue *rxq = fp->rxq;
+	struct qede_dev *edev = fp->edev;
+	u16 hw_comp_cons, sw_comp_cons;
+	int work_done = 0;
 
-		skb_record_rx_queue(skb, fp->rxq->rxq_id);
+	hw_comp_cons = le16_to_cpu(*rxq->hw_cons_ptr);
+	sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
 
-		qede_skb_receive(edev, fp, skb, le16_to_cpu(fp_cqe->vlan_tag));
-next_rx_only:
-		rx_pkt++;
+	/* Memory barrier to prevent the CPU from doing speculative reads of CQE
+	 * / BD in the while-loop before reading hw_comp_cons. If the CQE is
+	 * read before it is written by FW, then FW writes CQE and SB, and then
+	 * the CPU reads the hw_comp_cons, it will use an old CQE.
+	 */
+	rmb();
 
-next_cqe: /* don't consume bd rx buffer */
+	/* Loop to complete all indicated BDs */
+	while ((sw_comp_cons != hw_comp_cons) && (work_done < budget)) {
+		qede_rx_process_cqe(edev, fp, rxq);
 		qed_chain_recycle_consumed(&rxq->rx_comp_ring);
 		sw_comp_cons = qed_chain_get_cons_idx(&rxq->rx_comp_ring);
-		/* CR TPA - revisit how to handle budget in TPA perhaps
-		 * increase on "end"
-		 */
-		if (rx_pkt == budget)
-			break;
-	} /* repeat while sw_comp_cons != hw_comp_cons... */
+		work_done++;
+	}
 
 	/* Update producers */
 	qede_update_rx_prod(edev, rxq);
 
-	rxq->rcv_pkts += rx_pkt;
+	return work_done;
+}
+
+static bool qede_poll_is_more_work(struct qede_fastpath *fp)
+{
+	qed_sb_update_sb_idx(fp->sb_info);
 
-	return rx_pkt;
+	/* *_has_*_work() reads the status block, thus we need to ensure that
+	 * status block indices have been actually read (qed_sb_update_sb_idx)
+	 * prior to this check (*_has_*_work) so that we won't write the
+	 * "newer" value of the status block to HW (if there was a DMA right
+	 * after qede_has_rx_work and if there is no rmb, the memory reading
+	 * (qed_sb_update_sb_idx) may be postponed to right before *_ack_sb).
+	 * In this case there will never be another interrupt until there is
+	 * another update of the status block, while there is still unhandled
+	 * work.
+	 */
+	rmb();
+
+	if (likely(fp->type & QEDE_FASTPATH_RX))
+		if (qede_has_rx_work(fp->rxq))
+			return true;
+
+	if (likely(fp->type & QEDE_FASTPATH_TX))
+		if (qede_txq_has_work(fp->txq))
+			return true;
+
+	return false;
 }
 
 static int qede_poll(struct napi_struct *napi, int budget)
@@ -1631,32 +1692,11 @@ static int qede_poll(struct napi_struct *napi, int budget)
 			qede_has_rx_work(fp->rxq)) ?
 			qede_rx_int(fp, budget) : 0;
 	if (rx_work_done < budget) {
-		qed_sb_update_sb_idx(fp->sb_info);
-		/* *_has_*_work() reads the status block,
-		 * thus we need to ensure that status block indices
-		 * have been actually read (qed_sb_update_sb_idx)
-		 * prior to this check (*_has_*_work) so that
-		 * we won't write the "newer" value of the status block
-		 * to HW (if there was a DMA right after
-		 * qede_has_rx_work and if there is no rmb, the memory
-		 * reading (qed_sb_update_sb_idx) may be postponed
-		 * to right before *_ack_sb). In this case there
-		 * will never be another interrupt until there is
-		 * another update of the status block, while there
-		 * is still unhandled work.
-		 */
-		rmb();
-
-		/* Fall out from the NAPI loop if needed */
-		if (!((likely(fp->type & QEDE_FASTPATH_RX) &&
-		       qede_has_rx_work(fp->rxq)) ||
-		      (likely(fp->type & QEDE_FASTPATH_TX) &&
-		       qede_txq_has_work(fp->txq)))) {
+		if (!qede_poll_is_more_work(fp)) {
 			napi_complete(napi);
 
 			/* Update and reenable interrupts */
-			qed_sb_ack(fp->sb_info, IGU_INT_ENABLE,
-				   1 /*update*/);
+			qed_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1);
 		} else {
 			rx_work_done = budget;
 		}
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 03/11] qede: Remove 'num_tc'.
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver currently doesn't support multi-CoS, but it contains logic
where multiple transmission queues could be theoretically manipulated.
No point in maintaining the infrastructure at the moment.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h         |  17 +--
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  40 +++----
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 132 ++++++++----------------
 3 files changed, 62 insertions(+), 127 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 1d4c7e0..b9584b2 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -127,10 +127,9 @@ struct qede_dev {
 
 	const struct qed_eth_ops	*ops;
 
-	struct qed_dev_eth_info	dev_info;
+	struct qed_dev_eth_info dev_info;
 #define QEDE_MAX_RSS_CNT(edev)	((edev)->dev_info.num_queues)
-#define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues * \
-				 (edev)->dev_info.num_tc)
+#define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues)
 
 	struct qede_fastpath		*fp_array;
 	u8				req_num_tx;
@@ -139,17 +138,9 @@ struct qede_dev {
 	u8				fp_num_rx;
 	u16				req_queues;
 	u16				num_queues;
-	u8				num_tc;
 #define QEDE_QUEUE_CNT(edev)	((edev)->num_queues)
 #define QEDE_RSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_tx)
-#define QEDE_TSS_COUNT(edev)	(((edev)->num_queues - (edev)->fp_num_rx) * \
-				 (edev)->num_tc)
-#define QEDE_TX_IDX(edev, txqidx)	((edev)->fp_num_rx + (txqidx) % \
-					 QEDE_TSS_COUNT(edev))
-#define QEDE_TC_IDX(edev, txqidx)	((txqidx) / QEDE_TSS_COUNT(edev))
-#define QEDE_TX_QUEUE(edev, txqidx)	\
-	(&(edev)->fp_array[QEDE_TX_IDX((edev), (txqidx))].txqs[QEDE_TC_IDX(\
-							(edev), (txqidx))])
+#define QEDE_TSS_COUNT(edev)	((edev)->num_queues - (edev)->fp_num_rx)
 
 	struct qed_int_info		int_info;
 	unsigned char			primary_mac[ETH_ALEN];
@@ -324,7 +315,7 @@ struct qede_fastpath {
 	struct napi_struct	napi;
 	struct qed_sb_info	*sb_info;
 	struct qede_rx_queue	*rxq;
-	struct qede_tx_queue	*txqs;
+	struct qede_tx_queue	*txq;
 
 #define VEC_NAME_SIZE	(sizeof(((struct net_device *)0)->name) + 8)
 	char	name[VEC_NAME_SIZE];
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 8a3debe..a892843 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -59,8 +59,8 @@
 	QEDE_TQSTAT(stopped_cnt),
 };
 
-#define QEDE_TQSTATS_DATA(dev, sindex, tssid, tcid) \
-	(*((u64 *)(((void *)(&dev->fp_array[tssid].txqs[tcid])) +\
+#define QEDE_TQSTATS_DATA(dev, sindex, tssid) \
+	(*((u64 *)(((void *)((dev)->fp_array[tssid].txq)) + \
 		   qede_tqstats_arr[(sindex)].offset)))
 
 static const struct {
@@ -175,7 +175,6 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 	int i, j, k;
 
 	for (i = 0, k = 0; i < QEDE_QUEUE_CNT(edev); i++) {
-		int tc;
 
 		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
 			for (j = 0; j < QEDE_NUM_RQSTATS; j++)
@@ -186,14 +185,12 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 		}
 
 		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				for (j = 0; j < QEDE_NUM_TQSTATS; j++)
-					sprintf(buf + (k + j) *
-						ETH_GSTRING_LEN,
-						"%d.%d: %s", i, tc,
-						qede_tqstats_arr[j].string);
-				k += QEDE_NUM_TQSTATS;
-			}
+			for (j = 0; j < QEDE_NUM_TQSTATS; j++)
+				sprintf(buf + (k + j) *
+					ETH_GSTRING_LEN,
+					"%d: %s", i,
+					qede_tqstats_arr[j].string);
+			k += QEDE_NUM_TQSTATS;
 		}
 	}
 
@@ -240,21 +237,16 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 	mutex_lock(&edev->qede_lock);
 
 	for (qid = 0; qid < QEDE_QUEUE_CNT(edev); qid++) {
-		int tc;
 
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_RX) {
+		if (edev->fp_array[qid].type & QEDE_FASTPATH_RX)
 			for (sidx = 0; sidx < QEDE_NUM_RQSTATS; sidx++)
 				buf[cnt++] = QEDE_RQSTATS_DATA(edev, sidx, qid);
-		}
 
-		if (edev->fp_array[qid].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				for (sidx = 0; sidx < QEDE_NUM_TQSTATS; sidx++)
-					buf[cnt++] = QEDE_TQSTATS_DATA(edev,
-								       sidx,
-								       qid, tc);
-			}
-		}
+		if (edev->fp_array[qid].type & QEDE_FASTPATH_TX)
+			for (sidx = 0; sidx < QEDE_NUM_TQSTATS; sidx++)
+				buf[cnt++] = QEDE_TQSTATS_DATA(edev,
+							       sidx,
+							       qid);
 	}
 
 	for (sidx = 0; sidx < QEDE_NUM_STATS; sidx++) {
@@ -281,7 +273,7 @@ static int qede_get_sset_count(struct net_device *dev, int stringset)
 					num_stats--;
 		}
 		return num_stats + QEDE_RSS_COUNT(edev) * QEDE_NUM_RQSTATS +
-		       QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS * edev->num_tc;
+		       QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
 	case ETH_SS_PRIV_FLAGS:
 		return QEDE_PRI_FLAG_LEN;
 	case ETH_SS_TEST:
@@ -1178,7 +1170,7 @@ static int qede_selftest_transmit_traffic(struct qede_dev *edev,
 
 	for_each_queue(i) {
 		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			txq = edev->fp_array[i].txqs;
+			txq = edev->fp_array[i].txq;
 			break;
 		}
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 653be22..2006dd4 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -547,7 +547,7 @@ static netdev_tx_t qede_start_xmit(struct sk_buff *skb,
 	/* Get tx-queue context and netdev index */
 	txq_index = skb_get_queue_mapping(skb);
 	WARN_ON(txq_index >= QEDE_TSS_COUNT(edev));
-	txq = QEDE_TX_QUEUE(edev, txq_index);
+	txq = edev->fp_array[edev->fp_num_rx + txq_index].txq;
 	netdev_txq = netdev_get_tx_queue(ndev, txq_index);
 
 	WARN_ON(qed_chain_get_elem_left(&txq->tx_pbl) < (MAX_SKB_FRAGS + 1));
@@ -881,16 +881,6 @@ bool qede_has_rx_work(struct qede_rx_queue *rxq)
 	return hw_comp_cons != sw_comp_cons;
 }
 
-static bool qede_has_tx_work(struct qede_fastpath *fp)
-{
-	u8 tc;
-
-	for (tc = 0; tc < fp->edev->num_tc; tc++)
-		if (qede_txq_has_work(&fp->txqs[tc]))
-			return true;
-	return false;
-}
-
 static inline void qede_rx_bd_ring_consume(struct qede_rx_queue *rxq)
 {
 	qed_chain_consume(&rxq->rx_bd_ring);
@@ -1633,12 +1623,9 @@ static int qede_poll(struct napi_struct *napi, int budget)
 						napi);
 	struct qede_dev *edev = fp->edev;
 	int rx_work_done = 0;
-	u8 tc;
 
-	for (tc = 0; tc < edev->num_tc; tc++)
-		if (likely(fp->type & QEDE_FASTPATH_TX) &&
-		    qede_txq_has_work(&fp->txqs[tc]))
-			qede_tx_int(edev, &fp->txqs[tc]);
+	if (likely(fp->type & QEDE_FASTPATH_TX) && qede_txq_has_work(fp->txq))
+		qede_tx_int(edev, fp->txq);
 
 	rx_work_done = (likely(fp->type & QEDE_FASTPATH_RX) &&
 			qede_has_rx_work(fp->rxq)) ?
@@ -1664,7 +1651,7 @@ static int qede_poll(struct napi_struct *napi, int budget)
 		if (!((likely(fp->type & QEDE_FASTPATH_RX) &&
 		       qede_has_rx_work(fp->rxq)) ||
 		      (likely(fp->type & QEDE_FASTPATH_TX) &&
-		       qede_has_tx_work(fp)))) {
+		       qede_txq_has_work(fp->txq)))) {
 			napi_complete(napi);
 
 			/* Update and reenable interrupts */
@@ -2330,8 +2317,6 @@ static struct qede_dev *qede_alloc_etherdev(struct qed_dev *cdev,
 	memset(&edev->stats, 0, sizeof(edev->stats));
 	memcpy(&edev->dev_info, info, sizeof(*info));
 
-	edev->num_tc = edev->dev_info.num_tc;
-
 	INIT_LIST_HEAD(&edev->vlan_list);
 
 	return edev;
@@ -2429,7 +2414,7 @@ static void qede_free_fp_array(struct qede_dev *edev)
 
 			kfree(fp->sb_info);
 			kfree(fp->rxq);
-			kfree(fp->txqs);
+			kfree(fp->txq);
 		}
 		kfree(edev->fp_array);
 	}
@@ -2462,7 +2447,7 @@ static int qede_alloc_fp_array(struct qede_dev *edev)
 	for_each_queue(i) {
 		fp = &edev->fp_array[i];
 
-		fp->sb_info = kcalloc(1, sizeof(*fp->sb_info), GFP_KERNEL);
+		fp->sb_info = kzalloc(sizeof(*fp->sb_info), GFP_KERNEL);
 		if (!fp->sb_info) {
 			DP_NOTICE(edev, "sb info struct allocation failed\n");
 			goto err;
@@ -2479,22 +2464,15 @@ static int qede_alloc_fp_array(struct qede_dev *edev)
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			fp->txqs = kcalloc(edev->num_tc, sizeof(*fp->txqs),
-					   GFP_KERNEL);
-			if (!fp->txqs) {
-				DP_NOTICE(edev,
-					  "TXQ array allocation failed\n");
+			fp->txq = kzalloc(sizeof(*fp->txq), GFP_KERNEL);
+			if (!fp->txq)
 				goto err;
-			}
 		}
 
 		if (fp->type & QEDE_FASTPATH_RX) {
-			fp->rxq = kcalloc(1, sizeof(*fp->rxq), GFP_KERNEL);
-			if (!fp->rxq) {
-				DP_NOTICE(edev,
-					  "RXQ struct allocation failed\n");
+			fp->rxq = kzalloc(sizeof(*fp->rxq), GFP_KERNEL);
+			if (!fp->rxq)
 				goto err;
-			}
 		}
 	}
 
@@ -3031,16 +3009,13 @@ static int qede_alloc_mem_txq(struct qede_dev *edev, struct qede_tx_queue *txq)
 /* This function frees all memory of a single fp */
 static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 {
-	int tc;
-
 	qede_free_mem_sb(edev, fp->sb_info);
 
 	if (fp->type & QEDE_FASTPATH_RX)
 		qede_free_mem_rxq(edev, fp->rxq);
 
 	if (fp->type & QEDE_FASTPATH_TX)
-		for (tc = 0; tc < edev->num_tc; tc++)
-			qede_free_mem_txq(edev, &fp->txqs[tc]);
+		qede_free_mem_txq(edev, fp->txq);
 }
 
 /* This function allocates all memory needed for a single fp (i.e. an entity
@@ -3048,7 +3023,7 @@ static void qede_free_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
  */
 static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 {
-	int rc, tc;
+	int rc;
 
 	rc = qede_alloc_mem_sb(edev, fp->sb_info, fp->id);
 	if (rc)
@@ -3061,11 +3036,9 @@ static int qede_alloc_mem_fp(struct qede_dev *edev, struct qede_fastpath *fp)
 	}
 
 	if (fp->type & QEDE_FASTPATH_TX) {
-		for (tc = 0; tc < edev->num_tc; tc++) {
-			rc = qede_alloc_mem_txq(edev, &fp->txqs[tc]);
-			if (rc)
-				goto err;
-		}
+		rc = qede_alloc_mem_txq(edev, fp->txq);
+		if (rc)
+			goto err;
 	}
 
 	return 0;
@@ -3108,7 +3081,7 @@ static int qede_alloc_mem_load(struct qede_dev *edev)
 /* This function inits fp content and resets the SB, RXQ and TXQ structures */
 static void qede_init_fp(struct qede_dev *edev)
 {
-	int queue_id, rxq_index = 0, txq_index = 0, tc;
+	int queue_id, rxq_index = 0, txq_index = 0;
 	struct qede_fastpath *fp;
 
 	for_each_queue(queue_id) {
@@ -3117,25 +3090,15 @@ static void qede_init_fp(struct qede_dev *edev)
 		fp->edev = edev;
 		fp->id = queue_id;
 
-		memset((void *)&fp->napi, 0, sizeof(fp->napi));
-
-		memset((void *)fp->sb_info, 0, sizeof(*fp->sb_info));
 
 		if (fp->type & QEDE_FASTPATH_RX) {
-			memset((void *)fp->rxq, 0, sizeof(*fp->rxq));
 			fp->rxq->rxq_id = rxq_index++;
 		}
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			memset((void *)fp->txqs, 0,
-			       (edev->num_tc * sizeof(*fp->txqs)));
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				fp->txqs[tc].index = txq_index +
-				    tc * QEDE_TSS_COUNT(edev);
-				if (edev->dev_info.is_legacy)
-					fp->txqs[tc].is_legacy = true;
-			}
-			txq_index++;
+			fp->txq->index = txq_index++;
+			if (edev->dev_info.is_legacy)
+				fp->txq->is_legacy = 1;
 		}
 
 		snprintf(fp->name, sizeof(fp->name), "%s-fp-%d",
@@ -3307,7 +3270,8 @@ static int qede_stop_queues(struct qede_dev *edev)
 {
 	struct qed_update_vport_params vport_update_params;
 	struct qed_dev *cdev = edev->cdev;
-	int rc, tc, i;
+	struct qede_fastpath *fp;
+	int rc, i;
 
 	/* Disable the vport */
 	memset(&vport_update_params, 0, sizeof(vport_update_params));
@@ -3324,16 +3288,12 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 	/* Flush Tx queues. If needed, request drain from MCP */
 	for_each_queue(i) {
-		struct qede_fastpath *fp = &edev->fp_array[i];
+		fp = &edev->fp_array[i];
 
 		if (fp->type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				struct qede_tx_queue *txq = &fp->txqs[tc];
-
-				rc = qede_drain_txq(edev, txq, true);
-				if (rc)
-					return rc;
-			}
+			rc = qede_drain_txq(edev, fp->txq, true);
+			if (rc)
+				return rc;
 		}
 	}
 
@@ -3341,29 +3301,24 @@ static int qede_stop_queues(struct qede_dev *edev)
 	for (i = QEDE_QUEUE_CNT(edev) - 1; i >= 0; i--) {
 		struct qed_stop_rxq_params rx_params;
 
+		fp = &edev->fp_array[i];
+
 		/* Stop the Tx Queue(s) */
-		if (edev->fp_array[i].type & QEDE_FASTPATH_TX) {
-			for (tc = 0; tc < edev->num_tc; tc++) {
-				struct qed_stop_txq_params tx_params;
-				u8 val;
-
-				tx_params.rss_id = i;
-				val = edev->fp_array[i].txqs[tc].index;
-				tx_params.tx_queue_id = val;
+		if (fp->type & QEDE_FASTPATH_TX) {
+			struct qed_stop_txq_params tx_params;
+
+			tx_params.rss_id = i;
+			tx_params.tx_queue_id = fp->txq->index;
 				rc = edev->ops->q_tx_stop(cdev, &tx_params);
-				if (rc) {
-					DP_ERR(edev, "Failed to stop TXQ #%d\n",
-					       tx_params.tx_queue_id);
+				if (rc)
 					return rc;
-				}
-			}
 		}
 
 		/* Stop the Rx Queue */
-		if (edev->fp_array[i].type & QEDE_FASTPATH_RX) {
+		if (fp->type & QEDE_FASTPATH_RX) {
 			memset(&rx_params, 0, sizeof(rx_params));
 			rx_params.rss_id = i;
-			rx_params.rx_queue_id = edev->fp_array[i].rxq->rxq_id;
+			rx_params.rx_queue_id = fp->rxq->rxq_id;
 
 			rc = edev->ops->q_rx_stop(cdev, &rx_params);
 			if (rc) {
@@ -3383,7 +3338,6 @@ static int qede_stop_queues(struct qede_dev *edev)
 
 static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 {
-	int rc, tc, i;
 	int vlan_removal_en = 1;
 	struct qed_dev *cdev = edev->cdev;
 	struct qed_update_vport_params vport_update_params;
@@ -3391,6 +3345,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 	struct qed_dev_info *qed_info = &edev->dev_info.common;
 	struct qed_start_vport_params start = {0};
 	bool reset_rss_indir = false;
+	int rc, i;
 
 	if (!edev->num_queues) {
 		DP_ERR(edev,
@@ -3454,11 +3409,8 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			qede_update_rx_prod(edev, rxq);
 		}
 
-		if (!(fp->type & QEDE_FASTPATH_TX))
-			continue;
-
-		for (tc = 0; tc < edev->num_tc; tc++) {
-			struct qede_tx_queue *txq = &fp->txqs[tc];
+		if (fp->type & QEDE_FASTPATH_TX) {
+			struct qede_tx_queue *txq = fp->txq;
 
 			p_phys_table = qed_chain_get_pbl_phys(&txq->tx_pbl);
 			page_cnt = qed_chain_get_page_cnt(&txq->tx_pbl);
@@ -3468,7 +3420,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			q_params.queue_id = txq->index;
 			q_params.vport_id = 0;
 			q_params.sb = fp->sb_info->igu_sb_id;
-			q_params.sb_idx = TX_PI(tc);
+			q_params.sb_idx = TX_PI(0);
 
 			rc = edev->ops->q_tx_start(cdev, &q_params,
 						   p_phys_table, page_cnt,
@@ -3480,7 +3432,7 @@ static int qede_start_queues(struct qede_dev *edev, bool clear_stats)
 			}
 
 			txq->hw_cons_ptr =
-				&fp->sb_info->sb_virt->pi_array[TX_PI(tc)];
+				&fp->sb_info->sb_virt->pi_array[TX_PI(0)];
 			SET_FIELD(txq->tx_db.data.params,
 				  ETH_DB_DATA_DEST, DB_DEST_XCM);
 			SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD,
@@ -3654,8 +3606,8 @@ static int qede_load(struct qede_dev *edev, enum qede_load_mode mode)
 	rc = qede_alloc_mem_load(edev);
 	if (rc)
 		goto err1;
-	DP_INFO(edev, "Allocated %d RSS queues on %d TC/s\n",
-		QEDE_QUEUE_CNT(edev), edev->num_tc);
+	DP_INFO(edev, "Allocated %d Rx, %d Tx queues\n",
+		QEDE_RSS_COUNT(edev), QEDE_TSS_COUNT(edev));
 
 	rc = qede_set_real_num_queues(edev);
 	if (rc)
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 02/11] qed: Optimize qed_chain datapath usage
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

The chain structure and functions are widely used by the qed* modules,
both for configuration and datapath.
E.g., qede's Tx has one such chain and its Rx has two.

Currently, the strucutre's fields which are required for datapath
related functions [produce/consume] are intertwined with fields which
are required only for configuration purposes [init/destroy/etc.].

This patch re-arranges the chain structure so that all the fields which
are required for datapath usage could reside in a single cacheline instead
of the two which are required today.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   7 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |   4 +-
 include/linux/qed/qed_chain.h                     | 144 ++++++++++++----------
 3 files changed, 86 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5be7b8a..80162ee 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2283,12 +2283,12 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 {
 	void **pp_virt_addr_tbl = p_chain->pbl.pp_virt_addr_tbl;
 	u32 page_cnt = p_chain->page_cnt, i, pbl_size;
-	u8 *p_pbl_virt = p_chain->pbl.p_virt_table;
+	u8 *p_pbl_virt = p_chain->pbl_sp.p_virt_table;
 
 	if (!pp_virt_addr_tbl)
 		return;
 
-	if (!p_chain->pbl.p_virt_table)
+	if (!p_pbl_virt)
 		goto out;
 
 	for (i = 0; i < page_cnt; i++) {
@@ -2306,7 +2306,8 @@ static void qed_chain_free_pbl(struct qed_dev *cdev, struct qed_chain *p_chain)
 	pbl_size = page_cnt * QED_CHAIN_PBL_ENTRY_SIZE;
 	dma_free_coherent(&cdev->pdev->dev,
 			  pbl_size,
-			  p_chain->pbl.p_virt_table, p_chain->pbl.p_phys_table);
+			  p_chain->pbl_sp.p_virt_table,
+			  p_chain->pbl_sp.p_phys_table);
 out:
 	vfree(p_chain->pbl.pp_virt_addr_tbl);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
index 2888eb0..d0a5828 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp_commands.c
@@ -347,11 +347,11 @@ int qed_sp_pf_start(struct qed_hwfn *p_hwfn,
 
 	/* Place EQ address in RAMROD */
 	DMA_REGPAIR_LE(p_ramrod->event_ring_pbl_addr,
-		       p_hwfn->p_eq->chain.pbl.p_phys_table);
+		       p_hwfn->p_eq->chain.pbl_sp.p_phys_table);
 	page_cnt = (u8)qed_chain_get_page_cnt(&p_hwfn->p_eq->chain);
 	p_ramrod->event_ring_num_pages = page_cnt;
 	DMA_REGPAIR_LE(p_ramrod->consolid_q_pbl_addr,
-		       p_hwfn->p_consq->chain.pbl.p_phys_table);
+		       p_hwfn->p_consq->chain.pbl_sp.p_phys_table);
 
 	qed_tunn_set_pf_start_params(p_hwfn, p_tunn, &p_ramrod->tunnel_config);
 
diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h
index 72d88cf..615f34c 100644
--- a/include/linux/qed/qed_chain.h
+++ b/include/linux/qed/qed_chain.h
@@ -56,23 +56,6 @@ struct qed_chain_pbl_u32 {
 	u32 cons_page_idx;
 };
 
-struct qed_chain_pbl {
-	/* Base address of a pre-allocated buffer for pbl */
-	dma_addr_t	p_phys_table;
-	void		*p_virt_table;
-
-	/* Table for keeping the virtual addresses of the chain pages,
-	 * respectively to the physical addresses in the pbl table.
-	 */
-	void **pp_virt_addr_tbl;
-
-	/* Index to current used page by producer/consumer */
-	union {
-		struct qed_chain_pbl_u16 pbl16;
-		struct qed_chain_pbl_u32 pbl32;
-	} u;
-};
-
 struct qed_chain_u16 {
 	/* Cyclic index of next element to produce/consme */
 	u16 prod_idx;
@@ -86,46 +69,78 @@ struct qed_chain_u32 {
 };
 
 struct qed_chain {
-	void			*p_virt_addr;
-	dma_addr_t		p_phys_addr;
-	void			*p_prod_elem;
-	void			*p_cons_elem;
+	/* fastpath portion of the chain - required for commands such
+	 * as produce / consume.
+	 */
+	/* Point to next element to produce/consume */
+	void *p_prod_elem;
+	void *p_cons_elem;
+
+	/* Fastpath portions of the PBL [if exists] */
+	struct {
+		/* Table for keeping the virtual addresses of the chain pages,
+		 * respectively to the physical addresses in the pbl table.
+		 */
+		void **pp_virt_addr_tbl;
 
-	enum qed_chain_mode	mode;
-	enum qed_chain_use_mode intended_use; /* used to produce/consume */
-	enum qed_chain_cnt_type cnt_type;
+		union {
+			struct qed_chain_pbl_u16 u16;
+			struct qed_chain_pbl_u32 u32;
+		} c;
+	} pbl;
 
 	union {
 		struct qed_chain_u16 chain16;
 		struct qed_chain_u32 chain32;
 	} u;
 
+	/* Capacity counts only usable elements */
+	u32 capacity;
 	u32 page_cnt;
 
-	/* Number of elements - capacity is for usable elements only,
-	 * while size will contain total number of elements [for entire chain].
+	enum qed_chain_mode mode;
+
+	/* Elements information for fast calculations */
+	u16 elem_per_page;
+	u16 elem_per_page_mask;
+	u16 elem_size;
+	u16 next_page_mask;
+	u16 usable_per_page;
+	u8 elem_unusable;
+
+	u8 cnt_type;
+
+	/* Slowpath of the chain - required for initialization and destruction,
+	 * but isn't involved in regular functionality.
 	 */
-	u32 capacity;
+
+	/* Base address of a pre-allocated buffer for pbl */
+	struct {
+		dma_addr_t p_phys_table;
+		void *p_virt_table;
+	} pbl_sp;
+
+	/* Address of first page of the chain - the address is required
+	 * for fastpath operation [consume/produce] but only for the the SINGLE
+	 * flavour which isn't considered fastpath [== SPQ].
+	 */
+	void *p_virt_addr;
+	dma_addr_t p_phys_addr;
+
+	/* Total number of elements [for entire chain] */
 	u32 size;
 
-	/* Elements information for fast calculations */
-	u16			elem_per_page;
-	u16			elem_per_page_mask;
-	u16			elem_unusable;
-	u16			usable_per_page;
-	u16			elem_size;
-	u16			next_page_mask;
-	struct qed_chain_pbl	pbl;
+	u8 intended_use;
 };
 
 #define QED_CHAIN_PBL_ENTRY_SIZE        (8)
 #define QED_CHAIN_PAGE_SIZE             (0x1000)
 #define ELEMS_PER_PAGE(elem_size)       (QED_CHAIN_PAGE_SIZE / (elem_size))
 
-#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)     \
-	((mode == QED_CHAIN_MODE_NEXT_PTR) ?	     \
-	 (1 + ((sizeof(struct qed_chain_next) - 1) / \
-	       (elem_size))) : 0)
+#define UNUSABLE_ELEMS_PER_PAGE(elem_size, mode)	 \
+	(((mode) == QED_CHAIN_MODE_NEXT_PTR) ?		 \
+	 (u8)(1 + ((sizeof(struct qed_chain_next) - 1) / \
+		   (elem_size))) : 0)
 
 #define USABLE_ELEMS_PER_PAGE(elem_size, mode) \
 	((u32)(ELEMS_PER_PAGE(elem_size) -     \
@@ -186,7 +201,7 @@ static inline u16 qed_chain_get_usable_per_page(struct qed_chain *p_chain)
 	return p_chain->usable_per_page;
 }
 
-static inline u16 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
+static inline u8 qed_chain_get_unusable_per_page(struct qed_chain *p_chain)
 {
 	return p_chain->elem_unusable;
 }
@@ -198,7 +213,7 @@ static inline u32 qed_chain_get_page_cnt(struct qed_chain *p_chain)
 
 static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 {
-	return p_chain->pbl.p_phys_table;
+	return p_chain->pbl_sp.p_phys_table;
 }
 
 /**
@@ -214,10 +229,10 @@ static inline dma_addr_t qed_chain_get_pbl_phys(struct qed_chain *p_chain)
 static inline void
 qed_chain_advance_page(struct qed_chain *p_chain,
 		       void **p_next_elem, void *idx_to_inc, void *page_to_inc)
-
 {
 	struct qed_chain_next *p_next = NULL;
 	u32 page_index = 0;
+
 	switch (p_chain->mode) {
 	case QED_CHAIN_MODE_NEXT_PTR:
 		p_next = *p_next_elem;
@@ -305,7 +320,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain16.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl16.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u16.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -314,7 +329,7 @@ static inline void *qed_chain_produce(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.prod_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_prod_idx = &p_chain->u.chain32.prod_idx;
-			p_prod_page_idx = &p_chain->pbl.u.pbl32.prod_page_idx;
+			p_prod_page_idx = &p_chain->pbl.c.u32.prod_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_prod_elem,
 					       p_prod_idx, p_prod_page_idx);
 		}
@@ -378,7 +393,7 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain16.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain16.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl16.cons_page_idx;
+			p_cons_page_idx = &p_chain->pbl.c.u16.cons_page_idx;
 			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
@@ -387,8 +402,8 @@ static inline void *qed_chain_consume(struct qed_chain *p_chain)
 		if ((p_chain->u.chain32.cons_idx &
 		     p_chain->elem_per_page_mask) == p_chain->next_page_mask) {
 			p_cons_idx = &p_chain->u.chain32.cons_idx;
-			p_cons_page_idx = &p_chain->pbl.u.pbl32.cons_page_idx;
-		qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
+			p_cons_page_idx = &p_chain->pbl.c.u32.cons_page_idx;
+			qed_chain_advance_page(p_chain, &p_chain->p_cons_elem,
 					       p_cons_idx, p_cons_page_idx);
 		}
 		p_chain->u.chain32.cons_idx++;
@@ -429,25 +444,26 @@ static inline void qed_chain_reset(struct qed_chain *p_chain)
 		u32 reset_val = p_chain->page_cnt - 1;
 
 		if (is_chain_u16(p_chain)) {
-			p_chain->pbl.u.pbl16.prod_page_idx = (u16)reset_val;
-			p_chain->pbl.u.pbl16.cons_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.prod_page_idx = (u16)reset_val;
+			p_chain->pbl.c.u16.cons_page_idx = (u16)reset_val;
 		} else {
-			p_chain->pbl.u.pbl32.prod_page_idx = reset_val;
-			p_chain->pbl.u.pbl32.cons_page_idx = reset_val;
+			p_chain->pbl.c.u32.prod_page_idx = reset_val;
+			p_chain->pbl.c.u32.cons_page_idx = reset_val;
 		}
 	}
 
 	switch (p_chain->intended_use) {
-	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
-	case QED_CHAIN_USE_TO_PRODUCE:
-		/* Do nothing */
-		break;
-
 	case QED_CHAIN_USE_TO_CONSUME:
 		/* produce empty elements */
 		for (i = 0; i < p_chain->capacity; i++)
 			qed_chain_recycle_consumed(p_chain);
 		break;
+
+	case QED_CHAIN_USE_TO_CONSUME_PRODUCE:
+	case QED_CHAIN_USE_TO_PRODUCE:
+	default:
+		/* Do nothing */
+		break;
 	}
 }
 
@@ -473,13 +489,13 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->p_virt_addr = NULL;
 	p_chain->p_phys_addr = 0;
 	p_chain->elem_size	= elem_size;
-	p_chain->intended_use = intended_use;
+	p_chain->intended_use = (u8)intended_use;
 	p_chain->mode		= mode;
-	p_chain->cnt_type = cnt_type;
+	p_chain->cnt_type = (u8)cnt_type;
 
-	p_chain->elem_per_page		= ELEMS_PER_PAGE(elem_size);
+	p_chain->elem_per_page = ELEMS_PER_PAGE(elem_size);
 	p_chain->usable_per_page = USABLE_ELEMS_PER_PAGE(elem_size, mode);
-	p_chain->elem_per_page_mask	= p_chain->elem_per_page - 1;
+	p_chain->elem_per_page_mask = p_chain->elem_per_page - 1;
 	p_chain->elem_unusable = UNUSABLE_ELEMS_PER_PAGE(elem_size, mode);
 	p_chain->next_page_mask = (p_chain->usable_per_page &
 				   p_chain->elem_per_page_mask);
@@ -488,8 +504,8 @@ static inline void qed_chain_init_params(struct qed_chain *p_chain,
 	p_chain->capacity = p_chain->usable_per_page * page_cnt;
 	p_chain->size = p_chain->elem_per_page * page_cnt;
 
-	p_chain->pbl.p_phys_table = 0;
-	p_chain->pbl.p_virt_table = NULL;
+	p_chain->pbl_sp.p_phys_table = 0;
+	p_chain->pbl_sp.p_virt_table = NULL;
 	p_chain->pbl.pp_virt_addr_tbl = NULL;
 }
 
@@ -530,8 +546,8 @@ static inline void qed_chain_init_pbl_mem(struct qed_chain *p_chain,
 					  dma_addr_t p_phys_pbl,
 					  void **pp_virt_addr_tbl)
 {
-	p_chain->pbl.p_phys_table = p_phys_pbl;
-	p_chain->pbl.p_virt_table = p_virt_pbl;
+	p_chain->pbl_sp.p_phys_table = p_phys_pbl;
+	p_chain->pbl_sp.p_virt_table = p_virt_pbl;
 	p_chain->pbl.pp_virt_addr_tbl = pp_virt_addr_tbl;
 }
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 01/11] qede: Optimize aggregation information size
From: Yuval Mintz @ 2016-11-29 14:47 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz
In-Reply-To: <1480430830-17671-1-git-send-email-Yuval.Mintz@cavium.com>

Driver needs to maintain a structure per-each concurrent possible
open aggregation, but the structure storing that metadata is far from
being optimized - biggest waste in it is that there are 2 buffer metadata,
one for a replacement buffer when the aggregation begins and the other for
holding the first aggregation's buffer after it begins [as firmware might
still update it]. Those 2 can safely be united into a single metadata
structure.

struct qede_agg_info changes the following:

	/* size: 120, cachelines: 2, members: 9 */
	/* sum members: 114, holes: 1, sum holes: 4 */
	/* padding: 2 */
	/* paddings: 2, sum paddings: 8 */
	/* last cacheline: 56 bytes */
 -->
	/* size: 48, cachelines: 1, members: 9 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 48 bytes */

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
---
 drivers/net/ethernet/qlogic/qede/qede.h      | 29 +++++++++----
 drivers/net/ethernet/qlogic/qede/qede_main.c | 63 ++++++++++++----------------
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 0cba21b..1d4c7e0 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -225,15 +225,30 @@ enum qede_agg_state {
 };
 
 struct qede_agg_info {
-	struct sw_rx_data replace_buf;
-	dma_addr_t replace_buf_mapping;
-	struct sw_rx_data start_buf;
-	dma_addr_t start_buf_mapping;
-	struct eth_fast_path_rx_tpa_start_cqe start_cqe;
-	enum qede_agg_state agg_state;
+	/* rx_buf is a data buffer that can be placed / consumed from rx bd
+	 * chain. It has two purposes: We will preallocate the data buffer
+	 * for each aggregation when we open the interface and will place this
+	 * buffer on the rx-bd-ring when we receive TPA_START. We don't want
+	 * to be in a state where allocation fails, as we can't reuse the
+	 * consumer buffer in the rx-chain since FW may still be writing to it
+	 * (since header needs to be modified for TPA).
+	 * The second purpose is to keep a pointer to the bd buffer during
+	 * aggregation.
+	 */
+	struct sw_rx_data buffer;
+	dma_addr_t buffer_mapping;
+
 	struct sk_buff *skb;
-	int frag_id;
+
+	/* We need some structs from the start cookie until termination */
 	u16 vlan_tag;
+	u16 start_cqe_bd_len;
+	u8 start_cqe_placement_offset;
+
+	u8 state;
+	u8 frag_id;
+
+	u8 tunnel_type;
 };
 
 struct qede_rx_queue {
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index b84a2c4..653be22 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1058,7 +1058,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[tpa_agg_index];
 	struct sk_buff *skb = tpa_info->skb;
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto out;
 
 	/* Add one frag and update the appropriate fields in the skb */
@@ -1084,7 +1084,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 	return 0;
 
 out:
-	tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+	tpa_info->state = QEDE_AGG_STATE_ERROR;
 	qede_recycle_rx_bd_ring(rxq, edev, 1);
 	return -ENOMEM;
 }
@@ -1096,8 +1096,8 @@ static void qede_tpa_start(struct qede_dev *edev,
 	struct qede_agg_info *tpa_info = &rxq->tpa_info[cqe->tpa_agg_index];
 	struct eth_rx_bd *rx_bd_cons = qed_chain_consume(&rxq->rx_bd_ring);
 	struct eth_rx_bd *rx_bd_prod = qed_chain_produce(&rxq->rx_bd_ring);
-	struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
-	dma_addr_t mapping = tpa_info->replace_buf_mapping;
+	struct sw_rx_data *replace_buf = &tpa_info->buffer;
+	dma_addr_t mapping = tpa_info->buffer_mapping;
 	struct sw_rx_data *sw_rx_data_cons;
 	struct sw_rx_data *sw_rx_data_prod;
 	enum pkt_hash_types rxhash_type;
@@ -1122,11 +1122,11 @@ static void qede_tpa_start(struct qede_dev *edev,
 	/* move partial skb from cons to pool (don't unmap yet)
 	 * save mapping, incase we drop the packet later on.
 	 */
-	tpa_info->start_buf = *sw_rx_data_cons;
+	tpa_info->buffer = *sw_rx_data_cons;
 	mapping = HILO_U64(le32_to_cpu(rx_bd_cons->addr.hi),
 			   le32_to_cpu(rx_bd_cons->addr.lo));
 
-	tpa_info->start_buf_mapping = mapping;
+	tpa_info->buffer_mapping = mapping;
 	rxq->sw_rx_cons++;
 
 	/* set tpa state to start only if we are able to allocate skb
@@ -1137,23 +1137,25 @@ static void qede_tpa_start(struct qede_dev *edev,
 					 le16_to_cpu(cqe->len_on_first_bd));
 	if (unlikely(!tpa_info->skb)) {
 		DP_NOTICE(edev, "Failed to allocate SKB for gro\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 		goto cons_buf;
 	}
 
-	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
-	memcpy(&tpa_info->start_cqe, cqe, sizeof(tpa_info->start_cqe));
-
 	/* Start filling in the aggregation info */
+	skb_put(tpa_info->skb, le16_to_cpu(cqe->len_on_first_bd));
 	tpa_info->frag_id = 0;
-	tpa_info->agg_state = QEDE_AGG_STATE_START;
+	tpa_info->state = QEDE_AGG_STATE_START;
 
 	rxhash = qede_get_rxhash(edev, cqe->bitfields,
 				 cqe->rss_hash, &rxhash_type);
 	skb_set_hash(tpa_info->skb, rxhash, rxhash_type);
+
+	/* Store some information from first CQE */
+	tpa_info->start_cqe_placement_offset = cqe->placement_offset;
+	tpa_info->start_cqe_bd_len = le16_to_cpu(cqe->len_on_first_bd);
 	if ((le16_to_cpu(cqe->pars_flags.flags) >>
 	     PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT) &
-		    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
+	    PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK)
 		tpa_info->vlan_tag = le16_to_cpu(cqe->vlan_tag);
 	else
 		tpa_info->vlan_tag = 0;
@@ -1169,7 +1171,7 @@ static void qede_tpa_start(struct qede_dev *edev,
 	if (unlikely(cqe->ext_bd_len_list[1])) {
 		DP_ERR(edev,
 		       "Unlikely - got a TPA aggregation with more than one ext_bd_len_list entry in the TPA start\n");
-		tpa_info->agg_state = QEDE_AGG_STATE_ERROR;
+		tpa_info->state = QEDE_AGG_STATE_ERROR;
 	}
 }
 
@@ -1276,7 +1278,7 @@ static void qede_tpa_end(struct qede_dev *edev,
 		DP_ERR(edev,
 		       "Strange - TPA emd with more than a single len_list entry\n");
 
-	if (unlikely(tpa_info->agg_state != QEDE_AGG_STATE_START))
+	if (unlikely(tpa_info->state != QEDE_AGG_STATE_START))
 		goto err;
 
 	/* Sanity */
@@ -1290,14 +1292,9 @@ static void qede_tpa_end(struct qede_dev *edev,
 		       le16_to_cpu(cqe->total_packet_len), skb->len);
 
 	memcpy(skb->data,
-	       page_address(tpa_info->start_buf.data) +
-		tpa_info->start_cqe.placement_offset +
-		tpa_info->start_buf.page_offset,
-	       le16_to_cpu(tpa_info->start_cqe.len_on_first_bd));
-
-	/* Recycle [mapped] start buffer for the next replacement */
-	tpa_info->replace_buf = tpa_info->start_buf;
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
+	       page_address(tpa_info->buffer.data) +
+	       tpa_info->start_cqe_placement_offset +
+	       tpa_info->buffer.page_offset, tpa_info->start_cqe_bd_len);
 
 	/* Finalize the SKB */
 	skb->protocol = eth_type_trans(skb, edev->ndev);
@@ -1310,18 +1307,11 @@ static void qede_tpa_end(struct qede_dev *edev,
 
 	qede_gro_receive(edev, fp, skb, tpa_info->vlan_tag);
 
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 
 	return;
 err:
-	/* The BD starting the aggregation is still mapped; Re-use it for
-	 * future aggregations [as replacement buffer]
-	 */
-	memcpy(&tpa_info->replace_buf, &tpa_info->start_buf,
-	       sizeof(struct sw_rx_data));
-	tpa_info->replace_buf_mapping = tpa_info->start_buf_mapping;
-	tpa_info->start_buf.data = NULL;
-	tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+	tpa_info->state = QEDE_AGG_STATE_NONE;
 	dev_kfree_skb_any(tpa_info->skb);
 	tpa_info->skb = NULL;
 }
@@ -2823,7 +2813,7 @@ static void qede_free_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		if (replace_buf->data) {
 			dma_unmap_page(&edev->pdev->dev,
@@ -2905,7 +2895,7 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 
 	for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) {
 		struct qede_agg_info *tpa_info = &rxq->tpa_info[i];
-		struct sw_rx_data *replace_buf = &tpa_info->replace_buf;
+		struct sw_rx_data *replace_buf = &tpa_info->buffer;
 
 		replace_buf->data = alloc_pages(GFP_ATOMIC, 0);
 		if (unlikely(!replace_buf->data)) {
@@ -2923,10 +2913,9 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq)
 		}
 
 		replace_buf->mapping = mapping;
-		tpa_info->replace_buf.page_offset = 0;
-
-		tpa_info->replace_buf_mapping = mapping;
-		tpa_info->agg_state = QEDE_AGG_STATE_NONE;
+		tpa_info->buffer.page_offset = 0;
+		tpa_info->buffer_mapping = mapping;
+		tpa_info->state = QEDE_AGG_STATE_NONE;
 	}
 
 	return 0;
-- 
1.9.3

^ permalink raw reply related

* [PATCH v2 net-next 00/11] qed*: Add XDP support
From: Yuval Mintz @ 2016-11-29 14:46 UTC (permalink / raw)
  To: davem, netdev; +Cc: Yuval Mintz

This patch series is intended to add XDP to the qede driver, although
it contains quite a bit of cleanups, refactorings and infrastructure
changes as well.

The content of this series can be roughly divided into:

 - Datapath improvements - mostly focused on having the datapath utilize
parameters which can be more tightly contained in cachelines.
Patches #1, #2, #8, #9 belong to this group.

 - Refactoring - done mostly in favour of XDP. Patches #3, #4, #5, #9.

 - Infrastructure changes - done in favour of XDP. Paches #6 and #7 belong
to this category [#7 being by far the biggest patch in the series].

 - Actual XDP support - last two patches [#10, #11].

Hi Dave,

Please consider applying this to `net-next'.

Thanks,
Yuval

Changes from previous versions
------------------------------

 v2:
  - Allow setting of the eBPF even when interface is DOWN.
  - Fix warnings when compiling for an arch with 64Kb pages.

Yuval Mintz (11):
  qede: Optimize aggregation information size
  qed: Optimize qed_chain datapath usage
  qede: Remove 'num_tc'.
  qede: Refactor statistics gathering
  qede: Refactor data-path Rx flow
  qede: Revise state locking scheme
  qed*: Handle-based L2-queues.
  qede: Don't check netdevice for rx-hash
  qede: Better utilize the qede_[rt]x_queue
  qede: Add basic XDP support
  qede: Add support for XDP_TX

 drivers/net/ethernet/qlogic/qed/qed.h             |   12 -
 drivers/net/ethernet/qlogic/qed/qed_dev.c         |   33 +-
 drivers/net/ethernet/qlogic/qed/qed_l2.c          |  595 +++++----
 drivers/net/ethernet/qlogic/qed/qed_l2.h          |  133 +-
 drivers/net/ethernet/qlogic/qed/qed_sp_commands.c |    4 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c       |  275 ++--
 drivers/net/ethernet/qlogic/qed/qed_sriov.h       |   21 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.c          |   90 +-
 drivers/net/ethernet/qlogic/qed/qed_vf.h          |   40 +-
 drivers/net/ethernet/qlogic/qede/qede.h           |  162 ++-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c   |  210 ++--
 drivers/net/ethernet/qlogic/qede/qede_main.c      | 1395 ++++++++++++---------
 include/linux/qed/qed_chain.h                     |  144 ++-
 include/linux/qed/qed_eth_if.h                    |   56 +-
 14 files changed, 1879 insertions(+), 1291 deletions(-)

-- 
1.9.3

^ permalink raw reply

* Re: [PATCH net] bpf: fix states equal logic for varlen access
From: Josef Bacik @ 2016-11-29 14:45 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: davem, netdev, daniel, ast, jannh
In-Reply-To: <20161129030446.GA13735@ast-mbp.thefacebook.com>

On 11/28/2016 10:04 PM, Alexei Starovoitov wrote:
> On Mon, Nov 28, 2016 at 02:44:10PM -0500, Josef Bacik wrote:
>> If we have a branch that looks something like this
>>
>> int foo = map->value;
>> if (condition) {
>>   foo += blah;
>> } else {
>>   foo = bar;
>> }
>> map->array[foo] = baz;
>>
>> We will incorrectly assume that the !condition branch is equal to the condition
>> branch as the register for foo will be UNKNOWN_VALUE in both cases.  We need to
>> adjust this logic to only do this if we didn't do a varlen access after we
>> processed the !condition branch, otherwise we have different ranges and need to
>> check the other branch as well.
>>
>> Signed-off-by: Josef Bacik <jbacik@fb.com>
>> ---
>>  kernel/bpf/verifier.c | 10 ++++++++--
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 89f787c..2c8a688 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2478,6 +2478,7 @@ static bool states_equal(struct bpf_verifier_env *env,
>>  {
>>  	struct bpf_reg_state *rold, *rcur;
>>  	int i;
>> +	bool map_access = env->varlen_map_value_access;
>
> that's a bit misleading name for the variable.
> Pls call it varlen_map_access.
>
>>  	for (i = 0; i < MAX_BPF_REG; i++) {
>>  		rold = &old->regs[i];
>> @@ -2489,12 +2490,17 @@ static bool states_equal(struct bpf_verifier_env *env,
>>  		/* If the ranges were not the same, but everything else was and
>>  		 * we didn't do a variable access into a map then we are a-ok.
>>  		 */
>> -		if (!env->varlen_map_value_access &&
>> +		if (!map_access &&
>>  		    rold->type == rcur->type && rold->imm == rcur->imm)
>
> just noticed that this one is missing comparing rold->id == rcur->id
>

Do you want me to fix that here?  I'll fix up the rest of the stuff, and Daniels 
things as well.  Thanks,

Josef

^ permalink raw reply

* Re: [PATCH v3 net-next 0/4] bpf: BPF for lightweight tunnel encapsulation
From: Hannes Frederic Sowa @ 2016-11-29 14:15 UTC (permalink / raw)
  To: Thomas Graf, davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Hi Thomas,

On 29.11.2016 14:21, Thomas Graf wrote:
> This series implements BPF program invocation from dst entries via the
> lightweight tunnels infrastructure. The BPF program can be attached to
> lwtunnel_input(), lwtunnel_output() or lwtunnel_xmit() and see an L3
> skb as context. Programs attached to input and output are read-only.
> Programs attached to lwtunnel_xmit() can modify and redirect, push headers
> and redirect packets.
> 
> The facility can be used to:
>  - Collect statistics and generate sampling data for a subset of traffic
>    based on the dst utilized by the packet thus allowing to extend the
>    existing realms.
>  - Apply additional per route/dst filters to prohibit certain outgoing
>    or incoming packets based on BPF filters. In particular, this allows
>    to maintain per dst custom state across multiple packets in BPF maps
>    and apply filters based on statistics and behaviour observed over time.
>  - Attachment of L2 headers at transmit where resolving the L2 address
>    is not required.
>  - Possibly many more.

Did you look at the cgroup based hooks which were added recently in
ip_finish_output for cgroup ebpf support and in general the cgroup bpf
subsystem. Does some of this solve the problem for you already? Would be
interesting to hear your opinion on that.

Thanks,
Hannes

^ permalink raw reply

* Re: [PATCH net-next] cgroup, bpf: remove unnecessary #include
From: Daniel Mack @ 2016-11-29 14:05 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov, David S . Miller; +Cc: Tejun Heo, netdev
In-Reply-To: <583D5D00.4070101@iogearbox.net>

On 11/29/2016 11:48 AM, Daniel Borkmann wrote:
> On 11/26/2016 08:23 AM, Alexei Starovoitov wrote:
>> this #include is unnecessary and brings whole set of
>> other headers into cgroup-defs.h. Remove it.
>>
>> Fixes: 3007098494be ("cgroup: add support for eBPF programs")
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> 
> This fixes many build errors in samples/bpf/ due to wrong helper
> redefinitions (originating from kernel includes conflicting with
> samples' helper declarations).
> 
> I don't see it pushed out to net-next yet, so:
> 
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>
> 

FWIW:

Acked-by: Daniel Mack <daniel@zonque.org>

^ permalink raw reply

* Re: [PATCH v2 net 0/5] l2tp: fixes for l2tp_ip and l2tp_ip6 socket handling
From: James Chapman @ 2016-11-29 13:47 UTC (permalink / raw)
  To: Guillaume Nault, netdev; +Cc: Chris Elston
In-Reply-To: <cover.1480419658.git.g.nault@alphalink.fr>

On 29/11/16 12:09, Guillaume Nault wrote:
> This series addresses problems found while working on commit 32c231164b76
> ("l2tp: fix racy SOCK_ZAPPED flag check in l2tp_ip{,6}_bind()").
>
> The first three patches fix races in socket's connect, recv and bind
> operations. The last two ones fix scenarios where l2tp fails to
> correctly lookup its userspace sockets.
>
> Apart from the last patch, which is l2tp_ip6 specific, every patch
> fixes the same problem in the L2TP IPv4 and IPv6 code.
>
> All problems fixed by this series exist since the creation of the
> l2tp_ip and l2tp_ip6 modules.
>
> Changes since v1:
>   * Patch #3: fix possible uninitialised use of 'ret' in l2tp_ip_bind().
>
>
> Guillaume Nault (5):
>   l2tp: lock socket before checking flags in connect()
>   l2tp: hold socket before dropping lock in l2tp_ip{,6}_recv()
>   l2tp: fix racy socket lookup in l2tp_ip and l2tp_ip6 bind()
>   l2tp: fix lookup for sockets not bound to a device in l2tp_ip
>   l2tp: fix address test in __l2tp_ip6_bind_lookup()
>
>  include/net/ipv6.h  |  2 ++
>  net/ipv6/datagram.c |  4 ++-
>  net/l2tp/l2tp_ip.c  | 63 ++++++++++++++++++++++--------------------
>  net/l2tp/l2tp_ip6.c | 79 ++++++++++++++++++++++++++++-------------------------
>  4 files changed, 81 insertions(+), 67 deletions(-)
>

Looks good.

Acked-by: James Chapman <jchapman@katalix.com>

^ permalink raw reply

* RE: [PATCH net-next v3 3/4] Documentation: net: phy: Add blurb about RGMII
From: David Laight @ 2016-11-29 13:47 UTC (permalink / raw)
  To: 'Florian Fainelli', netdev@vger.kernel.org
  Cc: davem@davemloft.net, andrew@lunn.ch, sf84@laposte.net,
	martin.blumenstingl@googlemail.com, mans@mansr.com,
	alexandre.torgue@st.com, peppe.cavallaro@st.com,
	timur@codeaurora.org, jbrunet@baylibre.com
In-Reply-To: <20161128024515.13070-4-f.fainelli@gmail.com>

From: Florian Fainelli
> Sent: 28 November 2016 02:45
> RGMII is a recurring source of pain for people with Gigabit Ethernet
> hardware since it may require PHY driver and MAC driver level
> configuration hints. Document what are the expectations from PHYLIB and
> what options exist.
...
> + * PHY_INTERFACE_MODE_RGMII_TXID: the PHY should insert an internal delay
> +   for the transmit data lines (TXD[3:0]) processed by the PHY device
> +
> + * PHY_INTERFACE_MODE_RGMII_RXID: the PHY should insert an internal delay
> +   for the receive data lines (RXD[3:0]) processed by the PHY device

These delays are subtly different, so the texts shouldn't really be the same.
I think the clock is delayed before latching the TX data and the RX data gets
delayed from the clock.

...
> + * Switching to lower speeds such as 10/100Mbits/sec makes the problem go away
> +   (since there is enough setup/hold time in that case)

The setup/hold times aren't directly affected by the lower speed.
So something else is going on.
Possibly the data is changed on one edge and sampled on the other at lower
speeds, but changes every clock edge at 1Gb.

	David

^ permalink raw reply

* [PATCH] ehea: Remove unnecessary memset of stats in netdev private data
From: Tobias Klauser @ 2016-11-29 13:35 UTC (permalink / raw)
  To: Douglas Miller; +Cc: netdev

The memory for netdev private data is allocated using kzalloc/vzalloc in
alloc_netdev_mqs, thus there is no need to zero the stats portion of it
again in the driver's probe function.

In any case, the size for the memset is wrong as the stats member is of
type rtnl_link_stats64, not net_device_stats.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
---
 drivers/net/ethernet/ibm/ehea/ehea_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index fa66fa6f8bee..702446a93697 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -3044,7 +3044,6 @@ static struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	init_waitqueue_head(&port->swqe_avail_wq);
 	init_waitqueue_head(&port->restart_wq);
 
-	memset(&port->stats, 0, sizeof(struct net_device_stats));
 	ret = register_netdev(dev);
 	if (ret) {
 		pr_err("register_netdev failed. ret=%d\n", ret);
-- 
2.11.0.rc3.5.g7cdf2ab.dirty

^ permalink raw reply related

* 转发: Re: [PATCH] phy: fix the bug when remove the phy driver
From: maowenan @ 2016-11-29 13:28 UTC (permalink / raw)
  To: netdev@vger.kernel.org, Florian Fainelli; +Cc: Dingtianhong, weiyongjun (A)

Hi
When remove marvell, the call trace as below:
rmmod marvell -> module_exit(phy_module_exit) -> phy_drivers_unregister -> phy_driver_unregister -> driver_unregister -> bus_remove_driver -> driver_detach -> __device_release_driver -> drv->remove(dev), phy_remove -> phydev->drv = NULL; 

static int phy_remove(struct device *dev)
{
	struct phy_device *phydev = to_phy_device(dev);

	mutex_lock(&phydev->lock);
	phydev->state = PHY_DOWN;
	mutex_unlock(&phydev->lock);

	if (phydev->drv->remove)
		phydev->drv->remove(phydev);
	phydev->drv = NULL;

	return 0;
}

I found there is no reference count check when call drv->remove in __device_release_driver()
		if (dev->bus && dev->bus->remove)
			dev->bus->remove(dev);
		else if (drv->remove) {
			pr_info("refcount:%d\n",dev->kobj.kref.refcount.counter);
			drv->remove(dev);
		}

so I add log to trace this, found the ref count of this dev is 5, unfortunately then call phy_remove and set phydev->drv = NULL and kernel panic happen.

@fainelli, What’s your comments about checking reference counting in driver_detach(), If there is someone remain using this driver, we could not remove the phy driver kernel module.
As bellow:
		if (dev->driver == drv && (dev->kobj.kref.refcount.counter == 1))
			__device_release_driver(dev);




-----邮件原件-----
发件人: Dingtianhong 
发送时间: 2016年11月29日 10:03
收件人: maowenan
主题: Fwd: Re: [PATCH] phy: fix the bug when remove the phy driver




-------- Forwarded Message --------
Subject: Re: [PATCH] phy: fix the bug when remove the phy driver
Date: Wed, 3 Aug 2016 09:21:52 +0800
From: Ding Tianhong <dingtianhong@huawei.com>
To: Florian Fainelli <f.fainelli@gmail.com>, Netdev <netdev@vger.kernel.org>, David S. Miller <davem@davemloft.net>, linux-kernel@vger.kernel.org <linux-kernel@vger.kernel.org>, Andrew Lunn <andrew@lunn.ch>

On 2016/8/3 0:42, Florian Fainelli wrote:
> Le 02/08/2016 à 06:00, Ding Tianhong a écrit :
>> The nic in my board use the phy dev from marvell, and the system will 
>> load the marvell phy driver automatically, but when I remove the phy 
>> drivers, the system immediately panic:
>>
>> localhost login: [ 2582.052465] Unable to handle kernel NULL pointer 
>> dereference at virtual address 000000c0 [ 2582.061429] pgd = 
>> ffff800001226000 [ 2582.065277] [000000c0] *pgd=0000003f7f893003, 
>> *pud=0000003f7f894003, *pmd=0000003f7f895003, *pte=006000006d000707 [ 
>> 2582.076681] Internal error: Oops: 96000006 [#1] SMP [ 2582.082049] Modules linked in: sr_mod(E) cdrom(E) ses(E) enclosure(E) shpchp(E) crc32_arm64(E) aes_ce_blk(E) ablk_helper(E) cryptd(E) aes_ce_cipher(E) ghash_ce(E) sha2_ce(E) sha1_ce(E) usb_storage(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last unloaded: marvell]
>> [ 2582.109226] CPU: 21 PID: 1514 Comm: kworker/21:1 Tainted: G            E   4.1.27-vhulk3.6.5.aarch64 #1
>> [ 2582.119415] Hardware name: Huawei Taishan 2180 /BC11SPCC, BIOS 
>> 1.31 06/23/2016 [ 2582.127346] Workqueue: events_power_efficient 
>> phy_state_machine [ 2582.133796] task: ffff803f6f41bac0 ti: 
>> ffff803f6eca4000 task.ti: ffff803f6eca4000 [ 2582.141910] PC is at 
>> phy_state_machine+0x3c/0x438 [ 2582.147081] LR is at 
>> phy_state_machine+0x34/0x438 [ 2582.152173] pc : [<ffff800000715384>] 
>> lr : [<ffff80000071537c>] pstate: 60000145 [ 2582.160189] sp : 
>> ffff803f6eca7d30 [ 2582.163825] x29: ffff803f6eca7d30 x28: 
>> 0000000000000000 [ 2582.169689] x27: ffff805fdfd0aa00 x26: 
>> 0000000000000008 [ 2582.175482] x25: 0000000000000000 x24: 
>> ffff80000112c57c [ 2582.181391] x23: ffff805f4fc8a800 x22: 
>> ffff805fdfd0f700 [ 2582.187241] x21: ffff805f4fc8abf8 x20: 
>> ffff805f4fc8a770 [ 2582.193035] x19: ffff805f4fc8ab70 x18: 
>> 0000000000000007 [ 2582.198914] x17: 000000000000000e x16: 
>> 0000000000000001 [ 2582.204710] x15: 0000000000000007 x14: 
>> 000000000000000e [ 2582.210584] x13: 0000000000000200 x12: 
>> 0000000055555556 [ 2582.216373] x11: 0000000000001c00 x10: 
>> 0000000000000000 [ 2582.222166] x9 : ffff800000a36880 x8 : 
>> ffff803f6f41c060 [ 2582.227994] x7 : 000000010008b39b x6 : 
>> ffff80000101e690 [ 2582.233788] x5 : 0000000000000000 x4 : 
>> 0000000000800000 [ 2582.239612] x3 : 0000000000000000 x2 : 
>> 0000000000000000 [ 2582.245404] x1 : 0000000000000000 x0 : 
>> 0000000000000000 [ 2582.265813] [ 2582.282971] Process kworker/21:1 
>> (pid: 1514, stack limit = 0xffff803f6eca4020) [ 2582.307284] Stack: 
>> (0xffff803f6eca7d30 to 0xffff803f6eca8000)
>> [ 2582.331183] 7d20:                                   ffff803f6eca7d70 ffff8000000db3b8
>> [ 2582.354788] 7d40: ffff803f6e696000 ffff805f4fc8ab70 
>> ffff805fdfd0aa00 ffff805fdfd0f700 [ 2582.378341] 7d60: 
>> 0000000000000000 ffff80000112c57c ffff803f6eca7dc0 ffff8000000db7d4 [ 
>> 2582.403700] 7d80: ffff803f6e696000 ffff803f6e696030 ffff805fdfd0aa18 
>> ffff805fdfd0aa00 [ 2582.428385] 7da0: ffff803f6eca4000 
>> ffff80000112c57c 0000000000000000 0000000000000008 [ 2582.451222] 
>> 7dc0: ffff803f6eca7e20 ffff8000000e1d0c ffff805f4fc15000 
>> ffff80000115f188 [ 2582.474577] 7de0: ffff800000d1cf88 
>> ffff803f6e696000 ffff8000000db690 0000000000000000 [ 2582.497281] 
>> 7e00: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.522992] 7e20: 0000000000000000 
>> ffff800000083dd0 ffff8000000e1c10 ffff805f4fc15000 [ 2582.546945] 
>> 7e40: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.568550] 7e60: 0000000000000000 
>> ffff8000000ee33c ffff803f6e696000 ffff805f00000000 [ 2582.589157] 
>> 7e80: 0000000000000000 ffff803f6eca7e88 ffff803f6eca7e88 
>> 0000000000000000 [ 2582.609767] 7ea0: 0000000000000000 
>> ffff803f6eca7ea8 ffff803f6eca7ea8 cb88537fdc8ba31b [ 2582.633228] 
>> 7ec0: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.655386] 7ee0: 0000000000000000 
>> 0000000000000000 0000000000000000 0000000000000000 [ 2582.674223] 
>> 7f00: 0000000000000000 0000000000000000 0000000000000000 
>> 0000000000000000 [ 2582.692994] 7f20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.711765] 7f40: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.730809] 7f60: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.748601] 7f80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.769388] 7fa0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.786756] 7fc0: 0000000000000000 0000000000000005 0000000000000000 0000000000000000 [ 2582.804134] 7fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 2582.821488] Call trace:
>> [ 2582.834493] [<ffff800000715384>] phy_state_machine+0x3c/0x438 [ 
>> 2582.851754] [<ffff8000000db3b8>] process_one_work+0x150/0x428 [ 
>> 2582.868188] [<ffff8000000db7d4>] worker_thread+0x144/0x4b0 [ 
>> 2582.883882] [<ffff8000000e1d0c>] kthread+0xfc/0x110
>>
>> ============================cut here===============================
>>
>> The phy_state_machine was still running and didn't know whether the 
>> phydev's driver was removed or not, then occur this problem, so we 
>> need to stop the phy_state_machine when removing the phy driver.
> 
> Your explanation of the problem is unclear to me, unless a network 
> driver attached to the PHY and started it, and then never stopped the 
> PHY state machine, this should not happen, also there should be proper 
> reference counting in place to avoid that. Your trace is based on 4.1 
> is this still reproducible with latest vanilla? Is this with a 
> mainline Ethernet driver?
> 

Yes，the network driver already attached to the PHY and started it, and all of them could work well if I didn't do anything, but if I suddenly remove the marvall.ko, the phy_state_machine was still running, but the phydev->drv is NULL at this time, than oops, I found this problem at 4.1 lts, and didn't see any effective improvement in the kernel 4.8, so report this bug and fix it.

Thanks.
Ding



^ permalink raw reply

* RE: The ubufs->refcount maybe be subtracted twice when tun_get_user failed
From: wangyunjian @ 2016-11-29 13:27 UTC (permalink / raw)
  To: Jason Wang, mst@redhat.com, netdev@vger.kernel.org; +Cc: caihe
In-Reply-To: <30e4c704-70b4-d318-68bd-742eb5f9bfc7@redhat.com>

Sorry, I didn't describe it clearly. In fact, the second subtraction happens in the function handle_tx,
when tun_get_user fails and zcopy_used is ture. Fllowing the steps:

static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                                  void *msg_control, struct iov_iter *from,
                                  int noblock) {
           ...

           if (zerocopy)
                     err = zerocopy_sg_from_iter(skb, from);
           else {
                     err = skb_copy_datagram_from_iter(skb, 0, from, len);
                     if (!err && msg_control) {
                              struct ubuf_info *uarg = msg_control;
                              uarg->callback(uarg, false);                       --> step 1, the ubufs->refcount is subtracted frist.
                     }
           }

           if (err) {
                     this_cpu_inc(tun->pcpu_stats->rx_dropped);
                     kfree_skb(skb);
                     return -EFAULT;
           }

           err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
           if (err) {
                     this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
                     kfree_skb(skb);
                     return -EINVAL;                                         -->step 2, return err.
           }
}

static void handle_tx(struct vhost_net *net)
{
	...
	/* TODO: Check specific error and bomb out unless ENOBUFS? */
	err = sock->ops->sendmsg(sock, &msg, len);
	if (unlikely(err < 0)) {
		if (zcopy_used) {
			vhost_net_ubuf_put(ubufs);                                        --> step 3, the ubufs->refcount will be subtracted twice, when sendmsg execution err.
			nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
				% UIO_MAXIOV;
		}
		vhost_discard_vq_desc(vq, 1);
		break;
	}
	...
}

-----Original Message-----
From: Jason Wang [mailto:jasowang@redhat.com] 
Sent: Tuesday, November 29, 2016 6:07 PM
To: wangyunjian; mst@redhat.com; netdev@vger.kernel.org
Cc: caihe
Subject: Re: The ubufs->refcount maybe be subtracted twice when tun_get_user failed



On 2016年11月29日 17:30, wangyunjian wrote:
> In function tun_get_user , the ubufs->refcount may be subtracted twice, when msg_control is true and zerocopy is false.
>
> About the below code:
>
> static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
>                                  void *msg_control, struct iov_iter *from,
>                                  int noblock) {
>           ...
>
>           if (zerocopy)
>                     err = zerocopy_sg_from_iter(skb, from);
>           else {
>                     err = skb_copy_datagram_from_iter(skb, 0, from, len);
>                     if (!err && msg_control) {
>                              struct ubuf_info *uarg = msg_control;
>                              uarg->callback(uarg, false);                       --> the ubufs->refcount is subtracted frist.
>                     }
>           }
>
>           if (err) {
>                     this_cpu_inc(tun->pcpu_stats->rx_dropped);
>                     kfree_skb(skb);
>                     return -EFAULT;
>           }
>
>           err = virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun));
>           if (err) {
>                     this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
>                     kfree_skb(skb);
>                     return -EINVAL;                                   -->here, the ubufs->refcount will be subtracted twice, when virtio_net_hdr_to_skb execution err.
>           }

Just make sure I understand the problem. Since we don't set SKBTX_DEV_ZEROCOPY here if zerocopy is false, callback won't be even trigged in skb_release_data(). So we are in fact safe here?

Thanks

> switch (tun->flags & TUN_TYPE_MASK) {
>           case IFF_TUN:
>                     if (tun->flags & IFF_NO_PI) {
>                              switch (skb->data[0] & 0xf0) {
>                              case 0x40:
>                                       pi.proto = htons(ETH_P_IP);
>                                       break;
>                              case 0x60:
>                                       pi.proto = htons(ETH_P_IPV6);
>                                       break;
>                              default:
>                                       this_cpu_inc(tun->pcpu_stats->rx_dropped);
>                                       kfree_skb(skb);
>                                       return -EINVAL;                          --> this will also be subtracted twice.
>                              }
>                     }
>
>                     skb_reset_mac_header(skb);
>                     skb->protocol = pi.proto;
>                     skb->dev = tun->dev;
>                     break;
>           case IFF_TAP:
>                     skb->protocol = eth_type_trans(skb, tun->dev);
>                     break;
>           }
> 		...
> }


^ permalink raw reply

* [PATCH net-next v3 4/4] bpf: Add tests and samples for LWT-BPF
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Adds a series of test to verify the functionality of attaching
BPF programs at LWT hooks.

Also adds a sample which collects a histogram of packet sizes which
pass through an LWT hook.

$ ./lwt_len_hist.sh
Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.253.2 () port 0 AF_INET : demo
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

 87380  16384  16384    10.00    39857.69
       1 -> 1        : 0        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 0        |                                      |
      32 -> 63       : 22       |                                      |
      64 -> 127      : 98       |                                      |
     128 -> 255      : 213      |                                      |
     256 -> 511      : 1444251  |********                              |
     512 -> 1023     : 660610   |***                                   |
    1024 -> 2047     : 535241   |**                                    |
    2048 -> 4095     : 19       |                                      |
    4096 -> 8191     : 180      |                                      |
    8192 -> 16383    : 5578023  |************************************* |
   16384 -> 32767    : 632099   |***                                   |
   32768 -> 65535    : 6575     |                                      |

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 samples/bpf/Makefile            |   4 +
 samples/bpf/bpf_helpers.h       |   4 +
 samples/bpf/lwt_len_hist.sh     |  37 ++++
 samples/bpf/lwt_len_hist_kern.c |  82 +++++++++
 samples/bpf/lwt_len_hist_user.c |  76 ++++++++
 samples/bpf/test_lwt_bpf.c      | 247 ++++++++++++++++++++++++++
 samples/bpf/test_lwt_bpf.sh     | 385 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 835 insertions(+)
 create mode 100755 samples/bpf/lwt_len_hist.sh
 create mode 100644 samples/bpf/lwt_len_hist_kern.c
 create mode 100644 samples/bpf/lwt_len_hist_user.c
 create mode 100644 samples/bpf/test_lwt_bpf.c
 create mode 100755 samples/bpf/test_lwt_bpf.sh

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 22b6407e..3161f82 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -29,6 +29,7 @@ hostprogs-y += test_current_task_under_cgroup
 hostprogs-y += trace_event
 hostprogs-y += sampleip
 hostprogs-y += tc_l2_redirect
+hostprogs-y += lwt_len_hist
 
 test_lru_dist-objs := test_lru_dist.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
@@ -59,6 +60,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
 trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
 tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
+lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -89,6 +91,7 @@ always += xdp2_kern.o
 always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
+always += lwt_len_hist_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(objtree)/tools/testing/selftests/bpf/
@@ -117,6 +120,7 @@ HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
 HOSTLOADLIBES_tc_l2_redirect += -l elf
+HOSTLOADLIBES_lwt_len_hist += -l elf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 90f44bd..f34e417 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -80,6 +80,8 @@ struct bpf_map_def {
 	unsigned int map_flags;
 };
 
+static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) =
+	(void *) BPF_FUNC_skb_load_bytes;
 static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
 	(void *) BPF_FUNC_skb_store_bytes;
 static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) =
@@ -88,6 +90,8 @@ static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flag
 	(void *) BPF_FUNC_l4_csum_replace;
 static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_skb_under_cgroup;
+static int (*bpf_skb_push)(void *, int len, int flags) =
+	(void *) BPF_FUNC_skb_push;
 
 #if defined(__x86_64__)
 
diff --git a/samples/bpf/lwt_len_hist.sh b/samples/bpf/lwt_len_hist.sh
new file mode 100755
index 0000000..3a8ee52
--- /dev/null
+++ b/samples/bpf/lwt_len_hist.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+NS1=lwt_ns1
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function cleanup {
+        ip route del 192.168.253.2/32 dev $VETH0 2> /dev/null
+        ip link del $VETH0 2> /dev/null
+        ip link del $VETH1 2> /dev/null
+	ip netns exec $NS1 killall netserver
+        ip netns delete $NS1 2> /dev/null
+}
+
+cleanup
+
+ip netns add $NS1
+ip link add $VETH0 type veth peer name $VETH1
+ip link set dev $VETH0 up
+ip addr add 192.168.253.1/24 dev $VETH0
+ip link set $VETH1 netns $NS1
+ip netns exec $NS1 ip link set dev $VETH1 up
+ip netns exec $NS1 ip addr add 192.168.253.2/24 dev $VETH1
+ip netns exec $NS1 netserver
+
+echo 1 > ${TRACE_ROOT}/tracing_on
+cp /dev/null ${TRACE_ROOT}/trace
+ip route add 192.168.253.2/32 encap bpf out obj lwt_len_hist_kern.o section len_hist dev $VETH0
+netperf -H 192.168.253.2 -t TCP_STREAM
+cat ${TRACE_ROOT}/trace | grep -v '^#'
+./lwt_len_hist
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+
+exit 0
diff --git a/samples/bpf/lwt_len_hist_kern.c b/samples/bpf/lwt_len_hist_kern.c
new file mode 100644
index 0000000..df75383
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_kern.c
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/in.h>
+#include "bpf_helpers.h"
+
+# define printk(fmt, ...)						\
+		({							\
+			char ____fmt[] = fmt;				\
+			bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				     ##__VA_ARGS__);			\
+		})
+
+struct bpf_elf_map {
+	__u32 type;
+	__u32 size_key;
+	__u32 size_value;
+	__u32 max_elem;
+	__u32 flags;
+	__u32 id;
+	__u32 pinning;
+};
+
+struct bpf_elf_map SEC("maps") lwt_len_hist_map = {
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+	.size_key = sizeof(__u64),
+	.size_value = sizeof(__u64),
+	.pinning = 2,
+	.max_elem = 1024,
+};
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+SEC("len_hist")
+int do_len_hist(struct __sk_buff *skb)
+{
+	__u64 *value, key, init_val = 1;
+
+	key = log2l(skb->len);
+
+	value = bpf_map_lookup_elem(&lwt_len_hist_map, &key);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	else
+		bpf_map_update_elem(&lwt_len_hist_map, &key, &init_val, BPF_ANY);
+
+	return BPF_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c
new file mode 100644
index 0000000..05d783f
--- /dev/null
+++ b/samples/bpf/lwt_len_hist_user.c
@@ -0,0 +1,76 @@
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <arpa/inet.h>
+
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define MAX_INDEX 64
+#define MAX_STARS 38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+int main(int argc, char **argv)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	const char *map_filename = "/sys/fs/bpf/tc/globals/lwt_len_hist_map";
+	uint64_t values[nr_cpus], sum, max_value = 0, data[MAX_INDEX] = {};
+	uint64_t key = 0, next_key, max_key = 0;
+	char starstr[MAX_STARS];
+	int i, map_fd;
+
+	map_fd = bpf_obj_get(map_filename);
+	if (map_fd < 0) {
+		fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n",
+			map_filename, strerror(errno), errno);
+		return -1;
+	}
+
+	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+		if (next_key >= MAX_INDEX) {
+			fprintf(stderr, "Key %lu out of bounds\n", next_key);
+			continue;
+		}
+
+		bpf_lookup_elem(map_fd, &next_key, values);
+
+		sum = 0;
+		for (i = 0; i < nr_cpus; i++)
+			sum += values[i];
+
+		data[next_key] = sum;
+		if (sum && next_key > max_key)
+			max_key = next_key;
+
+		if (sum > max_value)
+			max_value = sum;
+
+		key = next_key;
+	}
+
+	for (i = 1; i <= max_key + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+
+	close(map_fd);
+
+	return 0;
+}
diff --git a/samples/bpf/test_lwt_bpf.c b/samples/bpf/test_lwt_bpf.c
new file mode 100644
index 0000000..9c1773e
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.c
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include "bpf_helpers.h"
+#include <string.h>
+
+# define printk(fmt, ...)						\
+		({							\
+			char ____fmt[] = fmt;				\
+			bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				     ##__VA_ARGS__);			\
+		})
+
+#define CB_MAGIC 1234
+
+/* Test: Pass all packets through */
+SEC("nop")
+int do_nop(struct __sk_buff *skb)
+{
+	return BPF_OK;
+}
+
+/* Test: Verify context information can be accessed */
+SEC("test_ctx")
+int do_test_ctx(struct __sk_buff *skb)
+{
+	skb->cb[0] = CB_MAGIC;
+	printk("len %d hash %d protocol %d\n", skb->len, skb->hash,
+	       skb->protocol);
+	printk("cb %d ingress_ifindex %d ifindex %d\n", skb->cb[0],
+	       skb->ingress_ifindex, skb->ifindex);
+
+	return BPF_OK;
+}
+
+/* Test: Ensure skb->cb[] buffer is cleared */
+SEC("test_cb")
+int do_test_cb(struct __sk_buff *skb)
+{
+	printk("cb0: %x cb1: %x cb2: %x\n", skb->cb[0], skb->cb[1],
+	       skb->cb[2]);
+	printk("cb3: %x cb4: %x\n", skb->cb[3], skb->cb[4]);
+
+	return BPF_OK;
+}
+
+/* Test: Verify skb data can be read */
+SEC("test_data")
+int do_test_data(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct iphdr *iph = data;
+
+	if (data + sizeof(*iph) > data_end) {
+		printk("packet truncated\n");
+		return BPF_DROP;
+	}
+
+	printk("src: %x dst: %x\n", iph->saddr, iph->daddr);
+
+	return BPF_OK;
+}
+
+#define IP_CSUM_OFF offsetof(struct iphdr, check)
+#define IP_DST_OFF offsetof(struct iphdr, daddr)
+#define IP_SRC_OFF offsetof(struct iphdr, saddr)
+#define IP_PROTO_OFF offsetof(struct iphdr, protocol)
+#define TCP_CSUM_OFF offsetof(struct tcphdr, check)
+#define UDP_CSUM_OFF offsetof(struct udphdr, check)
+#define IS_PSEUDO 0x10
+
+static inline int rewrite(struct __sk_buff *skb, uint32_t old_ip,
+			  uint32_t new_ip, int rw_daddr)
+{
+	int ret, off = 0, flags = IS_PSEUDO;
+	uint8_t proto;
+
+	ret = bpf_skb_load_bytes(skb, IP_PROTO_OFF, &proto, 1);
+	if (ret < 0) {
+		printk("bpf_l4_csum_replace failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	switch (proto) {
+	case IPPROTO_TCP:
+		off = TCP_CSUM_OFF;
+		break;
+
+	case IPPROTO_UDP:
+		off = UDP_CSUM_OFF;
+		flags |= BPF_F_MARK_MANGLED_0;
+		break;
+
+	case IPPROTO_ICMPV6:
+		off = offsetof(struct icmp6hdr, icmp6_cksum);
+		break;
+	}
+
+	if (off) {
+		ret = bpf_l4_csum_replace(skb, off, old_ip, new_ip,
+					  flags | sizeof(new_ip));
+		if (ret < 0) {
+			printk("bpf_l4_csum_replace failed: %d\n");
+			return BPF_DROP;
+		}
+	}
+
+	ret = bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
+	if (ret < 0) {
+		printk("bpf_l3_csum_replace failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	if (rw_daddr)
+		ret = bpf_skb_store_bytes(skb, IP_DST_OFF, &new_ip, sizeof(new_ip), 0);
+	else
+		ret = bpf_skb_store_bytes(skb, IP_SRC_OFF, &new_ip, sizeof(new_ip), 0);
+
+	if (ret < 0) {
+		printk("bpf_skb_store_bytes() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	return BPF_OK;
+}
+
+/* Test: Verify skb data can be modified */
+SEC("test_rewrite")
+int do_test_rewrite(struct __sk_buff *skb)
+{
+	uint32_t old_ip, new_ip = 0x3fea8c0;
+	int ret;
+
+	ret = bpf_skb_load_bytes(skb, IP_DST_OFF, &old_ip, 4);
+	if (ret < 0) {
+		printk("bpf_skb_load_bytes failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	if (old_ip == 0x2fea8c0) {
+		printk("out: rewriting from %x to %x\n", old_ip, new_ip);
+		return rewrite(skb, old_ip, new_ip, 1);
+	}
+
+	return BPF_OK;
+}
+
+static inline int __do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+	uint64_t smac = SRC_MAC, dmac = DST_MAC;
+	int ret, ifindex = DST_IFINDEX;
+	struct ethhdr ehdr;
+
+	ret = bpf_skb_push(skb, 14, 0);
+	if (ret < 0) {
+		printk("skb_push() failed: %d\n", ret);
+	}
+
+	ehdr.h_proto = __constant_htons(ETH_P_IP);
+	memcpy(&ehdr.h_source, &smac, 6);
+	memcpy(&ehdr.h_dest, &dmac, 6);
+
+	ret = bpf_skb_store_bytes(skb, 0, &ehdr, sizeof(ehdr), 0);
+	if (ret < 0) {
+		printk("skb_store_bytes() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	ret = bpf_redirect(ifindex, 0);
+	if (ret < 0) {
+		printk("bpf_redirect() failed: %d\n", ret);
+		return BPF_DROP;
+	}
+
+	return BPF_REDIRECT;
+}
+
+SEC("push_ll_and_redirect_silent")
+int do_push_ll_and_redirect_silent(struct __sk_buff *skb)
+{
+	return __do_push_ll_and_redirect(skb);
+}
+
+SEC("push_ll_and_redirect")
+int do_push_ll_and_redirect(struct __sk_buff *skb)
+{
+	int ret, ifindex = DST_IFINDEX;
+
+	ret = __do_push_ll_and_redirect(skb);
+	if (ret >= 0)
+		printk("redirected to %d\n", ifindex);
+
+	return ret;
+}
+
+SEC("fill_garbage")
+int do_fill_garbage(struct __sk_buff *skb)
+{
+	uint64_t f = 0xFFFFFFFFFFFFFFFF;
+
+	bpf_skb_store_bytes(skb, 0, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 8, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 16, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 24, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 32, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 40, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 48, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 56, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 64, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 72, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 80, &f, sizeof(f), 0);
+	bpf_skb_store_bytes(skb, 88, &f, sizeof(f), 0);
+
+	printk("Set initial 96 bytes of header to FF\n");
+
+	return BPF_OK;
+}
+
+/* Drop all packets */
+SEC("drop_all")
+int do_drop_all(struct __sk_buff *skb)
+{
+	printk("dropping with: %d\n", BPF_DROP);
+	return BPF_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_lwt_bpf.sh b/samples/bpf/test_lwt_bpf.sh
new file mode 100755
index 0000000..d016961
--- /dev/null
+++ b/samples/bpf/test_lwt_bpf.sh
@@ -0,0 +1,385 @@
+#!/bin/bash
+
+# Uncomment to see generated bytecode
+#VERBOSE=verbose
+
+NS1=lwt_ns1
+NS2=lwt_ns2
+VETH0=tst_lwt1a
+VETH1=tst_lwt1b
+VETH2=tst_lwt2a
+VETH3=tst_lwt2b
+IPVETH0="192.168.254.1"
+IPVETH1="192.168.254.2"
+IPVETH1b="192.168.254.3"
+
+IPVETH2="192.168.111.1"
+IPVETH3="192.168.111.2"
+
+IP_LOCAL="192.168.99.1"
+
+TRACE_ROOT=/sys/kernel/debug/tracing
+
+function lookup_mac()
+{
+	set +x
+	if [ ! -z "$2" ]; then
+		MAC=$(ip netns exec $2 ip link show $1 | grep ether | awk '{print $2}')
+	else
+		MAC=$(ip link show $1 | grep ether | awk '{print $2}')
+	fi
+	MAC="${MAC//:/}"
+	echo "0x${MAC:10:2}${MAC:8:2}${MAC:6:2}${MAC:4:2}${MAC:2:2}${MAC:0:2}"
+	set -x
+}
+
+function cleanup {
+        set +ex
+        rm test_lwt_bpf.o 2> /dev/null
+        ip link del $VETH0 2> /dev/null
+        ip link del $VETH1 2> /dev/null
+        ip link del $VETH2 2> /dev/null
+        ip link del $VETH3 2> /dev/null
+	ip netns exec $NS1 killall netserver
+        ip netns delete $NS1 2> /dev/null
+        ip netns delete $NS2 2> /dev/null
+        set -ex
+}
+
+function setup_one_veth {
+        ip netns add $1
+        ip link add $2 type veth peer name $3
+        ip link set dev $2 up
+        ip addr add $4/24 dev $2
+        ip link set $3 netns $1
+        ip netns exec $1 ip link set dev $3 up
+        ip netns exec $1 ip addr add $5/24 dev $3
+
+	if [ "$6" ]; then
+		ip netns exec $1 ip addr add $6/32 dev $3
+	fi
+}
+
+function get_trace {
+	set +x
+        cat ${TRACE_ROOT}/trace | grep -v '^#'
+	set -x
+}
+
+function cleanup_routes {
+	ip route del ${IPVETH1}/32 dev $VETH0 2> /dev/null || true
+	ip route del table local local ${IP_LOCAL}/32 dev lo 2> /dev/null || true
+}
+
+function install_test {
+	cleanup_routes
+	cp /dev/null ${TRACE_ROOT}/trace
+
+	OPTS="encap bpf headroom 14 $1 obj test_lwt_bpf.o section $2 $VERBOSE"
+
+	if [ "$1" == "in" ];  then
+		ip route add table local local ${IP_LOCAL}/32 $OPTS dev lo
+	else
+		ip route add ${IPVETH1}/32 $OPTS dev $VETH0
+	fi
+}
+
+function remove_prog {
+	if [ "$1" == "in" ];  then
+		ip route del table local local ${IP_LOCAL}/32 dev lo
+	else
+		ip route del ${IPVETH1}/32 dev $VETH0
+	fi
+}
+
+function filter_trace {
+	# Add newline to allow starting EXPECT= variables on newline
+	NL=$'\n'
+	echo "${NL}$*" | sed -e 's/^.*: : //g'
+}
+
+function expect_fail {
+	set +x
+	echo "FAIL:"
+	echo "Expected: $1"
+	echo "Got: $2"
+	set -x
+	exit 1
+}
+
+function match_trace {
+	set +x
+	RET=0
+	TRACE=$1
+	EXPECT=$2
+	GOT="$(filter_trace "$TRACE")"
+
+	[ "$GOT" != "$EXPECT" ] && {
+		expect_fail "$EXPECT" "$GOT"
+		RET=1
+	}
+	set -x
+	return $RET
+}
+
+function test_start {
+	set +x
+	echo "----------------------------------------------------------------"
+	echo "Starting test: $*"
+	echo "----------------------------------------------------------------"
+	set -x
+}
+
+function failure {
+	get_trace
+	echo "FAIL: $*"
+	exit 1
+}
+
+function test_ctx_xmit {
+	test_start "test_ctx on lwt xmit"
+	install_test xmit test_ctx
+	ping -c 3 $IPVETH1 || {
+		failure "test_ctx xmit: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 0 ifindex $DST_IFINDEX" || exit 1
+	remove_prog xmit
+}
+
+function test_ctx_out {
+	test_start "test_ctx on lwt out"
+	install_test out test_ctx
+	ping -c 3 $IPVETH1 || {
+		failure "test_ctx out: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0
+len 84 hash 0 protocol 0
+cb 1234 ingress_ifindex 0 ifindex 0" || exit 1
+	remove_prog out
+}
+
+function test_ctx_in {
+	test_start "test_ctx on lwt in"
+	install_test in test_ctx
+	ping -c 3 $IP_LOCAL || {
+		failure "test_ctx out: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1
+len 84 hash 0 protocol 8
+cb 1234 ingress_ifindex 1 ifindex 1" || exit 1
+	remove_prog in
+}
+
+function test_data {
+	test_start "test_data on lwt $1"
+	install_test $1 test_data
+	ping -c 3 $IPVETH1 || {
+		failure "test_data ${1}: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0
+src: 1fea8c0 dst: 2fea8c0" || exit 1
+	remove_prog $1
+}
+
+function test_data_in {
+	test_start "test_data on lwt in"
+	install_test in test_data
+	ping -c 3 $IP_LOCAL || {
+		failure "test_data in: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0
+src: 163a8c0 dst: 163a8c0" || exit 1
+	remove_prog in
+}
+
+function test_cb {
+	test_start "test_cb on lwt $1"
+	install_test $1 test_cb
+	ping -c 3 $IPVETH1 || {
+		failure "test_cb ${1}: packets are dropped"
+	}
+	match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+	remove_prog $1
+}
+
+function test_cb_in {
+	test_start "test_cb on lwt in"
+	install_test in test_cb
+	ping -c 3 $IP_LOCAL || {
+		failure "test_cb in: packets are dropped"
+	}
+	# We will both request & reply packets as the packets will
+	# be from $IP_LOCAL => $IP_LOCAL
+	match_trace "$(get_trace)" "
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0
+cb0: 0 cb1: 0 cb2: 0
+cb3: 0 cb4: 0" || exit 1
+	remove_prog in
+}
+
+function test_drop_all {
+	test_start "test_drop_all on lwt $1"
+	install_test $1 drop_all
+	ping -c 3 $IPVETH1 && {
+		failure "test_drop_all ${1}: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+	remove_prog $1
+}
+
+function test_drop_all_in {
+	test_start "test_drop_all on lwt in"
+	install_test in drop_all
+	ping -c 3 $IP_LOCAL && {
+		failure "test_drop_all in: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+dropping with: 2
+dropping with: 2
+dropping with: 2" || exit 1
+	remove_prog in
+}
+
+function test_push_ll_and_redirect {
+	test_start "test_push_ll_and_redirect on lwt xmit"
+	install_test xmit push_ll_and_redirect
+	ping -c 3 $IPVETH1 || {
+		failure "Redirected packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" "
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX
+redirected to $DST_IFINDEX" || exit 1
+	remove_prog xmit
+}
+
+function test_rewrite {
+	test_start "test_rewrite on lwt xmit"
+	install_test xmit test_rewrite
+	ping -c 3 $IPVETH1 || {
+		failure "Rewritten packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" "
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0
+out: rewriting from 2fea8c0 to 3fea8c0" || exit 1
+	remove_prog out
+}
+
+function test_fill_garbage {
+	test_start "test_fill_garbage on lwt xmit"
+	install_test xmit fill_garbage
+	ping -c 3 $IPVETH1 && {
+		failure "test_drop_all ${1}: Unexpected success of ping"
+	}
+	match_trace "$(get_trace)" "
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF
+Set initial 96 bytes of header to FF" || exit 1
+	remove_prog xmit
+}
+
+function test_netperf_nop {
+	test_start "test_netperf_nop on lwt xmit"
+	install_test xmit nop
+	netperf -H $IPVETH1 -t TCP_STREAM || {
+		failure "packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" ""|| exit 1
+	remove_prog xmit
+}
+
+function test_netperf_redirect {
+	test_start "test_netperf_redirect on lwt xmit"
+	install_test xmit push_ll_and_redirect_silent
+	netperf -H $IPVETH1 -t TCP_STREAM || {
+		failure "Rewritten packets appear to be dropped"
+	}
+	match_trace "$(get_trace)" ""|| exit 1
+	remove_prog xmit
+}
+
+cleanup
+setup_one_veth $NS1 $VETH0 $VETH1 $IPVETH0 $IPVETH1 $IPVETH1b
+setup_one_veth $NS2 $VETH2 $VETH3 $IPVETH2 $IPVETH3
+ip netns exec $NS1 netserver
+echo 1 > ${TRACE_ROOT}/tracing_on
+
+DST_MAC=$(lookup_mac $VETH1 $NS1)
+SRC_MAC=$(lookup_mac $VETH0)
+DST_IFINDEX=$(cat /sys/class/net/$VETH0/ifindex)
+
+CLANG_OPTS="-O2 -target bpf -I ../include/"
+CLANG_OPTS+=" -DSRC_MAC=$SRC_MAC -DDST_MAC=$DST_MAC -DDST_IFINDEX=$DST_IFINDEX"
+clang $CLANG_OPTS -c test_lwt_bpf.c -o test_lwt_bpf.o
+
+test_ctx_xmit
+test_ctx_out
+test_ctx_in
+test_data "xmit"
+test_data "out"
+test_data_in
+test_cb "xmit"
+test_cb "out"
+test_cb_in
+test_drop_all "xmit"
+test_drop_all "out"
+test_drop_all_in
+test_rewrite
+test_push_ll_and_redirect
+test_fill_garbage
+test_netperf_nop
+test_netperf_redirect
+
+cleanup
+echo 0 > ${TRACE_ROOT}/tracing_on
+exit 0
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

Registers new BPF program types which correspond to the LWT hooks:
  - BPF_PROG_TYPE_LWT_IN   => dst_input()
  - BPF_PROG_TYPE_LWT_OUT  => dst_output()
  - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()

The separate program types are required to differentiate between the
capabilities each LWT hook allows:

 * Programs attached to dst_input() or dst_output() are restricted and
   may only read the data of an skb. This prevent modification and
   possible invalidation of already validated packet headers on receive
   and the construction of illegal headers while the IP headers are
   still being assembled.

 * Programs attached to lwtunnel_xmit() are allowed to modify packet
   content as well as prepending an L2 header via a newly introduced
   helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
   after the IP header has been assembled completely.

All BPF programs receive an skb with L3 headers attached and may return
one of the following error codes:

 BPF_OK - Continue routing as per nexthop
 BPF_DROP - Drop skb and return EPERM
 BPF_REDIRECT - Redirect skb to device as per redirect() helper.
                (Only valid in lwtunnel_xmit() context)

The return codes are binary compatible with their TC_ACT_
relatives to ease compatibility.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/filter.h        |   2 +-
 include/uapi/linux/bpf.h      |  32 +++-
 include/uapi/linux/lwtunnel.h |  23 +++
 kernel/bpf/verifier.c         |  14 +-
 net/Kconfig                   |   8 +
 net/core/Makefile             |   1 +
 net/core/filter.c             | 148 +++++++++++++++-
 net/core/lwt_bpf.c            | 397 ++++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c           |   2 +
 9 files changed, 621 insertions(+), 6 deletions(-)
 create mode 100644 net/core/lwt_bpf.c

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7f246a2..7ba6446 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -438,7 +438,7 @@ struct xdp_buff {
 };
 
 /* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf and act_bpf programs
+ * will be accessed by cls_bpf, act_bpf and lwt programs
  */
 static inline void bpf_compute_data_end(struct sk_buff *skb)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1370a9d..81c02c5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,6 +101,9 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_XDP,
 	BPF_PROG_TYPE_PERF_EVENT,
 	BPF_PROG_TYPE_CGROUP_SKB,
+	BPF_PROG_TYPE_LWT_IN,
+	BPF_PROG_TYPE_LWT_OUT,
+	BPF_PROG_TYPE_LWT_XMIT,
 };
 
 enum bpf_attach_type {
@@ -409,6 +412,16 @@ union bpf_attr {
  *
  * int bpf_get_numa_node_id()
  *     Return: Id of current NUMA node.
+ *
+ * int bpf_skb_push()
+ *     Add room to beginning of skb and adjusts MAC header offset accordingly.
+ *     Extends/reallocaes for needed skb headeroom automatically.
+ *     May change skb data pointer and will thus invalidate any check done
+ *     for direct packet access.
+ *     @skb: pointer to skb
+ *     @len: length of header to be pushed in front
+ *     @flags: Flags (unused for now)
+ *     Return: 0 on success or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -453,7 +466,8 @@ union bpf_attr {
 	FN(skb_pull_data),		\
 	FN(csum_update),		\
 	FN(set_hash_invalid),		\
-	FN(get_numa_node_id),
+	FN(get_numa_node_id),		\
+	FN(skb_push),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -537,6 +551,22 @@ struct bpf_tunnel_key {
 	__u32 tunnel_label;
 };
 
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+	BPF_OK = 0,
+	/* 1 reserved */
+	BPF_DROP = 2,
+	/* 3-6 reserved */
+	BPF_REDIRECT = 7,
+	/* >127 are reserved for prog type specific return codes */
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 453cc62..937f660 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -10,6 +10,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_ILA,
 	LWTUNNEL_ENCAP_IP6,
 	LWTUNNEL_ENCAP_SEG6,
+	LWTUNNEL_ENCAP_BPF,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -43,4 +44,26 @@ enum lwtunnel_ip6_t {
 
 #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1)
 
+enum {
+	LWT_BPF_PROG_UNSPEC,
+	LWT_BPF_PROG_FD,
+	LWT_BPF_PROG_NAME,
+	__LWT_BPF_PROG_MAX,
+};
+
+#define LWT_BPF_PROG_MAX (__LWT_BPF_PROG_MAX - 1)
+
+enum {
+	LWT_BPF_UNSPEC,
+	LWT_BPF_IN,
+	LWT_BPF_OUT,
+	LWT_BPF_XMIT,
+	LWT_BPF_XMIT_HEADROOM,
+	__LWT_BPF_MAX,
+};
+
+#define LWT_BPF_MAX (__LWT_BPF_MAX - 1)
+
+#define LWT_BPF_MAX_HEADROOM 128
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8740c5f..8135cb1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -633,12 +633,19 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 #define MAX_PACKET_OFF 0xffff
 
 static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
-				       const struct bpf_call_arg_meta *meta)
+				       const struct bpf_call_arg_meta *meta,
+				       enum bpf_access_type t)
 {
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+		/* dst_input() and dst_output() can't write for now */
+		if (t == BPF_WRITE)
+			return false;
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
 	case BPF_PROG_TYPE_XDP:
+	case BPF_PROG_TYPE_LWT_XMIT:
 		if (meta)
 			return meta->pkt_access;
 
@@ -837,7 +844,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			err = check_stack_read(state, off, size, value_regno);
 		}
 	} else if (state->regs[regno].type == PTR_TO_PACKET) {
-		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
 		}
@@ -970,7 +977,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+	if (type == PTR_TO_PACKET &&
+	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
diff --git a/net/Kconfig b/net/Kconfig
index 7b6cd34..a100500 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -402,6 +402,14 @@ config LWTUNNEL
 	  weight tunnel endpoint. Tunnel encapsulation parameters are stored
 	  with light weight tunnel state associated with fib routes.
 
+config LWTUNNEL_BPF
+	bool "Execute BPF program as route nexthop action"
+	depends on LWTUNNEL
+	default y if LWTUNNEL=y
+	---help---
+	  Allows to run BPF programs as a nexthop action following a route
+	  lookup for incoming and outgoing packets.
+
 config DST_CACHE
 	bool
 	default n
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2..f6761b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 698a262..1dafe37 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2188,6 +2188,50 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static bool len_fits_mtu(struct sk_buff *skb, u32 len)
+{
+	u32 new_len = skb->len + len;
+
+	if (skb_is_gso(skb))
+		return skb_gso_validate_mtu(skb, skb->dev->mtu - len);
+
+	return new_len <= skb->dev->mtu && new_len >= skb->len;
+}
+
+BPF_CALL_3(bpf_skb_push, struct sk_buff *, skb, __u32, len, u64, flags)
+{
+	if (!len_fits_mtu(skb, len))
+		return -ERANGE;
+
+	if (flags)
+		return -EINVAL;
+
+	if (len > 0) {
+		int ret;
+
+		ret = skb_cow(skb, len);
+		if (unlikely(ret < 0))
+			return ret;
+
+		__skb_push(skb, len);
+		memset(skb->data, 0, len);
+	}
+
+	skb_reset_mac_header(skb);
+	bpf_compute_data_end(skb);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_push_proto = {
+	.func		= bpf_skb_push,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_skb_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -2197,7 +2241,8 @@ bool bpf_helper_changes_skb_data(void *func)
 	    func == bpf_skb_change_tail ||
 	    func == bpf_skb_pull_data ||
 	    func == bpf_l3_csum_replace ||
-	    func == bpf_l4_csum_replace)
+	    func == bpf_l4_csum_replace ||
+	    func == bpf_skb_push)
 		return true;
 
 	return false;
@@ -2639,6 +2684,68 @@ cg_skb_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+lwt_inout_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_load_bytes:
+		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_skb_pull_data:
+		return &bpf_skb_pull_data_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
+	case BPF_FUNC_get_cgroup_classid:
+		return &bpf_get_cgroup_classid_proto;
+	case BPF_FUNC_get_route_realm:
+		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_get_hash_recalc:
+		return &bpf_get_hash_recalc_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_get_smp_processor_id:
+		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_skb_under_cgroup:
+		return &bpf_skb_under_cgroup_proto;
+	default:
+		return sk_filter_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
+lwt_xmit_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_skb_get_tunnel_key:
+		return &bpf_skb_get_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_key:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_redirect:
+		return &bpf_redirect_proto;
+	case BPF_FUNC_clone_redirect:
+		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_skb_change_tail:
+		return &bpf_skb_change_tail_proto;
+	case BPF_FUNC_skb_push:
+		return &bpf_skb_push_proto;
+	case BPF_FUNC_skb_store_bytes:
+		return &bpf_skb_store_bytes_proto;
+	case BPF_FUNC_csum_update:
+		return &bpf_csum_update_proto;
+	case BPF_FUNC_l3_csum_replace:
+		return &bpf_l3_csum_replace_proto;
+	case BPF_FUNC_l4_csum_replace:
+		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_set_hash_invalid:
+		return &bpf_set_hash_invalid_proto;
+	default:
+		return lwt_inout_func_proto(func_id);
+	}
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	if (off < 0 || off >= sizeof(struct __sk_buff))
@@ -3007,6 +3114,27 @@ static const struct bpf_verifier_ops cg_skb_ops = {
 	.convert_ctx_access	= sk_filter_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops lwt_in_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_out_ops = {
+	.get_func_proto		= lwt_inout_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
+static const struct bpf_verifier_ops lwt_xmit_ops = {
+	.get_func_proto		= lwt_xmit_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= sk_filter_convert_ctx_access,
+	.gen_prologue		= tc_cls_act_prologue,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops	= &sk_filter_ops,
 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
@@ -3032,6 +3160,21 @@ static struct bpf_prog_type_list cg_skb_type __read_mostly = {
 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
 };
 
+static struct bpf_prog_type_list lwt_in_type __read_mostly = {
+	.ops	= &lwt_in_ops,
+	.type	= BPF_PROG_TYPE_LWT_IN,
+};
+
+static struct bpf_prog_type_list lwt_out_type __read_mostly = {
+	.ops	= &lwt_out_ops,
+	.type	= BPF_PROG_TYPE_LWT_OUT,
+};
+
+static struct bpf_prog_type_list lwt_xmit_type __read_mostly = {
+	.ops	= &lwt_xmit_ops,
+	.type	= BPF_PROG_TYPE_LWT_XMIT,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
@@ -3039,6 +3182,9 @@ static int __init register_sk_filter_ops(void)
 	bpf_register_prog_type(&sched_act_type);
 	bpf_register_prog_type(&xdp_type);
 	bpf_register_prog_type(&cg_skb_type);
+	bpf_register_prog_type(&lwt_in_type);
+	bpf_register_prog_type(&lwt_out_type);
+	bpf_register_prog_type(&lwt_xmit_type);
 
 	return 0;
 }
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
new file mode 100644
index 0000000..fdbf6f4
--- /dev/null
+++ b/net/core/lwt_bpf.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <net/lwtunnel.h>
+
+struct bpf_lwt_prog {
+	struct bpf_prog *prog;
+	char *name;
+};
+
+struct bpf_lwt {
+	struct bpf_lwt_prog in;
+	struct bpf_lwt_prog out;
+	struct bpf_lwt_prog xmit;
+	int family;
+};
+
+#define MAX_PROG_NAME 256
+
+static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+	return (struct bpf_lwt *)lwt->data;
+}
+
+#define NO_REDIRECT false
+#define CAN_REDIRECT true
+
+static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
+		       struct dst_entry *dst, bool can_redirect)
+{
+	int ret;
+
+	/* Preempt disable is needed to protect per-cpu redirect_info between
+	 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
+	 * access to maps strictly require a rcu_read_lock() for protection,
+	 * mixing with BH RCU lock doesn't work.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+	bpf_compute_data_end(skb);
+	ret = BPF_PROG_RUN(lwt->prog, skb);
+	rcu_read_unlock();
+
+	switch (ret) {
+	case BPF_OK:
+		break;
+
+	case BPF_REDIRECT:
+		if (!can_redirect) {
+			WARN_ONCE(1, "Illegal redirect return code in prog %s\n",
+				  lwt->name ? : "<unknown>");
+			ret = BPF_OK;
+		} else {
+			ret = skb_do_redirect(skb);
+			if (ret == 0)
+				ret = BPF_REDIRECT;
+		}
+		break;
+
+	case BPF_DROP:
+		kfree_skb(skb);
+		ret = -EPERM;
+		break;
+
+	default:
+		WARN_ONCE(1, "Illegal LWT BPF return value %u, expect packet loss\n",
+			  ret);
+		kfree_skb(skb);
+		ret = -EINVAL;
+		break;
+	}
+
+	preempt_enable();
+
+	return ret;
+}
+
+static int bpf_input(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->in.prog) {
+		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_input)) {
+		WARN_ONCE(1, "orig_input not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_input(skb);
+}
+
+static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+	int ret;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->out.prog) {
+		ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT);
+		if (ret < 0)
+			return ret;
+	}
+
+	if (unlikely(!dst->lwtstate->orig_output)) {
+		WARN_ONCE(1, "orig_output not set on dst for prog %s\n",
+			  bpf->out.name);
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	return dst->lwtstate->orig_output(net, sk, skb);
+}
+
+static int xmit_check_hhlen(struct sk_buff *skb)
+{
+	int hh_len = skb_dst(skb)->dev->hard_header_len;
+
+	if (skb_headroom(skb) < hh_len) {
+		int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+		if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int bpf_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct bpf_lwt *bpf;
+
+	bpf = bpf_lwt_lwtunnel(dst->lwtstate);
+	if (bpf->xmit.prog) {
+		int ret;
+
+		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
+		switch (ret) {
+		case BPF_OK:
+			/* If the L3 header was expanded, headroom might be too
+			 * small for L2 header now, expand as needed.
+			 */
+			ret = xmit_check_hhlen(skb);
+			if (unlikely(ret))
+				return ret;
+
+			return LWTUNNEL_XMIT_CONTINUE;
+		case BPF_REDIRECT:
+			return LWTUNNEL_XMIT_DONE;
+		default:
+			return ret;
+		}
+	}
+
+	return LWTUNNEL_XMIT_CONTINUE;
+}
+
+static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog)
+{
+	if (prog->prog)
+		bpf_prog_put(prog->prog);
+
+	kfree(prog->name);
+}
+
+static void bpf_destroy_state(struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	bpf_lwt_prog_destroy(&bpf->in);
+	bpf_lwt_prog_destroy(&bpf->out);
+	bpf_lwt_prog_destroy(&bpf->xmit);
+}
+
+static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = {
+	[LWT_BPF_PROG_FD] = { .type = NLA_U32, },
+	[LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING,
+				.len = MAX_PROG_NAME },
+};
+
+static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
+			  enum bpf_prog_type type)
+{
+	struct nlattr *tb[LWT_BPF_PROG_MAX + 1];
+	struct bpf_prog *p;
+	int ret;
+	u32 fd;
+
+	ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
+		return -EINVAL;
+
+	prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+	if (!prog->name)
+		return -ENOMEM;
+
+	fd = nla_get_u32(tb[LWT_BPF_PROG_FD]);
+	p = bpf_prog_get_type(fd, type);
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	prog->prog = p;
+
+	return 0;
+}
+
+static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
+	[LWT_BPF_IN]		= { .type = NLA_NESTED, },
+	[LWT_BPF_OUT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT]		= { .type = NLA_NESTED, },
+	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
+};
+
+static int bpf_build_state(struct net_device *dev, struct nlattr *nla,
+			   unsigned int family, const void *cfg,
+			   struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[LWT_BPF_MAX + 1];
+	struct lwtunnel_state *newts;
+	struct bpf_lwt *bpf;
+	int ret;
+
+	if (family != AF_INET && family != AF_INET6)
+		return -EAFNOSUPPORT;
+
+	ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*bpf));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->type = LWTUNNEL_ENCAP_BPF;
+	bpf = bpf_lwt_lwtunnel(newts);
+
+	if (tb[LWT_BPF_IN]) {
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in,
+				     BPF_PROG_TYPE_LWT_IN);
+		if (ret  < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_OUT]) {
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out,
+				     BPF_PROG_TYPE_LWT_OUT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT]) {
+		newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+		ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit,
+				     BPF_PROG_TYPE_LWT_XMIT);
+		if (ret < 0)
+			goto errout;
+	}
+
+	if (tb[LWT_BPF_XMIT_HEADROOM]) {
+		u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]);
+
+		if (headroom > LWT_BPF_MAX_HEADROOM) {
+			ret = -ERANGE;
+			goto errout;
+		}
+
+		newts->headroom = headroom;
+	}
+
+	bpf->family = family;
+	*ts = newts;
+
+	return 0;
+
+errout:
+	bpf_destroy_state(newts);
+	kfree(newts);
+	return ret;
+}
+
+static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
+			     struct bpf_lwt_prog *prog)
+{
+	struct nlattr *nest;
+
+	if (!prog->prog)
+		return 0;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (prog->name &&
+	    nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name))
+		return -EMSGSIZE;
+
+	return nla_nest_end(skb, nest);
+}
+
+static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt)
+{
+	struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt);
+
+	if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 ||
+	    bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	int nest_len = nla_total_size(sizeof(struct nlattr)) +
+		       nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */
+		       0;
+
+	return nest_len + /* LWT_BPF_IN */
+	       nest_len + /* LWT_BPF_OUT */
+	       nest_len + /* LWT_BPF_XMIT */
+	       0;
+}
+
+int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b)
+{
+	/* FIXME:
+	 * The LWT state is currently rebuilt for delete requests which
+	 * results in a new bpf_prog instance. Comparing names for now.
+	 */
+	if (!a->name && !b->name)
+		return 0;
+
+	if (!a->name || !b->name)
+		return 1;
+
+	return strcmp(a->name, b->name);
+}
+
+static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a);
+	struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b);
+
+	return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) ||
+	       bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) ||
+	       bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit);
+}
+
+static const struct lwtunnel_encap_ops bpf_encap_ops = {
+	.build_state	= bpf_build_state,
+	.destroy_state	= bpf_destroy_state,
+	.input		= bpf_input,
+	.output		= bpf_output,
+	.xmit		= bpf_xmit,
+	.fill_encap	= bpf_fill_encap_info,
+	.get_encap_size = bpf_encap_nlsize,
+	.cmp_encap	= bpf_encap_cmp,
+};
+
+static int __init bpf_lwt_init(void)
+{
+	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
+}
+
+subsys_initcall(bpf_lwt_init)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 03976e9..a5d4e86 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "ILA";
 	case LWTUNNEL_ENCAP_SEG6:
 		return "SEG6";
+	case LWTUNNEL_ENCAP_BPF:
+		return "BPF";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
 	case LWTUNNEL_ENCAP_NONE:
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 2/4] route: Set lwtstate for local traffic and cached input dsts
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

A route on the output path hitting a RTN_LOCAL route will keep the dst
associated on its way through the loopback device. On the receive path,
the dst_input() call will thus invoke the input handler of the route
created in the output path. Thus, lwt redirection for input must be done
for dsts allocated in the otuput path as well.

Also, if a route is cached in the input path, the allocated dst should
respect lwtunnel configuration on the nexthop as well.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/ipv4/route.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0c39b62..f4c4d7a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1602,6 +1602,19 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 	spin_unlock_bh(&fnhe_lock);
 }
 
+static void set_lwt_redirect(struct rtable *rth)
+{
+	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_output = rth->dst.output;
+		rth->dst.output = lwtunnel_output;
+	}
+
+	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_input = rth->dst.input;
+		rth->dst.input = lwtunnel_input;
+	}
+}
+
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
 			   const struct fib_result *res,
@@ -1691,14 +1704,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	rth->dst.input = ip_forward;
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_output = rth->dst.output;
-		rth->dst.output = lwtunnel_output;
-	}
-	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_input = rth->dst.input;
-		rth->dst.input = lwtunnel_input;
-	}
+	set_lwt_redirect(rth);
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -1925,8 +1931,18 @@ out:	return err;
 		rth->dst.error= -err;
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
+
 	if (do_cache) {
-		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
+		struct fib_nh *nh = &FIB_RES_NH(res);
+
+		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
+			WARN_ON(rth->dst.input == lwtunnel_input);
+			rth->dst.lwtstate->orig_input = rth->dst.input;
+			rth->dst.input = lwtunnel_input;
+		}
+
+		if (unlikely(!rt_cache_route(nh, rth))) {
 			rth->dst.flags |= DST_NOCACHE;
 			rt_add_uncached_list(rth);
 		}
@@ -2154,10 +2170,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_output = rth->dst.output;
-		rth->dst.output = lwtunnel_output;
-	}
+	set_lwt_redirect(rth);
 
 	return rth;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v3 1/4] route: Set orig_output when redirecting to lwt on locally generated traffic
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes
In-Reply-To: <cover.1480424542.git.tgraf@suug.ch>

orig_output for IPv4 was only set for dsts which hit an input route.
Set it consistently for locally generated traffic as well to allow
lwt to continue the dst_output() path as configured by the nexthop.

Fixes: 2536862311d ("lwt: Add support to redirect dst.input")
Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/ipv4/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d37fc6f..0c39b62 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2154,8 +2154,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
-	if (lwtunnel_output_redirect(rth->dst.lwtstate))
+	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
+		rth->dst.lwtstate->orig_output = rth->dst.output;
 		rth->dst.output = lwtunnel_output;
+	}
 
 	return rth;
 }
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 net-next 0/4] bpf: BPF for lightweight tunnel encapsulation
From: Thomas Graf @ 2016-11-29 13:21 UTC (permalink / raw)
  To: davem; +Cc: netdev, alexei.starovoitov, daniel, tom, roopa, hannes

This series implements BPF program invocation from dst entries via the
lightweight tunnels infrastructure. The BPF program can be attached to
lwtunnel_input(), lwtunnel_output() or lwtunnel_xmit() and see an L3
skb as context. Programs attached to input and output are read-only.
Programs attached to lwtunnel_xmit() can modify and redirect, push headers
and redirect packets.

The facility can be used to:
 - Collect statistics and generate sampling data for a subset of traffic
   based on the dst utilized by the packet thus allowing to extend the
   existing realms.
 - Apply additional per route/dst filters to prohibit certain outgoing
   or incoming packets based on BPF filters. In particular, this allows
   to maintain per dst custom state across multiple packets in BPF maps
   and apply filters based on statistics and behaviour observed over time.
 - Attachment of L2 headers at transmit where resolving the L2 address
   is not required.
 - Possibly many more.

v2 -> v3:
 - Added real world sample lwt_len_hist_kern.c which demonstrates how to
   collect a histogram on packet sizes for all packets flowing through
   a number of routes.
 - Restricted output to be read-only. Since the header can no longer
   be modified, the rerouting functionality has been removed again.
 - Added test case which cover destructive modification of packet data.

v1 -> v2:
 - Added new BPF_LWT_REROUTE return code for program to indicate
   that new route lookup should be performed. Suggested by Tom.
 - New sample to illustrate rerouting
 - New patch 05: Recursion limit for lwtunnel_output for the case
   when user creates circular dst redirection. Also resolves the
   issue for ILA.
 - Fix to ensure headroom for potential future L2 header is still
   guaranteed


Thomas Graf (4):
  route: Set orig_output when redirecting to lwt on locally generated
    traffic
  route: Set lwtstate for local traffic and cached input dsts
  bpf: BPF for lightweight tunnel infrastructure
  bpf: Add tests and samples for LWT-BPF

 include/linux/filter.h          |   2 +-
 include/uapi/linux/bpf.h        |  32 +++-
 include/uapi/linux/lwtunnel.h   |  23 +++
 kernel/bpf/verifier.c           |  14 +-
 net/Kconfig                     |   8 +
 net/core/Makefile               |   1 +
 net/core/filter.c               | 148 ++++++++++++++-
 net/core/lwt_bpf.c              | 397 ++++++++++++++++++++++++++++++++++++++++
 net/core/lwtunnel.c             |   2 +
 net/ipv4/route.c                |  37 ++--
 samples/bpf/Makefile            |   4 +
 samples/bpf/bpf_helpers.h       |   4 +
 samples/bpf/lwt_len_hist.sh     |  37 ++++
 samples/bpf/lwt_len_hist_kern.c |  82 +++++++++
 samples/bpf/lwt_len_hist_user.c |  76 ++++++++
 samples/bpf/test_lwt_bpf.c      | 247 +++++++++++++++++++++++++
 samples/bpf/test_lwt_bpf.sh     | 385 ++++++++++++++++++++++++++++++++++++++
 17 files changed, 1482 insertions(+), 17 deletions(-)
 create mode 100644 net/core/lwt_bpf.c
 create mode 100755 samples/bpf/lwt_len_hist.sh
 create mode 100644 samples/bpf/lwt_len_hist_kern.c
 create mode 100644 samples/bpf/lwt_len_hist_user.c
 create mode 100644 samples/bpf/test_lwt_bpf.c
 create mode 100755 samples/bpf/test_lwt_bpf.sh

-- 
2.7.4

^ permalink raw reply

* [PATCH V2 net-next] net: hns: Fix to conditionally convey RX checksum flag to stack
From: Salil Mehta @ 2016-11-29 13:09 UTC (permalink / raw)
  To: davem
  Cc: salil.mehta, yisen.zhuang, mehta.salil.lnk, netdev, linux-kernel,
	linuxarm

This patch introduces the RX checksum function to check the
status of the hardware calculated checksum and its error and
appropriately convey status to the upper stack in skb->ip_summed
field.

We only support checksum for IPv4, UDP(over IPv4 or IPv6),
TCP(over IPv4 or IPv6) and SCTP but we support many L3(IPv4, IPv6,
MPLS, PPPoE etc) and L4(TCP, UDP, GRE, SCTP, IGMP, ICMP etc.)
protocols. We want to filter out L3 and L4 protocols early on for
which checksum is not supported.

Our present hardware RX Descriptor lacks L3/L4 "Checksum
Status & Error" bit (indicating whether checksum was calculated
and if there was an error encountered) for the supported protocol
received in the packet. Therefore, we do the following:
1. Filter the protocols for which checksum is not supported.
2. Check if there were any errors encountered in L3 or L4
   protocols. These errors might not just be Checksum errors but
   could be related to version, length of IPv4, UDP, TCP etc.
   2a. If L3 Errors amd L4 Errors exists, then return as our RX
       descriptor lacks Status-and-Error bits for checksum so
       cannot identify specifically if error was because of
       checksum error or other error for this packet.
   2b. If above errors do not exists, then we set checksum
       un-necessary for upper layers.

Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
---
Change Log:
Patch V2: Addressed the comment by David Miller
          Link: https://www.spinics.net/lists/netdev/msg406697.html
Patch V1: This patch is a result of the comments given by
          David Miller <davem@davemloft.net>
          Link: https://lkml.org/lkml/2016/6/15/27
---
 drivers/net/ethernet/hisilicon/hns/hnae.h     |    2 +
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |   71 ++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 09602f1..8016854 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -99,6 +99,8 @@ enum hnae_led_state {
 #define HNS_RX_FLAG_L3ID_IPV6 0x1
 #define HNS_RX_FLAG_L4ID_UDP 0x0
 #define HNS_RX_FLAG_L4ID_TCP 0x1
+#define HNS_RX_FLAG_L4ID_SCTP 0x3
+
 
 #define HNS_TXD_ASID_S 0
 #define HNS_TXD_ASID_M (0xff << HNS_TXD_ASID_S)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index 255fede..6450d0e 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -565,6 +565,66 @@ static void get_rx_desc_bnum(u32 bnum_flag, int *out_bnum)
 				   HNS_RXD_BUFNUM_M, HNS_RXD_BUFNUM_S);
 }
 
+static void hns_nic_rx_checksum(struct hns_nic_ring_data *ring_data,
+				struct sk_buff *skb, u32 flag)
+{
+	struct net_device *netdev = ring_data->napi.dev;
+	u32 l3id;
+	u32 l4id;
+
+	/* check if RX checksum offload is enabled */
+	if (unlikely(!(netdev->features & NETIF_F_RXCSUM)))
+		return;
+
+	/* We only support checksum for IPv4,UDP(over IPv4 or IPv6), TCP(over
+	 * IPv4 or IPv6) and SCTP but we support many L3(IPv4, IPv6, MPLS,
+	 * PPPoE etc) and L4(TCP, UDP, GRE, SCTP, IGMP, ICMP etc.) protocols.
+	 * We want to filter out L3 and L4 protocols early on for which checksum
+	 * is not supported.
+	 *
+	 * Our present hardware RX Descriptor lacks L3/L4 "Checksum Status &
+	 * Error" bit (indicating whether checksum was calculated and if there
+	 * was an error encountered) for the supported protocol received in the
+	 * packet. Therefore, we do the following:
+	 * 1. Filter the protocols for which checksum is not supported.
+	 * 2. Check if there were any errors encountered in L3 or L4 protocols.
+	 *    These errors might not just be Checksum errors but could be
+	 *    related to version, length of IPv4, UDP, TCP etc.
+	 *    2a. If L3 Errors amd L4 Errors exists, then return as our RX
+	 *        descriptor lacks Status-and-Error bits for checksum so cannot
+	 *        identify specifically if error was because of checksum error
+	 *        or other error for this packet.
+	 *    2b. If above errors do not exists, then we set checksum
+	 *        un-necessary for upper layers.
+	 */
+	l3id = hnae_get_field(flag, HNS_RXD_L3ID_M, HNS_RXD_L3ID_S);
+	l4id = hnae_get_field(flag, HNS_RXD_L4ID_M, HNS_RXD_L4ID_S);
+	if ((l3id != HNS_RX_FLAG_L3ID_IPV4) &&
+	    ((l3id != HNS_RX_FLAG_L3ID_IPV6) ||
+	     (l4id != HNS_RX_FLAG_L4ID_UDP)) &&
+	    ((l3id != HNS_RX_FLAG_L3ID_IPV6) ||
+	     (l4id != HNS_RX_FLAG_L4ID_TCP)) &&
+	    (l4id != HNS_RX_FLAG_L4ID_SCTP))
+		return;
+
+	/* We do not support checksum of fragmented packets */
+	if (unlikely(hnae_get_bit(flag, HNS_RXD_FRAG_B)))
+		return;
+
+	/* Check if there are any L3/L4 errors encountered during RX checksum */
+	if (unlikely(hnae_get_bit(flag, HNS_RXD_L3E_B) ||
+		     hnae_get_bit(flag, HNS_RXD_L4E_B))) {
+		/* we dont have any way to detect if this is checksum error or
+		 * any specific protocol error. Therefore, return no checksum
+		 * all protocol errors.
+		 */
+		return;
+	}
+
+	/* Now, this is a packet with valid RX checksum */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
 static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
 			       struct sk_buff **out_skb, int *out_bnum)
 {
@@ -683,13 +743,10 @@ static int hns_nic_poll_rx_skb(struct hns_nic_ring_data *ring_data,
 	ring->stats.rx_pkts++;
 	ring->stats.rx_bytes += skb->len;
 
-	if (unlikely(hnae_get_bit(bnum_flag, HNS_RXD_L3E_B) ||
-		     hnae_get_bit(bnum_flag, HNS_RXD_L4E_B))) {
-		ring->stats.l3l4_csum_err++;
-		return 0;
-	}
-
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	/* indicate to upper stack if our hardware has already calculated
+	 * the RX checksum
+	 */
+	hns_nic_rx_checksum(ring_data, skb, bnum_flag);
 
 	return 0;
 }
-- 
1.7.9.5

^ permalink raw reply related

* Re: [PATCH v3 net-next 1/2] net: ethernet: slicoss: add slicoss gigabit ethernet driver
From: Markus Böhme @ 2016-11-29 12:15 UTC (permalink / raw)
  To: Lino Sanfilippo, davem, charrer, liodot, gregkh, andrew
  Cc: devel, netdev, linux-kernel
In-Reply-To: <9a789bda-7d98-fa2c-8759-e046c6dd5145@gmx.de>

On 11/28/2016 09:46 PM, Lino Sanfilippo wrote:
> Hi Markus,
> 
> On 27.11.2016 18:59, Markus Böhme wrote:
>> Hello Lino,
>>
>> just some things barely worth mentioning:
>>
> 
>>
>> I found a bunch of unused #defines in slic.h. I cannot judge if they are
>> worth keeping:
>>
>> 	SLIC_VRHSTATB_LONGE
>> 	SLIC_VRHSTATB_PREA
>> 	SLIC_ISR_IO
>> 	SLIC_ISR_PING_MASK
>> 	SLIC_GIG_SPEED_MASK
>> 	SLIC_GMCR_RESET
>> 	SLIC_XCR_RESET
>> 	SLIC_XCR_XMTEN
>> 	SLIC_XCR_PAUSEEN
>> 	SLIC_XCR_LOADRNG
>> 	SLIC_REG_DBAR
>> 	SLIC_REG_PING
>> 	SLIC_REG_DUMP_CMD
>> 	SLIC_REG_DUMP_DATA
>> 	SLIC_REG_WRHOSTID
>> 	SLIC_REG_LOW_POWER
>> 	SLIC_REG_RESET_IFACE
>> 	SLIC_REG_ADDR_UPPER
>> 	SLIC_REG_HBAR64
>> 	SLIC_REG_DBAR64
>> 	SLIC_REG_CBAR64
>> 	SLIC_REG_RBAR64
>> 	SLIC_REG_WRVLANID
>> 	SLIC_REG_READ_XF_INFO
>> 	SLIC_REG_WRITE_XF_INFO
>> 	SLIC_REG_TICKS_PER_SEC
>>
>> These device IDs are not used, either, but maybe it's good to keep them
>> for documentation purposes:
>>
>> 	PCI_SUBDEVICE_ID_ALACRITECH_1000X1_2
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SES1001T
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2002XT
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2001XT
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2104ET
>> 	PCI_SUBDEVICE_ID_ALACRITECH_SEN2102ET
>>
> 
> I left these defines in for both documentation and to avoid gaps in
> register ranges. I would like to keep this as it is.

Seems reasonable.

[...]

>>> +static int slic_init_tx_queue(struct slic_device *sdev)
>>> +{
>>> +	struct slic_tx_queue *txq = &sdev->txq;
>>> +	struct slic_tx_buffer *buff;
>>> +	struct slic_tx_desc *desc;
>>> +	int err;
>>> +	int i;
>>
>> You could make i unsigned...
>>
> 
>>> +
>>> +	txq->len = SLIC_NUM_TX_DESCS;
>>> +	txq->put_idx = 0;
>>> +	txq->done_idx = 0;
>>> +
>>> +	txq->txbuffs = kcalloc(txq->len, sizeof(*buff), GFP_KERNEL);
>>> +	if (!txq->txbuffs)
>>> +		return -ENOMEM;
>>> +
>>> +	txq->dma_pool = dma_pool_create("slic_pool", &sdev->pdev->dev,
>>> +					sizeof(*desc), SLIC_TX_DESC_ALIGN,
>>> +					4096);
>>> +	if (!txq->dma_pool) {
>>> +		err = -ENOMEM;
>>> +		netdev_err(sdev->netdev, "failed to create dma pool\n");
>>> +		goto free_buffs;
>>> +	}
>>> +
>>> +	for (i = 0; i < txq->len; i++) {
>>
>> ...to fix a signed/unsigned comparison warning here, but...
>>
>>> +		buff = &txq->txbuffs[i];
>>> +		desc = dma_pool_zalloc(txq->dma_pool, GFP_KERNEL,
>>> +				       &buff->desc_paddr);
>>> +		if (!desc) {
>>> +			netdev_err(sdev->netdev,
>>> +				   "failed to alloc pool chunk (%i)\n", i);
>>> +			err = -ENOMEM;
>>> +			goto free_descs;
>>> +		}
>>> +
>>> +		desc->hnd = cpu_to_le32((u32)(i + 1));
>>> +		desc->cmd = SLIC_CMD_XMT_REQ;
>>> +		desc->flags = 0;
>>> +		desc->type = cpu_to_le32(SLIC_CMD_TYPE_DUMB);
>>> +		buff->desc = desc;
>>> +	}
>>> +
>>> +	return 0;
>>> +
>>> +free_descs:
>>> +	while (i--) {
>>
>> ...this would require reworking this logic to prevent an endless loop,
>> so probably not worth bothering, considering that txq->len is well
>> within the positive signed range.
> 
> AFAICS the logic does not have to be changed. The while loop will also work
> fine if "i" is unsigned.
> 

My bad! Of course you are right.


Regards,
Markus

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox