Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 2/6] nfp: drop rx_ring param from buffer allocation
From: Jakub Kicinski @ 2017-04-28  4:06 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, kubakici, Jakub Kicinski
In-Reply-To: <20170428040620.8256-1-jakub.kicinski@netronome.com>

We will soon allocate RX buffers for caching on XDP TX rings.
The rx_ring parameter passed to nfp_net_rx_alloc_one() is not
actually used, remove it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 0e87b446e98f..fc0652d175fc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1153,16 +1153,13 @@ nfp_net_free_frag(void *frag, bool xdp)
 /**
  * nfp_net_rx_alloc_one() - Allocate and map page frag for RX
  * @dp:		NFP Net data path struct
- * @rx_ring:	RX ring structure of the skb
  * @dma_addr:	Pointer to storage for DMA address (output param)
  *
  * This function will allcate a new page frag, map it for DMA.
  *
  * Return: allocated page frag or NULL on failure.
  */
-static void *
-nfp_net_rx_alloc_one(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
-		     dma_addr_t *dma_addr)
+static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 {
 	void *frag;
 
@@ -1317,8 +1314,7 @@ nfp_net_rx_ring_bufs_alloc(struct nfp_net_dp *dp,
 	rxbufs = rx_ring->rxbufs;
 
 	for (i = 0; i < rx_ring->cnt - 1; i++) {
-		rxbufs[i].frag =
-			nfp_net_rx_alloc_one(dp, rx_ring, &rxbufs[i].dma_addr);
+		rxbufs[i].frag = nfp_net_rx_alloc_one(dp, &rxbufs[i].dma_addr);
 		if (!rxbufs[i].frag) {
 			nfp_net_rx_ring_bufs_free(dp, rx_ring);
 			return -ENOMEM;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 3/6] nfp: do simple XDP TX buffer recycling
From: Jakub Kicinski @ 2017-04-28  4:06 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, kubakici, Jakub Kicinski
In-Reply-To: <20170428040620.8256-1-jakub.kicinski@netronome.com>

On the RX path we follow the "drop if allocation of replacement
buffer fails" rule.  With XDP we extended that to the TX action,
so if XDP prog returned TX but allocation of replacement RX buffer
failed, we will drop the packet.

To improve our XDP TX performance extend the idea of rings being
always full to XDP TX rings.  Pre-fill the XDP TX rings with RX
buffers, and when XDP prog returns TX action swap the RX buffer
with the next buffer from the TX ring.

XDP TX complete will no longer free the buffers but let them
sit on the TX ring and wait for swap with RX buffer, instead.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h       |   2 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    | 140 ++++++++++++---------
 2 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 8f20fdef0754..e7e9a9848746 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -201,6 +201,7 @@ struct nfp_net_tx_buf {
  * @txds:       Virtual address of TX ring in host memory
  * @dma:        DMA address of the TX ring
  * @size:       Size, in bytes, of the TX ring (needed to free)
+ * @is_xdp:	Is this a XDP TX ring?
  */
 struct nfp_net_tx_ring {
 	struct nfp_net_r_vector *r_vec;
@@ -221,6 +222,7 @@ struct nfp_net_tx_ring {
 
 	dma_addr_t dma;
 	unsigned int size;
+	bool is_xdp;
 } ____cacheline_aligned;
 
 /* RX and freelist descriptor format */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index fc0652d175fc..4fbda0eb4776 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -478,15 +478,18 @@ static irqreturn_t nfp_net_irq_exn(int irq, void *data)
  * @tx_ring:  TX ring structure
  * @r_vec:    IRQ vector servicing this ring
  * @idx:      Ring index
+ * @is_xdp:   Is this an XDP TX ring?
  */
 static void
 nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring,
-		     struct nfp_net_r_vector *r_vec, unsigned int idx)
+		     struct nfp_net_r_vector *r_vec, unsigned int idx,
+		     bool is_xdp)
 {
 	struct nfp_net *nn = r_vec->nfp_net;
 
 	tx_ring->idx = idx;
 	tx_ring->r_vec = r_vec;
+	tx_ring->is_xdp = is_xdp;
 
 	tx_ring->qcidx = tx_ring->idx * nn->stride_tx;
 	tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx);
@@ -994,7 +997,6 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
 static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
 {
 	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
-	struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
 	u32 done_pkts = 0, done_bytes = 0;
 	int idx, todo;
 	u32 qcp_rd_p;
@@ -1010,22 +1012,12 @@ static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
 	else
 		todo = qcp_rd_p + tx_ring->cnt - tx_ring->qcp_rd_p;
 
+	done_pkts = todo;
 	while (todo--) {
 		idx = tx_ring->rd_p & (tx_ring->cnt - 1);
 		tx_ring->rd_p++;
 
-		if (!tx_ring->txbufs[idx].frag)
-			continue;
-
-		nfp_net_dma_unmap_rx(dp, tx_ring->txbufs[idx].dma_addr);
-		__free_page(virt_to_page(tx_ring->txbufs[idx].frag));
-
-		done_pkts++;
 		done_bytes += tx_ring->txbufs[idx].real_len;
-
-		tx_ring->txbufs[idx].dma_addr = 0;
-		tx_ring->txbufs[idx].frag = NULL;
-		tx_ring->txbufs[idx].fidx = -2;
 	}
 
 	tx_ring->qcp_rd_p = qcp_rd_p;
@@ -1050,42 +1042,35 @@ static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
 static void
 nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
 {
-	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
 	const struct skb_frag_struct *frag;
 	struct netdev_queue *nd_q;
 
-	while (tx_ring->rd_p != tx_ring->wr_p) {
+	while (!tx_ring->is_xdp && tx_ring->rd_p != tx_ring->wr_p) {
 		struct nfp_net_tx_buf *tx_buf;
-		int idx;
+		struct sk_buff *skb;
+		int idx, nr_frags;
 
 		idx = tx_ring->rd_p & (tx_ring->cnt - 1);
 		tx_buf = &tx_ring->txbufs[idx];
 
-		if (tx_ring == r_vec->xdp_ring) {
-			nfp_net_dma_unmap_rx(dp, tx_buf->dma_addr);
-			__free_page(virt_to_page(tx_ring->txbufs[idx].frag));
-		} else {
-			struct sk_buff *skb = tx_ring->txbufs[idx].skb;
-			int nr_frags = skb_shinfo(skb)->nr_frags;
-
-			if (tx_buf->fidx == -1) {
-				/* unmap head */
-				dma_unmap_single(dp->dev, tx_buf->dma_addr,
-						 skb_headlen(skb),
-						 DMA_TO_DEVICE);
-			} else {
-				/* unmap fragment */
-				frag = &skb_shinfo(skb)->frags[tx_buf->fidx];
-				dma_unmap_page(dp->dev, tx_buf->dma_addr,
-					       skb_frag_size(frag),
-					       DMA_TO_DEVICE);
-			}
+		skb = tx_ring->txbufs[idx].skb;
+		nr_frags = skb_shinfo(skb)->nr_frags;
 
-			/* check for last gather fragment */
-			if (tx_buf->fidx == nr_frags - 1)
-				dev_kfree_skb_any(skb);
+		if (tx_buf->fidx == -1) {
+			/* unmap head */
+			dma_unmap_single(dp->dev, tx_buf->dma_addr,
+					 skb_headlen(skb), DMA_TO_DEVICE);
+		} else {
+			/* unmap fragment */
+			frag = &skb_shinfo(skb)->frags[tx_buf->fidx];
+			dma_unmap_page(dp->dev, tx_buf->dma_addr,
+				       skb_frag_size(frag), DMA_TO_DEVICE);
 		}
 
+		/* check for last gather fragment */
+		if (tx_buf->fidx == nr_frags - 1)
+			dev_kfree_skb_any(skb);
+
 		tx_buf->dma_addr = 0;
 		tx_buf->skb = NULL;
 		tx_buf->fidx = -2;
@@ -1100,7 +1085,7 @@ nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
 	tx_ring->qcp_rd_p = 0;
 	tx_ring->wr_ptr_add = 0;
 
-	if (tx_ring == r_vec->xdp_ring)
+	if (tx_ring->is_xdp)
 		return;
 
 	nd_q = netdev_get_tx_queue(dp->netdev, tx_ring->idx);
@@ -1492,8 +1477,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
 {
 	struct nfp_net_tx_buf *txbuf;
 	struct nfp_net_tx_desc *txd;
-	dma_addr_t new_dma_addr;
-	void *new_frag;
 	int wr_idx;
 
 	if (unlikely(nfp_net_tx_full(tx_ring, 1))) {
@@ -1501,17 +1484,13 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
 		return false;
 	}
 
-	new_frag = nfp_net_napi_alloc_one(dp, &new_dma_addr);
-	if (unlikely(!new_frag)) {
-		nfp_net_rx_drop(dp, rx_ring->r_vec, rx_ring, rxbuf, NULL);
-		return false;
-	}
-	nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
-
 	wr_idx = tx_ring->wr_p & (tx_ring->cnt - 1);
 
 	/* Stash the soft descriptor of the head then initialize it */
 	txbuf = &tx_ring->txbufs[wr_idx];
+
+	nfp_net_rx_give_one(dp, rx_ring, txbuf->frag, txbuf->dma_addr);
+
 	txbuf->frag = rxbuf->frag;
 	txbuf->dma_addr = rxbuf->dma_addr;
 	txbuf->fidx = -1;
@@ -1796,13 +1775,11 @@ static void nfp_net_tx_ring_free(struct nfp_net_tx_ring *tx_ring)
  * nfp_net_tx_ring_alloc() - Allocate resource for a TX ring
  * @dp:        NFP Net data path struct
  * @tx_ring:   TX Ring structure to allocate
- * @is_xdp:    True if ring will be used for XDP
  *
  * Return: 0 on success, negative errno otherwise.
  */
 static int
-nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring,
-		      bool is_xdp)
+nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
 {
 	struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
 	int sz;
@@ -1820,7 +1797,7 @@ nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring,
 	if (!tx_ring->txbufs)
 		goto err_alloc;
 
-	if (!is_xdp)
+	if (!tx_ring->is_xdp)
 		netif_set_xps_queue(dp->netdev, &r_vec->affinity_mask,
 				    tx_ring->idx);
 
@@ -1831,6 +1808,45 @@ nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring,
 	return -ENOMEM;
 }
 
+static void
+nfp_net_tx_ring_bufs_free(struct nfp_net_dp *dp,
+			  struct nfp_net_tx_ring *tx_ring)
+{
+	unsigned int i;
+
+	if (!tx_ring->is_xdp)
+		return;
+
+	for (i = 0; i < tx_ring->cnt; i++) {
+		if (!tx_ring->txbufs[i].frag)
+			return;
+
+		nfp_net_dma_unmap_rx(dp, tx_ring->txbufs[i].dma_addr);
+		__free_page(virt_to_page(tx_ring->txbufs[i].frag));
+	}
+}
+
+static int
+nfp_net_tx_ring_bufs_alloc(struct nfp_net_dp *dp,
+			   struct nfp_net_tx_ring *tx_ring)
+{
+	struct nfp_net_tx_buf *txbufs = tx_ring->txbufs;
+	unsigned int i;
+
+	if (!tx_ring->is_xdp)
+		return 0;
+
+	for (i = 0; i < tx_ring->cnt; i++) {
+		txbufs[i].frag = nfp_net_rx_alloc_one(dp, &txbufs[i].dma_addr);
+		if (!txbufs[i].frag) {
+			nfp_net_tx_ring_bufs_free(dp, tx_ring);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 static int nfp_net_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_dp *dp)
 {
 	unsigned int r;
@@ -1847,17 +1863,23 @@ static int nfp_net_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_dp *dp)
 			bias = dp->num_stack_tx_rings;
 
 		nfp_net_tx_ring_init(&dp->tx_rings[r], &nn->r_vecs[r - bias],
-				     r);
+				     r, bias);
 
-		if (nfp_net_tx_ring_alloc(dp, &dp->tx_rings[r], bias))
+		if (nfp_net_tx_ring_alloc(dp, &dp->tx_rings[r]))
 			goto err_free_prev;
+
+		if (nfp_net_tx_ring_bufs_alloc(dp, &dp->tx_rings[r]))
+			goto err_free_ring;
 	}
 
 	return 0;
 
 err_free_prev:
-	while (r--)
+	while (r--) {
+		nfp_net_tx_ring_bufs_free(dp, &dp->tx_rings[r]);
+err_free_ring:
 		nfp_net_tx_ring_free(&dp->tx_rings[r]);
+	}
 	kfree(dp->tx_rings);
 	return -ENOMEM;
 }
@@ -1866,8 +1888,10 @@ static void nfp_net_tx_rings_free(struct nfp_net_dp *dp)
 {
 	unsigned int r;
 
-	for (r = 0; r < dp->num_tx_rings; r++)
+	for (r = 0; r < dp->num_tx_rings; r++) {
+		nfp_net_tx_ring_bufs_free(dp, &dp->tx_rings[r]);
 		nfp_net_tx_ring_free(&dp->tx_rings[r]);
+	}
 
 	kfree(dp->tx_rings);
 }
@@ -2365,8 +2389,10 @@ static void nfp_net_close_free_all(struct nfp_net *nn)
 		nfp_net_rx_ring_bufs_free(&nn->dp, &nn->dp.rx_rings[r]);
 		nfp_net_rx_ring_free(&nn->dp.rx_rings[r]);
 	}
-	for (r = 0; r < nn->dp.num_tx_rings; r++)
+	for (r = 0; r < nn->dp.num_tx_rings; r++) {
+		nfp_net_tx_ring_bufs_free(&nn->dp, &nn->dp.tx_rings[r]);
 		nfp_net_tx_ring_free(&nn->dp.tx_rings[r]);
+	}
 	for (r = 0; r < nn->dp.num_r_vecs; r++)
 		nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 5/6] nfp: don't completely refuse to work with old flashes
From: Jakub Kicinski @ 2017-04-28  4:06 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, kubakici, Jakub Kicinski
In-Reply-To: <20170428040620.8256-1-jakub.kicinski@netronome.com>

Right now the required Service Process ABI version is still tied
to max ID of known commands.  For new NSP commands we are adding
we are checking if NSP version is recent enough on command-by-command
basis.  The driver doesn't have to force the device to have the
very latest flash, anything newer than 0.8 should do.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
index 61797c98f5fe..2fa9247bb23d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.c
@@ -78,7 +78,7 @@
 
 #define NSP_MAGIC		0xab10
 #define NSP_MAJOR		0
-#define NSP_MINOR		(__MAX_SPCODE - 1)
+#define NSP_MINOR		8
 
 #define NSP_CODE_MAJOR		GENMASK(15, 12)
 #define NSP_CODE_MINOR		GENMASK(11, 0)
@@ -94,8 +94,6 @@ enum nfp_nsp_cmd {
 	SPCODE_ETH_RESCAN	= 7, /* Rescan ETHs, write ETH_TABLE to buf */
 	SPCODE_ETH_CONTROL	= 8, /* Update media config from buffer */
 	SPCODE_NSP_IDENTIFY	= 13, /* Read NSP version */
-
-	__MAX_SPCODE,
 };
 
 static const struct {
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 4/6] nfp: avoid reading TX queue indexes from the device
From: Jakub Kicinski @ 2017-04-28  4:06 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, kubakici, Jakub Kicinski
In-Reply-To: <20170428040620.8256-1-jakub.kicinski@netronome.com>

Reading TX queue indexes from the device memory on each interrupt
is expensive.  It's doubly expensive with XDP running since we have
two TX rings to check there.  If the software indexes indicate that
the TX queue is completely empty, however, we don't need to look at
the device completion index at all.

The queuing CPU is doing a wmb() before kicking the device TX so
we should be safe to assume on the CPU handling the completions will
never see old value of the software copy of the index.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 4fbda0eb4776..c763a5b3bd6b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -927,6 +927,9 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
 	int fidx;
 	int idx;
 
+	if (tx_ring->wr_p == tx_ring->rd_p)
+		return;
+
 	/* Work out how many descriptors have been transmitted */
 	qcp_rd_p = nfp_qcp_rd_ptr_read(tx_ring->qcp_q);
 
@@ -1001,6 +1004,9 @@ static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
 	int idx, todo;
 	u32 qcp_rd_p;
 
+	if (tx_ring->wr_p == tx_ring->rd_p)
+		return;
+
 	/* Work out how many descriptors have been transmitted */
 	qcp_rd_p = nfp_qcp_rd_ptr_read(tx_ring->qcp_q);
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 6/6] nfp: provide 256 bytes of XDP headroom in all configurations
From: Jakub Kicinski @ 2017-04-28  4:06 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, kubakici, Jakub Kicinski
In-Reply-To: <20170428040620.8256-1-jakub.kicinski@netronome.com>

For legacy reasons NFP FW may be compiled to DMA packets to a constant
offset into the buffer and use the space before it for metadata.  This
ensures that packets data always start at a certain offset regardless of
the amount of preceding metadata.

If rx offset is set to 0 there may still be up to 64 bytes of metadata
but metadata will start at the beginning of the buffer, instead of:

    data_start_offset = rx_offset - meta_len

Even though we make the buffers larger to accommodate up to 64 bytes of
metadata, if there is only N bytes of metadata, we will end up with
N bytes of headroom and 64 - N bytes of tailroom.  Therefore we can't
rely on that space for XDP headroom.  Make sure we always allocate
full 256 bytes.  This, unfortunately, means we can't fit the headroom
on an u8 any more.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h        |  4 ++--
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 13 +------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index e7e9a9848746..38b41fdeaa8f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -470,10 +470,10 @@ struct nfp_net_dp {
 	u8 chained_metadata_format:1;
 
 	u8 rx_dma_dir;
-	u8 rx_dma_off;
-
 	u8 rx_offset;
 
+	u32 rx_dma_off;
+
 	u32 ctrl;
 	u32 fl_bufsz;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index c763a5b3bd6b..b9f3548bb65f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2966,11 +2966,7 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
 	dp->xdp_prog = prog;
 	dp->num_tx_rings += prog ? nn->dp.num_rx_rings : -nn->dp.num_rx_rings;
 	dp->rx_dma_dir = prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
-	if (prog)
-		dp->rx_dma_off = XDP_PACKET_HEADROOM -
-			(nn->dp.rx_offset ?: NFP_NET_MAX_PREPEND);
-	else
-		dp->rx_dma_off = 0;
+	dp->rx_dma_off = prog ? XDP_PACKET_HEADROOM - nn->dp.rx_offset : 0;
 
 	/* We need RX reconfig to remap the buffers (BIDIR vs FROM_DEV) */
 	err = nfp_net_ring_reconfig(nn, dp);
@@ -3198,13 +3194,6 @@ int nfp_net_netdev_init(struct net_device *netdev)
 	struct nfp_net *nn = netdev_priv(netdev);
 	int err;
 
-	/* XDP calls for 256 byte packet headroom which wouldn't fit in a u8.
-	 * We, however, reuse the metadata prepend space for XDP buffers which
-	 * is at least 1 byte long and as long as XDP headroom doesn't increase
-	 * above 256 the *extra* XDP headroom will fit on 8 bits.
-	 */
-	BUILD_BUG_ON(XDP_PACKET_HEADROOM > 256);
-
 	nn->dp.chained_metadata_format = nn->fw_ver.major > 3;
 
 	nn->dp.rx_dma_dir = DMA_FROM_DEVICE;
-- 
2.11.0

^ permalink raw reply related

* Re: [REGRESSION next-20170426] Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe time for platform/amba/pci bus devices") causes oops in mvneta
From: Sricharan R @ 2017-04-28  4:29 UTC (permalink / raw)
  To: Ralph Sennhauser
  Cc: Rafael J. Wysocki, Joerg Roedel, Bjorn Helgaas, linux-acpi,
	linux-kernel, linux-pci, Thomas Petazzoni, netdev
In-Reply-To: <2fb57b9b-3944-d9cc-1fac-8dcccaa0c37a@codeaurora.org>

[-- Attachment #1: Type: text/plain, Size: 944 bytes --]

Hi Ralph,

On 4/27/2017 7:05 PM, Sricharan R wrote:
> Hi,
> 
> On 4/26/2017 9:45 PM, Ralph Sennhauser wrote:
>> Hi Sricharan R,
>>
>> Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe time
>> for platform/amba/pci bus devices") causes a kernel panic as in the log
>> below on an armada-385. Reverting the commit fixes the issue.
>>
>> Regards
>> Ralph
> 
> Somehow not getting a obvious clue on whats going wrong with the logs
> below. From the log and looking in to dts, the drivers seems to the one for
> "marvell,armada-370-neta". Issue looks the data from the dma has gone bad
> and subsequently referring the wrong data has resulted in the crash.
> Looks like the dma_masks is the one going wrong.
> Can i get some logs from mvneta_probe, about dev->dma_mask,
> dev->coherent_dma_mask and dev->dma_ops with and without the patch
> to see whats the difference ?
> 

Attached the patch with debug prints.

Regards,
 Sricharan

[-- Attachment #2: 0001-Debug-prints.patch --]
[-- Type: text/plain, Size: 1113 bytes --]

From fe77d3968d6bc35b46e5f30d5c67c7603aa10597 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Fri, 28 Apr 2017 09:43:26 +0530
Subject: [PATCH] Debug prints

Signed-off-by: Sricharan R <sricharan@codeaurora.org>
---
 drivers/net/ethernet/marvell/mvneta.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index d297011..250f0b2 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4146,6 +4146,12 @@ static int mvneta_probe(struct platform_device *pdev)
 	int phy_mode;
 	int err;
 	int cpu;
+	struct device *ddev = &pdev->dev;
+
+
+	dev_err(ddev, "dev->dma_mask 0x%llx\n", *(ddev->dma_mask));
+	dev_err(ddev, "dev->coherent_dma_mask 0x%llx\n", ddev->coherent_dma_mask);
+	dev_err(ddev, "dev->dma_ops 0x%llx\n", ddev->dma_ops);
 
 	dev = alloc_etherdev_mqs(sizeof(struct mvneta_port), txq_number, rxq_number);
 	if (!dev)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation


^ permalink raw reply related

* Re: EINVAL when using connect() for udp sockets
From: Eric Dumazet @ 2017-04-28  4:48 UTC (permalink / raw)
  To: Daurnimator
  Cc: Cong Wang, Linux Kernel Network Developers, William Ahern,
	Santi T., Michael Kerrisk-manpages
In-Reply-To: <CAEnbY+cu_b1NddDcKMumK2npmxh4yh4kEoeeY1db2Ct3JntG9g@mail.gmail.com>

On Fri, 2017-04-28 at 12:55 +1000, Daurnimator wrote:
> On 1 April 2017 at 03:52, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> > Please submit your patch formally and with a man page patch too.
> 
> Did a patch get submitted? I had a look but couldn't see it.

Not yet.

I will probably wait for next cycle (linux-4.13), I have been busy on
other work lately.

Thanks.

^ permalink raw reply

* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: John Fastabend @ 2017-04-28  5:06 UTC (permalink / raw)
  To: Alexei Starovoitov, Jesper Dangaard Brouer, Andy Gospodarek
  Cc: Alexei Starovoitov, Daniel Borkmann, Daniel Borkmann,
	netdev@vger.kernel.org, xdp-newbies@vger.kernel.org
In-Reply-To: <53e9dd2f-f40a-b43b-99c9-62f5ce3a665c@fb.com>

On 17-04-27 04:31 PM, Alexei Starovoitov wrote:
> On 4/27/17 1:41 AM, Jesper Dangaard Brouer wrote:
>> When registering/attaching a XDP/bpf program, we would just send the
>> file-descriptor for this port-map along (like we do with the bpf_prog
>> FD). Plus, it own ingress-port number this program is in the port-map.
>>
>> It is not clear to me, in-which-data-structure on the kernel-side we
>> store this reference to the port-map and ingress-port. As today we only
>> have the "raw" struct bpf_prog pointer. I see several options:
>>
>> 1. Create a new xdp_prog struct that contains existing bpf_prog,
>> a port-map pointer and ingress-port. (IMHO easiest solution)
>>
>> 2. Just create a new pointer to port-map and store it in driver rx-ring
>> struct (like existing bpf_prog), but this create a race-challenge
>> replacing (cmpxchg) the program (or perhaps it's not a problem as it
>> runs under rcu and RTNL-lock).
>>
>> 3. Extend bpf_prog to store this port-map and ingress-port, and have a
>> fast-way to access it.  I assume it will be accessible via
>> bpf_prog->bpf_prog_aux->used_maps[X] but it will be too slow for XDP.
> 
> I'm not sure I completely follow the 3 proposals.
> Are you suggesting to have only one netdev_array per program?
> Why not to allow any number like we do for tailcall+prog_array, etc?
> We can teach verifier to allow new helper
> bpf_tx_port(netdev_array, port_num);
> to only be used with netdev_array map type.
> It will fetch netdevice pointer from netdev_array[port_num]
> and will tx the packet into it.
> We can make it similar to bpf_tail_call(), so that program will
> finish on successful bpf_tx_port() or
> make it into 'delayed' tx which will be executed when program finishes.
> Not sure which approach is better.

My reaction would be to make it finish on success but would like to write
a few programs first and see. I can't think of any use _not_ to terminate
but maybe there is something I'm missing.

> 
> We can also extend this netdev_array into broadcast/multicast. Like
> bpf_tx_allports(&netdev_array);
> call from the program will xmit the packet to all netdevices
> in that 'netdev_array' map type.

Yep nice solution to the multicast problem.

> 
> The map-in-map support can be trivially extended to allow netdev_array,
> then the program can create N multicast groups of netdevices.
> Each multicast group == one netdev_array map.
> The user space will populate a hashmap with these netdev_arrays and
> bpf kernel side can select dynamically which multicast group to use
> to send the packets to.
> bpf kernel side may look like:
> struct bpf_netdev_array *netdev_array = bpf_map_lookup_elem(&hash, key);
> if (!netdev_array)
>   ...
> if (my_condition)
>    bpf_tx_allports(netdev_array);  /* broadcast to all netdevices */
> else
>    bpf_tx_port(netdev_array, port_num); /* tx into one netdevice */
> 
> that's an artificial example. Just trying to point out
> that we shouldn't restrict the feature too soon.
> 

That is more or less what I was thinking as well. The other question
I have though is should we have a bpf_redirect() call for the simple
case where I use the ifindex directly. This will be helpful for taking
existing programs from tc_cls into xdp. I think it makes sense to have
both bpf_tx_allports(), bpf_tx_port(), and bpf_redirect().

.John

^ permalink raw reply

* Re: xdp_redirect ifindex vs port. Was: best API for returning/setting egress port?
From: Alexei Starovoitov @ 2017-04-28  5:30 UTC (permalink / raw)
  To: John Fastabend, Jesper Dangaard Brouer, Andy Gospodarek
  Cc: Alexei Starovoitov, Daniel Borkmann, Daniel Borkmann,
	netdev@vger.kernel.org, xdp-newbies@vger.kernel.org
In-Reply-To: <5902CDC5.5010209@gmail.com>

On 4/27/17 10:06 PM, John Fastabend wrote:
> That is more or less what I was thinking as well. The other question
> I have though is should we have a bpf_redirect() call for the simple
> case where I use the ifindex directly. This will be helpful for taking
> existing programs from tc_cls into xdp. I think it makes sense to have
> both bpf_tx_allports(), bpf_tx_port(), and bpf_redirect().

I think so too.
Once netdevice is stored into netdev_array map the netdevice is pinned
and we need to figure out what to do if somebody tries to delete it.
Should we add a new netlink notifier that this netdev's refcnt is
almost zero and it's only in netdev_array(s) ?
or should it be deleted from the array(s) automatically and
then user space will be notified post-deletion?
Both approaches have their pros and cons.

Whereas raw ifindex approach (via bpf_redirect) doesn't have these
caveats. It's clear to both bpf prog and user space that ifindex
can be stale and user space needs to monitor netdevs and update
programs/maps.

^ permalink raw reply

* Re: [REGRESSION next-20170426] Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe time for platform/amba/pci bus devices") causes oops in mvneta
From: Sricharan R @ 2017-04-28  5:43 UTC (permalink / raw)
  To: Ralph Sennhauser
  Cc: Rafael J. Wysocki, Joerg Roedel, Bjorn Helgaas, linux-acpi,
	linux-kernel, linux-pci, Thomas Petazzoni, netdev
In-Reply-To: <20170427164014.422a124c@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 1360 bytes --]

Hi Ralph,

On 4/27/2017 8:10 PM, Ralph Sennhauser wrote:
> On Thu, 27 Apr 2017 19:05:09 +0530
> Sricharan R <sricharan@codeaurora.org> wrote:
> 
>> Hi,
>>
>> On 4/26/2017 9:45 PM, Ralph Sennhauser wrote:
>>> Hi Sricharan R,
>>>
>>> Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe
>>> time for platform/amba/pci bus devices") causes a kernel panic as
>>> in the log below on an armada-385. Reverting the commit fixes the
>>> issue.
>>>
>>> Regards
>>> Ralph  
>>
>> Somehow not getting a obvious clue on whats going wrong with the logs
>> below. From the log and looking in to dts, the drivers seems to the
>> one for "marvell,armada-370-neta".
> 
> Correct.
> 
>> Issue looks the data from the dma
>> has gone bad and subsequently referring the wrong data has resulted
>> in the crash. Looks like the dma_masks is the one going wrong.
>> Can i get some logs from mvneta_probe, about dev->dma_mask,
>> dev->coherent_dma_mask and dev->dma_ops with and without the patch
>> to see whats the difference ?
> 
> Not sure I understood what exactly you are after. Might be faster to
> just send me a patch with all debug print statements you like to see.
> 

Attached the patch with debug prints.

Regards,
 Sricharan

-- 
"QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation

[-- Attachment #2: 0001-Debug-prints.patch --]
[-- Type: text/plain, Size: 1113 bytes --]

From fe77d3968d6bc35b46e5f30d5c67c7603aa10597 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Fri, 28 Apr 2017 09:43:26 +0530
Subject: [PATCH] Debug prints

Signed-off-by: Sricharan R <sricharan@codeaurora.org>
---
 drivers/net/ethernet/marvell/mvneta.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index d297011..250f0b2 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4146,6 +4146,12 @@ static int mvneta_probe(struct platform_device *pdev)
 	int phy_mode;
 	int err;
 	int cpu;
+	struct device *ddev = &pdev->dev;
+
+
+	dev_err(ddev, "dev->dma_mask 0x%llx\n", *(ddev->dma_mask));
+	dev_err(ddev, "dev->coherent_dma_mask 0x%llx\n", ddev->coherent_dma_mask);
+	dev_err(ddev, "dev->dma_ops 0x%llx\n", ddev->dma_ops);
 
 	dev = alloc_etherdev_mqs(sizeof(struct mvneta_port), txq_number, rxq_number);
 	if (!dev)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation


^ permalink raw reply related

* Re: Strange samples/bpf loading error for maps on net-next?
From: Alexei Starovoitov @ 2017-04-28  5:49 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Daniel Borkmann, Martin KaFai Lau, netdev@vger.kernel.org, eric
In-Reply-To: <20170427131542.3c8dfa24@redhat.com>

On Thu, Apr 27, 2017 at 01:15:42PM +0200, Jesper Dangaard Brouer wrote:
> 
> To provoke this bug, remember that you MUST call:
> 
>  make headers_install
> 
> In the kernels root directory, else you will be compiling samples/bpf/
> against the older headers previously installed.
> 
> The error looks like:
> 
>  $ sudo ./sockex1
>  bpf_load_program() err=22
>  fd 0 is not pointing to valid bpf_map
>  sockex1: [...]/samples/bpf/sockex1_user.c:26: main: Assertion `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0' failed.
>  Aborted
> 
> I've found that the bug were introduced in
>  commit: fb30d4b71214 ("bpf: Add tests for map-in-map")

Great debugging!
Indeed that change made samples/bpf/bpf_load.c to be incompatible with .o
generated earlier. We should really get rid of that loader and
switch to tools/lib/bpf/. I believe Eric Leblond already made it
resilient to 'struct bpf_map_def' changes.

^ permalink raw reply

* [PATCH net-next] rhashtable: Do not lower max_elems when max_size is zero
From: Herbert Xu @ 2017-04-28  6:10 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, davem, fw, tgraf
In-Reply-To: <20170427223024.32657-1-f.fainelli@gmail.com>

On Thu, Apr 27, 2017 at 03:30:24PM -0700, Florian Fainelli wrote:
> After commit 6d684e54690c ("rhashtable: Cap total number of
> entries to 2^31"), we would be hitting a panic() in net/core/rtnetlink.c
> during initialization. The call stack would look like this:

Thanks for the patch.  I think we could just fold it into the
previous if clause, like this:

---8<---
The commit 6d684e54690c ("rhashtable: Cap total number of entries
to 2^31") breaks rhashtable users that do not set max_size.  This
is because when max_size is zero max_elems is also incorrectly set
to zero instead of 2^31.

This patch fixes it by only lowering max_elems when max_size is not
zero.

Fixes: 6d684e54690c ("rhashtable: Cap total number of entries to 2^31")
Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 751630b..3895486 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -958,13 +958,14 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->min_size)
 		ht->p.min_size = roundup_pow_of_two(params->min_size);
 
-	if (params->max_size)
-		ht->p.max_size = rounddown_pow_of_two(params->max_size);
-
 	/* Cap total entries at 2^31 to avoid nelems overflow. */
 	ht->max_elems = 1u << 31;
-	if (ht->p.max_size < ht->max_elems / 2)
-		ht->max_elems = ht->p.max_size * 2;
+
+	if (params->max_size) {
+		ht->p.max_size = rounddown_pow_of_two(params->max_size);
+		if (ht->p.max_size < ht->max_elems / 2)
+			ht->max_elems = ht->p.max_size * 2;
+	}
 
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* Re: [REGRESSION next-20170426] Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe time for platform/amba/pci bus devices") causes oops in mvneta
From: Ralph Sennhauser @ 2017-04-28  6:19 UTC (permalink / raw)
  To: Sricharan R
  Cc: Rafael J. Wysocki, Joerg Roedel, Bjorn Helgaas, linux-acpi,
	linux-kernel, linux-pci, Thomas Petazzoni, netdev
In-Reply-To: <524b7fc8-1eca-a7d8-7bc7-6743be17c208@codeaurora.org>

On Fri, 28 Apr 2017 11:13:33 +0530
Sricharan R <sricharan@codeaurora.org> wrote:

> Hi Ralph,
> 
> On 4/27/2017 8:10 PM, Ralph Sennhauser wrote:
> > On Thu, 27 Apr 2017 19:05:09 +0530
> > Sricharan R <sricharan@codeaurora.org> wrote:
> >   
> >> Hi,
> >>
> >> On 4/26/2017 9:45 PM, Ralph Sennhauser wrote:  
> >>> Hi Sricharan R,
> >>>
> >>> Commit 09515ef5ddad ("of/acpi: Configure dma operations at probe
> >>> time for platform/amba/pci bus devices") causes a kernel panic as
> >>> in the log below on an armada-385. Reverting the commit fixes the
> >>> issue.
> >>>
> >>> Regards
> >>> Ralph    
> >>
> >> Somehow not getting a obvious clue on whats going wrong with the
> >> logs below. From the log and looking in to dts, the drivers seems
> >> to the one for "marvell,armada-370-neta".  
> > 
> > Correct.
> >   
> >> Issue looks the data from the dma
> >> has gone bad and subsequently referring the wrong data has resulted
> >> in the crash. Looks like the dma_masks is the one going wrong.
> >> Can i get some logs from mvneta_probe, about dev->dma_mask,
> >> dev->coherent_dma_mask and dev->dma_ops with and without the patch
> >> to see whats the difference ?  
> > 
> > Not sure I understood what exactly you are after. Might be faster to
> > just send me a patch with all debug print statements you like to
> > see. 
> 
> Attached the patch with debug prints.
> 
> Regards,
>  Sricharan
> 

Hi Sricharan

With commit 09515ef5ddad

[    1.288962] mvneta f1070000.ethernet: dev->dma_mask 0xffffffff
[    1.294827] mvneta f1070000.ethernet: dev->coherent_dma_mask 0xffffffff
[    1.301472] mvneta f1070000.ethernet: dev->dma_ops 0x40b00c0601460

[    1.322047] mvneta f1034000.ethernet: dev->dma_mask 0xffffffff
[    1.327904] mvneta f1034000.ethernet: dev->coherent_dma_mask 0xffffffff
[    1.334549] mvneta f1034000.ethernet: dev->dma_ops 0x40b00c0601460


With the patch reverted, the build that works

[    1.289001] mvneta f1070000.ethernet: dev->dma_mask 0xffffffff
[    1.294866] mvneta f1070000.ethernet: dev->coherent_dma_mask 0xffffffff
[    1.301511] mvneta f1070000.ethernet: dev->dma_ops 0x40b00c06014a8

[    1.317005] mvneta f1034000.ethernet: dev->dma_mask 0xffffffff
[    1.322867] mvneta f1034000.ethernet: dev->coherent_dma_mask 0xffffffff
[    1.329508] mvneta f1034000.ethernet: dev->dma_ops 0x40b00c06014a8


Regards
Ralph

^ permalink raw reply

* Re: Strange samples/bpf loading error for maps on net-next?
From: Jesper Dangaard Brouer @ 2017-04-28  6:28 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Daniel Borkmann, Martin KaFai Lau, netdev@vger.kernel.org, eric,
	brouer
In-Reply-To: <20170428054950.teh53ssg6ei4xkvx@ast-mbp>

On Thu, 27 Apr 2017 22:49:51 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Thu, Apr 27, 2017 at 01:15:42PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > To provoke this bug, remember that you MUST call:
> > 
> >  make headers_install
> > 
> > In the kernels root directory, else you will be compiling samples/bpf/
> > against the older headers previously installed.
> > 
> > The error looks like:
> > 
> >  $ sudo ./sockex1
> >  bpf_load_program() err=22
> >  fd 0 is not pointing to valid bpf_map
> >  sockex1: [...]/samples/bpf/sockex1_user.c:26: main: Assertion `setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0' failed.
> >  Aborted
> > 
> > I've found that the bug were introduced in
> >  commit: fb30d4b71214 ("bpf: Add tests for map-in-map")  
> 
> Great debugging!
> Indeed that change made samples/bpf/bpf_load.c to be incompatible with .o
> generated earlier. We should really get rid of that loader and
> switch to tools/lib/bpf/. I believe Eric Leblond already made it
> resilient to 'struct bpf_map_def' changes.

Yes, exactly it is problem in samples/bpf/bpf_load.c.  As it assumes
the contents of the ELF file maps section will always chunks in
sizeof(struct bpf_map_def) and just uses that directly as a pointer to
an array of type struct bpf_map_def, which of-cause silently blows up
when changing struct bpf_map_def.  That cost me many hours to discover
that yesterday.

I started implementing more correct parsing of the ELF maps section, it
is doable, but as you say, maybe we should just get rid of this loader?
I will at least fixup bpf_load.c and perhaps just abort the program the
program if I detect a difference between the ELF size and struct size.
And send this as a patch later today...

I've also looked at the loaded Daniel implemented[1] in iproute2, and
it is much cleaner.

[1] https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/tree/lib/bpf.c
-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH net-next v5 0/2] net: hns: bug fix for HNS driver
From: Yankejian @ 2017-04-28  6:49 UTC (permalink / raw)
  To: davem, salil.mehta, yisen.zhuang, matthias.bgg, yankejian,
	lipeng321, zhouhuiru, huangdaode
  Cc: netdev, linuxarm

From: lipeng <lipeng321@huawei.com>

This patchset add support defered dsaf probe when mdio and
mbigen module is not insmod.

For more details, please refer to individual patch.

change log:
V4 - > V5:
1. Float on net-next;
2. Delete patch "net: hns: fixed bug that skb used after kfree"
   from this patchset;

V3 -> V4:
1. Delete redundant commit message;
2. Add Reviewed-by: Matthias Brugger <mbrugger@suse.com>;

V2 -> V3:
1. Check return value when  platform_get_irq in hns_rcb_get_cfg;

V1 -> V2:
1. Return appropriate errno in hns_mac_register_phy;

lipeng (2):
  net: hns: support deferred probe when can not obtain irq
  net: hns: support deferred probe when no mdio

 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 39 +++++++++++++++++++----
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c |  4 ++-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c |  8 ++++-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h |  2 +-
 4 files changed, 44 insertions(+), 9 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [PATCH net-next v5 2/2] net: hns: support deferred probe when no mdio
From: Yankejian @ 2017-04-28  6:49 UTC (permalink / raw)
  To: davem, salil.mehta, yisen.zhuang, matthias.bgg, yankejian,
	lipeng321, zhouhuiru, huangdaode
  Cc: netdev, linuxarm
In-Reply-To: <1493362187-51671-1-git-send-email-yankejian@huawei.com>

From: lipeng <lipeng321@huawei.com>

In the hip06 and hip07 SoCs, phy connect to mdio bus.The mdio
module is probed with module_init, and, as such,
is not guaranteed to probe before the HNS driver. So we need
to support deferred probe.

We check for probe deferral in the mac init, so we not init DSAF
when there is no mdio, and free all resource, to later learn that
we need to defer the probe.

Signed-off-by: lipeng <lipeng321@huawei.com>
Reviewed-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Reviewed-by: Matthias Brugger <mbrugger@suse.com>
---
change log:
V4 -> V5:
1. Float on net-next;

V1 -> V2:
1. Return appropriate errno in hns_mac_register_phy;
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 39 +++++++++++++++++++----
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
index 0c1f56e..8b5cdf4 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -696,6 +696,8 @@ static int hns_mac_init_ex(struct hns_mac_cb *mac_cb)
 	rc = phy_device_register(phy);
 	if (rc) {
 		phy_device_free(phy);
+		dev_err(&mdio->dev, "registered phy fail at address %i\n",
+			addr);
 		return -ENODEV;
 	}
 
@@ -706,7 +708,7 @@ static int hns_mac_init_ex(struct hns_mac_cb *mac_cb)
 	return 0;
 }
 
-static void hns_mac_register_phy(struct hns_mac_cb *mac_cb)
+static int hns_mac_register_phy(struct hns_mac_cb *mac_cb)
 {
 	struct acpi_reference_args args;
 	struct platform_device *pdev;
@@ -716,24 +718,39 @@ static void hns_mac_register_phy(struct hns_mac_cb *mac_cb)
 
 	/* Loop over the child nodes and register a phy_device for each one */
 	if (!to_acpi_device_node(mac_cb->fw_port))
-		return;
+		return -ENODEV;
 
 	rc = acpi_node_get_property_reference(
 			mac_cb->fw_port, "mdio-node", 0, &args);
 	if (rc)
-		return;
+		return rc;
 
 	addr = hns_mac_phy_parse_addr(mac_cb->dev, mac_cb->fw_port);
 	if (addr < 0)
-		return;
+		return addr;
 
 	/* dev address in adev */
 	pdev = hns_dsaf_find_platform_device(acpi_fwnode_handle(args.adev));
+	if (!pdev) {
+		dev_err(mac_cb->dev, "mac%d mdio pdev is NULL\n",
+			mac_cb->mac_id);
+		return  -EINVAL;
+	}
+
 	mii_bus = platform_get_drvdata(pdev);
+	if (!mii_bus) {
+		dev_err(mac_cb->dev,
+			"mac%d mdio is NULL, dsaf will probe again later\n",
+			mac_cb->mac_id);
+		return -EPROBE_DEFER;
+	}
+
 	rc = hns_mac_register_phydev(mii_bus, mac_cb, addr);
 	if (!rc)
 		dev_dbg(mac_cb->dev, "mac%d register phy addr:%d\n",
 			mac_cb->mac_id, addr);
+
+	return rc;
 }
 
 #define MAC_MEDIA_TYPE_MAX_LEN		16
@@ -754,7 +771,7 @@ static void hns_mac_register_phy(struct hns_mac_cb *mac_cb)
  *@np:device node
  * return: 0 --success, negative --fail
  */
-static int  hns_mac_get_info(struct hns_mac_cb *mac_cb)
+static int hns_mac_get_info(struct hns_mac_cb *mac_cb)
 {
 	struct device_node *np;
 	struct regmap *syscon;
@@ -864,7 +881,15 @@ static int  hns_mac_get_info(struct hns_mac_cb *mac_cb)
 			}
 		}
 	} else if (is_acpi_node(mac_cb->fw_port)) {
-		hns_mac_register_phy(mac_cb);
+		ret = hns_mac_register_phy(mac_cb);
+		/*
+		 * Mac can work well if there is phy or not.If the port don't
+		 * connect with phy, the return value will be ignored. Only
+		 * when there is phy but can't find mdio bus, the return value
+		 * will be handled.
+		 */
+		if (ret == -EPROBE_DEFER)
+			return ret;
 	} else {
 		dev_err(mac_cb->dev, "mac%d cannot find phy node\n",
 			mac_cb->mac_id);
@@ -1026,6 +1051,7 @@ int hns_mac_init(struct dsaf_device *dsaf_dev)
 			dsaf_dev->mac_cb[port_id] = mac_cb;
 		}
 	}
+
 	/* init mac_cb for all port */
 	for (port_id = 0; port_id < max_port_num; port_id++) {
 		mac_cb = dsaf_dev->mac_cb[port_id];
@@ -1035,6 +1061,7 @@ int hns_mac_init(struct dsaf_device *dsaf_dev)
 		ret = hns_mac_get_cfg(dsaf_dev, mac_cb);
 		if (ret)
 			return ret;
+
 		ret = hns_mac_init_ex(mac_cb);
 		if (ret)
 			return ret;
-- 
1.9.1

^ permalink raw reply related

* [PATCH net-next v5 1/2] net: hns: support deferred probe when can not obtain irq
From: Yankejian @ 2017-04-28  6:49 UTC (permalink / raw)
  To: davem, salil.mehta, yisen.zhuang, matthias.bgg, yankejian,
	lipeng321, zhouhuiru, huangdaode
  Cc: netdev, linuxarm
In-Reply-To: <1493362187-51671-1-git-send-email-yankejian@huawei.com>

From: lipeng <lipeng321@huawei.com>

In the hip06 and hip07 SoCs, the interrupt lines from the
DSAF controllers are connected to mbigen hw module.
The mbigen module is probed with module_init, and, as such,
is not guaranteed to probe before the HNS driver. So we need
to support deferred probe.

Signed-off-by: lipeng <lipeng321@huawei.com>
Reviewed-by: Yisen Zhuang <yisen.zhuang@huawei.com>
Reviewed-by: Matthias Brugger <mbrugger@suse.com>
---
change log:
V4 -> V5:
1. Float on net-next;

V3 -> V4:
1. Delete redundant commit message;
2. add Reviewed-by: Matthias Brugger <mbrugger@suse.com>;

V2 -> V3:
1. Check return value when  platform_get_irq in hns_rcb_get_cfg;
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c | 4 +++-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c | 8 +++++++-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
index eba406b..93e71e2 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_ppe.c
@@ -510,7 +510,9 @@ int hns_ppe_init(struct dsaf_device *dsaf_dev)
 
 		hns_ppe_get_cfg(dsaf_dev->ppe_common[i]);
 
-		hns_rcb_get_cfg(dsaf_dev->rcb_common[i]);
+		ret = hns_rcb_get_cfg(dsaf_dev->rcb_common[i]);
+		if (ret)
+			goto get_cfg_fail;
 	}
 
 	for (i = 0; i < HNS_PPE_COM_NUM; i++)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
index c20a0f4..e2e2853 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.c
@@ -492,7 +492,7 @@ static int hns_rcb_get_base_irq_idx(struct rcb_common_cb *rcb_common)
  *hns_rcb_get_cfg - get rcb config
  *@rcb_common: rcb common device
  */
-void hns_rcb_get_cfg(struct rcb_common_cb *rcb_common)
+int hns_rcb_get_cfg(struct rcb_common_cb *rcb_common)
 {
 	struct ring_pair_cb *ring_pair_cb;
 	u32 i;
@@ -517,10 +517,16 @@ void hns_rcb_get_cfg(struct rcb_common_cb *rcb_common)
 		ring_pair_cb->virq[HNS_RCB_IRQ_IDX_RX] =
 		is_ver1 ? platform_get_irq(pdev, base_irq_idx + i * 2 + 1) :
 			  platform_get_irq(pdev, base_irq_idx + i * 3);
+		if ((ring_pair_cb->virq[HNS_RCB_IRQ_IDX_TX] == -EPROBE_DEFER) ||
+		    (ring_pair_cb->virq[HNS_RCB_IRQ_IDX_RX] == -EPROBE_DEFER))
+			return -EPROBE_DEFER;
+
 		ring_pair_cb->q.phy_base =
 			RCB_COMM_BASE_TO_RING_BASE(rcb_common->phy_base, i);
 		hns_rcb_ring_pair_get_cfg(ring_pair_cb);
 	}
+
+	return 0;
 }
 
 /**
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
index a664ee8..6028164 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_rcb.h
@@ -121,7 +121,7 @@ struct rcb_common_cb {
 void hns_rcb_common_free_cfg(struct dsaf_device *dsaf_dev, u32 comm_index);
 int hns_rcb_common_init_hw(struct rcb_common_cb *rcb_common);
 void hns_rcb_start(struct hnae_queue *q, u32 val);
-void hns_rcb_get_cfg(struct rcb_common_cb *rcb_common);
+int hns_rcb_get_cfg(struct rcb_common_cb *rcb_common);
 void hns_rcb_get_queue_mode(enum dsaf_mode dsaf_mode,
 			    u16 *max_vfn, u16 *max_q_per_vf);
 
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v2 07/21] crypto: shash, caam: Make use of the new sg_map helper function
From: Herbert Xu @ 2017-04-28  6:30 UTC (permalink / raw)
  To: Logan Gunthorpe
  Cc: dri-devel-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	target-devel-u79uwXL29TY76Z2rM5mHXA, Christoph Hellwig,
	devel-gWbeCf7V1WCQmaza687I9mD2FQJk+8+b, James E.J. Bottomley,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	linux-nvdimm-hn68Rpc1hR1g9hUCZPvPmw,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Sumit Semwal,
	open-iscsi-/JYPxA39Uh5TLH3MbocFFw,
	linux-media-u79uwXL29TY76Z2rM5mHXA,
	intel-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
	sparmaintainer-GLv8BlqOqDDQT0dZR+AlfA,
	linux-raid-u79uwXL29TY76Z2rM5mHXA,
	megaraidlinux.pdl-dY08KVG/lbpWk0Htik3J/w, Jens Axboe,
	Martin K. Petersen, netdev-u79uwXL29TY76Z2rM5mHXA, Matthew Wilcox,
	linux-mmc-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-crypto-u79uwXL29TY76Z2rM5mHXA, Greg Kroah-Hartman,
	David S. Miller
In-Reply-To: <94123cbf-3287-f05e-7267-0bcf08ab0a8b-OTvnGxWRz7hWk0Htik3J/w@public.gmane.org>

On Thu, Apr 27, 2017 at 09:45:57AM -0600, Logan Gunthorpe wrote:
> 
> 
> On 26/04/17 09:56 PM, Herbert Xu wrote:
> > On Tue, Apr 25, 2017 at 12:20:54PM -0600, Logan Gunthorpe wrote:
> >> Very straightforward conversion to the new function in the caam driver
> >> and shash library.
> >>
> >> Signed-off-by: Logan Gunthorpe <logang-OTvnGxWRz7hWk0Htik3J/w@public.gmane.org>
> >> Cc: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
> >> Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> >> ---
> >>  crypto/shash.c                | 9 ++++++---
> >>  drivers/crypto/caam/caamalg.c | 8 +++-----
> >>  2 files changed, 9 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/crypto/shash.c b/crypto/shash.c
> >> index 5e31c8d..5914881 100644
> >> --- a/crypto/shash.c
> >> +++ b/crypto/shash.c
> >> @@ -283,10 +283,13 @@ int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc)
> >>  	if (nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset)) {
> >>  		void *data;
> >>  
> >> -		data = kmap_atomic(sg_page(sg));
> >> -		err = crypto_shash_digest(desc, data + offset, nbytes,
> >> +		data = sg_map(sg, 0, SG_KMAP_ATOMIC);
> >> +		if (IS_ERR(data))
> >> +			return PTR_ERR(data);
> >> +
> >> +		err = crypto_shash_digest(desc, data, nbytes,
> >>  					  req->result);
> >> -		kunmap_atomic(data);
> >> +		sg_unmap(sg, data, 0, SG_KMAP_ATOMIC);
> >>  		crypto_yield(desc->flags);
> >>  	} else
> >>  		err = crypto_shash_init(desc) ?:
> > 
> > Nack.  This is an optimisation for the special case of a single
> > SG list entry.  In fact in the common case the kmap_atomic should
> > disappear altogether in the no-highmem case.  So replacing it
> > with sg_map is not acceptable.
> 
> What you seem to have missed is that sg_map is just a thin wrapper
> around kmap_atomic. Perhaps with a future check for a mappable page.
> This change should have zero impact on performance.

You are right.  Indeed the existing code looks buggy as they
don't take sg->offset into account when doing the kmap.  Could
you send me some patches that fix these problems first so that
they can be easily backported?

Thanks,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [patch net-next 10/10] net: sched: extend gact to allow jumping to another filter chain
From: Jiri Pirko @ 2017-04-28  6:52 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, davem, xiyou.wangcong, dsa, edumazet, stephen, daniel,
	alexander.h.duyck, mlxsw, simon.horman
In-Reply-To: <bedf1246-4900-ebba-cf78-936989029db9@mojatatu.com>

Fri, Apr 28, 2017 at 03:41:08AM CEST, jhs@mojatatu.com wrote:
>
>Jiri,
>
>Good stuff!
>Thanks for the effort.
>
>I didnt review the details - will do. I wanted to raise one issue.
>This should work for all actions, not just gact (refer to the
>recent commit i made on the action jumping).
>
>Example policy for policer:
>
>#if packets destined for mac address 52:54:00:3d:c7:6d
>#exceed 90kbps with burst of 90K then jump to chain 11
>#for further classification, otherwise set their skb mark to 11
># and proceed.
>
>tc filter add dev eth0 parent ffff: protocol ip pref 33 \
>flower dst_mac 52:54:00:3d:c7:6d \
>action police rate 1kbit burst 90k conform-exceed pipe/goto chain 11 \
>action skbedit mark 11
>
>But i should also be able to do this for any other action, etc.
>
>For this to work, you have to be able to encode the action in the
>opcode. Something like (for 2^16 chains):
>
>#define TC_ACT_GOTO_CHAIN	0x20000000
>#define TCA_ACT_MAX_CHAIN_MASK 0xFFFF
>
>So 0x20000001 is encoding of chain 1 etc.
>
>I will post the iproute2 code i used for jumping of actions.

You can have multiple actions in list and gact goto as the last one. Why
to do this ugliness?


>
>cheers,
>jamal
>
>On 17-04-27 07:12 AM, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Introduce new type of gact action called "goto_chain". This allows
>> user to specify a chain to be processed. This action type is
>> then processed as a return value in tcf_classify loop in similar
>> way as "reclassify" is, only it does not reset to the first filter
>> in chain but rather reset to the first filter of the desired chain.
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/net/sch_generic.h           |  9 +++++--
>>  include/net/tc_act/tc_gact.h        |  2 ++
>>  include/uapi/linux/pkt_cls.h        |  1 +
>>  include/uapi/linux/tc_act/tc_gact.h |  1 +
>>  net/sched/act_gact.c                | 48 ++++++++++++++++++++++++++++++++++++-
>>  net/sched/cls_api.c                 |  8 +++++--
>>  6 files changed, 64 insertions(+), 5 deletions(-)
>> 
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index 569b565..3688501 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -193,8 +193,13 @@ struct Qdisc_ops {
>> 
>> 
>>  struct tcf_result {
>> -	unsigned long	class;
>> -	u32		classid;
>> +	union {
>> +		struct {
>> +			unsigned long	class;
>> +			u32		classid;
>> +		};
>> +		const struct tcf_proto *goto_tp;
>> +	};
>>  };
>> 
>>  struct tcf_proto_ops {
>> diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h
>> index b6f1739..58bee54 100644
>> --- a/include/net/tc_act/tc_gact.h
>> +++ b/include/net/tc_act/tc_gact.h
>> @@ -12,6 +12,8 @@ struct tcf_gact {
>>  	int			tcfg_paction;
>>  	atomic_t		packets;
>>  #endif
>> +	struct tcf_chain	*goto_chain;
>> +	struct rcu_head		rcu;
>>  };
>>  #define to_gact(a) ((struct tcf_gact *)a)
>> 
>> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>> index f1129e3..e03ba27 100644
>> --- a/include/uapi/linux/pkt_cls.h
>> +++ b/include/uapi/linux/pkt_cls.h
>> @@ -37,6 +37,7 @@ enum {
>>  #define TC_ACT_QUEUED		5
>>  #define TC_ACT_REPEAT		6
>>  #define TC_ACT_REDIRECT		7
>> +#define TC_ACT_GOTO_CHAIN	8
>>  #define TC_ACT_JUMP		0x10000000
>> 
>>  /* Action type identifiers*/
>> diff --git a/include/uapi/linux/tc_act/tc_gact.h b/include/uapi/linux/tc_act/tc_gact.h
>> index 70b536a..388733d 100644
>> --- a/include/uapi/linux/tc_act/tc_gact.h
>> +++ b/include/uapi/linux/tc_act/tc_gact.h
>> @@ -26,6 +26,7 @@ enum {
>>  	TCA_GACT_PARMS,
>>  	TCA_GACT_PROB,
>>  	TCA_GACT_PAD,
>> +	TCA_GACT_CHAIN,
>>  	__TCA_GACT_MAX
>>  };
>>  #define TCA_GACT_MAX (__TCA_GACT_MAX - 1)
>> diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
>> index c527c11..d63aebd 100644
>> --- a/net/sched/act_gact.c
>> +++ b/net/sched/act_gact.c
>> @@ -20,6 +20,7 @@
>>  #include <linux/init.h>
>>  #include <net/netlink.h>
>>  #include <net/pkt_sched.h>
>> +#include <net/pkt_cls.h>
>>  #include <linux/tc_act/tc_gact.h>
>>  #include <net/tc_act/tc_gact.h>
>> 
>> @@ -54,6 +55,7 @@ static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
>>  static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
>>  	[TCA_GACT_PARMS]	= { .len = sizeof(struct tc_gact) },
>>  	[TCA_GACT_PROB]		= { .len = sizeof(struct tc_gact_p) },
>> +	[TCA_GACT_CHAIN]	= { .type = NLA_U32 },
>>  };
>> 
>>  static int tcf_gact_init(struct net *net, struct tcf_proto *tp,
>> @@ -92,6 +94,9 @@ static int tcf_gact_init(struct net *net, struct tcf_proto *tp,
>>  	}
>>  #endif
>> 
>> +	if (parm->action == TC_ACT_GOTO_CHAIN && !tb[TCA_GACT_CHAIN])
>> +		return -EINVAL;
>> +
>>  	if (!tcf_hash_check(tn, parm->index, a, bind)) {
>>  		ret = tcf_hash_create(tn, parm->index, est, a,
>>  				      &act_gact_ops, bind, true);
>> @@ -121,11 +126,43 @@ static int tcf_gact_init(struct net *net, struct tcf_proto *tp,
>>  		gact->tcfg_ptype   = p_parm->ptype;
>>  	}
>>  #endif
>> +
>> +	if (gact->tcf_action == TC_ACT_GOTO_CHAIN) {
>> +		u32 chain_index = nla_get_u32(tb[TCA_GACT_CHAIN]);
>> +
>> +		if (!tp) {
>> +			if (ret == ACT_P_CREATED)
>> +				tcf_hash_release(*a, bind);
>> +			return -EINVAL;
>> +		}
>> +		gact->goto_chain = tcf_chain_get(tp->chain->block, chain_index);
>> +		if (!gact->goto_chain) {
>> +			if (ret == ACT_P_CREATED)
>> +				tcf_hash_release(*a, bind);
>> +			return -ENOMEM;
>> +		}
>> +	}
>> +
>>  	if (ret == ACT_P_CREATED)
>>  		tcf_hash_insert(tn, *a);
>>  	return ret;
>>  }
>> 
>> +static void tcf_gact_cleanup_rcu(struct rcu_head *rcu)
>> +{
>> +	struct tcf_gact *gact = container_of(rcu, struct tcf_gact, rcu);
>> +
>> +	if (gact->tcf_action == TC_ACT_GOTO_CHAIN)
>> +		tcf_chain_put(gact->goto_chain);
>> +}
>> +
>> +static void tcf_gact_cleanup(struct tc_action *a, int bind)
>> +{
>> +	struct tcf_gact *gact = to_gact(a);
>> +
>> +	call_rcu(&gact->rcu, tcf_gact_cleanup_rcu);
>> +}
>> +
>>  static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
>>  		    struct tcf_result *res)
>>  {
>> @@ -141,8 +178,13 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
>>  	}
>>  #endif
>>  	bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb);
>> -	if (action == TC_ACT_SHOT)
>> +	if (action == TC_ACT_SHOT) {
>>  		qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats));
>> +	} else if (action == TC_ACT_GOTO_CHAIN) {
>> +		struct tcf_chain *chain = gact->goto_chain;
>> +
>> +		res->goto_tp = rcu_dereference_bh(chain->filter_chain);
>> +	}
>> 
>>  	tcf_lastuse_update(&gact->tcf_tm);
>> 
>> @@ -194,6 +236,9 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a,
>>  	tcf_tm_dump(&t, &gact->tcf_tm);
>>  	if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD))
>>  		goto nla_put_failure;
>> +	if (gact->tcf_action == TC_ACT_GOTO_CHAIN &&
>> +	    nla_put_u32(skb, TCA_GACT_CHAIN, gact->goto_chain->index))
>> +		goto nla_put_failure;
>>  	return skb->len;
>> 
>>  nla_put_failure:
>> @@ -225,6 +270,7 @@ static struct tc_action_ops act_gact_ops = {
>>  	.stats_update	=	tcf_gact_stats_update,
>>  	.dump		=	tcf_gact_dump,
>>  	.init		=	tcf_gact_init,
>> +	.cleanup	=	tcf_gact_cleanup,
>>  	.walk		=	tcf_gact_walker,
>>  	.lookup		=	tcf_gact_search,
>>  	.size		=	sizeof(struct tcf_gact),
>> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
>> index dbc1348..a2d6bc7 100644
>> --- a/net/sched/cls_api.c
>> +++ b/net/sched/cls_api.c
>> @@ -304,10 +304,14 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
>>  			continue;
>> 
>>  		err = tp->classify(skb, tp, res);
>> -		if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
>> +		if (err == TC_ACT_RECLASSIFY && !compat_mode) {
>>  			goto reset;
>> -		if (err >= 0)
>> +		} else if (err == TC_ACT_GOTO_CHAIN) {
>> +			old_tp = res->goto_tp;
>> +			goto reset;
>> +		} else if (err >= 0) {
>>  			return err;
>> +		}
>>  	}
>> 
>>  	return TC_ACT_UNSPEC; /* signal: continue lookup */
>> 
>

^ permalink raw reply

* Re: [patch net-next 00/10] net: sched: introduce multichain support for filters
From: Jiri Pirko @ 2017-04-28  6:53 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, David Miller, Jamal Hadi Salim,
	David Ahern, Eric Dumazet, Stephen Hemminger, Daniel Borkmann,
	Alexander Duyck, mlxsw, Simon Horman
In-Reply-To: <CAM_iQpXUs1301-OR1_bsO-0bU8sSBVo2BWRHy4qNhzdHxvJWaQ@mail.gmail.com>

Thu, Apr 27, 2017 at 07:46:03PM CEST, xiyou.wangcong@gmail.com wrote:
>On Thu, Apr 27, 2017 at 4:12 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Simple example:
>> $ tc qdisc add dev eth0 ingress
>> $ tc filter add dev eth0 parent ffff: protocol ip pref 33 flower dst_mac 52:54:00:3d:c7:6d action goto chain 11
>> $ tc filter add dev eth0 parent ffff: protocol ip pref 22 chain 11 flower dst_ip 192.168.40.1 action drop
>> $ tc filter show dev eth0 root
>
>Interesting.
>
>I don't look into the code yet. If I understand the concepts correctly,
>so with your patchset we can mark either filter with a chain No. to
>choose which chain it belongs to _logically_ even though
>_physically_ it is still in the old-fashion chain (prio, proto)?

You have to see the code :)

There are physically multiple chains


>
>If so, you have to ensure proto is same since the protocol of
>the packet does not change dynamically? And the original
>priority becomes pointless with chains since we can just to
>any other chain in any order?
>
>By default, without any chain No., you use 0 for all the chains,
>so the old-fashion chain still works.

Yes.

>
>Thanks.

^ permalink raw reply

* Re: [PATCH net-next v8 2/3] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
From: Jiri Pirko @ 2017-04-28  7:02 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, xiyou.wangcong, eric.dumazet, netdev, Simon Horman,
	Benjamin LaHaise
In-Reply-To: <c1680135-8c9f-14d0-c65e-7a5bb7d8d661@mojatatu.com>

Fri, Apr 28, 2017 at 03:22:53AM CEST, jhs@mojatatu.com wrote:
>On 17-04-27 02:30 AM, Jiri Pirko wrote:
>> Wed, Apr 26, 2017 at 10:07:08PM CEST, jhs@mojatatu.com wrote:
>> > On 17-04-26 09:56 AM, Jiri Pirko wrote:
>> > > Wed, Apr 26, 2017 at 03:14:38PM CEST, jhs@mojatatu.com wrote:
>
>> > I think to have flags at that level is useful but it
>> > is a different hierarchy level. I am not sure the
>> > "actions dump large messages" is a fit for that level.
>> 
>> Jamal, the idea is to have exactly what you want to have. Only does not
>> use NLA_U32 attr for that but a special attr NLA_FLAGS which would have
>> well defined semantics and set of helpers to work with and enforce it.
>> 
>> Then, this could be easily reused in other subsystem that uses netlink
>> 
>
>Maybe I am misunderstanding:
>Recall, this is what it looks like with this patchset:
><nlh><subsytem-header>[TCA_ROOT_XXXX]
>
>TCA_ROOT_XXX is very subsystem specific. classifiers, qdiscs and many
>subsystems defined their own semantics for that TLV level. This specific
>"dump max" is very very specific to actions. They were crippled by the
>fact you could only send 32 at a time - this allows more to be sent.
>
>I thought initially you meant:
><nlh>[NLA_XXX]<subsytem-header>[TCA_ROOT_XXXX]
>
>I think at the NLA_XXX you could fit netlink wide TLVs - but if i said
>"do a large dump" it is of no use to any other subsystem.

Okay, I'm sorry, I had couple of beers yesterday so that might be
the cause why your msg makes me totally confused :O

All I suggest is to replace NLA_U32 flags you want that does not
have any semantics with NLA_FLAGS flags, which eventually will carry
the exact same u32, but with predefined semantics, helpers, everything.

^ permalink raw reply

* Re: ipsec doesn't route TCP with 4.11 kernel
From: Steffen Klassert @ 2017-04-28  7:13 UTC (permalink / raw)
  To: Don Bowman
  Cc: Cong Wang, linux-kernel@vger.kernel.org, Herbert Xu,
	Linux Kernel Network Developers
In-Reply-To: <CADJev7_=YEHmijGweqZvdATMQVuzwywEbBKweYvPurJfTEQRjQ@mail.gmail.com>

On Thu, Apr 27, 2017 at 06:13:38PM -0400, Don Bowman wrote:
> On 27 April 2017 at 04:42, Steffen Klassert <steffen.klassert@secunet.com>
> wrote:
> > On Wed, Apr 26, 2017 at 10:01:34PM -0700, Cong Wang wrote:
> >> (Cc'ing netdev and IPSec maintainers)
> >>
> >> On Tue, Apr 25, 2017 at 6:08 PM, Don Bowman <db@donbowman.ca> wrote:
> 
> for 'esp' question, i have ' esp = aes256-sha256-modp1536!' is that what
> you mean?
> its nat-aware tunnel [from my desktop pc to my office]
> 
> root@office:~# ip -s x s
> src 172.16.0.8 dst 64.7.137.180
>         proto esp spi 0x0d588366(223904614) reqid 1(0x00000001) mode tunnel
>         replay-window 0 seq 0x00000000 flag af-unspec (0x00100000)
>         auth-trunc hmac(sha256)
> 0x046cafdf19c5d78d1c29165d96a0b9fce1c500029d77be0fe956dce1bf80a86a (256
> bits) 128
>         enc cbc(aes)
> 0x79ff2fbc2178eb468de6ff16612f0603b514a1d1d5f375c67222294463ec7c62 (256
> bits)
>         encap type espinudp sport 4500 dport 4500 addr 0.0.0.0

Ok, this is espinudp. This information was important.

> 
> I'm not sure what you mean the receiving interface, you mean the outer, the
> native interface?
> listening on eno1, link-type EN10MB (Ethernet), capture size 262144 bytes
> 18:11:32.061501 IP 172.16.0.8.3416 > 64.7.137.180.33638:
> truncated-udplength 0
> 18:11:32.788091 IP 64.7.137.180.4500 > 172.16.0.8.4500: NONESP-encap:
> isakmp: child_sa  inf2
> 18:11:32.788354 IP 172.16.0.8.4500 > 64.7.137.180.4500: NONESP-encap:
> isakmp: child_sa  inf2[IR]
> 18:11:33.066830 IP 172.16.0.8.3416 > 64.7.137.180.33638:
> truncated-udplength 0
> 18:11:35.082839 IP 172.16.0.8.3416 > 64.7.137.180.33638:
> truncated-udplength 0

This is not a GRO issue as I thought, the TX side is already broken.

Could you please try the patch below?

Subject: [PATCH] esp4: Fix udpencap for local TCP packets.

Locally generated TCP packets are usually cloned, so we
do skb_cow_data() on this packets. After that we need to
reload the pointer to the esp header. On udpencap this
header has an offset to skb_transport_header, so take this
offset into account.

Fixes: commit cac2661c53f ("esp4: Avoid skb_cow_data whenever possible")
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b1e2444..ab71fbb 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -223,6 +223,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	int extralen;
 	int tailen;
 	__be64 seqno;
+	int esp_offset = 0;
 	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
@@ -288,6 +289,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			break;
 		}
 
+		esp_offset = (unsigned char *)esph - (unsigned char *)uh;
+
 		*skb_mac_header(skb) = IPPROTO_UDP;
 	}
 
@@ -397,7 +400,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		goto error;
 	nfrags = err;
 	tail = skb_tail_pointer(trailer);
-	esph = ip_esp_hdr(skb);
+	esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esp_offset);
 
 skip_cow:
 	esp_output_fill_trailer(tail, tfclen, plen, proto);
-- 
2.7.4

^ permalink raw reply related

* Re: Network cooling device and how to control NIC speed on thermal condition
From: Waldemar Rymarkiewicz @ 2017-04-28  8:04 UTC (permalink / raw)
  To: Alan Cox, Andrew Lunn, Florian Fainelli; +Cc: netdev, linux-kernel
In-Reply-To: <20170425144501.0cfe27a5@lxorguk.ukuu.org.uk>

On 25 April 2017 at 15:45, Alan Cox <gnomes@lxorguk.ukuu.org.uk> wrote:
>> I am looking on Linux thermal framework and on how to cool down the
>> system effectively when it hits thermal condition. Already existing
>> cooling methods cpu_cooling and clock_cooling are good. However, I
>> wanted to go further and dynamically control also a switch ports'
>> speed based on thermal condition. Lowering speed means less power,
>> less power means lower temp.
>>
>> Is there any in-kernel interface to configure switch port/NIC from other driver?
>
> No but you can always hook that kind of functionality to the thermal
> daemon. However I'd be careful with your assumptions. Lower speed also
> means more time active.
>
> https://github.com/01org/thermal_daemon

This is one of the option indeed. Will consider this option as well. I
would see, however,  a generic solution in the kernel  (configurable
of course) as every network device can generate higher heat with
higher link speed.

> For example if you run a big encoding job on an atom instead of an Intel
> i7, the atom will often not only take way longer but actually use more
> total power than the i7 did.
>
> Thus it would often be far more efficient to time synchronize your
> systems, batch up data on the collecting end, have the processing node
> wake up on an alarm, collect data from the other node and then actually
> go back into suspend.

Yes, that's true in a normal thermal conditions. However, if the
platform reaches max temp trip we don't really care about performance
and time efficiency  we just try to avoid critical trip and system
shutdown by cooling the system eg. lowering cpu freq, limiting usb phy
speed, or net  link speed etc.

I did a quick test to show you what I am about.

I collect SoC temp every a few secs. Meantime, I use ethtool -s ethX
speed <speed> to manipulate link speed and to see how it impacts SoC
temp. My 4 PHYs and switch are integrated into SoC and I always
change link speed for all PHYs , no traffic on the link for this test.
Starting with 1Gb/s and then scaling down to 100 Mb/s and then to
10Mb/s, I see significant  ~10 *C drop in temp while link is set to
10Mb/s.

So, throttling link speed can really help to dissipate heat
significantly when the platform is under threat.

Renegotiating link speed costs something I agree, it also impacts user
experience, but such a thermal condition will not occur often I
believe.


/Waldek

^ permalink raw reply

* Re: [PATCH net-next 1/4] ixgbe: sparc: rename the ARCH_WANT_RELAX_ORDER to IXGBE_ALLOW_RELAXED_ORDER
From: Lucas Stach @ 2017-04-28  8:51 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Alexander Duyck, Ding Tianhong, Mark Rutland, Amir Ancel,
	Gabriele Paoloni, linux-pci@vger.kernel.org, Catalin Marinas,
	Will Deacon, LinuxArm, David Laight, jeffrey.t.kirsher@intel.com,
	netdev@vger.kernel.org, Robin Murphy, davem@davemloft.net,
	linux-arm-kernel@lists.infradead.org, Casey Leedom
In-Reply-To: <20170427171938.GA10705@bhelgaas-glaptop.roam.corp.google.com>

Am Donnerstag, den 27.04.2017, 12:19 -0500 schrieb Bjorn Helgaas:
> [+cc Casey]
> 
> On Wed, Apr 26, 2017 at 09:18:33AM -0700, Alexander Duyck wrote:
> > On Wed, Apr 26, 2017 at 2:26 AM, Ding Tianhong <dingtianhong@huawei.com> wrote:
> > > Hi Amir:
> > >
> > > It is really glad to hear that the mlx5 will support RO mode this year, if so, do you agree that enable it dynamic by ethtool -s xxx,
> > > we have try it several month ago but there was only one drivers would use it at that time so the maintainer against it, it mlx5 would support RO,
> > > we could try to restart this solution, what do you think about it. :)
> > >
> > > Thanks
> > > Ding
> > 
> > Hi Ding,
> > 
> > Enabing relaxed ordering really doesn't have any place in ethtool. It
> > is a PCIe attribute that you are essentially wanting to enable.
> > 
> > It might be worth while to take a look at updating the PCIe code path
> > to handle this. Really what we should probably do is guarantee that
> > the architectures that need relaxed ordering are setting it in the
> > PCIe Device Control register and that the ones that don't are clearing
> > the bit. It's possible that this is already occurring, but I don't
> > know the state of handling those bits is in the kernel. Once we can
> > guarantee that we could use that to have the drivers determine their
> > behavior in regards to relaxed ordering. For example in the case of
> > igb/ixgbe we could probably change the behavior so that it will bey
> > default try to use relaxed ordering but if it is not enabled in PCIe
> > Device Control register the hardware should not request to use it. It
> > would simplify things in the drivers and allow for each architecture
> > to control things as needed in their PCIe code.
> 
> I thought Relaxed Ordering was an optimization.  Are there cases where
> it is actually required for correct behavior?

Yes, at least the Tegra 2 TRM claims that RO needs to be enabled on the
device side for correct operation with the following language:

"Tegra 2 requires relaxed ordering for responses to downstream requests
(responses can pass writes). It is possible in some circumstances for
PCIe transfers from an external bus masters (i.e. upstream transfers) to
become blocked by a downstream read or non-posted write. The responses
to these downstream requests are blocked by upstream posted writes only
when PCIe strict ordering is imposed. It is therefore necessary to never
impose strict ordering that would block a response to a downstream
NPW/read request and always set the relaxed ordering bit to 1. Only
devices that are capable of relaxed ordering may be used with Tegra 2
devices."

Regards,
Lucas

^ permalink raw reply

* Re: [PATCH v1 net-next 5/6] net: allow simultaneous SW and HW transmit timestamping
From: Miroslav Lichvar @ 2017-04-28  8:54 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Network Development, Richard Cochran, Willem de Bruijn,
	Soheil Hassas Yeganeh, Keller, Jacob E, Denny Page, Jiri Benc
In-Reply-To: <CAF=yD-+GSK491AWQx8=6yd3=-HHwxdWq677ubwdjbV5AXzRbog@mail.gmail.com>

On Wed, Apr 26, 2017 at 08:00:02PM -0400, Willem de Bruijn wrote:
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 81ef53f..42bff22 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -3300,8 +3300,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
> >
> >  static inline void sw_tx_timestamp(struct sk_buff *skb)
> >  {
> > -       if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP &&
> > -           !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
> > +       if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
> >                 skb_tstamp_tx(skb, NULL);
> >  }

> > +++ b/net/core/skbuff.c
> > @@ -3874,6 +3874,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
> >         if (!sk)
> >                 return;
> >
> > +       if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
> > +           skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
> > +               return;
> > +
> 
> This check should only happen for software transmit timestamps, so simpler to
> revise the check in sw_tx_timestamp above to
> 
>   if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP &&
> -        !(skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS))
> +      (!(skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)) ||
> +      (skb->sk && skb->sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW)

I'm not sure if this can work. sk_buff.h would need to include sock.h
in order to get the definition of struct sock. Any suggestions?

-- 
Miroslav Lichvar

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox