Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [net-next, PATCH 2/2, v2] net: socionext: add XDP support
From: Jesper Dangaard Brouer @ 2018-09-12  9:25 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	mykyta.iziumtsev, bjorn.topel, magnus.karlsson, daniel, ast,
	brouer
In-Reply-To: <1536742958-29887-3-git-send-email-ilias.apalodimas@linaro.org>

On Wed, 12 Sep 2018 12:02:38 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

>  static const struct net_device_ops netsec_netdev_ops = {
>  	.ndo_init		= netsec_netdev_init,
>  	.ndo_uninit		= netsec_netdev_uninit,
> @@ -1430,6 +1627,7 @@ static const struct net_device_ops netsec_netdev_ops = {
>  	.ndo_set_mac_address    = eth_mac_addr,
>  	.ndo_validate_addr	= eth_validate_addr,
>  	.ndo_do_ioctl		= netsec_netdev_ioctl,
> +	.ndo_bpf		= netsec_xdp,
>  };
>  

You have not implemented ndo_xdp_xmit.

Thus, you have "only" implemented the RX side of XDP_REDIRECT.  Which
allows you to do, cpumap and AF_XDP redirects, but not allowing other
drivers to XDP send out this device.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [net-next, PATCH 2/2, v2] net: socionext: add XDP support
From: Ilias Apalodimas @ 2018-09-12  9:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	mykyta.iziumtsev, bjorn.topel, magnus.karlsson, daniel, ast
In-Reply-To: <20180912111457.0121d9f3@redhat.com>

On Wed, Sep 12, 2018 at 11:14:57AM +0200, Jesper Dangaard Brouer wrote:
> On Wed, 12 Sep 2018 12:02:38 +0300
> Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:
> 
> > @@ -1003,20 +1076,29 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
> >  		u16 len;
> >  
> >  		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
> > -		if (!buf) {
> > -			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
> > +		if (!buf)
> >  			goto err_out;
> > -		}
> >  		desc->dma_addr = dma_handle;
> >  		desc->addr = buf;
> >  		desc->len = len;
> >  	}
> >  
> >  	netsec_rx_fill(priv, 0, DESC_NUM);
> > +	err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);
> 
> Do you only have 1 RX queue? (last arg to xdp_rxq_info_reg is 0),
> 
> 
Yes the current driver is only supporting a single queue (same for Tx)
> > +	if (err)
> > +		goto err_out;
> > +
> > +	err = xdp_rxq_info_reg_mem_model(&dring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
> > +					 NULL);
> > +	if (err) {
> > +		xdp_rxq_info_unreg(&dring->xdp_rxq);
> > +		goto err_out;
> > +	}
> >  
> >  	return 0;
> >  
> 
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer


Thanks for looking at this

/Ilias

^ permalink raw reply

* Re: [net-next, PATCH 2/2, v2] net: socionext: add XDP support
From: Jesper Dangaard Brouer @ 2018-09-12  9:14 UTC (permalink / raw)
  To: Ilias Apalodimas
  Cc: netdev, jaswinder.singh, ard.biesheuvel, masami.hiramatsu, arnd,
	mykyta.iziumtsev, bjorn.topel, magnus.karlsson, daniel, ast,
	brouer
In-Reply-To: <1536742958-29887-3-git-send-email-ilias.apalodimas@linaro.org>

On Wed, 12 Sep 2018 12:02:38 +0300
Ilias Apalodimas <ilias.apalodimas@linaro.org> wrote:

> @@ -1003,20 +1076,29 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
>  		u16 len;
>  
>  		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
> -		if (!buf) {
> -			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
> +		if (!buf)
>  			goto err_out;
> -		}
>  		desc->dma_addr = dma_handle;
>  		desc->addr = buf;
>  		desc->len = len;
>  	}
>  
>  	netsec_rx_fill(priv, 0, DESC_NUM);
> +	err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);

Do you only have 1 RX queue? (last arg to xdp_rxq_info_reg is 0),


> +	if (err)
> +		goto err_out;
> +
> +	err = xdp_rxq_info_reg_mem_model(&dring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
> +					 NULL);
> +	if (err) {
> +		xdp_rxq_info_unreg(&dring->xdp_rxq);
> +		goto err_out;
> +	}
>  
>  	return 0;
>  


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH] brcm80211: remove redundant condition check before debugfs_remove_recursive
From: zhong jiang @ 2018-09-12 14:10 UTC (permalink / raw)
  To: kvalo-sgV2jX0FEOL9JmXXK+q4OQ, davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: hante.meuleman-dY08KVG/lbpWk0Htik3J/w,
	franky.lin-dY08KVG/lbpWk0Htik3J/w,
	arend.vanspriel-dY08KVG/lbpWk0Htik3J/w,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	brcm80211-dev-list.pdl-dY08KVG/lbpWk0Htik3J/w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

debugfs_remove_recursive has taken IS_ERR_OR_NULL into account. So just
remove the condition check before debugfs_remove_recursive.

Signed-off-by: zhong jiang <zhongjiang-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
 drivers/net/wireless/broadcom/brcm80211/brcmsmac/debug.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/debug.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/debug.c
index 2fe1f68..3bd54f1 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/debug.c
@@ -62,8 +62,7 @@ int brcms_debugfs_attach(struct brcms_pub *drvr)
 
 void brcms_debugfs_detach(struct brcms_pub *drvr)
 {
-	if (!IS_ERR_OR_NULL(drvr->dbgfs_dir))
-		debugfs_remove_recursive(drvr->dbgfs_dir);
+	debugfs_remove_recursive(drvr->dbgfs_dir);
 }
 
 struct dentry *brcms_debugfs_get_devdir(struct brcms_pub *drvr)
-- 
1.7.12.4

^ permalink raw reply related

* Re: [PATCH v2] neighbour: confirm neigh entries when ARP packet is received
From: Sergei Shtylyov @ 2018-09-12  9:05 UTC (permalink / raw)
  To: Vasily Khoruzhick, David S. Miller, Roopa Prabhu, Alexey Dobriyan,
	Eric Dumazet, Stephen Hemminger, Jim Westfall, Wolfgang Bumiller,
	Vasily Khoruzhick, Kees Cook, Ihar Hrachyshka, netdev,
	linux-kernel
In-Reply-To: <20180911180406.31283-1-vasilykh@arista.com>

Hello!

On 9/11/2018 9:04 PM, Vasily Khoruzhick wrote:

> Update 'confirmed' timestamp when ARP packet is received. It shouldn't
> affect locktime logic and anyway entry can be confirmed by any higher-layer
> protocol. Thus it makes to sense not to confirm it when ARP packet is

    "Makes sense" or "makes no sense"?

> received.
> 
> Fixes: 77d7123342 ("neighbour: update neigh timestamps iff update is
> effective")
> 
> Signed-off-by: Vasily Khoruzhick <vasilykh@arista.com>
[...]

MBR, Sergei

^ permalink raw reply

* [net-next, PATCH 2/2, v2] net: socionext: add XDP support
From: Ilias Apalodimas @ 2018-09-12  9:02 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, mykyta.iziumtsev,
	bjorn.topel, magnus.karlsson, brouer, daniel, ast,
	Ilias Apalodimas
In-Reply-To: <1536742958-29887-1-git-send-email-ilias.apalodimas@linaro.org>

Add basic XDP support

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/socionext/netsec.c | 234 +++++++++++++++++++++++++++++---
 1 file changed, 216 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 666fee2..1f4594f 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -9,6 +9,9 @@
 #include <linux/etherdevice.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/netlink.h>
+#include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 
 #include <net/tcp.h>
 #include <net/ip6_checksum.h>
@@ -238,6 +241,11 @@
 
 #define NETSEC_F_NETSEC_VER_MAJOR_NUM(x)	((x) & 0xffff0000)
 
+#define NETSEC_XDP_PASS          0
+#define NETSEC_XDP_CONSUMED      BIT(0)
+#define NETSEC_XDP_TX            BIT(1)
+#define NETSEC_XDP_REDIR         BIT(2)
+
 enum ring_id {
 	NETSEC_RING_TX = 0,
 	NETSEC_RING_RX
@@ -256,11 +264,14 @@ struct netsec_desc_ring {
 	void *vaddr;
 	u16 pkt_cnt;
 	u16 head, tail;
+	bool is_xdp;
+	struct xdp_rxq_info xdp_rxq;
 };
 
 struct netsec_priv {
 	struct netsec_desc_ring desc_ring[NETSEC_RING_MAX];
 	struct ethtool_coalesce et_coalesce;
+	struct bpf_prog *xdp_prog;
 	spinlock_t reglock; /* protect reg access */
 	struct napi_struct napi;
 	phy_interface_t phy_interface;
@@ -297,6 +308,8 @@ struct netsec_rx_pkt_info {
 };
 
 static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num);
+static u32 netsec_run_xdp(struct netsec_desc *desc, struct netsec_priv *priv,
+			  struct bpf_prog *prog, struct xdp_buff *xdp);
 
 static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 				  dma_addr_t *dma_addr, u16 *len);
@@ -613,13 +626,23 @@ static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
 
 		eop = (entry->attr >> NETSEC_TX_LAST) & 1;
 
-		dma_unmap_single(priv->dev, desc->dma_addr, desc->len,
-				 DMA_TO_DEVICE);
-		if (eop) {
-			pkts++;
+		if (desc->skb)
+			dma_unmap_single(priv->dev,
+					 desc->dma_addr - XDP_PACKET_HEADROOM,
+					 desc->len, DMA_TO_DEVICE);
+
+		if (!eop) {
+			*desc = (struct netsec_desc){};
+			continue;
+		}
+
+		if (!desc->skb) {
+			skb_free_frag(desc->addr);
+		} else {
 			bytes += desc->skb->len;
 			dev_kfree_skb(desc->skb);
 		}
+		pkts++;
 		*desc = (struct netsec_desc){};
 	}
 	dring->pkt_cnt -= budget;
@@ -659,19 +682,22 @@ static void nsetsec_adv_desc(u16 *idx)
 static int netsec_process_rx(struct netsec_priv *priv, int budget)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+	struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog);
 	struct net_device *ndev = priv->ndev;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
+	u32 xdp_flush = 0;
+	u32 xdp_result;
 	int done = 0;
 
 	while (done < budget) {
 		u16 idx = dring->tail;
 		struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
 		struct netsec_desc *desc = &dring->desc[idx];
+		dma_addr_t dma_handle, dma_unmap;
 		struct netsec_rx_pkt_info rpi;
-		dma_addr_t dma_handle;
+		u16 pkt_len, desc_len;
+		struct xdp_buff xdp;
 		void *buf_addr;
-		u16 pkt_len;
-		u16 desc_len;
 
 		if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
 			break;
@@ -704,10 +730,40 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 
 		prefetch(desc->addr);
 		buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len);
+
 		if (unlikely(!buf_addr))
 			break;
 
-		skb = build_skb(desc->addr, desc->len);
+		dma_unmap = dring->is_xdp ?
+			desc->dma_addr - XDP_PACKET_HEADROOM : desc->dma_addr;
+
+		xdp.data_hard_start = desc->addr;
+		xdp.data = desc->addr;
+		xdp_set_data_meta_invalid(&xdp);
+		xdp.data_end = xdp.data + pkt_len;
+		xdp.rxq = &dring->xdp_rxq;
+
+		if (xdp_prog) {
+			xdp.data = desc->addr + XDP_PACKET_HEADROOM;
+			xdp.data_end = xdp.data + pkt_len;
+			xdp_result = netsec_run_xdp(desc, priv, xdp_prog, &xdp);
+			if (xdp_result != NETSEC_XDP_PASS) {
+				xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
+
+				dma_unmap_single_attrs(priv->dev, dma_unmap,
+						       desc->len, DMA_TO_DEVICE,
+						       DMA_ATTR_SKIP_CPU_SYNC);
+
+				desc->len = desc_len;
+				desc->dma_addr = dma_handle;
+				desc->addr = buf_addr;
+				netsec_rx_fill(priv, idx, 1);
+				nsetsec_adv_desc(&dring->tail);
+				continue;
+			}
+		}
+
+		skb = build_skb(xdp.data_hard_start, desc->len);
 		if (unlikely(!skb)) {
 			dma_unmap_single(priv->dev, dma_handle, desc_len,
 					 DMA_TO_DEVICE);
@@ -716,7 +772,7 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 				  "rx failed to alloc skb\n");
 			break;
 		}
-		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
+		dma_unmap_single_attrs(priv->dev, dma_unmap, desc->len,
 				       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
 
 		/* Update the descriptor with fresh buffers */
@@ -724,7 +780,8 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 		desc->dma_addr = dma_handle;
 		desc->addr = buf_addr;
 
-		skb_put(skb, pkt_len);
+		skb_reserve(skb, xdp.data - xdp.data_hard_start);
+		skb_put(skb, xdp.data_end - xdp.data);
 		skb->protocol = eth_type_trans(skb, priv->ndev);
 
 		if (priv->rx_cksum_offload_flag &&
@@ -733,13 +790,16 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 
 		if (napi_gro_receive(&priv->napi, skb) != GRO_DROP) {
 			ndev->stats.rx_packets++;
-			ndev->stats.rx_bytes += pkt_len;
+			ndev->stats.rx_bytes += xdp.data_end - xdp.data;
 		}
 
 		netsec_rx_fill(priv, idx, 1);
 		nsetsec_adv_desc(&dring->tail);
 	}
 
+	if (xdp_flush & NETSEC_XDP_REDIR)
+		xdp_do_flush_map();
+
 	return done;
 }
 
@@ -892,6 +952,9 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 	if (!dring->vaddr || !dring->desc)
 		return;
 
+	if (xdp_rxq_info_is_reg(&dring->xdp_rxq))
+		xdp_rxq_info_unreg(&dring->xdp_rxq);
+
 	for (idx = 0; idx < DESC_NUM; idx++) {
 		desc = &dring->desc[idx];
 		if (!desc->addr)
@@ -931,11 +994,14 @@ static void netsec_free_dring(struct netsec_priv *priv, int id)
 static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 				  dma_addr_t *dma_handle, u16 *desc_len)
 {
+	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
 	size_t len = priv->ndev->mtu + ETH_HLEN + VLAN_HLEN * 2 + NET_SKB_PAD +
 		NET_IP_ALIGN;
 	dma_addr_t mapping;
 	void *buf;
 
+	if (dring->is_xdp)
+		len += XDP_PACKET_HEADROOM;
 	len = SKB_DATA_ALIGN(len);
 	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
@@ -943,11 +1009,12 @@ static void *netsec_alloc_rx_data(struct netsec_priv *priv,
 	if (!buf)
 		return NULL;
 
-	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
+	mapping = dma_map_single(priv->dev, buf, len,
+				 DMA_FROM_DEVICE);
 	if (unlikely(dma_mapping_error(priv->dev, mapping)))
 		goto err_out;
 
-	*dma_handle = mapping;
+	*dma_handle = mapping + (dring->is_xdp ? XDP_PACKET_HEADROOM : 0);
 	*desc_len = len;
 
 	return buf;
@@ -994,7 +1061,13 @@ static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id)
 static int netsec_setup_rx_dring(struct netsec_priv *priv)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	int i;
+	struct bpf_prog *xdp_prog = READ_ONCE(priv->xdp_prog);
+	int i, err;
+
+	if (xdp_prog)
+		dring->is_xdp = true;
+	else
+		dring->is_xdp = false;
 
 	for (i = 0; i < DESC_NUM; i++) {
 		struct netsec_desc *desc = &dring->desc[i];
@@ -1003,20 +1076,29 @@ static int netsec_setup_rx_dring(struct netsec_priv *priv)
 		u16 len;
 
 		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
-		if (!buf) {
-			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
+		if (!buf)
 			goto err_out;
-		}
 		desc->dma_addr = dma_handle;
 		desc->addr = buf;
 		desc->len = len;
 	}
 
 	netsec_rx_fill(priv, 0, DESC_NUM);
+	err = xdp_rxq_info_reg(&dring->xdp_rxq, priv->ndev, 0);
+	if (err)
+		goto err_out;
+
+	err = xdp_rxq_info_reg_mem_model(&dring->xdp_rxq, MEM_TYPE_PAGE_SHARED,
+					 NULL);
+	if (err) {
+		xdp_rxq_info_unreg(&dring->xdp_rxq);
+		goto err_out;
+	}
 
 	return 0;
 
 err_out:
+	netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
 	return -ENOMEM;
 }
 
@@ -1420,6 +1502,121 @@ static int netsec_netdev_ioctl(struct net_device *ndev, struct ifreq *ifr,
 	return phy_mii_ioctl(ndev->phydev, ifr, cmd);
 }
 
+static u32 netsec_xmit_xdp(struct netsec_priv *priv, struct xdp_buff *xdp,
+			   struct netsec_desc *rx_desc)
+{
+	struct netsec_desc_ring *tx_ring = &priv->desc_ring[NETSEC_RING_TX];
+	struct netsec_tx_pkt_ctrl tx_ctrl = {};
+	struct netsec_desc tx_desc;
+	int filled;
+	u32 len;
+
+	len = xdp->data_end - xdp->data;
+
+	if (tx_ring->head >= tx_ring->tail)
+		filled = tx_ring->head - tx_ring->tail;
+	else
+		filled = tx_ring->head + DESC_NUM - tx_ring->tail;
+
+	if (DESC_NUM - filled <= 1)
+		return NETSEC_XDP_CONSUMED;
+
+	dma_sync_single_for_device(priv->dev, rx_desc->dma_addr, len,
+				   DMA_TO_DEVICE);
+
+	tx_desc.dma_addr = rx_desc->dma_addr;
+	tx_desc.addr = xdp->data;
+	tx_desc.len = len;
+
+	netsec_set_tx_de(priv, tx_ring, &tx_ctrl, &tx_desc, NULL);
+	netsec_write(priv, NETSEC_REG_NRM_TX_PKTCNT, 1);
+
+	return NETSEC_XDP_TX;
+}
+
+static u32 netsec_run_xdp(struct netsec_desc *desc, struct netsec_priv *priv,
+			  struct bpf_prog *prog, struct xdp_buff *xdp)
+{
+	u32 ret = NETSEC_XDP_PASS;
+	int err;
+	u32 act;
+
+	rcu_read_lock();
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		ret = NETSEC_XDP_PASS;
+		break;
+	case XDP_TX:
+		ret = netsec_xmit_xdp(priv, xdp, desc);
+		break;
+	case XDP_REDIRECT:
+		err = xdp_do_redirect(priv->ndev, xdp, prog);
+		if (!err) {
+			ret = NETSEC_XDP_REDIR;
+		} else {
+			ret = NETSEC_XDP_CONSUMED;
+			xdp_return_buff(xdp);
+		}
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(priv->ndev, prog, act);
+		/* fall through -- handle aborts by dropping packet */
+	case XDP_DROP:
+		ret = NETSEC_XDP_CONSUMED;
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int netsec_xdp_setup(struct netsec_priv *priv, struct bpf_prog *prog,
+			    struct netlink_ext_ack *extack)
+{
+	struct net_device *dev = priv->ndev;
+	struct bpf_prog *old_prog;
+
+	/* For now just support only the usual MTU sized frames */
+	if (prog && dev->mtu > 1500) {
+		NL_SET_ERR_MSG_MOD(extack, "Jumbo frames not supported on XDP");
+		return -EOPNOTSUPP;
+	}
+
+	if (netif_running(dev))
+		netsec_netdev_stop(dev);
+
+	/* Detach old prog, if any */
+	old_prog = xchg(&priv->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	if (netif_running(dev))
+		netsec_netdev_open(dev);
+
+	return 0;
+}
+
+static int netsec_xdp(struct net_device *ndev, struct netdev_bpf *xdp)
+{
+	struct netsec_priv *priv = netdev_priv(ndev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return netsec_xdp_setup(priv, xdp->prog, xdp->extack);
+	case XDP_QUERY_PROG:
+		xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops netsec_netdev_ops = {
 	.ndo_init		= netsec_netdev_init,
 	.ndo_uninit		= netsec_netdev_uninit,
@@ -1430,6 +1627,7 @@ static const struct net_device_ops netsec_netdev_ops = {
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= netsec_netdev_ioctl,
+	.ndo_bpf		= netsec_xdp,
 };
 
 static int netsec_of_probe(struct platform_device *pdev,
-- 
2.7.4

^ permalink raw reply related

* [net-next, PATCH 1/2, v2] net: socionext: different approach on DMA
From: Ilias Apalodimas @ 2018-09-12  9:02 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, mykyta.iziumtsev,
	bjorn.topel, magnus.karlsson, brouer, daniel, ast,
	Ilias Apalodimas
In-Reply-To: <1536742958-29887-1-git-send-email-ilias.apalodimas@linaro.org>

Current driver dynamically allocates an skb and maps it as DMA rx buffer.
A following patch introduces AF_XDP functionality, so we need a
different allocation scheme. Buffers are allocated dynamically and
mapped into hardware. During the Rx operation the driver uses
build_skb() to produce the necessary buffers for the network stack

Signed-off-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
---
 drivers/net/ethernet/socionext/netsec.c | 239 +++++++++++++++++---------------
 1 file changed, 130 insertions(+), 109 deletions(-)

diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 7aa5ebb..666fee2 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -296,6 +296,11 @@ struct netsec_rx_pkt_info {
 	bool err_flag;
 };
 
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num);
+
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_addr, u16 *len);
+
 static void netsec_write(struct netsec_priv *priv, u32 reg_addr, u32 val)
 {
 	writel(val, priv->ioaddr + reg_addr);
@@ -556,34 +561,10 @@ static const struct ethtool_ops netsec_ethtool_ops = {
 
 /************* NETDEV_OPS FOLLOW *************/
 
-static struct sk_buff *netsec_alloc_skb(struct netsec_priv *priv,
-					struct netsec_desc *desc)
-{
-	struct sk_buff *skb;
-
-	if (device_get_dma_attr(priv->dev) == DEV_DMA_COHERENT) {
-		skb = netdev_alloc_skb_ip_align(priv->ndev, desc->len);
-	} else {
-		desc->len = L1_CACHE_ALIGN(desc->len);
-		skb = netdev_alloc_skb(priv->ndev, desc->len);
-	}
-	if (!skb)
-		return NULL;
-
-	desc->addr = skb->data;
-	desc->dma_addr = dma_map_single(priv->dev, desc->addr, desc->len,
-					DMA_FROM_DEVICE);
-	if (dma_mapping_error(priv->dev, desc->dma_addr)) {
-		dev_kfree_skb_any(skb);
-		return NULL;
-	}
-	return skb;
-}
 
 static void netsec_set_rx_de(struct netsec_priv *priv,
 			     struct netsec_desc_ring *dring, u16 idx,
-			     const struct netsec_desc *desc,
-			     struct sk_buff *skb)
+			     const struct netsec_desc *desc)
 {
 	struct netsec_de *de = dring->vaddr + DESC_SZ * idx;
 	u32 attr = (1 << NETSEC_RX_PKT_OWN_FIELD) |
@@ -602,59 +583,6 @@ static void netsec_set_rx_de(struct netsec_priv *priv,
 	dring->desc[idx].dma_addr = desc->dma_addr;
 	dring->desc[idx].addr = desc->addr;
 	dring->desc[idx].len = desc->len;
-	dring->desc[idx].skb = skb;
-}
-
-static struct sk_buff *netsec_get_rx_de(struct netsec_priv *priv,
-					struct netsec_desc_ring *dring,
-					u16 idx,
-					struct netsec_rx_pkt_info *rxpi,
-					struct netsec_desc *desc, u16 *len)
-{
-	struct netsec_de de = {};
-
-	memcpy(&de, dring->vaddr + DESC_SZ * idx, DESC_SZ);
-
-	*len = de.buf_len_info >> 16;
-
-	rxpi->err_flag = (de.attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
-	rxpi->rx_cksum_result = (de.attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
-	rxpi->err_code = (de.attr >> NETSEC_RX_PKT_ERR_FIELD) &
-							NETSEC_RX_PKT_ERR_MASK;
-	*desc = dring->desc[idx];
-	return desc->skb;
-}
-
-static struct sk_buff *netsec_get_rx_pkt_data(struct netsec_priv *priv,
-					      struct netsec_rx_pkt_info *rxpi,
-					      struct netsec_desc *desc,
-					      u16 *len)
-{
-	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct sk_buff *tmp_skb, *skb = NULL;
-	struct netsec_desc td;
-	int tail;
-
-	*rxpi = (struct netsec_rx_pkt_info){};
-
-	td.len = priv->ndev->mtu + 22;
-
-	tmp_skb = netsec_alloc_skb(priv, &td);
-
-	tail = dring->tail;
-
-	if (!tmp_skb) {
-		netsec_set_rx_de(priv, dring, tail, &dring->desc[tail],
-				 dring->desc[tail].skb);
-	} else {
-		skb = netsec_get_rx_de(priv, dring, tail, rxpi, desc, len);
-		netsec_set_rx_de(priv, dring, tail, &td, tmp_skb);
-	}
-
-	/* move tail ahead */
-	dring->tail = (dring->tail + 1) % DESC_NUM;
-
-	return skb;
 }
 
 static int netsec_clean_tx_dring(struct netsec_priv *priv, int budget)
@@ -721,19 +649,29 @@ static int netsec_process_tx(struct netsec_priv *priv, int budget)
 	return done;
 }
 
+static void nsetsec_adv_desc(u16 *idx)
+{
+	*idx = *idx + 1;
+	if (unlikely(*idx >= DESC_NUM))
+		*idx = 0;
+}
+
 static int netsec_process_rx(struct netsec_priv *priv, int budget)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
 	struct net_device *ndev = priv->ndev;
-	struct netsec_rx_pkt_info rx_info;
-	int done = 0;
-	struct netsec_desc desc;
 	struct sk_buff *skb;
-	u16 len;
+	int done = 0;
 
 	while (done < budget) {
 		u16 idx = dring->tail;
 		struct netsec_de *de = dring->vaddr + (DESC_SZ * idx);
+		struct netsec_desc *desc = &dring->desc[idx];
+		struct netsec_rx_pkt_info rpi;
+		dma_addr_t dma_handle;
+		void *buf_addr;
+		u16 pkt_len;
+		u16 desc_len;
 
 		if (de->attr & (1U << NETSEC_RX_PKT_OWN_FIELD))
 			break;
@@ -744,28 +682,62 @@ static int netsec_process_rx(struct netsec_priv *priv, int budget)
 		 */
 		dma_rmb();
 		done++;
-		skb = netsec_get_rx_pkt_data(priv, &rx_info, &desc, &len);
-		if (unlikely(!skb) || rx_info.err_flag) {
+
+		pkt_len = de->buf_len_info >> 16;
+		rpi.err_code = (de->attr >> NETSEC_RX_PKT_ERR_FIELD) &
+			NETSEC_RX_PKT_ERR_MASK;
+		rpi.err_flag = (de->attr >> NETSEC_RX_PKT_ER_FIELD) & 1;
+		if (rpi.err_flag) {
 			netif_err(priv, drv, priv->ndev,
-				  "%s: rx fail err(%d)\n",
-				  __func__, rx_info.err_code);
+				  "%s: rx fail err(%d)\n", __func__,
+				  rpi.err_code);
 			ndev->stats.rx_dropped++;
+			nsetsec_adv_desc(&dring->tail);
+			/* reuse buffer page frag */
+			netsec_rx_fill(priv, idx, 1);
 			continue;
 		}
+		rpi.rx_cksum_result = (de->attr >> NETSEC_RX_PKT_CO_FIELD) & 3;
 
-		dma_unmap_single(priv->dev, desc.dma_addr, desc.len,
-				 DMA_FROM_DEVICE);
-		skb_put(skb, len);
+		dma_sync_single_for_cpu(priv->dev, desc->dma_addr, pkt_len,
+					DMA_FROM_DEVICE);
+
+		prefetch(desc->addr);
+		buf_addr = netsec_alloc_rx_data(priv, &dma_handle, &desc_len);
+		if (unlikely(!buf_addr))
+			break;
+
+		skb = build_skb(desc->addr, desc->len);
+		if (unlikely(!skb)) {
+			dma_unmap_single(priv->dev, dma_handle, desc_len,
+					 DMA_TO_DEVICE);
+			skb_free_frag(buf_addr);
+			netif_err(priv, drv, priv->ndev,
+				  "rx failed to alloc skb\n");
+			break;
+		}
+		dma_unmap_single_attrs(priv->dev, desc->dma_addr, desc->len,
+				       DMA_TO_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
+
+		/* Update the descriptor with fresh buffers */
+		desc->len = desc_len;
+		desc->dma_addr = dma_handle;
+		desc->addr = buf_addr;
+
+		skb_put(skb, pkt_len);
 		skb->protocol = eth_type_trans(skb, priv->ndev);
 
 		if (priv->rx_cksum_offload_flag &&
-		    rx_info.rx_cksum_result == NETSEC_RX_CKSUM_OK)
+		    rpi.rx_cksum_result == NETSEC_RX_CKSUM_OK)
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 		if (napi_gro_receive(&priv->napi, skb) != GRO_DROP) {
 			ndev->stats.rx_packets++;
-			ndev->stats.rx_bytes += len;
+			ndev->stats.rx_bytes += pkt_len;
 		}
+
+		netsec_rx_fill(priv, idx, 1);
+		nsetsec_adv_desc(&dring->tail);
 	}
 
 	return done;
@@ -928,7 +900,10 @@ static void netsec_uninit_pkt_dring(struct netsec_priv *priv, int id)
 		dma_unmap_single(priv->dev, desc->dma_addr, desc->len,
 				 id == NETSEC_RING_RX ? DMA_FROM_DEVICE :
 							      DMA_TO_DEVICE);
-		dev_kfree_skb(desc->skb);
+		if (id == NETSEC_RING_RX)
+			skb_free_frag(desc->addr);
+		else if (id == NETSEC_RING_TX)
+			dev_kfree_skb(desc->skb);
 	}
 
 	memset(dring->desc, 0, sizeof(struct netsec_desc) * DESC_NUM);
@@ -953,50 +928,96 @@ static void netsec_free_dring(struct netsec_priv *priv, int id)
 	dring->desc = NULL;
 }
 
+static void *netsec_alloc_rx_data(struct netsec_priv *priv,
+				  dma_addr_t *dma_handle, u16 *desc_len)
+{
+	size_t len = priv->ndev->mtu + ETH_HLEN + VLAN_HLEN * 2 + NET_SKB_PAD +
+		NET_IP_ALIGN;
+	dma_addr_t mapping;
+	void *buf;
+
+	len = SKB_DATA_ALIGN(len);
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	buf = napi_alloc_frag(len);
+	if (!buf)
+		return NULL;
+
+	mapping = dma_map_single(priv->dev, buf, len, DMA_FROM_DEVICE);
+	if (unlikely(dma_mapping_error(priv->dev, mapping)))
+		goto err_out;
+
+	*dma_handle = mapping;
+	*desc_len = len;
+
+	return buf;
+
+err_out:
+	skb_free_frag(buf);
+	return NULL;
+}
+
+static void netsec_rx_fill(struct netsec_priv *priv, u16 from, u16 num)
+{
+	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
+	u16 idx = from;
+
+	while (num) {
+		netsec_set_rx_de(priv, dring, idx, &dring->desc[idx]);
+		idx++;
+		if (idx >= DESC_NUM)
+			idx = 0;
+		num--;
+	}
+}
+
 static int netsec_alloc_dring(struct netsec_priv *priv, enum ring_id id)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[id];
-	int ret = 0;
 
 	dring->vaddr = dma_zalloc_coherent(priv->dev, DESC_SZ * DESC_NUM,
 					   &dring->desc_dma, GFP_KERNEL);
-	if (!dring->vaddr) {
-		ret = -ENOMEM;
+	if (!dring->vaddr)
 		goto err;
-	}
 
 	dring->desc = kcalloc(DESC_NUM, sizeof(*dring->desc), GFP_KERNEL);
-	if (!dring->desc) {
-		ret = -ENOMEM;
+	if (!dring->desc)
 		goto err;
-	}
 
 	return 0;
 err:
 	netsec_free_dring(priv, id);
 
-	return ret;
+	return -ENOMEM;
 }
 
 static int netsec_setup_rx_dring(struct netsec_priv *priv)
 {
 	struct netsec_desc_ring *dring = &priv->desc_ring[NETSEC_RING_RX];
-	struct netsec_desc desc;
-	struct sk_buff *skb;
-	int n;
+	int i;
 
-	desc.len = priv->ndev->mtu + 22;
+	for (i = 0; i < DESC_NUM; i++) {
+		struct netsec_desc *desc = &dring->desc[i];
+		dma_addr_t dma_handle;
+		void *buf;
+		u16 len;
 
-	for (n = 0; n < DESC_NUM; n++) {
-		skb = netsec_alloc_skb(priv, &desc);
-		if (!skb) {
+		buf = netsec_alloc_rx_data(priv, &dma_handle, &len);
+		if (!buf) {
 			netsec_uninit_pkt_dring(priv, NETSEC_RING_RX);
-			return -ENOMEM;
+			goto err_out;
 		}
-		netsec_set_rx_de(priv, dring, n, &desc, skb);
+		desc->dma_addr = dma_handle;
+		desc->addr = buf;
+		desc->len = len;
 	}
 
+	netsec_rx_fill(priv, 0, DESC_NUM);
+
 	return 0;
+
+err_out:
+	return -ENOMEM;
 }
 
 static int netsec_netdev_load_ucode_region(struct netsec_priv *priv, u32 reg,
-- 
2.7.4

^ permalink raw reply related

* [net-next, PATCH 0/2, v2]  net: socionext: add XDP support
From: Ilias Apalodimas @ 2018-09-12  9:02 UTC (permalink / raw)
  To: netdev, jaswinder.singh
  Cc: ard.biesheuvel, masami.hiramatsu, arnd, mykyta.iziumtsev,
	bjorn.topel, magnus.karlsson, brouer, daniel, ast,
	Ilias Apalodimas

This patch series adds AF_XDP support socionext netsec driver
In addition the new dma allocation scheme offers a 10% boost on Rx
pps rate using 64b packets

- patch [1/2]: Use a different allocation scheme for Rx DMA buffers to prepare
the driver for AF_XDP support
- patch [2/2]: Add XDP support without zero-copy

Changes since v1:
- patch [2/2]: 
	- Rephrased commit message
	- As pointed out by Toshiaki Makita: 
	 - Added XDP_PACKET_HEADROOM
	 - Fixed a bug on XDP_PASS case as pointed out by Toshiaki Makita
	 - Using extact for error messaging instead of netdev_warn, when 
	tryint to setup XDP as suggested by Toshiaki Makita

Ilias Apalodimas (2):
  net: socionext: different approach on DMA
  net: socionext: add XDP support

 drivers/net/ethernet/socionext/netsec.c | 449 ++++++++++++++++++++++++--------
 1 file changed, 334 insertions(+), 115 deletions(-)

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
From: Vlad Buslov @ 2018-09-12  8:50 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller
In-Reply-To: <CAM_iQpW1z+dNLiazr87HupqzYPeBnFR5fyVfsXkhwZcRW0LrSA@mail.gmail.com>


On Fri 07 Sep 2018 at 19:12, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>> Action API was changed to work with actions and action_idr in concurrency
>> safe manner, however tcf_del_walker() still uses actions without taking a
>> reference or idrinfo->lock first, and deletes them directly, disregarding
>> possible concurrent delete.
>>
>> Add tc_action_wq workqueue to action API. Implement
>> tcf_idr_release_unsafe() that assumes external synchronization by caller
>> and delays blocking action cleanup part to tc_action_wq workqueue. Extend
>> tcf_action_cleanup() with 'async' argument to indicate that function should
>> free action asynchronously.
>
> Where exactly is blocking in tcf_action_cleanup()?
>
> From your code, it looks like free_tcf(), but from my observation,
> the only blocking function inside is tcf_action_goto_chain_fini()
> which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking
> _ONLY_ when tc_chain_notify() is called, for tc action it is never
> called.
>
> So, what else is blocking?

__tcf_chain_put() calls tc_chain_tmplt_del(), which calls
ops->tmplt_destroy(). This last function uses hw offload API, which is
blocking.

^ permalink raw reply

* Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()
From: Steffen Klassert @ 2018-09-12  8:50 UTC (permalink / raw)
  To: Tobias Hommel
  Cc: Wolfgang Walter, Kristian Evensen, Network Development, weiwan,
	edumazet
In-Reply-To: <20180911190248.hj55ultypwnnkcnx@delI>

On Tue, Sep 11, 2018 at 09:02:48PM +0200, Tobias Hommel wrote:
> > > Subject: [PATCH RFC] xfrm: Fix NULL pointer dereference when skb_dst_force
> > > clears the dst_entry.
> > > 
> > > Since commit 222d7dbd258d ("net: prevent dst uses after free")
> > > skb_dst_force() might clear the dst_entry attached to the skb.
> > > The xfrm code don't expect this to happen, so we crash with
> > > a NULL pointer dereference in this case. Fix it by checking
> > > skb_dst(skb) for NULL after skb_dst_force() and drop the packet
> > > in cast the dst_entry was cleared.
> > > 
> > > Fixes: 222d7dbd258d ("net: prevent dst uses after free")
> > > Reported-by: Tobias Hommel <netdev-list@genoetigt.de>
> > > Reported-by: Kristian Evensen <kristian.evensen@gmail.com>
> > > Reported-by: Wolfgang Walter <linux@stwm.de>
> > > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> > > ---
> > 
> > This patch fixes the problem here.
> > 
> > XfrmFwdHdrError gets around 80 at the very beginning and remains so. Probably 
> > this happens when some route are changed/set then. 
> > 
> > Regards and thanks,
> 
> Same here, we're now running stable for ~6 hours, XfrmFwdHdrError is at 220.
> This is less than 1 lost packet per minute, which seems to be okay for now.

Thanks a lot for testing! This is now applied to the ipsec tree.

^ permalink raw reply

* Re: [RFC v2 2/2] netlink: add ethernet address policy types
From: Johannes Berg @ 2018-09-12  8:50 UTC (permalink / raw)
  To: Arend van Spriel, linux-wireless, netdev; +Cc: Michal Kubecek
In-Reply-To: <5B98D336.4090107@broadcom.com>

On Wed, 2018-09-12 at 10:49 +0200, Arend van Spriel wrote:
> On 9/12/2018 10:36 AM, Johannes Berg wrote:
> > From: Johannes Berg <johannes.berg@intel.com>
> > 
> > Commonly, ethernet addresses are just using a policy of
> > 	{ .len = ETH_ALEN }
> > which leaves userspace free to send more data than it should,
> > which may hide bugs.
> > 
> > Introduce NLA_ETH_ADDR which checks for exact size, and rejects
> > the attribute if the length isn't ETH_ALEN.
> > 
> > Also add NLA_ETH_ADDR_COMPAT which can be used in place of the
> > policy above, but will, in addition, warn on an address that's
> > too long.
> 
> Not sure if this is correctly described here. It seems longer addresses 
> are not rejected, but only result in a warning message. I guess the 
> problem is in the reference to the "policy above" ;-)

Yeah, good point. I meant ".len = ETH_ALEN" but should clarify that.

johannes

^ permalink raw reply

* Re: [RFC v2 2/2] netlink: add ethernet address policy types
From: Arend van Spriel @ 2018-09-12  8:49 UTC (permalink / raw)
  To: Johannes Berg, linux-wireless, netdev; +Cc: Michal Kubecek, Johannes Berg
In-Reply-To: <20180912083610.20857-2-johannes@sipsolutions.net>

On 9/12/2018 10:36 AM, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
>
> Commonly, ethernet addresses are just using a policy of
> 	{ .len = ETH_ALEN }
> which leaves userspace free to send more data than it should,
> which may hide bugs.
>
> Introduce NLA_ETH_ADDR which checks for exact size, and rejects
> the attribute if the length isn't ETH_ALEN.
>
> Also add NLA_ETH_ADDR_COMPAT which can be used in place of the
> policy above, but will, in addition, warn on an address that's
> too long.

Not sure if this is correctly described here. It seems longer addresses 
are not rejected, but only result in a warning message. I guess the 
problem is in the reference to the "policy above" ;-)

Regards,
Arend

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: Add MV88E6352 DT compatible
From: Marek Vasut @ 2018-09-12  8:38 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: netdev
In-Reply-To: <20180912004602.GA14588@lunn.ch>

On 09/12/2018 02:46 AM, Andrew Lunn wrote:
> On Wed, Sep 12, 2018 at 02:04:54AM +0200, Marek Vasut wrote:
>> On 09/12/2018 01:32 AM, Andrew Lunn wrote:
>>>> compatible = "marvell,mv88e6352", "marvell,mv88e6085";
>>>
>>> Just "marvell,mv88e6085";
>>>
>>> Please take a look at all the other DT files using the Marvell
>>> chips. You will only ever find "marvell,mv88e6085" or
>>> "marvell,mv88e6190", because everything is compatible to one of these
>>> two.
>>
>> Well, until you find a difference between those chips which you cannot
>> discern based solely on the ID register content. Then it's better to
>> have a compatible to match on which matches the actual chip.
> 
> Hi Marek
> 
> We have been around this loop before. The problem with putting in a
> more specific compatible is that nothing is validating it. So it is
> going to be wrong, simple because people cut/paste, and don't
> necessary change it. So when we do need to look at it, we cannot look
> at it, because it is wrong.
> 
> I would only add a more specific compatible if and when we need it, it
> is actually used, and we can verify it is correct.

You may need it in the future and it may not be possible to adjust the
DT then. So having a specific compatible and fallback compatible is I
think the way to go. You can always match on the fallback and if the
time comes, you can match on the specific one because it's there. In
addition to that, such a DT would fully correctly describe the hardware,
unlike one with only a fallback compatible.

Regarding validation, I don't see other systems using this approach
(specific + fallback compatible) having a problem with validation.
What is the trick there ?

-- 
Best regards,
Marek Vasut

^ permalink raw reply

* Re: [RFC v2 1/2] netlink: add NLA_REJECT policy type
From: Johannes Berg @ 2018-09-12  8:38 UTC (permalink / raw)
  To: linux-wireless, netdev; +Cc: Michal Kubecek
In-Reply-To: <20180912083610.20857-1-johannes@sipsolutions.net>

On Wed, 2018-09-12 at 10:36 +0200, Johannes Berg wrote:
> @@ -251,7 +257,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
>  
>  		if (type > 0 && type <= maxtype) {
>  			if (policy) {
> -				err = validate_nla(nla, maxtype, policy);
> +				err = validate_nla(nla, maxtype, policy, extack);
>  				if (err < 0) {
>  					NL_SET_ERR_MSG_ATTR(extack, nla,
>  							    "Attribute failed policy validation");

Err... I should read my patches before sending them :-)

Clearly, this NL_SET_ERR_MSG_ATTR() overwrites the error message, so
would have to be made conditional.

I can fix that (and I should use/test it) if we decide it's worthwhile.

johannes

^ permalink raw reply

* Re: [PATCH net-next 13/13] net: sched: add flags to Qdisc class ops struct
From: Vlad Buslov @ 2018-09-12  8:25 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller, Stephen Hemminger, Kirill Tkhai, Paul E. McKenney,
	Nicolas Dichtel, Leon Romanovsky, Greg KH, mark.rutland,
	Florian Westphal, David Ahern, lucien xin, Jakub Kicinski,
	Christian Brauner, Jiri Benc
In-Reply-To: <CAM_iQpU2FxihLKBpQkUP2RPfXe=eygd8QmLkE+VYhFEnuyrf3g@mail.gmail.com>


On Fri 07 Sep 2018 at 19:50, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>> Extend Qdisc_class_ops with flags. Create enum to hold possible class ops
>> flag values. Add first class ops flags value QDISC_CLASS_OPS_DOIT_UNLOCKED
>> to indicate that class ops functions can be called without taking rtnl
>> lock.
>
> We don't add anything that is not used.
>
> This is the last patch in this series, so I am pretty sure you split
> it in a wrong way, it certainly belongs to next series, not this series.

Will do.

Thank you for reviewing my code!

^ permalink raw reply

* Re: [PATCH net-next 09/13] net: sched: extend tcf_block with rcu
From: Vlad Buslov @ 2018-09-12  8:25 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller, Stephen Hemminger, Kirill Tkhai, Paul E. McKenney,
	Nicolas Dichtel, Leon Romanovsky, Greg KH, mark.rutland,
	Florian Westphal, David Ahern, lucien xin, Jakub Kicinski,
	Christian Brauner, Jiri Benc
In-Reply-To: <CAM_iQpXf=WgBGb2+2bNp-w=hssmbjPX6RMvjCKv2EW53yeBsVw@mail.gmail.com>


On Fri 07 Sep 2018 at 19:52, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>> Extend tcf_block with rcu to allow safe deallocation when it is accessed
>> concurrently.
>
> This sucks, please fold this patch into where you call rcu_read_lock()
> on tcf block.
>
> This patch _alone_ is apparently not complete. This is not how
> we split patches.

Got it.

^ permalink raw reply

* Re: [PATCH net-next 08/13] net: sched: rename tcf_block_get{_ext}() and tcf_block_put{_ext}()
From: Vlad Buslov @ 2018-09-12  8:24 UTC (permalink / raw)
  To: Cong Wang
  Cc: Linux Kernel Network Developers, Jamal Hadi Salim, Jiri Pirko,
	David Miller, Stephen Hemminger, Kirill Tkhai, Paul E. McKenney,
	Nicolas Dichtel, Leon Romanovsky, Greg KH, mark.rutland,
	Florian Westphal, David Ahern, lucien xin, Jakub Kicinski,
	Christian Brauner, Jiri Benc
In-Reply-To: <CAM_iQpVMU7nPgkw+N_V6ukrH5rx-37+nd+++wj47JRA33RqMUQ@mail.gmail.com>


On Fri 07 Sep 2018 at 20:09, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov <vladbu@mellanox.com> wrote:
>>
>> Functions tcf_block_get{_ext}() and tcf_block_put{_ext}() actually
>> attach/detach block to specific Qdisc besides just taking/putting
>> reference. Rename them according to their purpose.
>
> Where exactly does it attach to?
>
> Each qdisc provides a pointer to a pointer of a block, like
> &cl->block. It is where the result is saved to. It takes a parameter
> of Qdisc* merely for read-only purpose.

tcf_block_attach_ext() passes qdisc parameter to tcf_block_owner_add()
which saves qdisc to new tcf_block_owner_item and adds the item to
block's owner list. I proposed several naming options for these
functions to Jiri on internal review and he suggested "attach" as better
option.

>
> So, renaming it to *attach() is even confusing, at least not
> any better. Please find other names or leave them as they are.

What would you recommend?

^ permalink raw reply

* Re: [PATCH] net: ipv4: Use BUG_ON directly instead of a if condition followed by BUG
From: zhong jiang @ 2018-09-12 13:25 UTC (permalink / raw)
  To: David Miller; +Cc: edumazet, kuznet, yoshfuji, netdev, linux-kernel
In-Reply-To: <20180911.235417.1481257495387934935.davem@davemloft.net>

On 2018/9/12 14:54, David Miller wrote:
> From: zhong jiang <zhongjiang@huawei.com>
> Date: Mon, 10 Sep 2018 22:38:02 +0800
>
>> The if condition can be removed if we use BUG_ON directly.
>> The issule is detected with the help of Coccinelle.
>>
>> Signed-off-by: zhong jiang <zhongjiang@huawei.com>
> You keep asking if this is worthy to do.
>
> I think the most worthy thing to do is actually build test your
> patches.
 I am sorry for that.  :-[

Thanks,
zhong jiang

^ permalink raw reply

* Re: [RFC] netlink: add NLA_REJECT policy type
From: Johannes Berg @ 2018-09-12  8:21 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: linux-wireless, netdev
In-Reply-To: <20180912081621.GC29691@unicorn.suse.cz>

On Wed, 2018-09-12 at 10:16 +0200, Michal Kubecek wrote:
> On Wed, Sep 12, 2018 at 09:32:45AM +0200, Johannes Berg wrote:
> > From: Johannes Berg <johannes.berg@intel.com>
> > 
> > In some situations some netlink attributes may be used for output
> > only (kernel->userspace) or may be reserved for future use. It's
> > then helpful to be able to prevent userspace from using them in
> > messages sent to the kernel, since they'd otherwise be ignored and
> > any future will become impossible if this happens.
> > 
> > Add NLA_REJECT to the policy which does nothing but reject (with
> > EINVAL) validation of any messages containing this attribute.
> > 
> > The specific case I have in mind now is a shared nested attribute
> > containing request/response data, and it would be pointless and
> > potentially confusing to have userspace include response data in
> > the messages that actually contain a request.
> 
> I find this feature very useful. Actually, I was a bit surprised when
> I found I can't mark an attribute "forbidden" using policy.

:-)

> IMHO it would be even nicer if one could also specify an error message
> to use in extack if NLA_REJECT is applied; the easiest way would be
> using .validation_data and passing extack to validate_nla() but I'm not
> sure if it wouldn't qualify as an abuse.

I think it's fine - validation data is by nature validation type
dependent, so we can document it here. I'll send a v2.

johannes

^ permalink raw reply

* Re: [RFC] netlink: add NLA_REJECT policy type
From: Michal Kubecek @ 2018-09-12  8:16 UTC (permalink / raw)
  To: Johannes Berg; +Cc: linux-wireless, netdev, Johannes Berg
In-Reply-To: <20180912073245.8047-1-johannes@sipsolutions.net>

On Wed, Sep 12, 2018 at 09:32:45AM +0200, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg@intel.com>
> 
> In some situations some netlink attributes may be used for output
> only (kernel->userspace) or may be reserved for future use. It's
> then helpful to be able to prevent userspace from using them in
> messages sent to the kernel, since they'd otherwise be ignored and
> any future will become impossible if this happens.
> 
> Add NLA_REJECT to the policy which does nothing but reject (with
> EINVAL) validation of any messages containing this attribute.
> 
> The specific case I have in mind now is a shared nested attribute
> containing request/response data, and it would be pointless and
> potentially confusing to have userspace include response data in
> the messages that actually contain a request.

I find this feature very useful. Actually, I was a bit surprised when
I found I can't mark an attribute "forbidden" using policy.

IMHO it would be even nicer if one could also specify an error message
to use in extack if NLA_REJECT is applied; the easiest way would be
using .validation_data and passing extack to validate_nla() but I'm not
sure if it wouldn't qualify as an abuse.

Michal Kubecek

^ permalink raw reply

* Re: bpf: btf: Change how section is supported in btf_header
From: Dmitry Vyukov @ 2018-09-12  8:10 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: Daniel Borkmann, Alexei Starovoitov, netdev
In-Reply-To: <20180911181010.vqbp2pxba2k5mshc@kafai-mbp.dhcp.thefacebook.com>

On Tue, Sep 11, 2018 at 8:10 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> On Tue, Sep 11, 2018 at 06:40:05PM +0200, Dmitry Vyukov wrote:
>> Hi Martin,
>>
>> I am looking at the subj commit:
>>
>>  static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
>> @@ -1754,9 +1756,9 @@ static int btf_check_all_metas(struct
>> btf_verifier_env *env)
>>         struct btf_header *hdr;
>>         void *cur, *end;
>>
>> -       hdr = btf->hdr;
>> +       hdr = &btf->hdr;
>>         cur = btf->nohdr_data + hdr->type_off;
>> -       end = btf->nohdr_data + hdr->str_off;
>> +       end = btf->nohdr_data + hdr->type_len;
>>
>> Shouldn't this be:
>>
>> +       end = cur + hdr->type_len;
>>
>> ? Or otherwise I am having trouble understanding meaning of fields.
> You are correct.  Thanks for pointing this out.
> Do you want to post an offical patch for the bpf branch?

no :)

Only if you want to send a syzkaller patch with btf descriptions ;)

>> On a related note, what's between header and type_off? Is type_off
>> supposed to be 0 always?
> type section is always the first section for now (i.e. immediately after
> the header).  Some other sections could be introduced later and it could
> be located before the type section such that the type_off will not be 0.

I see. Thanks.

^ permalink raw reply

* Re: Re: [PATCH net-next v2 0/5] virtio: support packed ring
From: Michael S. Tsirkin @ 2018-09-12 13:06 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
	jfreimann
In-Reply-To: <20180910030053.GA15645@debian>

On Mon, Sep 10, 2018 at 11:00:53AM +0800, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 09:00:49AM -0400, Michael S. Tsirkin wrote:
> > On Fri, Sep 07, 2018 at 09:22:25AM +0800, Tiwei Bie wrote:
> > > On Mon, Aug 27, 2018 at 05:00:40PM +0300, Michael S. Tsirkin wrote:
> > > > Are there still plans to test the performance with vost pmd?
> > > > vhost doesn't seem to show a performance gain ...
> > > > 
> > > 
> > > I tried some performance tests with vhost PMD. In guest, the
> > > XDP program will return XDP_DROP directly. And in host, testpmd
> > > will do txonly fwd.
> > > 
> > > When burst size is 1 and packet size is 64 in testpmd and
> > > testpmd needs to iterate 5 Tx queues (but only the first two
> > > queues are enabled) to prepare and inject packets, I got ~12%
> > > performance boost (5.7Mpps -> 6.4Mpps). And if the vhost PMD
> > > is faster (e.g. just need to iterate the first two queues to
> > > prepare and inject packets), then I got similar performance
> > > for both rings (~9.9Mpps) (packed ring's performance can be
> > > lower, because it's more complicated in driver.)
> > > 
> > > I think packed ring makes vhost PMD faster, but it doesn't make
> > > the driver faster. In packed ring, the ring is simplified, and
> > > the handling of the ring in vhost (device) is also simplified,
> > > but things are not simplified in driver, e.g. although there is
> > > no desc table in the virtqueue anymore, driver still needs to
> > > maintain a private desc state table (which is still managed as
> > > a list in this patch set) to support the out-of-order desc
> > > processing in vhost (device).
> > > 
> > > I think this patch set is mainly to make the driver have a full
> > > functional support for the packed ring, which makes it possible
> > > to leverage the packed ring feature in vhost (device). But I'm
> > > not sure whether there is any other better idea, I'd like to
> > > hear your thoughts. Thanks!
> > 
> > Just this: Jens seems to report a nice gain with virtio and
> > vhost pmd across the board. Try to compare virtio and
> > virtio pmd to see what does pmd do better?
> 
> The virtio PMD (drivers/net/virtio) in DPDK doesn't need to share
> the virtio ring operation code with other drivers and is highly
> optimized for network. E.g. in Rx, the Rx burst function won't
> chain descs.
> So the ID management for the Rx ring can be quite
> simple and straightforward, we just need to initialize these IDs
> when initializing the ring and don't need to change these IDs
> in data path anymore (the mergable Rx code in that patch set
> assumes the descs will be written back in order, which should be
> fixed. I.e., the ID in the desc should be used to index vq->descx[]).
> The Tx code in that patch set also assumes the descs will be
> written back by device in order, which should be fixed.
> 
> But in kernel virtio driver, the virtio_ring.c is very generic.
> The enqueue (virtqueue_add()) and dequeue (virtqueue_get_buf_ctx())
> functions need to support all the virtio devices and should be
> able to handle all the possible cases that may happen. So although
> the packed ring can be very efficient in some cases, currently
> the room to optimize the performance in kernel's virtio_ring.c
> isn't that much. If we want to take the fully advantage of the
> packed ring's efficiency, we need some further e.g. API changes
> in virtio_ring.c, which shouldn't be part of this patch set. So
> I still think this patch set is mainly to make the kernel virtio
> driver to have a full functional support of the packed ring, and
> we can't expect impressive performance boost with it.

So what are the cases that make things complex?
Maybe we should drop support for them completely.


> > 
> > 
> > > 
> > > > 
> > > > On Wed, Jul 11, 2018 at 10:27:06AM +0800, Tiwei Bie wrote:
> > > > > Hello everyone,
> > > > > 
> > > > > This patch set implements packed ring support in virtio driver.
> > > > > 
> > > > > Some functional tests have been done with Jason's
> > > > > packed ring implementation in vhost:
> > > > > 
> > > > > https://lkml.org/lkml/2018/7/3/33
> > > > > 
> > > > > Both of ping and netperf worked as expected.
> > > > > 
> > > > > v1 -> v2:
> > > > > - Use READ_ONCE() to read event off_wrap and flags together (Jason);
> > > > > - Add comments related to ccw (Jason);
> > > > > 
> > > > > RFC (v6) -> v1:
> > > > > - Avoid extra virtio_wmb() in virtqueue_enable_cb_delayed_packed()
> > > > >   when event idx is off (Jason);
> > > > > - Fix bufs calculation in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > - Test the state of the desc at used_idx instead of last_used_idx
> > > > >   in virtqueue_enable_cb_delayed_packed() (Jason);
> > > > > - Save wrap counter (as part of queue state) in the return value
> > > > >   of virtqueue_enable_cb_prepare_packed();
> > > > > - Refine the packed ring definitions in uapi;
> > > > > - Rebase on the net-next tree;
> > > > > 
> > > > > RFC v5 -> RFC v6:
> > > > > - Avoid tracking addr/len/flags when DMA API isn't used (MST/Jason);
> > > > > - Define wrap counter as bool (Jason);
> > > > > - Use ALIGN() in vring_init_packed() (Jason);
> > > > > - Avoid using pointer to track `next` in detach_buf_packed() (Jason);
> > > > > - Add comments for barriers (Jason);
> > > > > - Don't enable RING_PACKED on ccw for now (noticed by Jason);
> > > > > - Refine the memory barrier in virtqueue_poll();
> > > > > - Add a missing memory barrier in virtqueue_enable_cb_delayed_packed();
> > > > > - Remove the hacks in virtqueue_enable_cb_prepare_packed();
> > > > > 
> > > > > RFC v4 -> RFC v5:
> > > > > - Save DMA addr, etc in desc state (Jason);
> > > > > - Track used wrap counter;
> > > > > 
> > > > > RFC v3 -> RFC v4:
> > > > > - Make ID allocation support out-of-order (Jason);
> > > > > - Various fixes for EVENT_IDX support;
> > > > > 
> > > > > RFC v2 -> RFC v3:
> > > > > - Split into small patches (Jason);
> > > > > - Add helper virtqueue_use_indirect() (Jason);
> > > > > - Just set id for the last descriptor of a list (Jason);
> > > > > - Calculate the prev in virtqueue_add_packed() (Jason);
> > > > > - Fix/improve desc suppression code (Jason/MST);
> > > > > - Refine the code layout for XXX_split/packed and wrappers (MST);
> > > > > - Fix the comments and API in uapi (MST);
> > > > > - Remove the BUG_ON() for indirect (Jason);
> > > > > - Some other refinements and bug fixes;
> > > > > 
> > > > > RFC v1 -> RFC v2:
> > > > > - Add indirect descriptor support - compile test only;
> > > > > - Add event suppression supprt - compile test only;
> > > > > - Move vring_packed_init() out of uapi (Jason, MST);
> > > > > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > > > > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > > > > - Avoid using '%' operator (Jason);
> > > > > - Rename free_head -> next_avail_idx (Jason);
> > > > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > > > > - Some other refinements and bug fixes;
> > > > > 
> > > > > Thanks!
> > > > > 
> > > > > Tiwei Bie (5):
> > > > >   virtio: add packed ring definitions
> > > > >   virtio_ring: support creating packed ring
> > > > >   virtio_ring: add packed ring support
> > > > >   virtio_ring: add event idx support in packed ring
> > > > >   virtio_ring: enable packed ring
> > > > > 
> > > > >  drivers/s390/virtio/virtio_ccw.c   |   14 +
> > > > >  drivers/virtio/virtio_ring.c       | 1365 ++++++++++++++++++++++------
> > > > >  include/linux/virtio_ring.h        |    8 +-
> > > > >  include/uapi/linux/virtio_config.h |    3 +
> > > > >  include/uapi/linux/virtio_ring.h   |   43 +
> > > > >  5 files changed, 1157 insertions(+), 276 deletions(-)
> > > > > 
> > > > > -- 
> > > > > 2.18.0
> > > > 
> > > > ---------------------------------------------------------------------
> > > > To unsubscribe, e-mail: virtio-dev-unsubscribe@lists.oasis-open.org
> > > > For additional commands, e-mail: virtio-dev-help@lists.oasis-open.org
> > > > 

^ permalink raw reply

* Re: [PATCH net] Revert "rtnetlink: add rtnl_link_state check in rtnl_configure_link"
From: mcbirnie.l @ 2018-09-12  7:56 UTC (permalink / raw)
  To: netdev; +Cc: Liam McBirnie
In-Reply-To: <20180911231110.3506-1-mcbirnie.l@gmail.com>

From: Liam McBirnie <liam.mcbirnie@boeing.com>

Please reject this patch.
The author of the original commit is working on a better patch.
https://bugzilla.kernel.org/show_bug.cgi?id=201071

^ permalink raw reply

* Re: [PATCH net-next v2 1/5] virtio: add packed ring definitions
From: Michael S. Tsirkin @ 2018-09-12 12:53 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
	jfreimann
In-Reply-To: <20180910021322.GA10912@debian>

On Mon, Sep 10, 2018 at 10:13:22AM +0800, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 09:51:23AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 11, 2018 at 10:27:07AM +0800, Tiwei Bie wrote:
> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > ---
> > >  include/uapi/linux/virtio_config.h |  3 +++
> > >  include/uapi/linux/virtio_ring.h   | 43 ++++++++++++++++++++++++++++++
> > >  2 files changed, 46 insertions(+)
> > > 
> > > diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
> > > index 449132c76b1c..1196e1c1d4f6 100644
> > > --- a/include/uapi/linux/virtio_config.h
> > > +++ b/include/uapi/linux/virtio_config.h
> > > @@ -75,6 +75,9 @@
> > >   */
> > >  #define VIRTIO_F_IOMMU_PLATFORM		33
> > >  
> > > +/* This feature indicates support for the packed virtqueue layout. */
> > > +#define VIRTIO_F_RING_PACKED		34
> > > +
> > >  /*
> > >   * Does the device support Single Root I/O Virtualization?
> > >   */
> > > diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
> > > index 6d5d5faa989b..0254a2ba29cf 100644
> > > --- a/include/uapi/linux/virtio_ring.h
> > > +++ b/include/uapi/linux/virtio_ring.h
> > > @@ -44,6 +44,10 @@
> > >  /* This means the buffer contains a list of buffer descriptors. */
> > >  #define VRING_DESC_F_INDIRECT	4
> > >  
> > > +/* Mark a descriptor as available or used. */
> > > +#define VRING_DESC_F_AVAIL	(1ul << 7)
> > > +#define VRING_DESC_F_USED	(1ul << 15)
> > > +
> > >  /* The Host uses this in used->flags to advise the Guest: don't kick me when
> > >   * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
> > >   * will still kick if it's out of buffers. */
> > > @@ -53,6 +57,17 @@
> > >   * optimization.  */
> > >  #define VRING_AVAIL_F_NO_INTERRUPT	1
> > >  
> > > +/* Enable events. */
> > > +#define VRING_EVENT_F_ENABLE	0x0
> > > +/* Disable events. */
> > > +#define VRING_EVENT_F_DISABLE	0x1
> > > +/*
> > > + * Enable events for a specific descriptor
> > > + * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
> > > + * Only valid if VIRTIO_RING_F_EVENT_IDX has been negotiated.
> > > + */
> > > +#define VRING_EVENT_F_DESC	0x2
> > > +
> > >  /* We support indirect buffer descriptors */
> > >  #define VIRTIO_RING_F_INDIRECT_DESC	28
> > >  
> > 
> > These are for the packed ring, right? Pls prefix accordingly.
> 
> How about something like this:
> 
> #define VRING_PACKED_DESC_F_AVAIL	(1u << 7)
> #define VRING_PACKED_DESC_F_USED	(1u << 15)

_F_ should be a bit number, not a shifted value.
Yes I know current virtio_ring.h is inconsistent in
that respect. changes welcome though we need to
maintain the old names for the sake of old apps.

> 
> #define VRING_PACKED_EVENT_F_ENABLE	0x0
> #define VRING_PACKED_EVENT_F_DISABLE	0x1
> #define VRING_PACKED_EVENT_F_DESC	0x2

These are values, I think they should not have _F_.
Yes I know current virtio_ring.h is inconsistent in
that respect.


> 
> > Also, you likely need macros for the wrap counters.
> 
> How about something like this:
> 
> #define VRING_PACKED_EVENT_WRAP_COUNTER_SHIFT	15

Given it's a single bit, maybe even
VRING_PACKED_EVENT_F_WRAP_CTR

> #define VRING_PACKED_EVENT_WRAP_COUNTER_MASK	\
> 			(1u << VRING_PACKED_WRAP_COUNTER_SHIFT)
> #define VRING_PACKED_EVENT_OFFSET_MASK	\
> 			((1u << VRING_PACKED_WRAP_COUNTER_SHIFT) - 1)

I'm not sure the mask is justified.

> > 
> > > @@ -171,4 +186,32 @@ static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old)
> > >  	return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old);
> > >  }
> > >  
> > > +struct vring_packed_desc_event {
> > > +	/* Descriptor Ring Change Event Offset/Wrap Counter. */
> > > +	__virtio16 off_wrap;
> > > +	/* Descriptor Ring Change Event Flags. */
> > > +	__virtio16 flags;
> > > +};
> > > +
> > > +struct vring_packed_desc {
> > > +	/* Buffer Address. */
> > > +	__virtio64 addr;
> > > +	/* Buffer Length. */
> > > +	__virtio32 len;
> > > +	/* Buffer ID. */
> > > +	__virtio16 id;
> > > +	/* The flags depending on descriptor type. */
> > > +	__virtio16 flags;
> > > +};
> > 
> > Don't use __virtioXX types, just __leXX ones.
> 
> Got it, will do that.
> 
> > 
> > > +
> > > +struct vring_packed {
> > > +	unsigned int num;
> > > +
> > > +	struct vring_packed_desc *desc;
> > > +
> > > +	struct vring_packed_desc_event *driver;
> > > +
> > > +	struct vring_packed_desc_event *device;
> > > +};
> > > +
> > >  #endif /* _UAPI_LINUX_VIRTIO_RING_H */
> > > -- 
> > > 2.18.0

^ permalink raw reply

* Re: [PATCH net-next v2 2/5] virtio_ring: support creating packed ring
From: Michael S. Tsirkin @ 2018-09-12 12:45 UTC (permalink / raw)
  To: Tiwei Bie
  Cc: jasowang, virtualization, linux-kernel, netdev, virtio-dev, wexu,
	jfreimann
In-Reply-To: <20180910022837.GA11822@debian>

On Mon, Sep 10, 2018 at 10:28:37AM +0800, Tiwei Bie wrote:
> On Fri, Sep 07, 2018 at 10:03:24AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Jul 11, 2018 at 10:27:08AM +0800, Tiwei Bie wrote:
> > > This commit introduces the support for creating packed ring.
> > > All split ring specific functions are added _split suffix.
> > > Some necessary stubs for packed ring are also added.
> > > 
> > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > 
> > I'd rather have a patch just renaming split functions, then
> > add all packed stuff in as a separate patch on top.
> 
> Sure, I will do that.
> 
> > 
> > 
> > > ---
> > >  drivers/virtio/virtio_ring.c | 801 +++++++++++++++++++++++------------
> > >  include/linux/virtio_ring.h  |   8 +-
> > >  2 files changed, 546 insertions(+), 263 deletions(-)
> > > 
> > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > > index 814b395007b2..c4f8abc7445a 100644
> > > --- a/drivers/virtio/virtio_ring.c
> > > +++ b/drivers/virtio/virtio_ring.c
> > > @@ -60,11 +60,15 @@ struct vring_desc_state {
> > >  	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> > >  };
> > >  
> > > +struct vring_desc_state_packed {
> > > +	int next;			/* The next desc state. */
> > 
> > So this can go away with IN_ORDER?
> 
> Yes. If IN_ORDER is negotiated, next won't be needed anymore.
> Currently, IN_ORDER isn't included in this patch set, because
> some changes for split ring are needed to make sure that it
> will use the descs in the expected order. After that,
> optimizations can be done for both of split ring and packed
> ring respectively.
> 
> > 
> > > +};
> > > +
> > >  struct vring_virtqueue {
> > >  	struct virtqueue vq;
> > >  
> > > -	/* Actual memory layout for this queue */
> > > -	struct vring vring;
> > > +	/* Is this a packed ring? */
> > > +	bool packed;
> > >  
> > >  	/* Can we use weak barriers? */
> > >  	bool weak_barriers;
> > > @@ -86,11 +90,39 @@ struct vring_virtqueue {
> > >  	/* Last used index we've seen. */
> > >  	u16 last_used_idx;
> > >  
> > > -	/* Last written value to avail->flags */
> > > -	u16 avail_flags_shadow;
> > > +	union {
> > > +		/* Available for split ring */
> > > +		struct {
> > > +			/* Actual memory layout for this queue. */
> > > +			struct vring vring;
> > >  
> > > -	/* Last written value to avail->idx in guest byte order */
> > > -	u16 avail_idx_shadow;
> > > +			/* Last written value to avail->flags */
> > > +			u16 avail_flags_shadow;
> > > +
> > > +			/* Last written value to avail->idx in
> > > +			 * guest byte order. */
> > > +			u16 avail_idx_shadow;
> > > +		};
> > 
> > Name this field split so it's easier to detect misuse of e.g.
> > packed fields in split code?
> 
> Good point, I'll do that.
> 
> > 
> > > +
> > > +		/* Available for packed ring */
> > > +		struct {
> > > +			/* Actual memory layout for this queue. */
> > > +			struct vring_packed vring_packed;
> > > +
> > > +			/* Driver ring wrap counter. */
> > > +			bool avail_wrap_counter;
> > > +
> > > +			/* Device ring wrap counter. */
> > > +			bool used_wrap_counter;
> > > +
> > > +			/* Index of the next avail descriptor. */
> > > +			u16 next_avail_idx;
> > > +
> > > +			/* Last written value to driver->flags in
> > > +			 * guest byte order. */
> > > +			u16 event_flags_shadow;
> > > +		};
> > > +	};
> [...]
> > > +
> > > +/*
> > > + * The layout for the packed ring is a continuous chunk of memory
> > > + * which looks like this.
> > > + *
> > > + * struct vring_packed {
> > > + *	// The actual descriptors (16 bytes each)
> > > + *	struct vring_packed_desc desc[num];
> > > + *
> > > + *	// Padding to the next align boundary.
> > > + *	char pad[];
> > > + *
> > > + *	// Driver Event Suppression
> > > + *	struct vring_packed_desc_event driver;
> > > + *
> > > + *	// Device Event Suppression
> > > + *	struct vring_packed_desc_event device;
> > > + * };
> > > + */
> > 
> > Why not just allocate event structures separately?
> > Is it a win to have them share a cache line for some reason?
> 
> Will do that.
> 
> > 
> > > +static inline void vring_init_packed(struct vring_packed *vr, unsigned int num,
> > > +				     void *p, unsigned long align)
> > > +{
> > > +	vr->num = num;
> > > +	vr->desc = p;
> > > +	vr->driver = (void *)ALIGN(((uintptr_t)p +
> > > +		sizeof(struct vring_packed_desc) * num), align);
> > > +	vr->device = vr->driver + 1;
> > > +}
> > 
> > What's all this about alignment? Where does it come from?
> 
> It comes from the `vring_align` parameter of vring_create_virtqueue()
> and vring_new_virtqueue():
> 
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1061
> https://github.com/torvalds/linux/blob/a49a9dcce802/drivers/virtio/virtio_ring.c#L1123

Note the TODO - we just never got to fixing it. It would be great to fix
this for virtio 1 rings, but if not - at least let's not add this
stuff for packed rings.

> Should I just ignore it in packed ring?
> 
> CCW defined this:
> 
> #define KVM_VIRTIO_CCW_RING_ALIGN 4096
> 
> I'm not familiar with CCW. Currently, in this patch set, packed ring
> isn't enabled on CCW dues to some legacy accessors are not implemented
> in packed ring yet.


Then you need to take steps not to negotiate this feature bit for
ccw drivers.

> > 
> > > +
> > > +static inline unsigned vring_size_packed(unsigned int num, unsigned long align)
> > > +{
> > > +	return ((sizeof(struct vring_packed_desc) * num + align - 1)
> > > +		& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
> > > +}
> [...]
> > > @@ -1129,10 +1388,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> > >  				      void (*callback)(struct virtqueue *vq),
> > >  				      const char *name)
> > >  {
> > > -	struct vring vring;
> > > -	vring_init(&vring, num, pages, vring_align);
> > > -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > -				     notify, callback, name);
> > > +	union vring_union vring;
> > > +	bool packed;
> > > +
> > > +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> > > +	if (packed)
> > > +		vring_init_packed(&vring.vring_packed, num, pages, vring_align);
> > > +	else
> > > +		vring_init(&vring.vring_split, num, pages, vring_align);
> > 
> > 
> > vring_init in the UAPI header is more or less a bug.
> > I'd just stop using it, keep it around for legacy userspace.
> 
> Got it. I'd like to do that. Thanks.
> 
> > 
> > > +
> > > +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > +				     context, notify, callback, name);
> > >  }
> > >  EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> > >  
> > > @@ -1142,7 +1408,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> > >  
> > >  	if (vq->we_own_ring) {
> > >  		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> > > -				 vq->vring.desc, vq->queue_dma_addr);
> > > +				 vq->packed ? (void *)vq->vring_packed.desc :
> > > +					      (void *)vq->vring.desc,
> > > +				 vq->queue_dma_addr);
> > >  	}
> > >  	list_del(&_vq->list);
> > >  	kfree(vq);
> > > @@ -1184,7 +1452,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
> > >  
> > >  	struct vring_virtqueue *vq = to_vvq(_vq);
> > >  
> > > -	return vq->vring.num;
> > > +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
> > >  
> > > @@ -1227,6 +1495,10 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
> > >  
> > >  	BUG_ON(!vq->we_own_ring);
> > >  
> > > +	if (vq->packed)
> > > +		return vq->queue_dma_addr + ((char *)vq->vring_packed.driver -
> > > +				(char *)vq->vring_packed.desc);
> > > +
> > >  	return vq->queue_dma_addr +
> > >  		((char *)vq->vring.avail - (char *)vq->vring.desc);
> > >  }
> > > @@ -1238,11 +1510,16 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
> > >  
> > >  	BUG_ON(!vq->we_own_ring);
> > >  
> > > +	if (vq->packed)
> > > +		return vq->queue_dma_addr + ((char *)vq->vring_packed.device -
> > > +				(char *)vq->vring_packed.desc);
> > > +
> > >  	return vq->queue_dma_addr +
> > >  		((char *)vq->vring.used - (char *)vq->vring.desc);
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
> > >  
> > > +/* Only available for split ring */
> > >  const struct vring *virtqueue_get_vring(struct virtqueue *vq)
> > >  {
> > >  	return &to_vvq(vq)->vring;
> > > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> > > index fab02133a919..992142b35f55 100644
> > > --- a/include/linux/virtio_ring.h
> > > +++ b/include/linux/virtio_ring.h
> > > @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
> > >  struct virtio_device;
> > >  struct virtqueue;
> > >  
> > > +union vring_union {
> > > +	struct vring vring_split;
> > > +	struct vring_packed vring_packed;
> > > +};
> > > +
> > >  /*
> > >   * Creates a virtqueue and allocates the descriptor ring.  If
> > >   * may_reduce_num is set, then this may allocate a smaller ring than
> > > @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
> > >  
> > >  /* Creates a virtqueue with a custom layout. */
> > >  struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > > -					struct vring vring,
> > > +					union vring_union vring,
> > > +					bool packed,
> > >  					struct virtio_device *vdev,
> > >  					bool weak_barriers,
> > >  					bool ctx,
> > > -- 
> > > 2.18.0

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox