Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH,v2,net-next 2/2] tun: enable napi_gro_frags() for TUN/TAP driver
From: Petar Penkov @ 2017-09-22  2:17 UTC (permalink / raw)
  To: netdev
  Cc: Petar Penkov, Eric Dumazet, Mahesh Bandewar, Willem de Bruijn,
	davem, ppenkov
In-Reply-To: <20170922021715.2618-1-peterpenkov96@gmail.com>

Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
---
 drivers/net/tun.c           | 131 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..e398f019c590 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@ do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE | IFF_NAPI)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@ struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (tun->flags & IFF_NAPI) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
@@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	tun->flags = (tun->flags & ~TUN_FEATURES) |
 		(ifr->ifr_flags & TUN_FEATURES);
 
+	if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next v2] net: Remove useless function skb_header_release
From: gfree.wind @ 2017-09-22  2:25 UTC (permalink / raw)
  To: davem, netdev; +Cc: Gao Feng

From: Gao Feng <gfree.wind@vip.163.com>

There is no one which would invokes the function skb_header_release.
So just remove it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
---
 v2: Correct some comments, per Joe
 v1: initial version

 drivers/net/usb/asix_common.c   |  2 +-
 include/linux/skbuff.h          | 19 -------------------
 net/batman-adv/soft-interface.c |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c
index 522d290..f4d7362 100644
--- a/drivers/net/usb/asix_common.c
+++ b/drivers/net/usb/asix_common.c
@@ -245,7 +245,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb,
 	 * - We are allowed to put 4 bytes at tail if skb_cloned()
 	 *   is false (and if we have 4 bytes of tailroom)
 	 *
-	 * TCP packets for example are cloned, but skb_header_release()
+	 * TCP packets for example are cloned, but __skb_header_release()
 	 * was called in tcp stack, allowing us to use headroom for our needs.
 	 */
 	if (!skb_header_cloned(skb) &&
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4928288..f9db553 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1457,27 +1457,8 @@ static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
 }
 
 /**
- *	skb_header_release - release reference to header
- *	@skb: buffer to operate on
- *
- *	Drop a reference to the header part of the buffer.  This is done
- *	by acquiring a payload reference.  You must not read from the header
- *	part of skb->data after this.
- *	Note : Check if you can use __skb_header_release() instead.
- */
-static inline void skb_header_release(struct sk_buff *skb)
-{
-	BUG_ON(skb->nohdr);
-	skb->nohdr = 1;
-	atomic_add(1 << SKB_DATAREF_SHIFT, &skb_shinfo(skb)->dataref);
-}
-
-/**
  *	__skb_header_release - release reference to header
  *	@skb: buffer to operate on
- *
- *	Variant of skb_header_release() assuming skb is private to caller.
- *	We can avoid one atomic operation.
  */
 static inline void __skb_header_release(struct sk_buff *skb)
 {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 10f7edf..c2c9867 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -69,7 +69,7 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
 	int result;
 
 	/* TODO: We must check if we can release all references to non-payload
-	 * data using skb_header_release in our skbs to allow skb_cow_header to
+	 * data using __skb_header_release in our skbs to allow skb_cow_header to
 	 * work optimally. This means that those skbs are not allowed to read
 	 * or write any data which is before the current position of skb->data
 	 * after that call and thus allow other skbs with the same data buffer
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH v3 09/31] jfs: Define usercopy region in jfs_ip slab cache
From: Dave Kleikamp @ 2017-09-22  2:54 UTC (permalink / raw)
  To: Kees Cook, linux-kernel
  Cc: David Windsor, jfs-discussion, linux-fsdevel, netdev, linux-mm,
	kernel-hardening
In-Reply-To: <1505940337-79069-10-git-send-email-keescook@chromium.org>

Acked-by: Dave Kleikamp <dave.kleikamp@oracle.com>

On 09/20/2017 03:45 PM, Kees Cook wrote:
> From: David Windsor <dave@nullcore.net>
> 
> The jfs symlink pathnames, stored in struct jfs_inode_info.i_inline and
> therefore contained in the jfs_ip slab cache, need to be copied to/from
> userspace.
> 
> cache object allocation:
>     fs/jfs/super.c:
>         jfs_alloc_inode(...):
>             ...
>             jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
>             ...
>             return &jfs_inode->vfs_inode;
> 
>     fs/jfs/jfs_incore.h:
>         JFS_IP(struct inode *inode):
>             return container_of(inode, struct jfs_inode_info, vfs_inode);
> 
>     fs/jfs/inode.c:
>         jfs_iget(...):
>             ...
>             inode->i_link = JFS_IP(inode)->i_inline;
> 
> example usage trace:
>     readlink_copy+0x43/0x70
>     vfs_readlink+0x62/0x110
>     SyS_readlinkat+0x100/0x130
> 
>     fs/namei.c:
>         readlink_copy(..., link):
>             ...
>             copy_to_user(..., link, len);
> 
>         (inlined in vfs_readlink)
>         generic_readlink(dentry, ...):
>             struct inode *inode = d_inode(dentry);
>             const char *link = inode->i_link;
>             ...
>             readlink_copy(..., link);
> 
> In support of usercopy hardening, this patch defines a region in the
> jfs_ip slab cache in which userspace copy operations are allowed.
> 
> This region is known as the slab cache's usercopy region. Slab caches can
> now check that each copy operation involving cache-managed memory falls
> entirely within the slab's usercopy region.
> 
> This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY
> whitelisting code in the last public patch of grsecurity/PaX based on my
> understanding of the code. Changes or omissions from the original code are
> mine and don't reflect the original grsecurity/PaX code.
> 
> Signed-off-by: David Windsor <dave@nullcore.net>
> [kees: adjust commit log, provide usage trace]
> Cc: Dave Kleikamp <shaggy@kernel.org>
> Cc: jfs-discussion@lists.sourceforge.net
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  fs/jfs/super.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/jfs/super.c b/fs/jfs/super.c
> index 2f14677169c3..e018412608d4 100644
> --- a/fs/jfs/super.c
> +++ b/fs/jfs/super.c
> @@ -966,9 +966,11 @@ static int __init init_jfs_fs(void)
>  	int rc;
>  
>  	jfs_inode_cachep =
> -	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
> -			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
> -			    init_once);
> +	    kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
> +			0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
> +			offsetof(struct jfs_inode_info, i_inline),
> +			sizeof_field(struct jfs_inode_info, i_inline),
> +			init_once);
>  	if (jfs_inode_cachep == NULL)
>  		return -ENOMEM;
>  
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* RE: [net-next 1/2] dummy: add device MTU validation check
From: 张胜举 @ 2017-09-22  3:28 UTC (permalink / raw)
  To: 'Eric Dumazet'; +Cc: davem, willemb, stephen, netdev
In-Reply-To: <1506006138.29839.132.camel@edumazet-glaptop3.roam.corp.google.com>

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: 2017年9月21日 23:02
> To: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> Cc: davem@davemloft.net; willemb@google.com;
> stephen@networkplumber.org; netdev@vger.kernel.org
> Subject: Re: [net-next 1/2] dummy: add device MTU validation check
> 
> On Thu, 2017-09-21 at 21:32 +0800, Zhang Shengju wrote:
> > Currently, any mtu value can be assigned when adding a new dummy device:
> > [~]# ip link add name dummy1 mtu 100000 type dummy [~]# ip link show
> > dummy1
> > 15: dummy1: <BROADCAST,NOARP> mtu 100000 qdisc noop state DOWN
> mode DEFAULT group default qlen 1000
> >     link/ether 0a:61:6b:16:14:ce brd ff:ff:ff:ff:ff:ff
> >
> > This patch adds device MTU validation check.
> 
> What is wrong with big MTU on dummy ?
> 
> If this is a generic rule, this check should belong in core network stack.
> 

dummy_setup() function setup mtu range: [0, ETH_MAX_MTU]. 
This will be checked at dev_set_mtu() function in core network stack.

So if you add a new dummy device without specify mtu value, you can't set a value out of range [0, ETH_MAX_MTU] afterward.
BUT you can set any mtu when adding new device. This cause an inconsistence.

> >
> > Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> > ---
> >  drivers/net/dummy.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index
> > e31ab3b..0276b2b 100644
> > --- a/drivers/net/dummy.c
> > +++ b/drivers/net/dummy.c
> > @@ -365,6 +365,14 @@ static int dummy_validate(struct nlattr *tb[], struct
> nlattr *data[],
> >  		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
> >  			return -EADDRNOTAVAIL;
> >  	}
> > +
> > +	if (tb[IFLA_MTU]) {
> > +		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
> 
> You do not verify/validate nla_len(tb[IFLA_MTU]).
> 
> Do not ever trust user space.
MTU attribute is just u32, do you think it's necessary to check the length? 
Actually I don't see any place to check the length of mtu attribute in network stack code. 

> 
> > +
> > +		if (mtu > ETH_MAX_MTU)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 

^ permalink raw reply

* RE: [net-next 2/2] ifb: add device MTU validation check
From: 张胜举 @ 2017-09-22  3:35 UTC (permalink / raw)
  To: 'Stephen Hemminger'; +Cc: davem, willemb, netdev
In-Reply-To: <20170921081010.4d6f5731@xeon-e3>

> -----Original Message-----
> From: Stephen Hemminger [mailto:stephen@networkplumber.org]
> Sent: 2017年9月21日 23:10
> To: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> Cc: davem@davemloft.net; willemb@google.com; netdev@vger.kernel.org
> Subject: Re: [net-next 2/2] ifb: add device MTU validation check
> 
> On Thu, 21 Sep 2017 21:32:02 +0800
> Zhang Shengju <zhangshengju@cmss.chinamobile.com> wrote:
> 
> > Currently, any mtu value can be assigned when adding a new ifb device:
> > [~]# ip link add name ifb2 mtu 100000 type ifb [~]# ip link show ifb2
> > 18: ifb2: <BROADCAST,NOARP> mtu 100000 qdisc noop state DOWN mode
> DEFAULT group default qlen 32
> >     link/ether 7a:bf:f4:63:da:d1 brd ff:ff:ff:ff:ff:ff
> >
> > This patch adds device MTU validation check.
> >
> > Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
> > ---
> >  drivers/net/ifb.c | 8 ++++++++
> >  1 file changed, 8 insertions(+)
> >
> > diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index
> > 8870bd2..ce84ad2 100644
> > --- a/drivers/net/ifb.c
> > +++ b/drivers/net/ifb.c
> > @@ -282,6 +282,14 @@ static int ifb_validate(struct nlattr *tb[], struct
> nlattr *data[],
> >  		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
> >  			return -EADDRNOTAVAIL;
> >  	}
> > +
> > +	if (tb[IFLA_MTU]) {
> > +		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
> > +
> > +		if (mtu < ETH_MIN_MTU || mtu > ETH_DATA_LEN)
> > +			return -EINVAL;
> > +	}
> > +
> >  	return 0;
> >  }
> >
> 
> What about running ifb with packets coming from devices with jumbo frames?
> Also since ifb is an input only device, MTU doesn't matter.

Actually ifb_setup() function setup ifb valid MTU range: [68, 1500], and
this will be check at dev_set_mtu() function. 
You can't setup mtu out of this range after creation, but you can set any
value when adding new ifb device.
This is inconsistent, and I think we should add this check at creation time 

BRs,
ZSJ

^ permalink raw reply

* Re: [PATCH] net: phy: Fix truncation of large IRQ numbers in phy_attached_print()
From: David Miller @ 2017-09-22  3:36 UTC (permalink / raw)
  To: geert+renesas; +Cc: andrew, f.fainelli, romain.perier, netdev, linux-kernel
In-Reply-To: <1505993222-16721-1-git-send-email-geert+renesas@glider.be>

From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 21 Sep 2017 13:27:02 +0200

> Given NR_IRQS is 2048 on sparc64, and even 32784 on alpha, 3 digits is
> not enough to represent interrupt numbers on all architectures.  Hence
> PHY interrupt numbers may be truncated during printing.
> 
> Increase the buffer size from 4 to 8 bytes to fix this.
> 
> Fixes: 5e369aefdce4818c ("net: stmmac: Delete dead code for MDIO registration")
> Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] net: vrf: remove skb_dst_force() after skb_dst_set()
From: David Miller @ 2017-09-22  3:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, dsa, shm
In-Reply-To: <1506005428.29839.129.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Sep 2017 07:50:28 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> skb_dst_set(skb, dst) installs a normal (refcounted) dst, there is no
> point using skb_dst_force(skb)
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Applied, thanks Eric.

^ permalink raw reply

* Re: [PATCH net-next] test_rhashtable: remove initdata annotation
From: David Miller @ 2017-09-22  3:42 UTC (permalink / raw)
  To: fw; +Cc: netdev
In-Reply-To: <20170921153608.8789-1-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Thu, 21 Sep 2017 17:36:08 +0200

> kbuild test robot reported a section mismatch warning w. gcc 4.x:
> WARNING: lib/test_rhashtable.o(.text+0x139e):
> Section mismatch in reference from the function rhltable_insert.clone.3() to the variable .init.data:rhlt
> 
> so remove this annotation.
> 
> Fixes: cdd4de372ea06 ("test_rhashtable: add test case for rhl_table interface")
> Reported-by: kbuild test robot <lkp@intel.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>

Applied.

^ permalink raw reply

* Re: [PATCH net] net: prevent dst uses after free
From: David Miller @ 2017-09-22  3:42 UTC (permalink / raw)
  To: eric.dumazet; +Cc: pstaszewski, weiwan, xiyou.wangcong, netdev, edumazet
In-Reply-To: <1506010546.29839.148.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 21 Sep 2017 09:15:46 -0700

> From: Eric Dumazet <edumazet@google.com>
> 
> In linux-4.13, Wei worked hard to convert dst to a traditional
> refcounted model, removing GC.
> 
> We now want to make sure a dst refcount can not transition from 0 back
> to 1.
> 
> The problem here is that input path attached a not refcounted dst to an
> skb. Then later, because packet is forwarded and hits skb_dst_force()
> before exiting RCU section, we might try to take a refcount on one dst
> that is about to be freed, if another cpu saw 1 -> 0 transition in
> dst_release() and queued the dst for freeing after one RCU grace period.
> 
> Lets unify skb_dst_force() and skb_dst_force_safe(), since we should
> always perform the complete check against dst refcount, and not assume
> it is not zero.
> 
> Bugzilla : https://bugzilla.kernel.org/show_bug.cgi?id=197005
 ...
> Similarly dst_clone() can use dst_hold() helper to have additional
> debugging, as a follow up to commit 44ebe79149ff ("net: add debug
> atomic_inc_not_zero() in dst_hold()")
> 
> In net-next we will convert dst atomic_t to refcount_t for peace of
> mind.
> 
> Fixes: a4c2fd7f7891 ("net: remove DST_NOCACHE flag")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Wei Wang <weiwan@google.com>
> Reported-by: Paweł Staszewski <pstaszewski@itcare.pl>
> Bisected-by: Paweł Staszewski <pstaszewski@itcare.pl>

Applied and queued up for -stable, thanks Eric.

^ permalink raw reply

* Re: [PATCH net-next 01/10] net: hns3: Support for dynamically assigning tx buffer to TC
From: David Miller @ 2017-09-22  3:43 UTC (permalink / raw)
  To: linyunsheng
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, salil.mehta, lipeng321, netdev,
	linux-kernel
In-Reply-To: <8babe541-9409-7f66-e52b-922144c46fb0@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Fri, 22 Sep 2017 09:57:31 +0800

> Hi, David
> 
> On 2017/9/22 9:41, David Miller wrote:
>> From: Yunsheng Lin <linyunsheng@huawei.com>
>> Date: Thu, 21 Sep 2017 19:21:44 +0800
>> 
>>> @@ -1324,23 +1324,28 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
>>>  	return 0;
>>>  }
>>>  
>>> -static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, u16 buf_size)
>>> +static int  hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev)
>>>  {
>>>  /* TX buffer size is unit by 128 byte */
>>>  #define HCLGE_BUF_SIZE_UNIT_SHIFT	7
>>>  #define HCLGE_BUF_SIZE_UPDATE_EN_MSK	BIT(15)
>>>  	struct hclge_tx_buff_alloc *req;
>>> +	struct hclge_priv_buf *priv;
>>>  	struct hclge_desc desc;
>>> +	u32 buf_size;
>>>  	int ret;
>>>  	u8 i;
>>>  
>>>  	req = (struct hclge_tx_buff_alloc *)desc.data;
>>>  
>>>  	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0);
>>> -	for (i = 0; i < HCLGE_TC_NUM; i++)
>>> +	for (i = 0; i < HCLGE_TC_NUM; i++) {
>>> +		priv = &hdev->priv_buf[i];
>>> +		buf_size = priv->tx_buf_size;
>>>  		req->tx_pkt_buff[i] =
>>>  			cpu_to_le16((buf_size >> HCLGE_BUF_SIZE_UNIT_SHIFT) |
> 
> buf_size is used here

My bad, I misread the code.

Thanks.

^ permalink raw reply

* Re: Regression in throughput between kvm guests over virtual bridge
From: Jason Wang @ 2017-09-22  4:03 UTC (permalink / raw)
  To: Matthew Rosato, netdev; +Cc: davem, mst
In-Reply-To: <129a01d9-de9b-f3f1-935c-128e73153df6@linux.vnet.ibm.com>



On 2017年09月21日 03:38, Matthew Rosato wrote:
>> Seems to make some progress on wakeup mitigation. Previous patch tries
>> to reduce the unnecessary traversal of waitqueue during rx. Attached
>> patch goes even further which disables rx polling during processing tx.
>> Please try it to see if it has any difference.
> Unfortunately, this patch doesn't seem to have made a difference.  I
> tried runs with both this patch and the previous patch applied, as well
> as only this patch applied for comparison (numbers from vhost thread of
> sending VM):
>
> 4.12    4.13     patch1   patch2   patch1+2
> 2.00%   +3.69%   +2.55%   +2.81%   +2.69%   [...] __wake_up_sync_key
>
> In each case, the regression in throughput was still present.

This probably means some other cases of the wakeups were missed. Could 
you please record the callers of __wake_up_sync_key()?

>
>> And two questions:
>> - Is the issue existed if you do uperf between 2VMs (instead of 4VMs)
> Verified that the second set of guests are not actually required, I can
> see the regression with only 2 VMs.
>
>> - Can enable batching in the tap of sending VM improve the performance
>> (ethtool -C $tap rx-frames 64)
> I tried this, but it did not help (actually seemed to make things a
> little worse)
>

  I still can't see a reason that can lead more wakeups, will take more 
time to look at this issue and keep you posted.

Thanks

^ permalink raw reply

* Re: [PATCH net-next 3/3] virtio-net: support XDP_REDIRECT
From: Jason Wang @ 2017-09-22  4:06 UTC (permalink / raw)
  To: John Fastabend, mst, virtualization, netdev, linux-kernel
In-Reply-To: <59C2E588.6060203@gmail.com>



On 2017年09月21日 06:02, John Fastabend wrote:
> On 09/19/2017 02:42 AM, Jason Wang wrote:
>> This patch tries to add XDP_REDIRECT for virtio-net. The changes are
>> not complex as we could use exist XDP_TX helpers for most of the
>> work. The rest is passing the XDP_TX to NAPI handler for implementing
>> batching.
>>
>> Cc: John Fastabend <john.fastabend@gmail.com>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
> [...]
>
>> @@ -678,12 +711,20 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>>   			}
>>   			break;
>>   		case XDP_TX:
>> -			if (unlikely(!virtnet_xdp_xmit(vi, &xdp)))
>> +			if (unlikely(!__virtnet_xdp_xmit(vi, &xdp)))
>>   				trace_xdp_exception(vi->dev, xdp_prog, act);
>> +			else
>> +				*xdp_xmit = true;
>>   			if (unlikely(xdp_page != page))
>>   				goto err_xdp;
>>   			rcu_read_unlock();
>>   			goto xdp_xmit;
>> +		case XDP_REDIRECT:
>> +			err = xdp_do_redirect(dev, &xdp, xdp_prog);
>> +			if (err)
>> +				*xdp_xmit = true;
> Is this a typo? Should above be
>
> 			if (!err)
> 				*xdp_xmit = true;
>
> this would match the pattern in receive_small and also make more sense.

Right, will post a fix.

>
>> +			rcu_read_unlock();
>> +			goto xdp_xmit;
>>   		default:
>>   			bpf_warn_invalid_xdp_action(act);
>>   		case XDP_ABORTED:
>> @@ -788,7 +829,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>>   }
>>   
> [...]
>
> Otherwise looks good to me thanks for doing this.
>
> .John
>

Thanks for the reviewing.

^ permalink raw reply

* Re: [PATCH] wireless: iwlegacy:  make const array static to shink object code size Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit
From: Johannes Berg @ 2017-09-22  6:00 UTC (permalink / raw)
  To: Colin King, Stanislaw Gruszka, Kalle Valo, linux-wireless, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20170921225630.21916-1-colin.king@canonical.com>

On Thu, 2017-09-21 at 23:56 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Don't populate const array ac_to_fifo on the stack in an inlined
> function, instead make it static.  Makes the object code smaller
> by over 800 bytes:
> 
>    text	   data	    bss	    dec	    hex	
> filename
>  159029	  33154	   1216	 193399	  2f377	
> 4965-mac.o
> 
>    text	   data	    bss	    dec	    hex	
> filename
>  158122	  33250	   1216	 192588	  2f04c	
> 4965-mac.o
> 
> (gcc version 7.2.0 x86_64)

I'm curious - how did you find this?

johannes

^ permalink raw reply

* [PATCH net-next v2 0/6] rtnetlink: preparation patches for further rtnl lock pushdown/removal
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev

First patch adds a rudimentary vrf test case.

Remaining patches split large rtnl_fill_ifinfo into smaller chunks
to better see which parts

1. require rtnl
2. do not require it at all
3. rely on rtnl locking now but could be converted

changes since v2:
- fix subject lines of patches 2 and 5
- drop pure ASSERT_RTNL annotation patch, its not all that useful
- remove two extra ASSERT_RTNL in last patch

^ permalink raw reply

* [PATCH net-next v2 1/6] selftests: rtnetlink.sh: add rudimentary vrf test
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, David Ahern
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 tools/testing/selftests/net/rtnetlink.sh | 42 ++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 4b48de565cae..6ee2bbaebcd4 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -291,6 +291,47 @@ kci_test_ifalias()
 	echo "PASS: set ifalias $namewant for $devdummy"
 }
 
+kci_test_vrf()
+{
+	vrfname="test-vrf"
+	ret=0
+
+	ip link show type vrf 2>/dev/null
+	if [ $? -ne 0 ]; then
+		echo "SKIP: vrf: iproute2 too old"
+		return 0
+	fi
+
+	ip link add "$vrfname" type vrf table 10
+	check_err $?
+	if [ $ret -ne 0 ];then
+		echo "FAIL: can't add vrf interface, skipping test"
+		return 0
+	fi
+
+	ip -br link show type vrf | grep -q "$vrfname"
+	check_err $?
+	if [ $ret -ne 0 ];then
+		echo "FAIL: created vrf device not found"
+		return 1
+	fi
+
+        ip link set dev "$vrfname" up
+	check_err $?
+
+	ip link set dev "$devdummy" master "$vrfname"
+	check_err $?
+	ip link del dev "$vrfname"
+	check_err $?
+
+	if [ $ret -ne 0 ];then
+		echo "FAIL: vrf"
+		return 1
+	fi
+
+	echo "PASS: vrf"
+}
+
 kci_test_rtnl()
 {
 	kci_add_dummy
@@ -306,6 +347,7 @@ kci_test_rtnl()
 	kci_test_bridge
 	kci_test_addrlabel
 	kci_test_ifalias
+	kci_test_vrf
 
 	kci_del_dummy
 }
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next v2 2/6] rtnetlink: add helper to put master ifindex
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

rtnl_fill_ifinfo currently requires caller to hold the rtnl mutex.
Unfortunately the function is quite large which makes it harder to see
which spots need the lock, which spots assume it and which ones could do
without.

Add helpers to factor out the ifindex dumping.

One helper can use rcu to remove rtnl dependency.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/core/rtnetlink.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a78fd61da0ec..c801212ee40e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1307,6 +1307,31 @@ static u32 rtnl_get_event(unsigned long event)
 	return rtnl_event_type;
 }
 
+static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
+{
+	const struct net_device *upper_dev;
+	int ret = 0;
+
+	rcu_read_lock();
+
+	upper_dev = netdev_master_upper_dev_get_rcu(dev);
+	if (upper_dev)
+		ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
+{
+	int ifindex = dev_get_iflink(dev);
+
+	if (dev->ifindex == ifindex)
+		return 0;
+
+	return nla_put_u32(skb, IFLA_LINK, ifindex);
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
@@ -1316,7 +1341,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	struct nlmsghdr *nlh;
 	struct nlattr *af_spec;
 	struct rtnl_af_ops *af_ops;
-	struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
 
 	ASSERT_RTNL();
 	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -1345,10 +1369,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
-	    (dev->ifindex != dev_get_iflink(dev) &&
-	     nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
-	    (upper_dev &&
-	     nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex)) ||
+	    nla_put_iflink(skb, dev) ||
+	    put_master_ifindex(skb, dev) ||
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    (dev->qdisc &&
 	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next v2 3/6] rtnetlink: add helper to dump qdisc name
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

We can use rcu here to make this safe even if we would not hold rtnl:
qdisc_destroy uses call_rcu to free the Qdisc struct.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/core/rtnetlink.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c801212ee40e..ad3f27da37a8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1332,6 +1332,19 @@ static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
 	return nla_put_u32(skb, IFLA_LINK, ifindex);
 }
 
+static int nla_put_qdisc(struct sk_buff *skb, struct net_device *dev)
+{
+	struct Qdisc *q;
+	int ret = 0;
+
+	rcu_read_lock();
+	q = READ_ONCE(dev->qdisc);
+	if (q)
+		ret = nla_put_string(skb, IFLA_QDISC, q->ops->id);
+	rcu_read_unlock();
+	return ret;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
@@ -1372,8 +1385,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_iflink(skb, dev) ||
 	    put_master_ifindex(skb, dev) ||
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
-	    (dev->qdisc &&
-	     nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
+	    nla_put_qdisc(skb, dev) ||
 	    (dev->ifalias &&
 	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next v2 4/6] rtnetlink: add helper to dump ifalias
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

ifalias is currently protected by rtnl mutex, add assertion
as a reminder.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/core/rtnetlink.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ad3f27da37a8..42ff582a010e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1345,6 +1345,16 @@ static int nla_put_qdisc(struct sk_buff *skb, struct net_device *dev)
 	return ret;
 }
 
+static noinline int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev)
+{
+	ASSERT_RTNL();
+
+	if (dev->ifalias)
+		return nla_put_string(skb, IFLA_IFALIAS, dev->ifalias);
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
@@ -1386,8 +1396,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    put_master_ifindex(skb, dev) ||
 	    nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
 	    nla_put_qdisc(skb, dev) ||
-	    (dev->ifalias &&
-	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
+	    nla_put_ifalias(skb, dev) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next v2 5/6] rtnetlink: add helpers to dump vf and netnsid information
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

similar to earlier patches, split out more parts of this function to
better see what is happening and where we assume rtnl is locked.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/core/rtnetlink.c | 80 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 42ff582a010e..590823f70cc3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1211,6 +1211,36 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	return -EMSGSIZE;
 }
 
+static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
+					   struct net_device *dev,
+					   u32 ext_filter_mask)
+{
+	struct nlattr *vfinfo;
+	int i, num_vfs;
+
+	if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
+		return 0;
+
+	num_vfs = dev_num_vf(dev->dev.parent);
+	if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
+		return -EMSGSIZE;
+
+	if (!dev->netdev_ops->ndo_get_vf_config)
+		return 0;
+
+	vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+	if (!vfinfo)
+		return -EMSGSIZE;
+
+	for (i = 0; i < num_vfs; i++) {
+		if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
+			return -EMSGSIZE;
+	}
+
+	nla_nest_end(skb, vfinfo);
+	return 0;
+}
+
 static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 {
 	struct rtnl_link_ifmap map;
@@ -1355,6 +1385,23 @@ static noinline int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static noinline int rtnl_fill_link_netnsid(struct sk_buff *skb,
+					   const struct net_device *dev)
+{
+	if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
+		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
+
+		if (!net_eq(dev_net(dev), link_net)) {
+			int id = peernet2id_alloc(dev_net(dev), link_net);
+
+			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
+				return -EMSGSIZE;
+		}
+	}
+
+	return 0;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
 			    unsigned int flags, u32 ext_filter_mask,
@@ -1428,27 +1475,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	if (rtnl_fill_stats(skb, dev))
 		goto nla_put_failure;
 
-	if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
-	    nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
+	if (rtnl_fill_vf(skb, dev, ext_filter_mask))
 		goto nla_put_failure;
 
-	if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent &&
-	    ext_filter_mask & RTEXT_FILTER_VF) {
-		int i;
-		struct nlattr *vfinfo;
-		int num_vfs = dev_num_vf(dev->dev.parent);
-
-		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
-		if (!vfinfo)
-			goto nla_put_failure;
-		for (i = 0; i < num_vfs; i++) {
-			if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
-				goto nla_put_failure;
-		}
-
-		nla_nest_end(skb, vfinfo);
-	}
-
 	if (rtnl_port_fill(skb, dev, ext_filter_mask))
 		goto nla_put_failure;
 
@@ -1460,17 +1489,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			goto nla_put_failure;
 	}
 
-	if (dev->rtnl_link_ops &&
-	    dev->rtnl_link_ops->get_link_net) {
-		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
-
-		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id_alloc(dev_net(dev), link_net);
-
-			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
-				goto nla_put_failure;
-		}
-	}
+	if (rtnl_fill_link_netnsid(skb, dev))
+		goto nla_put_failure;
 
 	if (!(af_spec = nla_nest_start(skb, IFLA_AF_SPEC)))
 		goto nla_put_failure;
-- 
2.13.5

^ permalink raw reply related

* [PATCH net-next v2 6/6] rtnetlink: rtnl_have_link_slave_info doesn't need rtnl
From: Florian Westphal @ 2017-09-22  6:10 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal
In-Reply-To: <20170922061008.14723-1-fw@strlen.de>

Switch it to rcu.

rtnl_link_slave_info_fill on to other hand does need rtnl mutex for now so
add an annotation to its caller as a reminder.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Change since v1:
  - don't add ASSERT_RTNL to rtnl_link_slave_info_fill and
  rtnl_link_info_fill they are only called via rtnl_link_fill.

 net/core/rtnetlink.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 590823f70cc3..e6f9e36d9d5a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -522,11 +522,15 @@ static size_t rtnl_link_get_af_size(const struct net_device *dev,
 static bool rtnl_have_link_slave_info(const struct net_device *dev)
 {
 	struct net_device *master_dev;
+	bool ret = false;
 
-	master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
+	rcu_read_lock();
+
+	master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
 	if (master_dev && master_dev->rtnl_link_ops)
-		return true;
-	return false;
+		ret = true;
+	rcu_read_unlock();
+	return ret;
 }
 
 static int rtnl_link_slave_info_fill(struct sk_buff *skb,
@@ -598,6 +602,8 @@ static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
 	struct nlattr *linkinfo;
 	int err = -EMSGSIZE;
 
+	ASSERT_RTNL();
+
 	linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
 	if (linkinfo == NULL)
 		goto out;
-- 
2.13.5

^ permalink raw reply related

* Re: tg3 pxe weirdness
From: Siva Reddy Kallam @ 2017-09-22  6:21 UTC (permalink / raw)
  To: Berend De Schouwer; +Cc: Linux Netdev List

On Thu, Sep 21, 2017 at 7:53 PM, Berend De Schouwer
<berend.de.schouwer@gmail.com> wrote:
> Hi,
>
> I've got a machine with a Broadcom bcm5762c, using the tg3 driver, that
> fails to receive network packets under some very specific conditions.
>
> It works perfectly using a 1Gbps switch.  If, however, it first uses
> PXE and then loads the Linux tg3 driver, and the switch is 100Mbps, it
> no longer receives packets larger than ICMP.
>
> It does do ARP and ping.
>
> If it boots using PXE on a 1Gbps switch, boots into Linux, and then
> it's plugged into 100 Mbps it also stops receiving packets.
>
> mii-diag and dmesg confirm auto-negotiated speed and flow control, and
> confirm temporary disconnect as the cables are moved.
>
> PXE boots using UNDI, which then transfers a kernel using TFTP, which
> transfers correctly.  The kernel boots, loads the tg3 driver, connects
> the network.  Up to this point everything works.  Ping will work too.
> Any other network traffic fails.
>
> Booting from a harddrive works fine.  I assume the UNDI driver
> somewhere breaks auto-negotiation.  I've tried using mii-tool and
> ethtool, but I haven't managed to make it work yet.
>
> Is it possible to get negotiation working after PXE boot?  Are there
> any tg3 driver flags that might make a difference?
>
>
> Berend

Can you please share below details?
1) Model and Manufacturer of the system
2) Linux distro/kernel used?

^ permalink raw reply

* Re: [PATCH 00/64] use setup_timer() helper function.
From: Allen @ 2017-09-22  6:22 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-kernel, m.grzeschik, dmitry.tarnyagin, wg, mkl,
	Mark Einon, linux, pcnet32, Florian Fainelli,
	bcm-kernel-feedback-list, venza, ajk, jes, romieu, khc,
	Simon Kelley, linux-can, adi-buildroot-devel
In-Reply-To: <20170921.113927.1684139706973891956.davem@davemloft.net>

>
>>  This series uses setup_timer() helper function. The series
>> addresses the files under drivers/net/*.
>
> I've reviewed this series and will apply it to net-next.
>
> But please send out smaller chunks next time, maybe 10-15
> in a bunch?  64 patches at once makes it really hard for
> reviewiers.

 My apologies. I'll make sure I break it into smaller chunks
in the future.


-- 
       - Allen

^ permalink raw reply

* Re: [patch net-next 03/12] ipmr: Add FIB notification access functions
From: Yotam Gigi @ 2017-09-22  6:35 UTC (permalink / raw)
  To: Nikolay Aleksandrov, Jiri Pirko, netdev; +Cc: davem, idosch, mlxsw
In-Reply-To: <24470459-704b-89a2-648a-bbd4d3a1aebb@cumulusnetworks.com>

On 09/21/2017 02:19 PM, Nikolay Aleksandrov wrote:
> On 21/09/17 09:43, Jiri Pirko wrote:
>> From: Yotam Gigi <yotamg@mellanox.com>
>>
>> Make the ipmr module register as a FIB notifier. To do that, implement both
>> the ipmr_seq_read and ipmr_dump ops.
>>
>> The ipmr_seq_read op returns a sequence counter that is incremented on
>> every notification related operation done by the ipmr. To implement that,
>> add a sequence counter in the netns_ipv4 struct and increment it whenever a
>> new MFC route or VIF are added or deleted. The sequence operations are
>> protected by the RTNL lock.
>>
>> The ipmr_dump iterates the list of MFC routes and the list of VIF entries
>> and sends notifications about them. The entries dump is done under RCU.
>>
>> Signed-off-by: Yotam Gigi <yotamg@mellanox.com>
>> Reviewed-by: Ido Schimmel <idosch@mellanox.com>
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>> ---
>>  include/linux/mroute.h   |  15 ++++++
>>  include/net/netns/ipv4.h |   3 ++
>>  net/ipv4/ipmr.c          | 135 ++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 151 insertions(+), 2 deletions(-)
>>
> [snip]
>> +
>> +static int ipmr_dump(struct net *net, struct notifier_block *nb)
>> +{
>> +	struct mr_table *mrt;
>> +	int err;
>> +
>> +	err = ipmr_rules_dump(net, nb);
>> +	if (err)
>> +		return err;
>> +
>> +	ipmr_for_each_table(mrt, net) {
>> +		struct vif_device *v = &mrt->vif_table[0];
>> +		struct mfc_cache *mfc;
>> +		int vifi;
>> +
>> +		/* Notifiy on table VIF entries */
>> +		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
>> +			if (!v->dev)
>> +				continue;
>> +
>> +			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
>> +						     v, vifi, mrt->id);
>> +		}
> The VIF table is protected by mrt_lock (rwlock), here with RCU only
> you're not guaranteed to keep v->dev. It can become NULL after the check above.
> For details you can see vif_delete() in net/ipv4/ipmr.c. You need at least
> mrt_lock for reading.

Hmm, that's interesting. That situation would lead the dump to be inconsistent,
thus eventually discarded, right? So I can also just make sure that the driver
ignores dev=NULL notifications and that would be enough to solve it.

I have to think about it a bit more, but anyway, maybe taking the mrt_lock for
the VIF dump is the right solution here.

Anyway, thanks for the review!


>> +
>> +		/* Notify on table MFC entries */
>> +		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
>> +			call_ipmr_mfc_entry_notifier(nb, net,
>> +						     FIB_EVENT_ENTRY_ADD, mfc,
>> +						     mrt->id);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct fib_notifier_ops ipmr_notifier_ops_template = {
>> +	.family		= RTNL_FAMILY_IPMR,
>> +	.fib_seq_read	= ipmr_seq_read,
>> +	.fib_dump	= ipmr_dump,
>> +	.owner		= THIS_MODULE,
>> +};
>> +
>> +int __net_init ipmr_notifier_init(struct net *net)
>> +{
>> +	struct fib_notifier_ops *ops;
>> +
>> +	net->ipv4.ipmr_seq = 0;
>> +
>> +	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
>> +	if (IS_ERR(ops))
>> +		return PTR_ERR(ops);
>> +	net->ipv4.ipmr_notifier_ops = ops;
>> +
>> +	return 0;
>> +}
>> +
>> +static void __net_exit ipmr_notifier_exit(struct net *net)
>> +{
>> +	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
>> +	net->ipv4.ipmr_notifier_ops = NULL;
>> +}
>> +
>>  /* Setup for IP multicast routing */
>>  static int __net_init ipmr_net_init(struct net *net)
>>  {
>>  	int err;
>>  
>> +	err = ipmr_notifier_init(net);
>> +	if (err)
>> +		goto ipmr_notifier_fail;
>> +
>>  	err = ipmr_rules_init(net);
>>  	if (err < 0)
>> -		goto fail;
>> +		goto ipmr_rules_fail;
>>  
>>  #ifdef CONFIG_PROC_FS
>>  	err = -ENOMEM;
>> @@ -3074,7 +3202,9 @@ static int __net_init ipmr_net_init(struct net *net)
>>  proc_vif_fail:
>>  	ipmr_rules_exit(net);
>>  #endif
>> -fail:
>> +ipmr_rules_fail:
>> +	ipmr_notifier_exit(net);
>> +ipmr_notifier_fail:
>>  	return err;
>>  }
>>  
>> @@ -3084,6 +3214,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
>>  	remove_proc_entry("ip_mr_cache", net->proc_net);
>>  	remove_proc_entry("ip_mr_vif", net->proc_net);
>>  #endif
>> +	ipmr_notifier_exit(net);
>>  	ipmr_rules_exit(net);
>>  }
>>  
>>

^ permalink raw reply

* [PATCH net-next] virtio-net: correctly set xdp_xmit for mergeable buffer
From: Jason Wang @ 2017-09-22  6:38 UTC (permalink / raw)
  To: mst, jasowang, virtualization, netdev, linux-kernel; +Cc: John Fastabend

We should set xdp_xmit only when xdp_do_redirect() succeed.

Cc: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f6c1f13..dd14a45 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -721,7 +721,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			goto xdp_xmit;
 		case XDP_REDIRECT:
 			err = xdp_do_redirect(dev, &xdp, xdp_prog);
-			if (err)
+			if (!err)
 				*xdp_xmit = true;
 			rcu_read_unlock();
 			goto xdp_xmit;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 2/2] net: dsa: lan9303: Add basic offloading of unicast traffic
From: Egil Hjelmeland @ 2017-09-22  7:06 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: vivien.didelot, f.fainelli, netdev, linux-kernel
In-Reply-To: <20170921142127.GB27589@lunn.ch>

Den 21. sep. 2017 16:21, skrev Andrew Lunn:
> Hi Egil
> 
>> +static void lan9303_bridge_ports(struct lan9303 *chip)
>> +{
>> +	/* ports bridged: remove mirroring */
>> +	lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR, 0);
>> +}
> 
> Could you replace the 0 with something symbolic which makes this
> easier to understand.
> 
> #define LAN9303_SWE_PORT_MIRROR_DISABLED 0
> 

OK

>> +
>>   static int lan9303_handle_reset(struct lan9303 *chip)
>>   {
>>   	if (!chip->reset_gpio)
>> @@ -844,6 +866,69 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port,
>>   	}
>>   }
>>   
>> +static int lan9303_port_bridge_join(struct dsa_switch *ds, int port,
>> +				    struct net_device *br)
>> +{
>> +	struct lan9303 *chip = ds->priv;
>> +
>> +	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
>> +	if (ds->ports[1].bridge_dev ==  ds->ports[2].bridge_dev) {
>> +		lan9303_bridge_ports(chip);
>> +		chip->is_bridged = true;  /* unleash stp_state_set() */
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static void lan9303_port_bridge_leave(struct dsa_switch *ds, int port,
>> +				      struct net_device *br)
>> +{
>> +	struct lan9303 *chip = ds->priv;
>> +
>> +	dev_dbg(chip->dev, "%s(port %d)\n", __func__, port);
>> +	if (chip->is_bridged) {
>> +		lan9303_separate_ports(chip);
>> +		chip->is_bridged = false;
>> +	}
>> +}
>> +
>> +static void lan9303_port_stp_state_set(struct dsa_switch *ds, int port,
>> +				       u8 state)
>> +{
>> +	int portmask, portstate;
>> +	struct lan9303 *chip = ds->priv;
>> +
>> +	dev_dbg(chip->dev, "%s(port %d, state %d)\n",
>> +		__func__, port, state);
>> +	if (!chip->is_bridged)
>> +		return; /* touching SWE_PORT_STATE will break port separation */
> 
> I'm wondering how this is supposed to work. Please add a good comment
> here, since the hardware is forcing you to do something odd.
> 
> Maybe it would be a good idea to save the STP state in chip.  And then
> when chip->is_bridged is set true, change the state in the hardware to
> the saved value?
> 
> What happens when port 0 is added to the bridge, there is then a
> minute pause and then port 1 is added? I would expect that as soon as
> port 0 is added, the STP state machine for port 0 will start and move
> into listening and then forwarding. Due to hardware limitations it
> looks like you cannot do this. So what state is the hardware
> effectively in? Blocking? Forwarding?
> 
> Then port 1 is added. You can then can respect the states. port 1 will
> do blocking->listening->forwarding, but what about port 0? The calls
> won't get repeated? How does it transition to forwarding?
> 
>    Andrew
> 

I see your point with the "minute pause" argument. Although a bit
contrived use case, it is easy to fix by caching the STP state, as
you suggest. So I can do that.

The port separation method is from the original version of the driver,
not by me.

I have read through the datasheet to see if there are other ways
to make port separation, but I could not find anything.
If anybody care to read through the 300+ page lan9303.pdf and prove
me wrong, I am happy to do it differently.

How does other DSA HW chips handle port separation? Knowing that
could perhaps help me know what to look for.

Egil
'

>> +
>> +	switch (state) {
>> +	case BR_STATE_DISABLED:
>> +		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
>> +		break;
>> +	case BR_STATE_BLOCKING:
>> +	case BR_STATE_LISTENING:
>> +		portstate = LAN9303_SWE_PORT_STATE_BLOCKING_PORT0;
>> +		break;
>> +	case BR_STATE_LEARNING:
>> +		portstate = LAN9303_SWE_PORT_STATE_LEARNING_PORT0;
>> +		break;
>> +	case BR_STATE_FORWARDING:
>> +		portstate = LAN9303_SWE_PORT_STATE_FORWARDING_PORT0;
>> +		break;
>> +	default:
>> +		portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0;
>> +		dev_err(chip->dev, "unknown stp state: port %d, state %d\n",
>> +			port, state);
>> +	}
>> +
>> +	portmask = 0x3 << (port * 2);
>> +	portstate     <<= (port * 2);
>> +	lan9303_write_switch_reg_mask(chip, LAN9303_SWE_PORT_STATE,
>> +				      portstate, portmask);
>> +}
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox