Netdev List
 help / color / mirror / Atom feed
* [PATCH net 3/3] gre: receive also TEB packets for lwtunnels
From: Jiri Benc @ 2016-04-22 17:44 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman
In-Reply-To: <cover.1461346798.git.jbenc@redhat.com>

For ipgre interfaces in collect metadata mode, receive also traffic with
encapsulated Ethernet headers. The lwtunnel users are supposed to sort this
out correctly. This allows to have mixed Ethernet + L3-only traffic on the
same lwtunnel interface.

To keep backwards compatibility and prevent any surprises, gretap interfaces
have priority in receiving packets with Ethernet headers.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 include/net/ip_tunnels.h |  1 +
 net/ipv4/ip_gre.c        | 34 +++++++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 56050f913339..ac8d6822072e 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -161,6 +161,7 @@ struct tnl_ptk_info {
 
 #define PACKET_RCVD	0
 #define PACKET_REJECT	1
+#define PACKET_NEXT	2
 
 #define IP_TNL_HASH_BITS   7
 #define IP_TNL_HASH_SIZE   (1 << IP_TNL_HASH_BITS)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f973e0a58993..ccd6b098928d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -379,19 +379,13 @@ static __be32 tunnel_id_to_key(__be64 x)
 #endif
 }
 
-static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+		       struct ip_tunnel_net *itn)
 {
-	struct net *net = dev_net(skb->dev);
 	struct metadata_dst *tun_dst = NULL;
-	struct ip_tunnel_net *itn;
 	const struct iphdr *iph;
 	struct ip_tunnel *tunnel;
 
-	if (tpi->proto == htons(ETH_P_TEB))
-		itn = net_generic(net, gre_tap_net_id);
-	else
-		itn = net_generic(net, ipgre_net_id);
-
 	iph = ip_hdr(skb);
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
 				  iph->saddr, iph->daddr, tpi->key);
@@ -412,7 +406,29 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
 		return PACKET_RCVD;
 	}
-	return PACKET_REJECT;
+	return PACKET_NEXT;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+	struct net *net = dev_net(skb->dev);
+	struct ip_tunnel_net *itn;
+	int res;
+
+	if (tpi->proto == htons(ETH_P_TEB))
+		itn = net_generic(net, gre_tap_net_id);
+	else
+		itn = net_generic(net, ipgre_net_id);
+
+	res = __ipgre_rcv(skb, tpi, itn);
+	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
+		/* ipgre tunnels in collect metadata mode should receive
+		 * also ETH_P_TEB traffic
+		 */
+		itn = net_generic(net, ipgre_net_id);
+		res = __ipgre_rcv(skb, tpi, itn);
+	}
+	return res;
 }
 
 static int gre_rcv(struct sk_buff *skb)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 2/3] gre: build header correctly for collect metadata tunnels
From: Jiri Benc @ 2016-04-22 17:44 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman
In-Reply-To: <cover.1461346798.git.jbenc@redhat.com>

In ipgre (i.e. not gretap) + collect metadata mode, the skb was assumed to
contain Ethernet header and was encapsulated as ETH_P_TEB. This is not the
case, the interface is ARPHRD_IPGRE and the protocol to be used for
encapsulation is skb->protocol.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 net/ipv4/ip_gre.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d0abde4236af..f973e0a58993 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -523,7 +523,8 @@ static struct rtable *gre_get_rt(struct sk_buff *skb,
 	return ip_route_output_key(net, fl);
 }
 
-static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+			__be16 proto)
 {
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
@@ -575,7 +576,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
-	build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+	build_header(skb, tunnel_hlen, flags, proto,
 		     tunnel_id_to_key(tun_info->key.tun_id), 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
@@ -616,7 +617,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	const struct iphdr *tnl_params;
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, skb->protocol);
 		return NETDEV_TX_OK;
 	}
 
@@ -660,7 +661,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
 	if (tunnel->collect_md) {
-		gre_fb_xmit(skb, dev);
+		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
 		return NETDEV_TX_OK;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 1/3] gre: do not assign header_ops in collect metadata mode
From: Jiri Benc @ 2016-04-22 17:44 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman
In-Reply-To: <cover.1461346798.git.jbenc@redhat.com>

In ipgre mode (i.e. not gretap) with collect metadata flag set, the tunnel
is incorrectly assumed to be mGRE in NBMA mode (see commit 6a5f44d7a048c).
This is not the case, we're controlling the encapsulation addresses by
lwtunnel metadata. And anyway, assigning dev->header_ops in collect metadata
mode does not make sense.

Fixes: 2e15ea390e6f4 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jiri Benc <jbenc@redhat.com>
---
 net/ipv4/ip_gre.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index af5d1f38217f..d0abde4236af 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -893,7 +893,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	netif_keep_dst(dev);
 	dev->addr_len		= 4;
 
-	if (iph->daddr) {
+	if (iph->daddr && !tunnel->collect_md) {
 #ifdef CONFIG_NET_IPGRE_BROADCAST
 		if (ipv4_is_multicast(iph->daddr)) {
 			if (!iph->saddr)
@@ -902,8 +902,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
 			dev->header_ops = &ipgre_header_ops;
 		}
 #endif
-	} else
+	} else if (!tunnel->collect_md) {
 		dev->header_ops = &ipgre_header_ops;
+	}
 
 	return ip_tunnel_init(dev);
 }
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net 0/3] ipgre: fix lwtunnel support
From: Jiri Benc @ 2016-04-22 17:44 UTC (permalink / raw)
  To: netdev; +Cc: Pravin B Shelar, Thomas Graf, Simon Horman

lwtunnels currently work only with gretap. This patchset fixes the bugs in
ipgre metadata mode implementation.

As an example, in this setup:

ip a a 192.168.1.1/24 dev eth0
ip l a gre1 type gre external
ip l s gre1 up
ip a a 192.168.99.1/24 dev gre1
ip r a 192.168.99.2/32 encap ip dst 192.168.1.2 ttl 10 dev gre1
ping 192.168.99.2

the traffic does not go through before this patchset and does as expected
with it applied.

Jiri Benc (3):
  gre: do not assign header_ops in collect metadata mode
  gre: build header correctly for collect metadata tunnels
  gre: receive also TEB packets for lwtunnels

 include/net/ip_tunnels.h |  1 +
 net/ipv4/ip_gre.c        | 48 +++++++++++++++++++++++++++++++++---------------
 2 files changed, 34 insertions(+), 15 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH iproute2] ss: add SK_MEMINFO_DROPS display
From: Stephen Hemminger @ 2016-04-22 17:21 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1461241144.7627.6.camel@edumazet-glaptop3.roam.corp.google.com>

On Thu, 21 Apr 2016 05:19:04 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> From: Eric Dumazet <edumazet@google.com>
> 
> SK_MEMINFO_DROPS is added in linux-4.7 for TCP, UDP and SCTP
> 
> skmem will display the socket drop count using d prefix as in :
> 
> $ ss -tm src :22 | more
> State      Recv-Q Send-Q Local Address:Port    Peer Address:Port                
> ESTAB      0      52     10.246.7.151:ssh      172.20.10.101:50759                
> 	 skmem:(r0,rb8388608,t0,tb8388608,f1792,w2304,o0,bl0,d0)


Applied to net-next, thanks

^ permalink raw reply

* Re: [RFC PATCH] gro: Partly revert "net: gro: allow to build full sized skb"
From: Alexander Duyck @ 2016-04-22 17:14 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Eric Dumazet, Sowmini Varadhan, Netdev
In-Reply-To: <20160422085107.GJ3347@gauss.secunet.com>

On Fri, Apr 22, 2016 at 1:51 AM, Steffen Klassert
<steffen.klassert@secunet.com> wrote:
> On Thu, Apr 21, 2016 at 09:02:48AM -0700, Alexander Duyck wrote:
>> On Thu, Apr 21, 2016 at 12:40 AM, Steffen Klassert
>> <steffen.klassert@secunet.com> wrote:
>> > This partly reverts the below mentioned patch because on
>> > forwarding, such skbs can't be offloaded to a NIC.
>> >
>> > We need this to get IPsec GRO for forwarding to work properly,
>> > otherwise the GRO aggregated packets get segmented again by
>> > the GSO layer. Although discovered when implementing IPsec GRO,
>> > this is a general problem in the forwarding path.
>>
>> I'm confused as to why you would need this to get IPsec GRO forwarding
>> to work.
>
> It works without this, but the performance numbers are not that good
> if we have to do GSO in software.

Well really GSO is only meant to preform better than if we didn't do
any GRO/GSO at all.  If that isn't the case I wouldn't consider it a
regression since as Eric points out there are other scenerios where
you end up with a chain of buffers stuck on the fraglist.  Mostly what
GRO/GSO gets you is fewer runs through the stack.

>> Are you having to go through a device that doesn't have
>> NETIF_F_FRAGLIST defined?
>
> I don't know of any NIC that can do TSO on a skbuff with fraglist,
> that's why I try to avoid to have a buffer with fraglist.
>

Most of them don't.  There are only one or two NICs out there that
support transmitting a frame that has a fraglist.

>> Also what is the issue with having to go
>> through the GSO layer on segmentation?  It seems like we might be able
>> to do something like what we did with GSO partial to split frames so
>> that they are in chunks that wouldn't require NETIF_F_FRAGLIST.  Then
>> you could get the best of both worlds in that the stack would only
>> process one super-frame, and the transmitter could TSO a series of
>> frames that are some fixed MSS in size.
>
> This could be interesting. Then we could have a buffer with
> fraglist, GSO layer splits in skbuffs without fraglist that
> can be TSO offloaded. Something like this might solve my
> performance problems.

Right.  It is something to think about.  I was considering what might
be involved to make a fraglist based skb a GSO type.  Then we might be
able to handle it kind of like what we do for the whole
SKB_GSO_DODGY/NETIF_F_GSO_ROBUST path.  Basically if we just need to
break the frame at the fraglist level it probably wouldn't be that
hard to do assuming each skb is MSS aligned in terms of size.

- Alex

^ permalink raw reply

* [PATCH net-next] tc_act: export all user headers
From: Stephen Hemminger @ 2016-04-22 17:06 UTC (permalink / raw)
  To: Jamal Hadi Salim, David Miller; +Cc: netdev

The file tc_ife.h was missing from the export list.
Rather than continue to cherry-pick, just export all headers in the directory.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

--- a/include/uapi/linux/tc_act/Kbuild	2015-05-21 15:13:05.121132983 -0700
+++ b/include/uapi/linux/tc_act/Kbuild	2016-04-22 10:03:24.733449078 -0700
@@ -1,12 +1,2 @@
 # UAPI Header export list
-header-y += tc_csum.h
-header-y += tc_defact.h
-header-y += tc_gact.h
-header-y += tc_ipt.h
-header-y += tc_mirred.h
-header-y += tc_nat.h
-header-y += tc_pedit.h
-header-y += tc_skbedit.h
-header-y += tc_vlan.h
-header-y += tc_bpf.h
-header-y += tc_connmark.h
+header-y += *.h

^ permalink raw reply

* Re: [iproute2 PATCH v3 1/1] tc: introduce IFE action
From: Stephen Hemminger @ 2016-04-22 17:00 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: netdev, phil
In-Reply-To: <1461274814-16235-1-git-send-email-jhs@emojatatu.com>

On Thu, 21 Apr 2016 17:40:14 -0400
Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> new file mode 100644
> index 0000000..d648ff6
> --- /dev/null
> +++ b/include/linux/tc_ife.h
> @@ -0,0 +1,38 @@
> +#ifndef __UAPI_TC_IFE_H
> +#define __UAPI_TC_IFE_H
> +
> +#include <linux/types.h>
> +#include <linux/pkt_cls.h>
> +
> +#define TCA_ACT_IFE 25
> +/* Flag bits for now just encoding/decoding; mutually exclusive */
> +#define IFE_ENCODE 1
> +#define IFE_DECODE 0
> +
> +struct tc_ife {
> +	tc_gen;
> +	__u16 flags;
> +};
> +
> +/*XXX: We need to encode the total number of bytes consumed */
> +enum {
> +	TCA_IFE_UNSPEC,
> +	TCA_IFE_PARMS,
> +	TCA_IFE_TM,
> +	TCA_IFE_DMAC,
> +	TCA_IFE_SMAC,
> +	TCA_IFE_TYPE,
> +	TCA_IFE_METALST,
> +	__TCA_IFE_MAX
> +};
> +#define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
> +
> +#define IFE_META_SKBMARK 1
> +#define IFE_META_HASHID 2
> +#define	IFE_META_PRIO 3
> +#define	IFE_META_QMAP 4
> +/*Can be overridden at runtime by module option*/
> +#define	__IFE_META_MAX 5
> +#define IFE_META_MAX (__IFE_META_MAX - 1)
> +
> +#endif

All header files for iproute2 (in include/linux) should come from santized kernel
headers. I do not see the file tc_ife.h in include/uapi/linux in current net-next
kernel source tree.

^ permalink raw reply

* [PATCH] devlink: export header
From: Stephen Hemminger @ 2016-04-22 16:55 UTC (permalink / raw)
  To: David Miller, Jiri Pirko; +Cc: netdev

Export devlink.h when doing make headers install.
I am going to investigate just doing all headers in the directory,
but lets add missing piece for now.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

--- a/include/uapi/linux/Kbuild	2016-03-29 17:25:37.727993065 -0700
+++ b/include/uapi/linux/Kbuild	2016-04-22 09:49:10.317523852 -0700
@@ -96,6 +96,7 @@ header-y += cyclades.h
 header-y += cycx_cfm.h
 header-y += dcbnl.h
 header-y += dccp.h
+header-y += devlink.h
 header-y += dlmconstants.h
 header-y += dlm_device.h
 header-y += dlm.h

^ permalink raw reply

* Re: [PATCH net-next 2/9] libnl: nla_put_le64(): align on a 64-bit area
From: Eric Dumazet @ 2016-04-22 16:51 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: dev-yBygre7rU0TnMu66kgdUjQ,
	steffen.klassert-opNxpl+3fjRBDgjK7y7TUQ,
	herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q,
	aar-bIcnvbaLZ9MEGnE8C9+IrQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	bsingharora-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, kaber-dcUjhNyLwpNeoWH0uzbU5w,
	yoshfuji-VfPWfsRibaP+Ru+s062T9g,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	kadlec-K40Dz/62t/MgiyqX0sVFJYdd74u8MsAO,
	kuznet-v/Mj1YrvjDBInbfyfbPRSQ, jmorris-gx6/JNMH7DfYtjvyW6yDsg,
	linux-wpan-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	pablo-Cap9r6Oaw4JrovVCs/uTlw
In-Reply-To: <1461339084-3849-3-git-send-email-nicolas.dichtel-pdR9zngts4EAvxtiuMwx3w@public.gmane.org>

On Fri, 2016-04-22 at 17:31 +0200, Nicolas Dichtel wrote:
> nla_data() is now aligned on a 64-bit area.
> 
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> ---
>  include/net/netlink.h     |  8 +++++---
>  include/net/nl802154.h    |  6 ++++++
>  net/ieee802154/nl802154.c | 13 ++++++++-----
>  3 files changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/netlink.h b/include/net/netlink.h
> index 6f51a8a06498..7f6b99483ab7 100644
> --- a/include/net/netlink.h
> +++ b/include/net/netlink.h
> @@ -878,14 +878,16 @@ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value)
>  }
>  
>  /**
> - * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer
> + * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
>   * @skb: socket buffer to add attribute to
>   * @attrtype: attribute type
>   * @value: numeric value
> + * @padattr: attribute type for the padding
>   */
> -static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value)
> +static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
> +			       int padattr)
>  {
> -	return nla_put(skb, attrtype, sizeof(__le64), &value);
> +	return nla_put_64bit(skb, attrtype, sizeof(__le64), &value, padattr);
>  }
>  

But _why_ is it needed ?

nla_put() has no alignment assumptions, it simply copies 8 bytes.

Seems this is going too far.



_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

^ permalink raw reply

* [ethtool PATCH v7 2/2] ethtool: use netlink socket when AF_INET not available
From: David Decotigny @ 2016-04-22 16:48 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Garzik, Ben Hutchings, David Miller, Vidya Sagar Ravipati,
	Joe Perches, David Decotigny
In-Reply-To: <1461343700-44087-1-git-send-email-ddecotig@gmail.com>

From: David Decotigny <decot@googlers.com>

To benefit from this, kernel commit 025c68186e07 ("netlink: add support
for NIC driver ioctls") is needed.


Signed-off-by: David Decotigny <decot@googlers.com>
---
 configure.ac | 2 +-
 ethtool.c    | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 3105415..47d2a0f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -15,7 +15,7 @@ AM_PROG_CC_C_O
 dnl Checks for libraries.
 
 dnl Checks for header files.
-AC_CHECK_HEADERS(sys/ioctl.h)
+AC_CHECK_HEADERS(sys/ioctl.h linux/netlink.h)
 
 dnl Checks for typedefs, structures, and compiler characteristics.
 AC_MSG_CHECKING([whether <linux/types.h> defines big-endian types])
diff --git a/ethtool.c b/ethtool.c
index cb3d971..314b1b8 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -42,6 +42,9 @@
 #include <arpa/inet.h>
 
 #include <linux/sockios.h>
+#ifdef HAVE_LINUX_NETLINK_H
+# include <linux/netlink.h>
+#endif
 
 #ifndef MAX_ADDR_LEN
 #define MAX_ADDR_LEN	32
@@ -4645,6 +4648,10 @@ opt_found:
 
 		/* Open control socket. */
 		ctx.fd = socket(AF_INET, SOCK_DGRAM, 0);
+#ifdef HAVE_LINUX_NETLINK_H
+		if (ctx.fd < 0)
+			ctx.fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+#endif
 		if (ctx.fd < 0) {
 			perror("Cannot get control socket");
 			return 70;
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [ethtool PATCH v7 1/2] ethtool.c: add support for ETHTOOL_xLINKSETTINGS ioctls
From: David Decotigny @ 2016-04-22 16:48 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Garzik, Ben Hutchings, David Miller, Vidya Sagar Ravipati,
	Joe Perches, David Decotigny
In-Reply-To: <1461343700-44087-1-git-send-email-ddecotig@gmail.com>

From: David Decotigny <decot@googlers.com>

More info with kernel commit 8d3f2806f8fb ("Merge branch
'ethtool-ksettings'").

Note: The new features implemented in this patch depend on kernel
commit 793cf87de9d1 ("Set cmd field in ETHTOOL_GLINKSETTINGS response to
wrong nwords").


Signed-off-by: David Decotigny <decot@googlers.com>
---
 ethtool.c      | 681 ++++++++++++++++++++++++++++++++++++++++++++-------------
 internal.h     |  67 ++++++
 test-cmdline.c |  13 ++
 3 files changed, 603 insertions(+), 158 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index 0cd0d4f..cb3d971 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -47,42 +47,6 @@
 #define MAX_ADDR_LEN	32
 #endif
 
-#define ALL_ADVERTISED_MODES			\
-	(ADVERTISED_10baseT_Half |		\
-	 ADVERTISED_10baseT_Full |		\
-	 ADVERTISED_100baseT_Half |		\
-	 ADVERTISED_100baseT_Full |		\
-	 ADVERTISED_1000baseT_Half |		\
-	 ADVERTISED_1000baseT_Full |		\
-	 ADVERTISED_1000baseKX_Full|		\
-	 ADVERTISED_2500baseX_Full |		\
-	 ADVERTISED_10000baseT_Full |		\
-	 ADVERTISED_10000baseKX4_Full |		\
-	 ADVERTISED_10000baseKR_Full |		\
-	 ADVERTISED_10000baseR_FEC |		\
-	 ADVERTISED_20000baseMLD2_Full |	\
-	 ADVERTISED_20000baseKR2_Full |		\
-	 ADVERTISED_40000baseKR4_Full |		\
-	 ADVERTISED_40000baseCR4_Full |		\
-	 ADVERTISED_40000baseSR4_Full |		\
-	 ADVERTISED_40000baseLR4_Full |		\
-	 ADVERTISED_56000baseKR4_Full |		\
-	 ADVERTISED_56000baseCR4_Full |		\
-	 ADVERTISED_56000baseSR4_Full |		\
-	 ADVERTISED_56000baseLR4_Full)
-
-#define ALL_ADVERTISED_FLAGS			\
-	(ADVERTISED_Autoneg |			\
-	 ADVERTISED_TP |			\
-	 ADVERTISED_AUI |			\
-	 ADVERTISED_MII |			\
-	 ADVERTISED_FIBRE |			\
-	 ADVERTISED_BNC |			\
-	 ADVERTISED_Pause |			\
-	 ADVERTISED_Asym_Pause |		\
-	 ADVERTISED_Backplane |			\
-	 ALL_ADVERTISED_MODES)
-
 #ifndef HAVE_NETIF_MSG
 enum {
 	NETIF_MSG_DRV		= 0x0001,
@@ -293,6 +257,43 @@ static void get_mac_addr(char *src, unsigned char *dest)
 	}
 }
 
+static int parse_hex_u32_bitmap(const char *s,
+				unsigned int nbits, u32 *result)
+{
+	const unsigned int nwords = __KERNEL_DIV_ROUND_UP(nbits, 32);
+	size_t slen = strlen(s);
+	size_t i;
+
+	/* ignore optional '0x' prefix */
+	if ((slen > 2) && (strncasecmp(s, "0x", 2) == 0)) {
+		slen -= 2;
+		s += 2;
+	}
+
+	if (slen > 8 * nwords)  /* up to 2 digits per byte */
+		return -1;
+
+	memset(result, 0, 4 * nwords);
+	for (i = 0 ; i < slen ; ++i) {
+		const unsigned int shift = (slen - 1 - i) * 4;
+		u32 *dest = &result[shift / 32];
+		u32 nibble;
+
+		if ('a' <= s[i] && s[i] <= 'f')
+			nibble = 0xa + (s[i] - 'a');
+		else if ('A' <= s[i] && s[i] <= 'F')
+			nibble = 0xa + (s[i] - 'A');
+		else if ('0' <= s[i] && s[i] <= '9')
+			nibble = (s[i] - '0');
+		else
+			return -1;
+
+		*dest |= (nibble << (shift % 32));
+	}
+
+	return 0;
+}
+
 static void parse_generic_cmdline(struct cmd_context *ctx,
 				  int *changed,
 				  struct cmdline_info *info,
@@ -472,64 +473,157 @@ static int do_version(struct cmd_context *ctx)
 	return 0;
 }
 
-static void dump_link_caps(const char *prefix, const char *an_prefix, u32 mask,
-			   int link_mode_only);
+/* link mode routines */
 
-static void dump_supported(struct ethtool_cmd *ep)
+static __ETHTOOL_DECLARE_LINK_MODE_MASK(all_advertised_modes);
+static __ETHTOOL_DECLARE_LINK_MODE_MASK(all_advertised_flags);
+
+static void init_global_link_mode_masks(void)
 {
-	u32 mask = ep->supported;
+	static const enum ethtool_link_mode_bit_indices
+		all_advertised_modes_bits[] = {
+		ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+		ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+		ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+		ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+		ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+		ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+		ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+		ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+		ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+		ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+		ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+		ETHTOOL_LINK_MODE_10000baseR_FEC_BIT,
+		ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT,
+		ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT,
+		ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT,
+		ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT,
+		ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT,
+		ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT,
+		ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT,
+		ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT,
+		ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT,
+		ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+	};
+	static const enum ethtool_link_mode_bit_indices
+		additional_advertised_flags_bits[] = {
+		ETHTOOL_LINK_MODE_Autoneg_BIT,
+		ETHTOOL_LINK_MODE_TP_BIT,
+		ETHTOOL_LINK_MODE_AUI_BIT,
+		ETHTOOL_LINK_MODE_MII_BIT,
+		ETHTOOL_LINK_MODE_FIBRE_BIT,
+		ETHTOOL_LINK_MODE_BNC_BIT,
+		ETHTOOL_LINK_MODE_Pause_BIT,
+		ETHTOOL_LINK_MODE_Asym_Pause_BIT,
+		ETHTOOL_LINK_MODE_Backplane_BIT,
+	};
+	unsigned int i;
 
+	ethtool_link_mode_zero(all_advertised_modes);
+	ethtool_link_mode_zero(all_advertised_flags);
+	for (i = 0 ; i < ARRAY_SIZE(all_advertised_modes_bits) ; ++i) {
+		ethtool_link_mode_set_bit(all_advertised_modes_bits[i],
+					  all_advertised_modes);
+		ethtool_link_mode_set_bit(all_advertised_modes_bits[i],
+					  all_advertised_flags);
+	}
+
+	for (i = 0 ; i < ARRAY_SIZE(additional_advertised_flags_bits) ; ++i) {
+		ethtool_link_mode_set_bit(
+			additional_advertised_flags_bits[i],
+			all_advertised_flags);
+	}
+}
+
+static void dump_link_caps(const char *prefix, const char *an_prefix,
+			   const u32 *mask, int link_mode_only);
+
+static void dump_supported(const struct ethtool_link_usettings *link_usettings)
+{
 	fprintf(stdout, "	Supported ports: [ ");
-	if (mask & SUPPORTED_TP)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_TP_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "TP ");
-	if (mask & SUPPORTED_AUI)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_AUI_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "AUI ");
-	if (mask & SUPPORTED_BNC)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_BNC_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "BNC ");
-	if (mask & SUPPORTED_MII)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_MII_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "MII ");
-	if (mask & SUPPORTED_FIBRE)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_FIBRE_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "FIBRE ");
-	if (mask & SUPPORTED_Backplane)
+	if (ethtool_link_mode_test_bit(
+		    ETHTOOL_LINK_MODE_Backplane_BIT,
+		    link_usettings->link_modes.supported))
 		fprintf(stdout, "Backplane ");
 	fprintf(stdout, "]\n");
 
-	dump_link_caps("Supported", "Supports", mask, 0);
+	dump_link_caps("Supported", "Supports",
+		       link_usettings->link_modes.supported, 0);
 }
 
 /* Print link capability flags (supported, advertised or lp_advertised).
  * Assumes that the corresponding SUPPORTED and ADVERTISED flags are equal.
  */
-static void
-dump_link_caps(const char *prefix, const char *an_prefix, u32 mask,
-	       int link_mode_only)
+static void dump_link_caps(const char *prefix, const char *an_prefix,
+			   const u32 *mask, int link_mode_only)
 {
 	static const struct {
 		int same_line; /* print on same line as previous */
-		u32 value;
+		unsigned int bit_index;
 		const char *name;
 	} mode_defs[] = {
-		{ 0, ADVERTISED_10baseT_Half,       "10baseT/Half" },
-		{ 1, ADVERTISED_10baseT_Full,       "10baseT/Full" },
-		{ 0, ADVERTISED_100baseT_Half,      "100baseT/Half" },
-		{ 1, ADVERTISED_100baseT_Full,      "100baseT/Full" },
-		{ 0, ADVERTISED_1000baseT_Half,     "1000baseT/Half" },
-		{ 1, ADVERTISED_1000baseT_Full,     "1000baseT/Full" },
-		{ 0, ADVERTISED_1000baseKX_Full,    "1000baseKX/Full" },
-		{ 0, ADVERTISED_2500baseX_Full,     "2500baseX/Full" },
-		{ 0, ADVERTISED_10000baseT_Full,    "10000baseT/Full" },
-		{ 0, ADVERTISED_10000baseKX4_Full,  "10000baseKX4/Full" },
-		{ 0, ADVERTISED_10000baseKR_Full,   "10000baseKR/Full" },
-		{ 0, ADVERTISED_20000baseMLD2_Full, "20000baseMLD2/Full" },
-		{ 0, ADVERTISED_20000baseKR2_Full,  "20000baseKR2/Full" },
-		{ 0, ADVERTISED_40000baseKR4_Full,  "40000baseKR4/Full" },
-		{ 0, ADVERTISED_40000baseCR4_Full,  "40000baseCR4/Full" },
-		{ 0, ADVERTISED_40000baseSR4_Full,  "40000baseSR4/Full" },
-		{ 0, ADVERTISED_40000baseLR4_Full,  "40000baseLR4/Full" },
-		{ 0, ADVERTISED_56000baseKR4_Full,  "56000baseKR4/Full" },
-		{ 0, ADVERTISED_56000baseCR4_Full,  "56000baseCR4/Full" },
-		{ 0, ADVERTISED_56000baseSR4_Full,  "56000baseSR4/Full" },
-		{ 0, ADVERTISED_56000baseLR4_Full,  "56000baseLR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_10baseT_Half_BIT,
+		  "10baseT/Half" },
+		{ 1, ETHTOOL_LINK_MODE_10baseT_Full_BIT,
+		  "10baseT/Full" },
+		{ 0, ETHTOOL_LINK_MODE_100baseT_Half_BIT,
+		  "100baseT/Half" },
+		{ 1, ETHTOOL_LINK_MODE_100baseT_Full_BIT,
+		  "100baseT/Full" },
+		{ 0, ETHTOOL_LINK_MODE_1000baseT_Half_BIT,
+		  "1000baseT/Half" },
+		{ 1, ETHTOOL_LINK_MODE_1000baseT_Full_BIT,
+		  "1000baseT/Full" },
+		{ 0, ETHTOOL_LINK_MODE_1000baseKX_Full_BIT,
+		  "1000baseKX/Full" },
+		{ 0, ETHTOOL_LINK_MODE_2500baseX_Full_BIT,
+		  "2500baseX/Full" },
+		{ 0, ETHTOOL_LINK_MODE_10000baseT_Full_BIT,
+		  "10000baseT/Full" },
+		{ 0, ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT,
+		  "10000baseKX4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT,
+		  "10000baseKR/Full" },
+		{ 0, ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT,
+		  "20000baseMLD2/Full" },
+		{ 0, ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT,
+		  "20000baseKR2/Full" },
+		{ 0, ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT,
+		  "40000baseKR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT,
+		  "40000baseCR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT,
+		  "40000baseSR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT,
+		  "40000baseLR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT,
+		  "56000baseKR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT,
+		  "56000baseCR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT,
+		  "56000baseSR4/Full" },
+		{ 0, ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+		  "56000baseLR4/Full" },
 	};
 	int indent;
 	int did1, new_line_pend, i;
@@ -546,7 +640,8 @@ dump_link_caps(const char *prefix, const char *an_prefix, u32 mask,
 	for (i = 0; i < ARRAY_SIZE(mode_defs); i++) {
 		if (did1 && !mode_defs[i].same_line)
 			new_line_pend = 1;
-		if (mask & mode_defs[i].value) {
+		if (ethtool_link_mode_test_bit(mode_defs[i].bit_index,
+					       mask)) {
 			if (new_line_pend) {
 				fprintf(stdout, "\n");
 				fprintf(stdout, "	%*s", indent, "");
@@ -562,46 +657,52 @@ dump_link_caps(const char *prefix, const char *an_prefix, u32 mask,
 
 	if (!link_mode_only) {
 		fprintf(stdout, "	%s pause frame use: ", prefix);
-		if (mask & ADVERTISED_Pause) {
+		if (ethtool_link_mode_test_bit(
+			    ETHTOOL_LINK_MODE_Pause_BIT, mask)) {
 			fprintf(stdout, "Symmetric");
-			if (mask & ADVERTISED_Asym_Pause)
+			if (ethtool_link_mode_test_bit(
+				    ETHTOOL_LINK_MODE_Asym_Pause_BIT, mask))
 				fprintf(stdout, " Receive-only");
 			fprintf(stdout, "\n");
 		} else {
-			if (mask & ADVERTISED_Asym_Pause)
+			if (ethtool_link_mode_test_bit(
+				    ETHTOOL_LINK_MODE_Asym_Pause_BIT, mask))
 				fprintf(stdout, "Transmit-only\n");
 			else
 				fprintf(stdout, "No\n");
 		}
 
 		fprintf(stdout, "	%s auto-negotiation: ", an_prefix);
-		if (mask & ADVERTISED_Autoneg)
+		if (ethtool_link_mode_test_bit(
+			    ETHTOOL_LINK_MODE_Autoneg_BIT, mask))
 			fprintf(stdout, "Yes\n");
 		else
 			fprintf(stdout, "No\n");
 	}
 }
 
-static int dump_ecmd(struct ethtool_cmd *ep)
+static int
+dump_link_usettings(const struct ethtool_link_usettings *link_usettings)
 {
-	u32 speed;
-
-	dump_supported(ep);
-	dump_link_caps("Advertised", "Advertised", ep->advertising, 0);
-	if (ep->lp_advertising)
+	dump_supported(link_usettings);
+	dump_link_caps("Advertised", "Advertised",
+		       link_usettings->link_modes.advertising, 0);
+	if (!ethtool_link_mode_is_empty(
+		    link_usettings->link_modes.lp_advertising))
 		dump_link_caps("Link partner advertised",
-			       "Link partner advertised", ep->lp_advertising,
-			       0);
+			       "Link partner advertised",
+			       link_usettings->link_modes.lp_advertising, 0);
 
 	fprintf(stdout, "	Speed: ");
-	speed = ethtool_cmd_speed(ep);
-	if (speed == 0 || speed == (u16)(-1) || speed == (u32)(-1))
+	if (link_usettings->base.speed == 0
+	    || link_usettings->base.speed == (u16)(-1)
+	    || link_usettings->base.speed == (u32)(-1))
 		fprintf(stdout, "Unknown!\n");
 	else
-		fprintf(stdout, "%uMb/s\n", speed);
+		fprintf(stdout, "%uMb/s\n", link_usettings->base.speed);
 
 	fprintf(stdout, "	Duplex: ");
-	switch (ep->duplex) {
+	switch (link_usettings->base.duplex) {
 	case DUPLEX_HALF:
 		fprintf(stdout, "Half\n");
 		break;
@@ -609,12 +710,12 @@ static int dump_ecmd(struct ethtool_cmd *ep)
 		fprintf(stdout, "Full\n");
 		break;
 	default:
-		fprintf(stdout, "Unknown! (%i)\n", ep->duplex);
+		fprintf(stdout, "Unknown! (%i)\n", link_usettings->base.duplex);
 		break;
 	};
 
 	fprintf(stdout, "	Port: ");
-	switch (ep->port) {
+	switch (link_usettings->base.port) {
 	case PORT_TP:
 		fprintf(stdout, "Twisted Pair\n");
 		break;
@@ -640,13 +741,13 @@ static int dump_ecmd(struct ethtool_cmd *ep)
 		fprintf(stdout, "Other\n");
 		break;
 	default:
-		fprintf(stdout, "Unknown! (%i)\n", ep->port);
+		fprintf(stdout, "Unknown! (%i)\n", link_usettings->base.port);
 		break;
 	};
 
-	fprintf(stdout, "	PHYAD: %d\n", ep->phy_address);
+	fprintf(stdout, "	PHYAD: %d\n", link_usettings->base.phy_address);
 	fprintf(stdout, "	Transceiver: ");
-	switch (ep->transceiver) {
+	switch (link_usettings->deprecated.transceiver) {
 	case XCVR_INTERNAL:
 		fprintf(stdout, "internal\n");
 		break;
@@ -659,17 +760,18 @@ static int dump_ecmd(struct ethtool_cmd *ep)
 	};
 
 	fprintf(stdout, "	Auto-negotiation: %s\n",
-		(ep->autoneg == AUTONEG_DISABLE) ?
+		(link_usettings->base.autoneg == AUTONEG_DISABLE) ?
 		"off" : "on");
 
-	if (ep->port == PORT_TP) {
+	if (link_usettings->base.port == PORT_TP) {
 		fprintf(stdout, "	MDI-X: ");
-		if (ep->eth_tp_mdix_ctrl == ETH_TP_MDI) {
+		if (link_usettings->base.eth_tp_mdix_ctrl == ETH_TP_MDI) {
 			fprintf(stdout, "off (forced)\n");
-		} else if (ep->eth_tp_mdix_ctrl == ETH_TP_MDI_X) {
+		} else if (link_usettings->base.eth_tp_mdix_ctrl
+			   == ETH_TP_MDI_X) {
 			fprintf(stdout, "on (forced)\n");
 		} else {
-			switch (ep->eth_tp_mdix) {
+			switch (link_usettings->base.eth_tp_mdix) {
 			case ETH_TP_MDI:
 				fprintf(stdout, "off");
 				break;
@@ -680,7 +782,8 @@ static int dump_ecmd(struct ethtool_cmd *ep)
 				fprintf(stdout, "Unknown");
 				break;
 			}
-			if (ep->eth_tp_mdix_ctrl == ETH_TP_MDI_AUTO)
+			if (link_usettings->base.eth_tp_mdix_ctrl
+			    == ETH_TP_MDI_AUTO)
 				fprintf(stdout, " (auto)");
 			fprintf(stdout, "\n");
 		}
@@ -1368,6 +1471,7 @@ static int dump_rxfhash(int fhash, u64 val)
 
 static void dump_eeecmd(struct ethtool_eee *ep)
 {
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(link_mode);
 
 	fprintf(stdout, "	EEE status: ");
 	if (!ep->supported) {
@@ -1389,9 +1493,16 @@ static void dump_eeecmd(struct ethtool_eee *ep)
 	else
 		fprintf(stdout, " disabled\n");
 
-	dump_link_caps("Supported EEE", "", ep->supported, 1);
-	dump_link_caps("Advertised EEE", "", ep->advertised, 1);
-	dump_link_caps("Link partner advertised EEE", "", ep->lp_advertised, 1);
+	ethtool_link_mode_zero(link_mode);
+
+	link_mode[0] = ep->supported;
+	dump_link_caps("Supported EEE", "", link_mode, 1);
+
+	link_mode[0] = ep->advertised;
+	dump_link_caps("Advertised EEE", "", link_mode, 1);
+
+	link_mode[0] = ep->lp_advertised;
+	dump_link_caps("Link partner advertised EEE", "", link_mode, 1);
 }
 
 #define N_SOTS 7
@@ -2247,10 +2358,220 @@ static int do_sfeatures(struct cmd_context *ctx)
 	return 0;
 }
 
-static int do_gset(struct cmd_context *ctx)
+static struct ethtool_link_usettings *
+__do_ioctl_glinksettings(struct cmd_context *ctx)
+{
+	int err;
+	struct {
+		struct ethtool_link_settings req;
+		__u32 link_mode_data[3*__ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+	} ecmd;
+	struct ethtool_link_usettings *link_usettings;
+	unsigned int u32_offs;
+
+	/* Handshake with kernel to determine number of words for link
+	 * mode bitmaps. When requested number of bitmap words is not
+	 * the one expected by kernel, the latter returns the integer
+	 * opposite of what it is expecting. We request length 0 below
+	 * (aka. invalid bitmap length) to get this info.
+	 */
+	memset(&ecmd, 0, sizeof(ecmd));
+	ecmd.req.cmd = ETHTOOL_GLINKSETTINGS;
+	err = send_ioctl(ctx, &ecmd);
+	if (err < 0)
+		return NULL;
+
+	/* see above: we expect a strictly negative value from kernel.
+	 */
+	if (ecmd.req.link_mode_masks_nwords >= 0
+	    || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS)
+		return NULL;
+
+	/* got the real ecmd.req.link_mode_masks_nwords,
+	 * now send the real request
+	 */
+	ecmd.req.cmd = ETHTOOL_GLINKSETTINGS;
+	ecmd.req.link_mode_masks_nwords = -ecmd.req.link_mode_masks_nwords;
+	err = send_ioctl(ctx, &ecmd);
+	if (err < 0)
+		return NULL;
+
+	if (ecmd.req.link_mode_masks_nwords <= 0
+	    || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS)
+		return NULL;
+
+	/* Convert to usettings struct */
+	link_usettings = calloc(1, sizeof(*link_usettings));
+	if (link_usettings == NULL)
+		return NULL;
+
+	/* keep transceiver 0 */
+	memcpy(&link_usettings->base, &ecmd.req, sizeof(link_usettings->base));
+
+	/* copy link mode bitmaps */
+	u32_offs = 0;
+	memcpy(link_usettings->link_modes.supported,
+	       &ecmd.link_mode_data[u32_offs],
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	u32_offs += ecmd.req.link_mode_masks_nwords;
+	memcpy(link_usettings->link_modes.advertising,
+	       &ecmd.link_mode_data[u32_offs],
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	u32_offs += ecmd.req.link_mode_masks_nwords;
+	memcpy(link_usettings->link_modes.lp_advertising,
+	       &ecmd.link_mode_data[u32_offs],
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	return link_usettings;
+}
+
+static int
+__do_ioctl_slinksettings(struct cmd_context *ctx,
+			 const struct ethtool_link_usettings *link_usettings)
+{
+	struct {
+		struct ethtool_link_settings req;
+		__u32 link_mode_data[3*__ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+	} ecmd;
+	unsigned int u32_offs;
+
+	/* refuse to send ETHTOOL_SLINKSETTINGS ioctl if
+	 * link_usettings was retrieved with ETHTOOL_GSET
+	 */
+	if (link_usettings->base.cmd != ETHTOOL_GLINKSETTINGS)
+		return -1;
+
+	/* refuse to send ETHTOOL_SLINKSETTINGS ioctl if deprecated fields
+	 * were set
+	 */
+	if (link_usettings->deprecated.transceiver)
+		return -1;
+
+	if (link_usettings->base.link_mode_masks_nwords <= 0)
+		return -1;
+
+	memcpy(&ecmd.req, &link_usettings->base, sizeof(ecmd.req));
+	ecmd.req.cmd = ETHTOOL_SLINKSETTINGS;
+
+	/* copy link mode bitmaps */
+	u32_offs = 0;
+	memcpy(&ecmd.link_mode_data[u32_offs],
+	       link_usettings->link_modes.supported,
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	u32_offs += ecmd.req.link_mode_masks_nwords;
+	memcpy(&ecmd.link_mode_data[u32_offs],
+	       link_usettings->link_modes.advertising,
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	u32_offs += ecmd.req.link_mode_masks_nwords;
+	memcpy(&ecmd.link_mode_data[u32_offs],
+	       link_usettings->link_modes.lp_advertising,
+	       4*ecmd.req.link_mode_masks_nwords);
+
+	return send_ioctl(ctx, &ecmd);
+}
+
+static struct ethtool_link_usettings *
+__do_ioctl_gset(struct cmd_context *ctx)
 {
 	int err;
 	struct ethtool_cmd ecmd;
+	struct ethtool_link_usettings *link_usettings;
+
+	memset(&ecmd, 0, sizeof(ecmd));
+	ecmd.cmd = ETHTOOL_GSET;
+	err = send_ioctl(ctx, &ecmd);
+	if (err < 0)
+		return NULL;
+
+	link_usettings = calloc(1, sizeof(*link_usettings));
+	if (link_usettings == NULL)
+		return NULL;
+
+	/* remember that ETHTOOL_GSET was used */
+	link_usettings->base.cmd = ETHTOOL_GSET;
+
+	link_usettings->base.link_mode_masks_nwords = 1;
+	link_usettings->link_modes.supported[0] = ecmd.supported;
+	link_usettings->link_modes.advertising[0] = ecmd.advertising;
+	link_usettings->link_modes.lp_advertising[0] = ecmd.lp_advertising;
+	link_usettings->base.speed = ethtool_cmd_speed(&ecmd);
+	link_usettings->base.duplex = ecmd.duplex;
+	link_usettings->base.port = ecmd.port;
+	link_usettings->base.phy_address = ecmd.phy_address;
+	link_usettings->deprecated.transceiver = ecmd.transceiver;
+	link_usettings->base.autoneg = ecmd.autoneg;
+	link_usettings->base.mdio_support = ecmd.mdio_support;
+	/* ignored (fully deprecated): maxrxpkt, maxtxpkt */
+	link_usettings->base.eth_tp_mdix = ecmd.eth_tp_mdix;
+	link_usettings->base.eth_tp_mdix_ctrl = ecmd.eth_tp_mdix_ctrl;
+
+	return link_usettings;
+}
+
+static bool ethtool_link_mode_is_backward_compatible(const u32 *mask)
+{
+	unsigned int i;
+
+	for (i = 1 ; i < __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 ; ++i)
+		if (mask[i])
+			return false;
+
+	return true;
+}
+
+static int
+__do_ioctl_sset(struct cmd_context *ctx,
+		const struct ethtool_link_usettings *link_usettings)
+{
+	struct ethtool_cmd ecmd;
+
+	/* refuse to send ETHTOOL_SSET ioctl if link_usettings was
+	 * retrieved with ETHTOOL_GLINKSETTINGS
+	 */
+	if (link_usettings->base.cmd != ETHTOOL_GSET)
+		return -1;
+
+	if (link_usettings->base.link_mode_masks_nwords <= 0)
+		return -1;
+
+	/* refuse to sset if any bit > 31 is set */
+	if (!ethtool_link_mode_is_backward_compatible(
+		    link_usettings->link_modes.supported))
+		return -1;
+	if (!ethtool_link_mode_is_backward_compatible(
+		    link_usettings->link_modes.advertising))
+		return -1;
+	if (!ethtool_link_mode_is_backward_compatible(
+		    link_usettings->link_modes.lp_advertising))
+		return -1;
+
+	memset(&ecmd, 0, sizeof(ecmd));
+	ecmd.cmd = ETHTOOL_SSET;
+
+	ecmd.supported = link_usettings->link_modes.supported[0];
+	ecmd.advertising = link_usettings->link_modes.advertising[0];
+	ecmd.lp_advertising = link_usettings->link_modes.lp_advertising[0];
+	ethtool_cmd_speed_set(&ecmd, link_usettings->base.speed);
+	ecmd.duplex = link_usettings->base.duplex;
+	ecmd.port = link_usettings->base.port;
+	ecmd.phy_address = link_usettings->base.phy_address;
+	ecmd.transceiver = link_usettings->deprecated.transceiver;
+	ecmd.autoneg = link_usettings->base.autoneg;
+	ecmd.mdio_support = link_usettings->base.mdio_support;
+	/* ignored (fully deprecated): maxrxpkt, maxtxpkt */
+	ecmd.eth_tp_mdix = link_usettings->base.eth_tp_mdix;
+	ecmd.eth_tp_mdix_ctrl = link_usettings->base.eth_tp_mdix_ctrl;
+	return send_ioctl(ctx, &ecmd);
+}
+
+static int do_gset(struct cmd_context *ctx)
+{
+	int err;
+	struct ethtool_link_usettings *link_usettings;
 	struct ethtool_wolinfo wolinfo;
 	struct ethtool_value edata;
 	int allfail = 1;
@@ -2260,10 +2581,12 @@ static int do_gset(struct cmd_context *ctx)
 
 	fprintf(stdout, "Settings for %s:\n", ctx->devname);
 
-	ecmd.cmd = ETHTOOL_GSET;
-	err = send_ioctl(ctx, &ecmd);
-	if (err == 0) {
-		err = dump_ecmd(&ecmd);
+	link_usettings = __do_ioctl_glinksettings(ctx);
+	if (link_usettings == NULL)
+		link_usettings = __do_ioctl_gset(ctx);
+	if (link_usettings != NULL) {
+		err = dump_link_usettings(link_usettings);
+		free(link_usettings);
 		if (err)
 			return err;
 		allfail = 0;
@@ -2322,8 +2645,10 @@ static int do_sset(struct cmd_context *ctx)
 	int autoneg_wanted = -1;
 	int phyad_wanted = -1;
 	int xcvr_wanted = -1;
-	int full_advertising_wanted = -1;
-	int advertising_wanted = -1;
+	u32 *full_advertising_wanted = NULL;
+	u32 *advertising_wanted = NULL;
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(_mask_full_advertising_wanted);
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(_mask_advertising_wanted);
 	int gset_changed = 0; /* did anything in GSET change? */
 	u32 wol_wanted = 0;
 	int wol_change = 0;
@@ -2337,7 +2662,7 @@ static int do_sset(struct cmd_context *ctx)
 	int argc = ctx->argc;
 	char **argp = ctx->argp;
 	int i;
-	int err;
+	int err = 0;
 
 	for (i = 0; i < ARRAY_SIZE(flags_msglvl); i++)
 		flag_to_cmdline_info(flags_msglvl[i].name,
@@ -2411,7 +2736,12 @@ static int do_sset(struct cmd_context *ctx)
 			i += 1;
 			if (i >= argc)
 				exit_bad_args();
-			full_advertising_wanted = get_int(argp[i], 16);
+			if (parse_hex_u32_bitmap(
+				    argp[i],
+				    __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBITS,
+				    _mask_full_advertising_wanted))
+				exit_bad_args();
+			full_advertising_wanted = _mask_full_advertising_wanted;
 		} else if (!strcmp(argp[i], "phyad")) {
 			gset_changed = 1;
 			i += 1;
@@ -2468,75 +2798,89 @@ static int do_sset(struct cmd_context *ctx)
 		}
 	}
 
-	if (full_advertising_wanted < 0) {
+	if (full_advertising_wanted == NULL) {
 		/* User didn't supply a full advertisement bitfield:
 		 * construct one from the specified speed and duplex.
 		 */
+		int adv_bit = -1;
+
 		if (speed_wanted == SPEED_10 && duplex_wanted == DUPLEX_HALF)
-			advertising_wanted = ADVERTISED_10baseT_Half;
+			adv_bit = ETHTOOL_LINK_MODE_10baseT_Half_BIT;
 		else if (speed_wanted == SPEED_10 &&
 			 duplex_wanted == DUPLEX_FULL)
-			advertising_wanted = ADVERTISED_10baseT_Full;
+			adv_bit = ETHTOOL_LINK_MODE_10baseT_Full_BIT;
 		else if (speed_wanted == SPEED_100 &&
 			 duplex_wanted == DUPLEX_HALF)
-			advertising_wanted = ADVERTISED_100baseT_Half;
+			adv_bit = ETHTOOL_LINK_MODE_100baseT_Half_BIT;
 		else if (speed_wanted == SPEED_100 &&
 			 duplex_wanted == DUPLEX_FULL)
-			advertising_wanted = ADVERTISED_100baseT_Full;
+			adv_bit = ETHTOOL_LINK_MODE_100baseT_Full_BIT;
 		else if (speed_wanted == SPEED_1000 &&
 			 duplex_wanted == DUPLEX_HALF)
-			advertising_wanted = ADVERTISED_1000baseT_Half;
+			adv_bit = ETHTOOL_LINK_MODE_1000baseT_Half_BIT;
 		else if (speed_wanted == SPEED_1000 &&
 			 duplex_wanted == DUPLEX_FULL)
-			advertising_wanted = ADVERTISED_1000baseT_Full;
+			adv_bit = ETHTOOL_LINK_MODE_1000baseT_Full_BIT;
 		else if (speed_wanted == SPEED_2500 &&
 			 duplex_wanted == DUPLEX_FULL)
-			advertising_wanted = ADVERTISED_2500baseX_Full;
+			adv_bit = ETHTOOL_LINK_MODE_2500baseX_Full_BIT;
 		else if (speed_wanted == SPEED_10000 &&
 			 duplex_wanted == DUPLEX_FULL)
-			advertising_wanted = ADVERTISED_10000baseT_Full;
-		else
-			/* auto negotiate without forcing,
-			 * all supported speed will be assigned below
-			 */
-			advertising_wanted = 0;
+			adv_bit = ETHTOOL_LINK_MODE_10000baseT_Full_BIT;
+
+		if (adv_bit >= 0) {
+			advertising_wanted = _mask_advertising_wanted;
+			ethtool_link_mode_zero(advertising_wanted);
+			ethtool_link_mode_set_bit(
+				adv_bit, advertising_wanted);
+		}
+		/* otherwise: auto negotiate without forcing,
+		 * all supported speed will be assigned below
+		 */
 	}
 
 	if (gset_changed) {
-		struct ethtool_cmd ecmd;
+		struct ethtool_link_usettings *link_usettings;
 
-		ecmd.cmd = ETHTOOL_GSET;
-		err = send_ioctl(ctx, &ecmd);
-		if (err < 0) {
+		link_usettings = __do_ioctl_glinksettings(ctx);
+		if (link_usettings == NULL)
+			link_usettings = __do_ioctl_gset(ctx);
+		if (link_usettings == NULL) {
 			perror("Cannot get current device settings");
+			err = -1;
 		} else {
 			/* Change everything the user specified. */
 			if (speed_wanted != -1)
-				ethtool_cmd_speed_set(&ecmd, speed_wanted);
+				link_usettings->base.speed = speed_wanted;
 			if (duplex_wanted != -1)
-				ecmd.duplex = duplex_wanted;
+				link_usettings->base.duplex = duplex_wanted;
 			if (port_wanted != -1)
-				ecmd.port = port_wanted;
+				link_usettings->base.port = port_wanted;
 			if (mdix_wanted != -1) {
 				/* check driver supports MDI-X */
-				if (ecmd.eth_tp_mdix_ctrl != ETH_TP_MDI_INVALID)
-					ecmd.eth_tp_mdix_ctrl = mdix_wanted;
+				if (link_usettings->base.eth_tp_mdix_ctrl
+				    != ETH_TP_MDI_INVALID)
+					link_usettings->base.eth_tp_mdix_ctrl
+						= mdix_wanted;
 				else
-					fprintf(stderr, "setting MDI not supported\n");
+					fprintf(stderr,
+						"setting MDI not supported\n");
 			}
 			if (autoneg_wanted != -1)
-				ecmd.autoneg = autoneg_wanted;
+				link_usettings->base.autoneg = autoneg_wanted;
 			if (phyad_wanted != -1)
-				ecmd.phy_address = phyad_wanted;
+				link_usettings->base.phy_address = phyad_wanted;
 			if (xcvr_wanted != -1)
-				ecmd.transceiver = xcvr_wanted;
+				link_usettings->deprecated.transceiver
+					= xcvr_wanted;
 			/* XXX If the user specified speed or duplex
 			 * then we should mask the advertised modes
 			 * accordingly.  For now, warn that we aren't
 			 * doing that.
 			 */
 			if ((speed_wanted != -1 || duplex_wanted != -1) &&
-			    ecmd.autoneg && advertising_wanted == 0) {
+			    link_usettings->base.autoneg &&
+			    advertising_wanted == NULL) {
 				fprintf(stderr, "Cannot advertise");
 				if (speed_wanted >= 0)
 					fprintf(stderr, " speed %d",
@@ -2548,37 +2892,56 @@ static int do_sset(struct cmd_context *ctx)
 				fprintf(stderr,	"\n");
 			}
 			if (autoneg_wanted == AUTONEG_ENABLE &&
-			    advertising_wanted == 0) {
+			    advertising_wanted == NULL) {
+				unsigned int i;
+
 				/* Auto negotiation enabled, but with
 				 * unspecified speed and duplex: enable all
 				 * supported speeds and duplexes.
 				 */
-				ecmd.advertising =
-					(ecmd.advertising &
-					 ~ALL_ADVERTISED_MODES) |
-					(ALL_ADVERTISED_MODES & ecmd.supported);
+				ethtool_link_mode_for_each_u32(i) {
+					u32 sup = link_usettings->link_modes.supported[i];
+					u32 *adv = link_usettings->link_modes.advertising + i;
+
+					*adv = ((*adv & ~all_advertised_modes[i])
+						| (sup & all_advertised_modes[i]));
+				}
 
 				/* If driver supports unknown flags, we cannot
 				 * be sure that we enable all link modes.
 				 */
-				if ((ecmd.supported & ALL_ADVERTISED_FLAGS) !=
-				    ecmd.supported) {
-					fprintf(stderr, "Driver supports one "
-					        "or more unknown flags\n");
+				ethtool_link_mode_for_each_u32(i) {
+					u32 sup = link_usettings->link_modes.supported[i];
+
+					if ((sup & all_advertised_flags[i]) != sup) {
+						fprintf(stderr, "Driver supports one or more unknown flags\n");
+						break;
+					}
 				}
-			} else if (advertising_wanted > 0) {
+			} else if (advertising_wanted != NULL) {
+				unsigned int i;
+
 				/* Enable all requested modes */
-				ecmd.advertising =
-					(ecmd.advertising &
-					 ~ALL_ADVERTISED_MODES) |
-					advertising_wanted;
-			} else if (full_advertising_wanted > 0) {
-				ecmd.advertising = full_advertising_wanted;
+				ethtool_link_mode_for_each_u32(i) {
+					u32 *adv = link_usettings->link_modes.advertising + i;
+
+					*adv = ((*adv & ~all_advertised_modes[i])
+						| advertising_wanted[i]);
+				}
+			} else if (full_advertising_wanted != NULL) {
+				ethtool_link_mode_copy(
+					link_usettings->link_modes.advertising,
+					full_advertising_wanted);
 			}
 
 			/* Try to perform the update. */
-			ecmd.cmd = ETHTOOL_SSET;
-			err = send_ioctl(ctx, &ecmd);
+			if (link_usettings->base.cmd == ETHTOOL_GLINKSETTINGS)
+				err = __do_ioctl_slinksettings(ctx,
+							       link_usettings);
+			else
+				err = __do_ioctl_sset(ctx,
+						      link_usettings);
+			free(link_usettings);
 			if (err < 0)
 				perror("Cannot set new settings");
 		}
@@ -4230,6 +4593,8 @@ int main(int argc, char **argp)
 	struct cmd_context ctx;
 	int k;
 
+	init_global_link_mode_masks();
+
 	/* Skip command name */
 	argp++;
 	argc--;
diff --git a/internal.h b/internal.h
index e98f532..5e78b5c 100644
--- a/internal.h
+++ b/internal.h
@@ -12,6 +12,7 @@
 #ifdef HAVE_CONFIG_H
 #include "ethtool-config.h"
 #endif
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -122,6 +123,72 @@ static inline int test_bit(unsigned int nr, const unsigned long *addr)
 				 ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |	\
 				 ETH_FLAG_RXHASH)
 
+/* internal API for link mode bitmap interaction with kernel. */
+
+#define __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32	\
+	(SCHAR_MAX)
+#define __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBITS	\
+	(32*__ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32)
+#define __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES	\
+	(4*__ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32)
+#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)			\
+	u32 name[__ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32]
+
+struct ethtool_link_usettings {
+	struct {
+		__u8 transceiver;
+	} deprecated;
+	struct ethtool_link_settings base;
+	struct {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	} link_modes;
+};
+
+#define ethtool_link_mode_for_each_u32(index)				\
+	for ((index) = 0 ;						\
+	     (index) < __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 ;	\
+	     ++(index))
+
+static inline void ethtool_link_mode_zero(u32 *dst)
+{
+	memset(dst, 0, __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES);
+}
+
+static inline bool ethtool_link_mode_is_empty(const u32 *mask)
+{
+	unsigned int i;
+
+	ethtool_link_mode_for_each_u32(i)
+	{
+		if (mask[i] != 0)
+			return false;
+	}
+
+	return true;
+}
+
+static inline void ethtool_link_mode_copy(u32 *dst, const u32 *src)
+{
+	memcpy(dst, src, __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES);
+}
+
+static inline int ethtool_link_mode_test_bit(unsigned int nr, const u32 *mask)
+{
+	if (nr >= __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBITS)
+		return !!0;
+	return !!(mask[nr / 32] & (1 << (nr % 32)));
+}
+
+static inline int ethtool_link_mode_set_bit(unsigned int nr, u32 *mask)
+{
+	if (nr >= __ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBITS)
+		return -1;
+	mask[nr / 32] |= (1 << (nr % 32));
+	return 0;
+}
+
 /* Context for sub-commands */
 struct cmd_context {
 	const char *devname;	/* net device name */
diff --git a/test-cmdline.c b/test-cmdline.c
index 2fd7cbb..a94edea 100644
--- a/test-cmdline.c
+++ b/test-cmdline.c
@@ -37,7 +37,20 @@ static struct test_case {
 	{ 1, "--change devname autoneg foo" },
 	{ 1, "-s devname autoneg" },
 	{ 0, "--change devname advertise 0x1" },
+	{ 0, "--change devname advertise 0xf" },
+	{ 0, "--change devname advertise 0Xf" },
+	{ 0, "--change devname advertise 1" },
+	{ 0, "--change devname advertise f" },
+	{ 0, "--change devname advertise 01" },
+	{ 0, "--change devname advertise 0f" },
+	{ 0, "--change devname advertise 0xfffffffffffffffffffffffffffffffff" },
+	{ 0, "--change devname advertise fffffffffffffffffffffffffffffffff" },
+	{ 0, "--change devname advertise 0x0000fffffffffffffffffffffffffffff" },
+	{ 0, "--change devname advertise 0000fffffffffffffffffffffffffffff" },
+	{ 1, "-s devname advertise" },
+	{ 1, "-s devname advertise 0x" },
 	{ 1, "-s devname advertise foo" },
+	{ 1, "-s devname advertise 0xfoo" },
 	{ 1, "--change devname advertise" },
 	{ 0, "-s devname phyad 1" },
 	{ 1, "--change devname phyad foo" },
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* [ethtool PATCH v7 0/2] add support for new ETHTOOL_xLINKSETTINGS ioctls
From: David Decotigny @ 2016-04-22 16:48 UTC (permalink / raw)
  To: netdev
  Cc: Jeff Garzik, Ben Hutchings, David Miller, Vidya Sagar Ravipati,
	Joe Perches, David Decotigny

From: David Decotigny <decot@googlers.com>

[ re-sending this series, same v7 as the one previously sent ]

History:
  v7
    added ref to related kernel commit in netlink ioctl patch description
  v6
    re-added last patch, to use AF_NETLINK when AF_INET not available
  v5
    rebased main patch, removed last patch "use AF_LOCAL when AF_INET
    not available"
  v4
    review Ben Hutchings:
      using AF_UNIX instead of INET6 in the absence of v4 sockets
      use stdbool.h
      do_seeprom always fails when offset/length out of bounds
      sync to latest ethtool.h + kernel.h from net-next
      __SANE_USERSPACE_TYPES__ always defined
      cosmetic updates for var == const tests
      cosmetic updates for associativity in tests
  v3
    TRUE/FALSE obvious-ification
  v2
    added do_seeprom patch
    added netdev <at>  as recipient
  v1
    initial submission

############################################
# Patch Set Summary:

David Decotigny (2):
  ethtool.c: add support for ETHTOOL_xLINKSETTINGS ioctls
  ethtool: use netlink socket when AF_INET not available

 configure.ac   |   2 +-
 ethtool.c      | 688 ++++++++++++++++++++++++++++++++++++++++++++-------------
 internal.h     |  67 ++++++
 test-cmdline.c |  13 ++
 4 files changed, 611 insertions(+), 159 deletions(-)

-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply

* Re: [PATCH iproute2 0/4] add MACsec support
From: Stephen Hemminger @ 2016-04-22 16:30 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: netdev
In-Reply-To: <20160422161031.GA20140@bistromath.localdomain>

On Fri, 22 Apr 2016 18:10:31 +0200
Sabrina Dubroca <sd@queasysnail.net> wrote:

> 2016-04-14, 15:01:06 +0200, Sabrina Dubroca wrote:
> > This series introduces support for MACsec devices, with a new device
> > type for `ip link`, and a new `ip macsec` subcommand.
> > 
> > The first three patches introduce some necessary helper functions.
> 
> Hi Stephen,
> 
> Please don't apply this yet, I'm updating uapi:
> http://marc.info/?l=linux-netdev&m=146131733302850&w=2
> 
> Thanks,

Sure, was holding for a couple weeks of review anyway.

^ permalink raw reply

* [PATCH] net: ipv6: Do not fix up linklocal and loopback addresses
From: Mike Manning @ 2016-04-22 16:14 UTC (permalink / raw)
  To: netdev
In-Reply-To: <571A4D11.8070307@brocade.com>

commit f1705ec197e7 ("net: ipv6: Make address flushing on ifdown 
optional") added the option to retain user configured addresses on an
admin down. A comment to one of the later revisions suggested using
the IFA_F_PERMANENT flag rather than adding a user_managed boolean to
the ifaddr struct. A side effect of this change is that link local and
loopback addresses were also retained which was not part of the
objective of the original changes. The commit 70af921db6f8 ("net: ipv6: 
Do not keep linklocal and loopback addresses") ensures that these are
no longer kept. Similarly, the present fix ensures that these addresses
are not incorrectly fixed up. 

Testing also with the recent patch in place to delete host routes on
ifdown still shows that fixup of the LL & loopback addrs is incorrectly
being attempted (and without that patch was causing a crash in fib6).
Another approach to this is through code inspection by checking where
the user_managed flag in the original dahern patch (which we have been
using for nearly a year) has been replaced by checking IFA_F_PERMANENT.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Signed-off-by: Mike Manning <mmanning@brocade.com>
---
 net/ipv6/addrconf.c |   13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23cec53..cba4e10 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3200,6 +3200,12 @@ static void l3mdev_check_host_rt(struct inet6_dev *idev,
 }
 #endif
 
+static bool addr_is_local(const struct in6_addr *addr)
+{
+	return ipv6_addr_type(addr) &
+		(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
+}
+
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
@@ -3238,6 +3244,7 @@ static void addrconf_permanent_addr(struct net_device *dev)
 
 	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
 		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    !addr_is_local(&ifp->addr) &&
 		    fixup_permanent_addr(idev, ifp) < 0) {
 			write_unlock_bh(&idev->lock);
 			ipv6_del_addr(ifp);
@@ -3448,12 +3455,6 @@ static void addrconf_type_change(struct net_device *dev, unsigned long event)
 		ipv6_mc_unmap(idev);
 }
 
-static bool addr_is_local(const struct in6_addr *addr)
-{
-	return ipv6_addr_type(addr) &
-		(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
-}
-
 static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH net 1/1] net: fec: update dirty_tx even if no skb
From: Troy Kisky @ 2016-04-22 16:11 UTC (permalink / raw)
  To: Fugang Duan, netdev@vger.kernel.org, davem@davemloft.net,
	lznuaa@gmail.com
  Cc: Fabio Estevam, l.stach@pengutronix.de, andrew@lunn.ch,
	tremyfr@gmail.com, gerg@uclinux.org,
	linux-arm-kernel@lists.infradead.org, johannes@sipsolutions.net,
	stillcompiling@gmail.com, sergei.shtylyov@cogentembedded.com,
	arnd@arndb.de, holgerschurig@gmail.com
In-Reply-To: <VI1PR0401MB18558325A80D252D2EC64A7BFF6F0@VI1PR0401MB1855.eurprd04.prod.outlook.com>

On 4/21/2016 10:59 PM, Fugang Duan wrote:
> From: Troy Kisky <troy.kisky@boundarydevices.com> Sent: Friday, April 22, 2016 10:01 AM
>> To: netdev@vger.kernel.org; davem@davemloft.net; Fugang Duan
>> <fugang.duan@nxp.com>; lznuaa@gmail.com
>> Cc: Fabio Estevam <fabio.estevam@nxp.com>; l.stach@pengutronix.de;
>> andrew@lunn.ch; tremyfr@gmail.com; gerg@uclinux.org; linux-arm-
>> kernel@lists.infradead.org; johannes@sipsolutions.net;
>> stillcompiling@gmail.com; sergei.shtylyov@cogentembedded.com;
>> arnd@arndb.de; holgerschurig@gmail.com; Troy Kisky
>> <troy.kisky@boundarydevices.com>
>> Subject: [PATCH net 1/1] net: fec: update dirty_tx even if no skb
>>
>> If dirty_tx isn't updated, then dma_unmap_single will be called twice.
>>
>> This fixes a
>> [   58.420980] ------------[ cut here ]------------
>> [   58.425667] WARNING: CPU: 0 PID: 377 at /home/schurig/d/mkarm/linux-
>> 4.5/lib/dma-debug.c:1096 check_unmap+0x9d0/0xab8()
>> [   58.436405] fec 2188000.ethernet: DMA-API: device driver tries to free DMA
>> memory it has not allocated [device address=0x0000000000000000] [size=66
>> bytes]
>>
>> encountered by Holger
>>
>> Signed-off-by: Troy Kisky <troy.kisky@boundarydevices.com>
>> Tested-by: <holgerschurig@gmail.com>
>> ---
>>  drivers/net/ethernet/freescale/fec_main.c | 8 +++-----
>>  1 file changed, 3 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/freescale/fec_main.c
>> b/drivers/net/ethernet/freescale/fec_main.c
>> index 08243c2..b71654c 100644
>> --- a/drivers/net/ethernet/freescale/fec_main.c
>> +++ b/drivers/net/ethernet/freescale/fec_main.c
>> @@ -1197,10 +1197,8 @@ fec_enet_tx_queue(struct net_device *ndev, u16
>> queue_id)
>>  					 fec16_to_cpu(bdp->cbd_datlen),
>>  					 DMA_TO_DEVICE);
>>  		bdp->cbd_bufaddr = cpu_to_fec32(0);
>> -		if (!skb) {
>> -			bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
>> -			continue;
>> -		}
>> +		if (!skb)
>> +			goto skb_done;
>>
>>  		/* Check for errors. */
>>  		if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC | @@ -1239,7
>> +1237,7 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
>>
>>  		/* Free the sk buffer associated with this last transmit */
>>  		dev_kfree_skb_any(skb);
>> -
>> +skb_done:
>>  		/* Make sure the update to bdp and tx_skbuff are performed
>>  		 * before dirty_tx
>>  		 */
>> --
>> 2.5.0
> 
> The patch is fine for me.
> Can you review below patch that also fix the issue. It can take much effort due to less rmb() and READ_ONCE() operation that is very sensitive for duplex Gbps test for i.MX6SX/i.MX7d SOC. (i.MX6SX can reach at 1.4Gbps, i.MX7D can reach at 1.8Gbps.)



If  "READ_ONCE(bdp->cbd_sc)" is really that expensive, then you should skip the 1st read as well and
only do it after skb presence is verified. But some numbers would really help sell the patch.
Also, I comment as to why the code is reorganized would be warranted


> 
> --- a/drivers/net/ethernet/freescale/fec_main.c
> +++ b/drivers/net/ethernet/freescale/fec_main.c
> @@ -1160,12 +1160,13 @@ static void
>  fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
>  {
>         struct  fec_enet_private *fep;
> -       struct bufdesc *bdp;
> +       struct bufdesc *bdp, *bdp_t;
>         unsigned short status;
>         struct  sk_buff *skb;
>         struct fec_enet_priv_tx_q *txq;
>         struct netdev_queue *nq;
>         int     index = 0;
> +       int     i, bdnum;
>         int     entries_free;
> 
>         fep = netdev_priv(ndev);
> @@ -1187,20 +1188,28 @@ fec_enet_tx_queue(struct net_device *ndev, u16 queue_id)
>                 if (status & BD_ENET_TX_READY)
>                         break;
> 
> -               index = fec_enet_get_bd_index(bdp, &txq->bd);
> -
> +               bdp_t = bdp;
> +               bdnum = 1;
> +               index = fec_enet_get_bd_index(txq->tx_bd_base, bdp_t, fep);
>                 skb = txq->tx_skbuff[index];
> -               txq->tx_skbuff[index] = NULL;
> -               if (!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
> -                       dma_unmap_single(&fep->pdev->dev,
> -                                        fec32_to_cpu(bdp->cbd_bufaddr),
> -                                        fec16_to_cpu(bdp->cbd_datlen),
> -                                        DMA_TO_DEVICE);
> -               bdp->cbd_bufaddr = cpu_to_fec32(0);
> -               if (!skb) {
> -                       bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
> -                       continue;
> +               while (!skb) {
> +                       bdp_t = fec_enet_get_nextdesc(bdp_t, &txq->bd);
> +                       index = fec_enet_get_bd_index(txq->tx_bd_base, bdp_t, fep);
> +                       skb = txq->tx_skbuff[index];
> +                       bdnum++;
> +               }
> +               status = fec16_to_cpu(READ_ONCE(bdp->cbd_sc));
> +               if (status & BD_ENET_TX_READY)
> +                       break;
> +
> +               for (i = 0; i < bdnum; i++) {
> +                       if (!IS_TSO_HEADER(txq, bdp->cbd_bufaddr))
> +                               dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
> +                                                bdp->cbd_datlen, DMA_TO_DEVICE);
> +                       bdp->cbd_bufaddr = 0;
> +                       if (i < bdnum - 1)
> +                               bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
>                 }
> +               txq->tx_skbuff[index] = NULL;
> 
> 
> Regards,
> Andy
> 

^ permalink raw reply

* Re: [PATCH iproute2 0/4] add MACsec support
From: Sabrina Dubroca @ 2016-04-22 16:10 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger
In-Reply-To: <cover.1460622809.git.sd@queasysnail.net>

2016-04-14, 15:01:06 +0200, Sabrina Dubroca wrote:
> This series introduces support for MACsec devices, with a new device
> type for `ip link`, and a new `ip macsec` subcommand.
> 
> The first three patches introduce some necessary helper functions.

Hi Stephen,

Please don't apply this yet, I'm updating uapi:
http://marc.info/?l=linux-netdev&m=146131733302850&w=2

Thanks,

-- 
Sabrina

^ permalink raw reply

* IGMPv3 and IGMPv2 interoperability - LEAVE packets are not sent on v3->v2 fallback
From: Richard @ 2016-04-22 16:03 UTC (permalink / raw)
  To: netdev

Hi there,

I have noticed some weird behaviour on IGMPv2 and IGMPv3.   The short
if it is, I can JOIN a multicast group normally (defaulting to v3) -
this works nicely until a IGMPv2 device is discovered on my network.
After that point, the LEAVE command for the group does not go out on
the wire and my router does not stop sending the data.

Digging down, I can see in IGMP.C that v2 has a variable 'reporter'
that does not seem to be used on IGMPv3 - and since the JOIN was on
v3, this value is Zero. To get the messages going out, this variable
needs to be non-zero.

To reproduce,

Boot Linux
Join a Multicast group (Wrireshark will show the packet if you are sniffing)
Force to use IGMPv2 (using echo "2" /proc/ .. etc)
Leave Multicast Group

At the leave point, nothing is sent over the wire, and the
subscription is removed from the list. For an experiment, I set
im->reporter=1 in the _group_add() function in the case for IGMPv3;
and this did allow the message to be sent after a v3->v2 fallback on
LEAVE; but there seems to be quite a lot of differences and I am not
so sure about the timers still working.

Any hints would be appreciated.
Regards,
Richard

^ permalink raw reply

* [RFC PATCH net] ipv6/ila: fix nlsize calculation for lwtunnel
From: Nicolas Dichtel @ 2016-04-22 15:58 UTC (permalink / raw)
  To: davem; +Cc: netdev, Nicolas Dichtel, Tom Herbert

The handler 'ila_fill_encap_info' adds one attribute: ILA_ATTR_LOCATOR.

Fixes: 65d7ab8de582 ("net: Identifier Locator Addressing module")
CC: Tom Herbert <tom@herbertland.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---

Tom, when I read the comment, I feel I'm misssing something, but what?

 net/ipv6/ila/ila_lwt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 2ae3c4fd8aab..41f18de5dcc2 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -120,8 +120,7 @@ nla_put_failure:
 
 static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 {
-	/* No encapsulation overhead */
-	return 0;
+	return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */
 }
 
 static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
-- 
2.8.1

^ permalink raw reply related

* Re: [PATCH net v2 2/3] drivers: net: cpsw: fix error messages when using phy-handle DT property
From: David Rivshin (Allworx) @ 2016-04-22 15:45 UTC (permalink / raw)
  To: Grygorii Strashko, Rob Herring
  Cc: netdev, linux-omap, linux-arm-kernel, devicetree, linux-kernel,
	David Miller, Mugunthan V N, Andrew Goodbody, Markus Brunner,
	Nicolas Chauvet
In-Reply-To: <571A2126.2060708@ti.com>

On Fri, 22 Apr 2016 16:03:34 +0300
Grygorii Strashko <grygorii.strashko@ti.com> wrote:

> On 04/21/2016 09:26 PM, David Rivshin (Allworx) wrote:
> > From: David Rivshin <drivshin@allworx.com>
> > 
> > The phy-handle, phy_id, and fixed-link properties are mutually exclusive,
> > and only one need be specified. However if phy-handle was specified, an
> > error message would complain about the lack of phy_id or fixed-link.
> > 
> > Also, if phy-handle was specified and the subsequent of_phy_connect()
> > failed, the error message still referenced slaved->data->phy_id, which
> > would be empty. Instead, use the name of the device_node as a useful
> > identifier.
> > 
> > Fixes: 9e42f715264f ("drivers: net: cpsw: add phy-handle parsing")
> > Signed-off-by: David Rivshin <drivshin@allworx.com>
> > Acked-by: Rob Herring <robh@kernel.org>
> > Tested-by: Nicolas Chauvet <kwizart@gmail.com>
> > ---
> > If would like this for -stable it should apply cleanly as far back
> > as 4.5. It failes on 4.4 due to some context differences, but can be
> > applied with 'git am -C2'. Or, I can produce a separate patch against
> > linux-4.4.y if preferred.
> > 
> > Changes since v1 [1]:
> > - Rebased (no conflicts)
> > - Added Tested-by from Nicolas Chauvet
> > - Added Acked-by from Rob Herring for the binding change
> > 
> > [1] https://patchwork.ozlabs.org/patch/560324/
> > 
> >   Documentation/devicetree/bindings/net/cpsw.txt |  4 ++--
> >   drivers/net/ethernet/ti/cpsw.c                 | 17 +++++++++++++----
> >   2 files changed, 15 insertions(+), 6 deletions(-)
> > 
> > diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
> > index 28a4781..3033c0f 100644
> > --- a/Documentation/devicetree/bindings/net/cpsw.txt
> > +++ b/Documentation/devicetree/bindings/net/cpsw.txt
> > @@ -46,16 +46,16 @@ Optional properties:
> >   - dual_emac_res_vlan	: Specifies VID to be used to segregate the ports
> >   - mac-address		: See ethernet.txt file in the same directory
> >   - phy_id		: Specifies slave phy id  
> 
> May be the "phy_id" can be marked as deprecated? (while here)
> The recommended property now is "phy-handle".

I can certainly do that. Perhaps something like this?
 - phy_id		: Specifies slave phy id (deprecated, use phy-handle)

Rob, would you have any issues with bundling that?

> 
> >   - phy-handle		: See ethernet.txt file in the same directory
> >   
> >   Slave sub-nodes:
> >   - fixed-link		: See fixed-link.txt file in the same directory
> > -			  Either the property phy_id, or the sub-node
> > -			  fixed-link can be specified
> > +
> > +Note: Exactly one of phy_id, phy-handle, or fixed-link must be specified.
> >   
> >   Note: "ti,hwmods" field is used to fetch the base address and irq
> >   resources from TI, omap hwmod data base during device registration.
> >   Future plan is to migrate hwmod data base contents into device tree
> >   blob so that, all the required data will be used from device tree dts
> >   file.
> >   
> > diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> > index d69cb3f..3c81413 100644
> > --- a/drivers/net/ethernet/ti/cpsw.c
> > +++ b/drivers/net/ethernet/ti/cpsw.c
> > @@ -1150,16 +1150,19 @@ static void cpsw_slave_open(struct cpsw_slave *slave, struct cpsw_priv *priv)
> >   	if (slave->data->phy_node)
> >   		slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node,
> >   				 &cpsw_adjust_link, 0, slave->data->phy_if);
> >   	else
> >   		slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
> >   				 &cpsw_adjust_link, slave->data->phy_if);
> >   	if (IS_ERR(slave->phy)) {
> > -		dev_err(priv->dev, "phy %s not found on slave %d\n",
> > -			slave->data->phy_id, slave->slave_num);
> > +		dev_err(priv->dev, "phy \"%s\" not found on slave %d\n",
> > +			slave->data->phy_node ?
> > +				slave->data->phy_node->full_name :
> > +				slave->data->phy_id,
> > +			slave->slave_num);  
> 
> Unfortunately,  there are some inconsistency between legacy and FDT API :(
> of_phy_connect() will return valid phy_device or NULL, but phy_connect()
> can return valid phy_device or ERR_PTR().

Good catch, I hadn't noticed that. It looks like that's actually a more 
serious (pre-existing) bug: if of_phy_connect() returns NULL, we'd end 
up dereferencing it and pagefaulting.

How about moving the IS_ERR() check into the phy_connect() case like this:
	if (slave->data->phy_node) {
		slave->phy = of_phy_connect(priv->ndev, slave->data->phy_node,
				 &cpsw_adjust_link, 0, slave->data->phy_if);
	} else {
		slave->phy = phy_connect(priv->ndev, slave->data->phy_id,
				 &cpsw_adjust_link, slave->data->phy_if);
		if (IS_ERR(slave->phy))
			slave->phy = NULL;
	}
	if (!slave->phy) {
		dev_err(priv->dev, "phy \"%s\" not found on slave %d\n",
			slave->data->phy_node ?
				slave->data->phy_node->full_name :
				slave->data->phy_id,
			slave->slave_num);  
	} else {

Since you say the phy_id case is deprecated anyways, I'm not too concerned
about not printing the error code returned by phy_connect() in that case
(especially since it never did so in the past). That lets us still avoid
duplicating the dev_err() itself.


> 
> 
> 
> >   		slave->phy = NULL;
> >   	} else {
> >   		phy_attached_info(slave->phy);
> >   
> >   		phy_start(slave->phy);
> >   
> >   		/* Configure GMII_SEL register */
> > @@ -2030,15 +2033,19 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
> >   		/* This is no slave child node, continue */
> >   		if (strcmp(slave_node->name, "slave"))
> >   			continue;
> >   
> >   		slave_data->phy_node = of_parse_phandle(slave_node,
> >   							"phy-handle", 0);
> >   		parp = of_get_property(slave_node, "phy_id", &lenp);
> > -		if (of_phy_is_fixed_link(slave_node)) {
> > +		if (slave_data->phy_node) {
> > +			dev_dbg(&pdev->dev,
> > +				"slave[%d] using phy-handle=\"%s\"\n",
> > +				i, slave_data->phy_node->full_name);
> > +		} else if (of_phy_is_fixed_link(slave_node)) {
> >   			struct device_node *phy_node;
> >   			struct phy_device *phy_dev;
> >   
> >   			/* In the case of a fixed PHY, the DT node associated
> >   			 * to the PHY is the Ethernet MAC DT node.
> >   			 */
> >   			ret = of_phy_register_fixed_link(slave_node);
> > @@ -2067,15 +2074,17 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
> >   			if (!mdio) {
> >   				dev_err(&pdev->dev, "Missing mdio platform device\n");
> >   				return -EINVAL;
> >   			}
> >   			snprintf(slave_data->phy_id, sizeof(slave_data->phy_id),
> >   				 PHY_ID_FMT, mdio->name, phyid);
> >   		} else {
> > -			dev_err(&pdev->dev, "No slave[%d] phy_id or fixed-link property\n", i);
> > +			dev_err(&pdev->dev,
> > +				"No slave[%d] phy_id, phy-handle, or fixed-link property\n",
> > +				i);
> >   			goto no_phy_slave;
> >   		}
> >   		slave_data->phy_if = of_get_phy_mode(slave_node);
> >   		if (slave_data->phy_if < 0) {
> >   			dev_err(&pdev->dev, "Missing or malformed slave[%d] phy-mode property\n",
> >   				i);
> >   			return slave_data->phy_if;
> >   
> 
> 

^ permalink raw reply

* [PATCH net-next 9/9] taskstats: use the libnl API to align nlattr on 64-bit
From: Nicolas Dichtel @ 2016-04-22 15:31 UTC (permalink / raw)
  To: netdev
  Cc: davem, linux-kernel, linux-wpan, aar, pablo, kaber, kadlec,
	pshelar, kuznet, jmorris, yoshfuji, netfilter-devel, dev,
	steffen.klassert, herbert, bsingharora, Nicolas Dichtel
In-Reply-To: <1461339084-3849-1-git-send-email-nicolas.dichtel@6wind.com>

Goal of this patch is to use the new libnl API to align netlink attribute
when needed.
The layout of the netlink message will be a bit different after the patch,
because the padattr (TASKSTATS_TYPE_STATS) will be inside the nested
attribute instead of before it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 kernel/taskstats.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c29c914..b3f05ee20d18 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -357,10 +357,6 @@ static int parse(struct nlattr *na, struct cpumask *mask)
 	return ret;
 }
 
-#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define TASKSTATS_NEEDS_PADDING 1
-#endif
-
 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
 	struct nlattr *na, *ret;
@@ -370,29 +366,6 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 			? TASKSTATS_TYPE_AGGR_PID
 			: TASKSTATS_TYPE_AGGR_TGID;
 
-	/*
-	 * The taskstats structure is internally aligned on 8 byte
-	 * boundaries but the layout of the aggregrate reply, with
-	 * two NLA headers and the pid (each 4 bytes), actually
-	 * force the entire structure to be unaligned. This causes
-	 * the kernel to issue unaligned access warnings on some
-	 * architectures like ia64. Unfortunately, some software out there
-	 * doesn't properly unroll the NLA packet and assumes that the start
-	 * of the taskstats structure will always be 20 bytes from the start
-	 * of the netlink payload. Aligning the start of the taskstats
-	 * structure breaks this software, which we don't want. So, for now
-	 * the alignment only happens on architectures that require it
-	 * and those users will have to update to fixed versions of those
-	 * packages. Space is reserved in the packet only when needed.
-	 * This ifdef should be removed in several years e.g. 2012 once
-	 * we can be confident that fixed versions are installed on most
-	 * systems. We add the padding before the aggregate since the
-	 * aggregate is already a defined type.
-	 */
-#ifdef TASKSTATS_NEEDS_PADDING
-	if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
-		goto err;
-#endif
 	na = nla_nest_start(skb, aggr);
 	if (!na)
 		goto err;
@@ -401,7 +374,8 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 		nla_nest_cancel(skb, na);
 		goto err;
 	}
-	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+	ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS,
+				sizeof(struct taskstats), TASKSTATS_TYPE_NULL);
 	if (!ret) {
 		nla_nest_cancel(skb, na);
 		goto err;
@@ -500,10 +474,9 @@ static size_t taskstats_packet_size(void)
 	size_t size;
 
 	size = nla_total_size(sizeof(u32)) +
-		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-#ifdef TASKSTATS_NEEDS_PADDING
-	size += nla_total_size(0); /* Padding for alignment */
-#endif
+		nla_total_size_64bit(sizeof(struct taskstats)) +
+		nla_total_size(0);
+
 	return size;
 }
 
-- 
2.8.1

^ permalink raw reply related

* [PATCH net-next 8/9] xfrm: align nlattr properly when needed
From: Nicolas Dichtel @ 2016-04-22 15:31 UTC (permalink / raw)
  To: netdev
  Cc: davem, linux-kernel, linux-wpan, aar, pablo, kaber, kadlec,
	pshelar, kuznet, jmorris, yoshfuji, netfilter-devel, dev,
	steffen.klassert, herbert, bsingharora, Nicolas Dichtel
In-Reply-To: <1461339084-3849-1-git-send-email-nicolas.dichtel@6wind.com>

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/uapi/linux/xfrm.h |  1 +
 net/xfrm/xfrm_user.c      | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 2cd9e608d0d1..143338978b48 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -302,6 +302,7 @@ enum xfrm_attr_type_t {
 	XFRMA_SA_EXTRA_FLAGS,	/* __u32 */
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
+	XFRMA_PAD,
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2cc7af858c6f..d516845e16e3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -809,7 +809,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 			goto out;
 	}
 	if (x->lastused) {
-		ret = nla_put_u64(skb, XFRMA_LASTUSED, x->lastused);
+		ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused,
+					XFRMA_PAD);
 		if (ret)
 			goto out;
 	}
@@ -1813,7 +1814,7 @@ static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x)
 
 	return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id))
 	       + nla_total_size(replay_size)
-	       + nla_total_size(sizeof(struct xfrm_lifetime_cur))
+	       + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur))
 	       + nla_total_size(sizeof(struct xfrm_mark))
 	       + nla_total_size(4) /* XFRM_AE_RTHR */
 	       + nla_total_size(4); /* XFRM_AE_ETHR */
@@ -1848,7 +1849,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
 	}
 	if (err)
 		goto out_cancel;
-	err = nla_put(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft);
+	err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft,
+			    XFRMA_PAD);
 	if (err)
 		goto out_cancel;
 
@@ -2617,7 +2619,7 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(x->props.extra_flags));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
-	l += nla_total_size(sizeof(u64));
+	l += nla_total_size_64bit(sizeof(u64));
 
 	return l;
 }
-- 
2.8.1

^ permalink raw reply related

* [PATCH net-next 1/9] libnl: fix help of _64bit functions
From: Nicolas Dichtel @ 2016-04-22 15:31 UTC (permalink / raw)
  To: netdev
  Cc: davem, linux-kernel, linux-wpan, aar, pablo, kaber, kadlec,
	pshelar, kuznet, jmorris, yoshfuji, netfilter-devel, dev,
	steffen.klassert, herbert, bsingharora, Nicolas Dichtel
In-Reply-To: <1461339084-3849-1-git-send-email-nicolas.dichtel@6wind.com>

Fix typo and describe 'padattr'.

Fixes: 089bf1a6a924 ("libnl: add more helpers to align attributes on 64-bit")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 lib/nlattr.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/nlattr.c b/lib/nlattr.c
index 2b82f1e2ebc2..fce1e9afc6d9 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -359,10 +359,11 @@ EXPORT_SYMBOL(__nla_reserve);
  * @skb: socket buffer to reserve room on
  * @attrtype: attribute type
  * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
  *
  * Adds a netlink attribute header to a socket buffer and reserves
  * room for the payload but does not copy it. It also ensure that this
- * attribute will be 64-bit aign.
+ * attribute will have a 64-bit aligned nla_data() area.
  *
  * The caller is responsible to ensure that the skb provides enough
  * tailroom for the attribute header and payload.
@@ -424,10 +425,11 @@ EXPORT_SYMBOL(nla_reserve);
  * @skb: socket buffer to reserve room on
  * @attrtype: attribute type
  * @attrlen: length of attribute payload
+ * @padattr: attribute type for the padding
  *
  * Adds a netlink attribute header to a socket buffer and reserves
  * room for the payload but does not copy it. It also ensure that this
- * attribute will be 64-bit aign.
+ * attribute will have a 64-bit aligned nla_data() area.
  *
  * Returns NULL if the tailroom of the skb is insufficient to store
  * the attribute header and payload.
@@ -493,6 +495,7 @@ EXPORT_SYMBOL(__nla_put);
  * @attrtype: attribute type
  * @attrlen: length of attribute payload
  * @data: head of attribute payload
+ * @padattr: attribute type for the padding
  *
  * The caller is responsible to ensure that the skb provides enough
  * tailroom for the attribute header and payload.
@@ -551,6 +554,7 @@ EXPORT_SYMBOL(nla_put);
  * @attrtype: attribute type
  * @attrlen: length of attribute payload
  * @data: head of attribute payload
+ * @padattr: attribute type for the padding
  *
  * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
  * the attribute header and payload.
-- 
2.8.1

^ permalink raw reply related

* [PATCH net-next 7/9] libnl: add nla_put_u64_64bit() helper
From: Nicolas Dichtel @ 2016-04-22 15:31 UTC (permalink / raw)
  To: netdev
  Cc: davem, linux-kernel, linux-wpan, aar, pablo, kaber, kadlec,
	pshelar, kuznet, jmorris, yoshfuji, netfilter-devel, dev,
	steffen.klassert, herbert, bsingharora, Nicolas Dichtel
In-Reply-To: <1461339084-3849-1-git-send-email-nicolas.dichtel@6wind.com>

With this function, nla_data() is aligned on a 64-bit area.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/net/netlink.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 113b483b6ee8..e589cb3dccee 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -858,6 +858,19 @@ static inline int nla_put_u64(struct sk_buff *skb, int attrtype, u64 value)
 }
 
 /**
+ * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
+ * @skb: socket buffer to add attribute to
+ * @attrtype: attribute type
+ * @value: numeric value
+ * @padattr: attribute type for the padding
+ */
+static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
+				    u64 value, int padattr)
+{
+	return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr);
+}
+
+/**
  * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
  * @skb: socket buffer to add attribute to
  * @attrtype: attribute type
-- 
2.8.1

^ permalink raw reply related

* [PATCH net-next 4/9] libnl: nla_put_net64(): align on a 64-bit area
From: Nicolas Dichtel @ 2016-04-22 15:31 UTC (permalink / raw)
  To: netdev
  Cc: davem, linux-kernel, linux-wpan, aar, pablo, kaber, kadlec,
	pshelar, kuznet, jmorris, yoshfuji, netfilter-devel, dev,
	steffen.klassert, herbert, bsingharora, Nicolas Dichtel
In-Reply-To: <1461339084-3849-1-git-send-email-nicolas.dichtel@6wind.com>

nla_data() is now aligned on a 64-bit area.

The temporary function nla_put_be64_32bit() is removed in this patch.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 include/linux/netfilter/ipset/ip_set.h      |  9 ++++++---
 include/net/netlink.h                       | 14 ++++++--------
 include/uapi/linux/netfilter/ipset/ip_set.h |  1 +
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index f48b8a664b0f..83b9a2e0d8d4 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -351,7 +351,8 @@ ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo)
 	return ((skbinfo->skbmark || skbinfo->skbmarkmask) &&
 		nla_put_net64(skb, IPSET_ATTR_SKBMARK,
 			      cpu_to_be64((u64)skbinfo->skbmark << 32 |
-					  skbinfo->skbmarkmask))) ||
+					  skbinfo->skbmarkmask),
+			      IPSET_ATTR_PAD)) ||
 	       (skbinfo->skbprio &&
 		nla_put_net32(skb, IPSET_ATTR_SKBPRIO,
 			      cpu_to_be32(skbinfo->skbprio))) ||
@@ -374,9 +375,11 @@ static inline bool
 ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
 {
 	return nla_put_net64(skb, IPSET_ATTR_BYTES,
-			     cpu_to_be64(ip_set_get_bytes(counter))) ||
+			     cpu_to_be64(ip_set_get_bytes(counter)),
+			     IPSET_ATTR_PAD) ||
 	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
-			     cpu_to_be64(ip_set_get_packets(counter)));
+			     cpu_to_be64(ip_set_get_packets(counter)),
+			     IPSET_ATTR_PAD);
 }
 
 static inline void
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 47d7d1356fa3..066a921e7cbe 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -868,20 +868,18 @@ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
 	return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr);
 }
 
-static inline int nla_put_be64_32bit(struct sk_buff *skb, int attrtype,
-				     __be64 value)
-{
-	return nla_put(skb, attrtype, sizeof(__be64), &value);
-}
 /**
- * nla_put_net64 - Add 64-bit network byte order netlink attribute to a socket buffer
+ * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
  * @skb: socket buffer to add attribute to
  * @attrtype: attribute type
  * @value: numeric value
+ * @padattr: attribute type for the padding
  */
-static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value)
+static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
+				int padattr)
 {
-	return nla_put_be64_32bit(skb, attrtype | NLA_F_NET_BYTEORDER, value);
+	return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, value,
+			    padattr);
 }
 
 /**
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index 63b2e34f1b60..ebb5154976de 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -118,6 +118,7 @@ enum {
 	IPSET_ATTR_SKBMARK,
 	IPSET_ATTR_SKBPRIO,
 	IPSET_ATTR_SKBQUEUE,
+	IPSET_ATTR_PAD,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
-- 
2.8.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox