Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net 3/3] ARM64: dts: meson: odroidc2: disable 1000t-eee advertisement
From: Jerome Brunet @ 2016-11-15 14:29 UTC (permalink / raw)
  To: netdev, devicetree, Florian Fainelli
  Cc: Jerome Brunet, Carlo Caione, Kevin Hilman, Giuseppe Cavallaro,
	Alexandre TORGUE, Martin Blumenstingl, Andre Roth, Neil Armstrong,
	linux-amlogic, linux-arm-kernel, linux-kernel
In-Reply-To: <1479220154-25851-1-git-send-email-jbrunet@baylibre.com>

Reported-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Cc: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Cc: Alexandre TORGUE <alexandre.torgue@st.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
Tested-by: Andre Roth <neolynx@gmail.com>
---
 arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
index e6e3491d48a5..1f4416ecb183 100644
--- a/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-gxbb-odroidc2.dts
@@ -98,3 +98,18 @@
 	pinctrl-0 = <&i2c_a_pins>;
 	pinctrl-names = "default";
 };
+
+&ethmac {
+	phy-handle = <&eth_phy0>;
+
+	mdio {
+		compatible = "snps,dwmac-mdio";
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		eth_phy0: ethernet-phy@0 {
+			reg = <0>;
+			realtek,disable-eee-1000t;
+		};
+	};
+};
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next v3 3/7] vxlan: simplify exception handling
From: Jiri Benc @ 2016-11-15 14:30 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev
In-Reply-To: <1479098638-4921-4-git-send-email-pshelar@ovn.org>

On Sun, 13 Nov 2016 20:43:54 -0800, Pravin B Shelar wrote:
> @@ -1927,13 +1923,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  	struct ip_tunnel_info *info;
>  	struct vxlan_dev *vxlan = netdev_priv(dev);
>  	struct sock *sk;
> -	struct rtable *rt = NULL;
>  	const struct iphdr *old_iph;
>  	union vxlan_addr *dst;
>  	union vxlan_addr remote_ip, local_ip;
>  	union vxlan_addr *src;
>  	struct vxlan_metadata _md;
>  	struct vxlan_metadata *md = &_md;
> +	struct dst_entry *ndst = NULL;
>  	__be16 src_port = 0, dst_port;
>  	__be32 vni, label;
>  	__be16 df = 0;
> @@ -2009,6 +2005,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  
>  	if (dst->sa.sa_family == AF_INET) {
>  		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
> +		struct rtable *rt;
>  
>  		if (!sock4)
>  			goto drop;
> @@ -2030,7 +2027,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  			netdev_dbg(dev, "circular route to %pI4\n",
>  				   &dst->sin.sin_addr.s_addr);
>  			dev->stats.collisions++;
> -			goto rt_tx_error;
> +			ip_rt_put(rt);
> +			goto tx_error;
>  		}
>  
>  		/* Bypass encapsulation if the destination is local */
> @@ -2053,12 +2051,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  		else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
>  			df = htons(IP_DF);
>  
> +		ndst = &rt->dst;

It would be a bit cleaner to do this assignment just after rt is
assigned (but after the IS_ERR(rt) condition), get rid of the added
ip_rt_put call above and move the existing ip_rt_put call in the bypass
case just before the vxlan_encap_bypass call...

>  		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
>  		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
> -		err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
> +		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
>  				      vni, md, flags, udp_sum);
>  		if (err < 0)
> -			goto xmit_tx_error;
> +			goto tx_error;
>  
>  		udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr,
>  				    dst->sin.sin_addr.s_addr, tos, ttl, df,
> @@ -2066,7 +2065,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  #if IS_ENABLED(CONFIG_IPV6)
>  	} else {
>  		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
> -		struct dst_entry *ndst;
>  		u32 rt6i_flags;
>  
>  		ndst = vxlan6_get_route(vxlan, sock6, skb,
> @@ -2078,13 +2076,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  			netdev_dbg(dev, "no route to %pI6\n",
>  				   &dst->sin6.sin6_addr);
>  			dev->stats.tx_carrier_errors++;
> +			ndst = NULL;
>  			goto tx_error;
>  		}
>  
>  		if (ndst->dev == dev) {
>  			netdev_dbg(dev, "circular route to %pI6\n",
>  				   &dst->sin6.sin6_addr);
> -			dst_release(ndst);
>  			dev->stats.collisions++;
>  			goto tx_error;
>  		}
> @@ -2096,12 +2094,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  		    !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
>  			struct vxlan_dev *dst_vxlan;
>  
> -			dst_release(ndst);
>  			dst_vxlan = vxlan_find_vni(vxlan->net, vni,
>  						   dst->sa.sa_family, dst_port,
>  						   vxlan->flags);
>  			if (!dst_vxlan)
>  				goto tx_error;
> +			dst_release(ndst);
>  			vxlan_encap_bypass(skb, vxlan, dst_vxlan);
>  			return;
>  		}

...the same way you have it here, in the IPv6 part. Could you change
the IPv4 part to match it?

Looks good otherwise. Seeing it, I like this version much more than v2.

Thanks!

 Jiri

^ permalink raw reply

* Re: [RFC PATCH 1/2] net: use cmpxchg instead of spinlock in ptr rings
From: Michael S. Tsirkin @ 2016-11-15 14:30 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev@vger.kernel.org, John Fastabend
In-Reply-To: <20161115143258.2c46fc9a@redhat.com>

On Tue, Nov 15, 2016 at 02:32:58PM +0100, Jesper Dangaard Brouer wrote:
> What I would really like to see is a lock-free (locked cmpxchg) queue
> implementation, what like ptr_ring use the array as empty/full check,
> and still (somehow) support bulking.

I think lock-free is overrated for this use-case - we hold the lock
for such a short amount of time.

I think what we want is just a simpler spinlock - one that's faster than
qlock for use-cases that are unfair anyway, like this one where even if
you get the lock in a fair way, FIFO might be full and you won't be able
to queue.

Or find an API to add to FIFO in a fair way.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next v3 4/7] vxlan: improve vxlan route lookup checks.
From: Jiri Benc @ 2016-11-15 14:39 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev
In-Reply-To: <1479098638-4921-5-git-send-email-pshelar@ovn.org>

On Sun, 13 Nov 2016 20:43:55 -0800, Pravin B Shelar wrote:
> @@ -1929,8 +1951,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>  	union vxlan_addr *src;
>  	struct vxlan_metadata _md;
>  	struct vxlan_metadata *md = &_md;
> -	struct dst_entry *ndst = NULL;
>  	__be16 src_port = 0, dst_port;
> +	struct dst_entry *ndst = NULL;
>  	__be32 vni, label;
>  	__be16 df = 0;
>  	__u8 tos, ttl;

This looks kind of arbitrary. You might want to remove this hunk or
merge it to patch 3.

Other than that,
Acked-by: Jiri Benc <jbenc@redhat.com>

^ permalink raw reply

* Re: [PATCH] vhost/scsi: Remove unused but set variable
From: Stefan Hajnoczi @ 2016-11-15 14:40 UTC (permalink / raw)
  To: Tobias Klauser
  Cc: Michael S. Tsirkin, Jason Wang, kvm, virtualization, netdev
In-Reply-To: <20161111132710.25804-1-tklauser@distanz.ch>

[-- Attachment #1: Type: text/plain, Size: 471 bytes --]

On Fri, Nov 11, 2016 at 02:27:10PM +0100, Tobias Klauser wrote:
> Remove the unused but set variable se_tpg in vhost_scsi_nexus_cb() to
> fix the following GCC warning when building with 'W=1':
> 
>   drivers/vhost/scsi.c:1752:26: warning: variable ‘se_tpg’ set but not used
> 
> Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
> ---
>  drivers/vhost/scsi.c | 2 --
>  1 file changed, 2 deletions(-)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v3 5/7] vxlan: simplify RTF_LOCAL handling.
From: Jiri Benc @ 2016-11-15 14:44 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev
In-Reply-To: <1479098638-4921-6-git-send-email-pshelar@ovn.org>

On Sun, 13 Nov 2016 20:43:56 -0800, Pravin B Shelar wrote:
> Avoid code duplicate code for handling RTF_LOCAL routes.
> 
> Signed-off-by: Pravin B Shelar <pshelar@ovn.org>

Acked-by: Jiri Benc <jbenc@redhat.com>

^ permalink raw reply

* Re: [PATCH v3] ip6_output: ensure flow saddr actually belongs to device
From: Hannes Frederic Sowa @ 2016-11-15 14:45 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: David Ahern, Netdev, WireGuard mailing list, LKML,
	YOSHIFUJI Hideaki
In-Reply-To: <CAHmME9ppx01YR9Db1oPpm6FJ+BmpqSxvjQ2S+GT0DXO09_M4oQ@mail.gmail.com>

Hey Jason,

On 15.11.2016 01:45, Jason A. Donenfeld wrote:
> I'll have a better look at this. Perhaps this should be modeled on
> what we currently do for userspace, which might amount to something
> more or less like:

Cool, thanks!

> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 6001e78..0721915 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -925,6 +925,7 @@ static int ip6_dst_lookup_tail(struct net *net,
> const struct sock *sk,
>  #endif
>          int err;
>          int flags = 0;
> +        int addr_type, bind_to_dev;
> 
>          /* The correct way to handle this would be to do
>           * ip6_route_get_saddr, and then ip6_route_output; however,
> @@ -1012,6 +1013,16 @@ static int ip6_dst_lookup_tail(struct net *net,
> const struct sock *sk,
>          }
>  #endif
> 
> +        addr_type = ipv6_addr_type(&fl6->saddr);
> +        if (addr_type == IPv6_ADDR_ANY)
> +                return 0;
> +
> +        err = -EINVAL;
> +        bind_to_dev = __ipv6_addr_src_scope(addr_type) <=
> IPV6_ADDR_SCOPE_LINKLOCAL;
> +        if (!ipv6_chk_addr(net, &fl6->saddr, bind_to_dev ?
> (*dst)->dev : NULL, 0) &&
> +            !ipv6_chk_acast_addr_src(net, (*dst)->dev, &fl6->saddr))
> +                goto out_err_release;
> +
>          return 0;
> 
>  out_err_release:
> 

We should not use (*dst)->dev, as this is the resulting device after the
lookup and not necessarily corresponds to the device the user asked for.
Thus you need to pass in fl6.flowi6_oif. Thus to kill the necessary
ifindex->net_device lookup, I would suggest to move
ipv6_chk_addr_and_flags to use ifindex instead of net_device (0
corresponds to the net_device == NULL case). It seems to me this would
make the code easier. ipv6_chk_addr can simply pass down dev->ifindex to
ipv6_chk_addr.

Probably for checking anycast address you need to look up the
net_device, thus use dev_get_by_index_rcu. But probably the unicast
filter will already hit thus the whole traversing of anycast addresses
won't happen in normal cases. This could be separated to its own function.

In the non-strict case we don't necessarily need bind_to_dev?

Bye,
Hannes

^ permalink raw reply

* Re: [PATCH 2/3] vhost: better detection of available buffers
From: Michael S. Tsirkin @ 2016-11-15 14:46 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel
In-Reply-To: <8bf86752-4dca-3ced-4641-efa7a4a1fc6e@redhat.com>

On Tue, Nov 15, 2016 at 04:00:21PM +0800, Jason Wang wrote:
> 
> 
> On 2016年11月15日 11:28, Michael S. Tsirkin wrote:
> > On Tue, Nov 15, 2016 at 11:16:59AM +0800, Jason Wang wrote:
> > > 
> > > On 2016年11月12日 00:20, Michael S. Tsirkin wrote:
> > > > On Fri, Nov 11, 2016 at 12:18:50PM +0800, Jason Wang wrote:
> > > > > On 2016年11月11日 11:41, Michael S. Tsirkin wrote:
> > > > > > On Fri, Nov 11, 2016 at 10:18:37AM +0800, Jason Wang wrote:
> > > > > > > > On 2016年11月10日 03:57, Michael S. Tsirkin wrote:
> > > > > > > > > > On Wed, Nov 09, 2016 at 03:38:32PM +0800, Jason Wang wrote:
> > > > > > > > > > > > We should use vq->last_avail_idx instead of vq->avail_idx in the
> > > > > > > > > > > > checking of vhost_vq_avail_empty() since latter is the cached avail
> > > > > > > > > > > > index from guest but we want to know if there's pending available
> > > > > > > > > > > > buffers in the virtqueue.
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > > > > > > I'm not sure why is this patch here. Is it related to
> > > > > > > > > > batching somehow?
> > > > > > > > Yes, we need to know whether or not there's still buffers left in the
> > > > > > > > virtqueue, so need to check last_avail_idx. Otherwise, we're checking if
> > > > > > > > guest has submitted new buffers.
> > > > > > > > 
> > > > > > > > > > > > ---
> > > > > > > > > > > >     drivers/vhost/vhost.c | 2 +-
> > > > > > > > > > > >     1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > > > > > > > > > > index c6f2d89..fdf4cdf 100644
> > > > > > > > > > > > --- a/drivers/vhost/vhost.c
> > > > > > > > > > > > +++ b/drivers/vhost/vhost.c
> > > > > > > > > > > > @@ -2230,7 +2230,7 @@ bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq)
> > > > > > > > > > > >     	if (r)
> > > > > > > > > > > >     		return false;
> > > > > > > > > > > > -	return vhost16_to_cpu(vq, avail_idx) == vq->avail_idx;
> > > > > > > > > > > > +	return vhost16_to_cpu(vq, avail_idx) == vq->last_avail_idx;
> > > > > > > > > > > >     }
> > > > > > > > > > > >     EXPORT_SYMBOL_GPL(vhost_vq_avail_empty);
> > > > > > > > > > That might be OK for TX but it's probably wrong for RX
> > > > > > > > > > where the fact that used != avail does not mean
> > > > > > > > > > we have enough space to store the packet.
> > > > > > > > Right, but it's no harm since it was just a hint, handle_rx() can handle
> > > > > > > > this situation.
> > > > > > Means busy polling will cause useless load on the CPU though.
> > > > > > 
> > > > > Right, but,it's not easy to have 100% correct hint here. Needs more thought.
> > > > What's wrong with what we have? It polls until value changes.
> > > > 
> > > But as you said, this does not mean (in mergeable cases) we have enough
> > > space to store the packet.
> > Absolutely but it checks once and then only re-checks after value
> > changes again.
> > 
> 
> Since get_rx_bufs() does not get enough buffers, we will wait for the kick
> in this case. For busy polling, we probably want to stay in the busy loop
> here.

That's what I'm saying. You don't want to re-poll the queue
if available idx was unchanged.

-- 
MST

^ permalink raw reply

* Re: [PATCH net-next v3 7/7] vxlan: remove unsed vxlan_dev_dst_port()
From: Jiri Benc @ 2016-11-15 14:56 UTC (permalink / raw)
  To: Pravin B Shelar; +Cc: netdev
In-Reply-To: <1479098638-4921-8-git-send-email-pshelar@ovn.org>

On Sun, 13 Nov 2016 20:43:58 -0800, Pravin B Shelar wrote:
> Signed-off-by: Pravin B Shelar <pshelar@ovn.org>

Acked-by: Jiri Benc <jbenc@redhat.com>

^ permalink raw reply

* Re: ath9k_htc: fix minor mistakes in dev_err messages
From: Kalle Valo @ 2016-11-15 14:58 UTC (permalink / raw)
  To: Colin Ian King
  Cc: QCA ath9k Development, Kalle Valo, linux-wireless, ath9k-devel,
	netdev, linux-kernel
In-Reply-To: <20161031151247.18127-1-colin.king@canonical.com>

Colin Ian King <colin.king@canonical.com> wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Add missing space in a dev_err message and join wrapped text so
> it does not span multiple lines.  Fix spelling mistake on "unknown".
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Patch applied to ath-next branch of ath.git, thanks.

14acebc33e6d ath9k_htc: fix minor mistakes in dev_err messages

-- 
https://patchwork.kernel.org/patch/9405663/

Documentation about submitting wireless patches and checking status
from patchwork:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [1/1] ath10k: use the right length of "background"
From: Kalle Valo @ 2016-11-15 15:07 UTC (permalink / raw)
  To: Nicolas Iooss; +Cc: ath10k, netdev, linux-wireless, linux-kernel, Nicolas Iooss
In-Reply-To: <20161029111737.19034-1-nicolas.iooss_linux@m4x.org>

Nicolas Iooss <nicolas.iooss_linux@m4x.org> wrote:
> The word "background" contains 10 characters so the third argument of
> strncmp() need to be 10 in order to match this prefix correctly.
> 
> Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
> Fixes: 855aed1220d2 ("ath10k: add spectral scan feature")

Patch applied to ath-next branch of ath.git, thanks.

31b239824ece ath10k: use the right length of "background"

-- 
https://patchwork.kernel.org/patch/9403561/

Documentation about submitting wireless patches and checking status
from patchwork:

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* [PATCH net-next 1/1] ipv6: sr: add option to control lwtunnel support
From: David Lebrun @ 2016-11-15 15:14 UTC (permalink / raw)
  To: netdev; +Cc: lorenzo, davem, roopa, David Lebrun

This patch adds a new option CONFIG_IPV6_SEG6_LWTUNNEL to enable/disable
support of encapsulation with the lightweight tunnels. When this option
is enabled, CONFIG_LWTUNNEL is automatically selected.

Fix commit 6c8702c60b88 ("ipv6: sr: add support for SRH encapsulation and injection with lwtunnels")

Without a proper option to control lwtunnel support for SR-IPv6, if
CONFIG_LWTUNNEL=n then the IPv6 initialization fails as a consequence
of seg6_iptunnel_init() failure with EOPNOTSUPP:

NET: Registered protocol family 10
IPv6: Attempt to unregister permanent protocol 6
IPv6: Attempt to unregister permanent protocol 136
IPv6: Attempt to unregister permanent protocol 17
NET: Unregistered protocol family 10

Tested (compiling, booting, and loading ipv6 module when relevant)
with possible combinations of CONFIG_IPV6={y,m,n},
CONFIG_IPV6_SEG6_LWTUNNEL={y,n} and CONFIG_LWTUNNEL={y,n}.

Reported-by: Lorenzo Colitti <lorenzo@google.com>
Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David Lebrun <david.lebrun@uclouvain.be>
---
 net/ipv6/Kconfig  | 13 ++++++++++++-
 net/ipv6/Makefile |  5 +++--
 net/ipv6/seg6.c   |  8 ++++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0f00811..ec1267e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -289,9 +289,20 @@ config IPV6_PIMSM_V2
 	  Support for IPv6 PIM multicast routing protocol PIM-SMv2.
 	  If unsure, say N.
 
+config IPV6_SEG6_LWTUNNEL
+	bool "IPv6: Segment Routing Header encapsulation support"
+	depends on IPV6
+	select LWTUNNEL
+	---help---
+	  Support for encapsulation of packets within an outer IPv6
+	  header and a Segment Routing Header using the lightweight
+	  tunnels mechanism.
+
+	  If unsure, say N.
+
 config IPV6_SEG6_INLINE
 	bool "IPv6: direct Segment Routing Header insertion "
-	depends on IPV6
+	depends on IPV6_SEG6_LWTUNNEL
 	---help---
 	  Support for direct insertion of the Segment Routing Header,
 	  also known as inline mode. Be aware that direct insertion of
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 129cad2..a9e9fec 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=	af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
 		route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
 		raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
 		exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-		udp_offload.o seg6.o seg6_iptunnel.o
+		udp_offload.o seg6.o
 
 ipv6-offload :=	ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
@@ -23,6 +23,8 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o
 ipv6-$(CONFIG_PROC_FS) += proc.o
 ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
 ipv6-$(CONFIG_NETLABEL) += calipso.o
+ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o
+ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 ipv6-objs += $(ipv6-y)
 
@@ -44,7 +46,6 @@ obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
 obj-$(CONFIG_IPV6_FOU) += fou6.o
-obj-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
 
 obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 50f6e06..b172d85 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -451,9 +451,11 @@ int __init seg6_init(void)
 	if (err)
 		goto out_unregister_genl;
 
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
 	err = seg6_iptunnel_init();
 	if (err)
 		goto out_unregister_pernet;
+#endif
 
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	err = seg6_hmac_init();
@@ -467,10 +469,14 @@ int __init seg6_init(void)
 	return err;
 #ifdef CONFIG_IPV6_SEG6_HMAC
 out_unregister_iptun:
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
 	seg6_iptunnel_exit();
 #endif
+#endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
 out_unregister_pernet:
 	unregister_pernet_subsys(&ip6_segments_ops);
+#endif
 out_unregister_genl:
 	genl_unregister_family(&seg6_genl_family);
 	goto out;
@@ -481,7 +487,9 @@ void seg6_exit(void)
 #ifdef CONFIG_IPV6_SEG6_HMAC
 	seg6_hmac_exit();
 #endif
+#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
 	seg6_iptunnel_exit();
+#endif
 	unregister_pernet_subsys(&ip6_segments_ops);
 	genl_unregister_family(&seg6_genl_family);
 }
-- 
2.7.3

^ permalink raw reply related

* Re: [PATCH net-next v5] cadence: Add LSO support.
From: David Miller @ 2016-11-15 15:15 UTC (permalink / raw)
  To: rafalo; +Cc: nicolas.ferre, netdev, linux-kernel
In-Reply-To: <BN3PR07MB251641C606D02892E2196960C9BF0@BN3PR07MB2516.namprd07.prod.outlook.com>

From: Rafal Ozieblo <rafalo@cadence.com>
Date: Tue, 15 Nov 2016 07:07:14 +0000

> Would it be good to enable UFO conditionally with some internal
> define? Ex.:

Absolutely not.

^ permalink raw reply

* Re: [patch net-next v2 0/8] Add support for offloading packet-sampling
From: David Miller @ 2016-11-15 15:17 UTC (permalink / raw)
  To: jiri
  Cc: netdev, yotamg, idosch, eladr, nogahf, ogerlitz, jhs,
	geert+renesas, stephen, xiyou.wangcong, linux, roopa,
	john.fastabend, simon.horman
In-Reply-To: <20161115092743.GA1783@nanopsycho.orion>

From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 15 Nov 2016 10:27:43 +0100

> Dave, please drop this patchset for now. We just discovered couple of
> more changes are needed.

Ok.

^ permalink raw reply

* Re: [PATCH net-next v2] ipv6: sr: fix IPv6 initialization failure without lwtunnels
From: David Miller @ 2016-11-15 15:18 UTC (permalink / raw)
  To: david.lebrun; +Cc: roopa, netdev, lorenzo
In-Reply-To: <582AE0B0.60006@uclouvain.be>

From: David Lebrun <david.lebrun@uclouvain.be>
Date: Tue, 15 Nov 2016 11:17:20 +0100

> On 11/14/2016 03:22 PM, Roopa Prabhu wrote:
>> I prefer option b). most LWTUNNEL encaps are done this way.
>> 
>> seg6 and seg6_iptunnel is new segment routing code and can be under
>> CONFIG_IPV6_SEG6 which depends on CONFIG_LWTUNNEL and CONFIG_IPV6.
>> CONFIG_IPV6_SEG6_HMAC could then depend on CONFIG_IPV6_SEG6
> 
> Will do that, thanks

This is good for the time being.

Although I'd like to entertain the idea of making LWTUNNEL
unconditionally built and considered a fundamental piece of
networking infrastructure just like net/core/dst.c

^ permalink raw reply

* [PATCH net] sctp: use new rhlist interface on sctp transport rhashtable
From: Xin Long @ 2016-11-15 15:23 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: davem, Marcelo Ricardo Leitner, Neil Horman, Vlad Yasevich,
	Herbert Xu, phil

Now sctp transport rhashtable uses hash(lport, dport, daddr) as the key
to hash a node to one chain. If in one host thousands of assocs connect
to one server with the same lport and different laddrs (although it's
not a normal case), all the transports would be hashed into the same
chain.

It may cause to keep returning -EBUSY when inserting a new node, as the
chain is too long and sctp inserts a transport node in a loop, which
could even lead to system hangs there.

The new rhlist interface works for this case that there are many nodes
with the same key in one chain. It puts them into a list then makes this
list be as a node of the chain.

This patch is to replace rhashtable_ interface with rhltable_ interface.
Since a chain would not be too long and it would not return -EBUSY with
this fix when inserting a node, the reinsert loop is also removed here.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/sctp/sctp.h    |  2 +-
 include/net/sctp/structs.h |  4 +-
 net/sctp/associola.c       |  8 +++-
 net/sctp/input.c           | 93 ++++++++++++++++++++++++++--------------------
 net/sctp/socket.c          |  7 +---
 5 files changed, 64 insertions(+), 50 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 31acc3f..f0dcaeb 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -164,7 +164,7 @@ void sctp_backlog_migrate(struct sctp_association *assoc,
 			  struct sock *oldsk, struct sock *newsk);
 int sctp_transport_hashtable_init(void);
 void sctp_transport_hashtable_destroy(void);
-void sctp_hash_transport(struct sctp_transport *t);
+int sctp_hash_transport(struct sctp_transport *t);
 void sctp_unhash_transport(struct sctp_transport *t);
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 11c3bf2..c5a2d83 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -124,7 +124,7 @@ extern struct sctp_globals {
 	/* This is the sctp port control hash.	*/
 	struct sctp_bind_hashbucket *port_hashtable;
 	/* This is the hash of all transports. */
-	struct rhashtable transport_hashtable;
+	struct rhltable transport_hashtable;
 
 	/* Sizes of above hashtables. */
 	int ep_hashsize;
@@ -762,7 +762,7 @@ static inline int sctp_packet_empty(struct sctp_packet *packet)
 struct sctp_transport {
 	/* A list of transports. */
 	struct list_head transports;
-	struct rhash_head node;
+	struct rhlist_head node;
 
 	/* Reference counting. */
 	atomic_t refcnt;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index f10d339..68428e1 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -700,11 +700,15 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	/* Set the peer's active state. */
 	peer->state = peer_state;
 
+	/* Add this peer into the transport hashtable */
+	if (sctp_hash_transport(peer)) {
+		sctp_transport_free(peer);
+		return NULL;
+	}
+
 	/* Attach the remote transport to our asoc.  */
 	list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
 	asoc->peer.transport_count++;
-	/* Add this peer into the transport hashtable */
-	sctp_hash_transport(peer);
 
 	/* If we do not yet have a primary path, set one.  */
 	if (!asoc->peer.primary_path) {
diff --git a/net/sctp/input.c b/net/sctp/input.c
index a01a56e..458e506 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -790,10 +790,9 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
 
 /* rhashtable for transport */
 struct sctp_hash_cmp_arg {
-	const struct sctp_endpoint	*ep;
-	const union sctp_addr		*laddr;
-	const union sctp_addr		*paddr;
-	const struct net		*net;
+	const union sctp_addr	*paddr;
+	const struct net	*net;
+	u16			lport;
 };
 
 static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -801,7 +800,6 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
 {
 	struct sctp_transport *t = (struct sctp_transport *)ptr;
 	const struct sctp_hash_cmp_arg *x = arg->key;
-	struct sctp_association *asoc;
 	int err = 1;
 
 	if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
@@ -809,19 +807,10 @@ static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
 	if (!sctp_transport_hold(t))
 		return err;
 
-	asoc = t->asoc;
-	if (!net_eq(sock_net(asoc->base.sk), x->net))
+	if (!net_eq(sock_net(t->asoc->base.sk), x->net))
+		goto out;
+	if (x->lport != htons(t->asoc->base.bind_addr.port))
 		goto out;
-	if (x->ep) {
-		if (x->ep != asoc->ep)
-			goto out;
-	} else {
-		if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
-			goto out;
-		if (!sctp_bind_addr_match(&asoc->base.bind_addr,
-					  x->laddr, sctp_sk(asoc->base.sk)))
-			goto out;
-	}
 
 	err = 0;
 out:
@@ -851,11 +840,9 @@ static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
 	const struct sctp_hash_cmp_arg *x = data;
 	const union sctp_addr *paddr = x->paddr;
 	const struct net *net = x->net;
-	u16 lport;
+	u16 lport = x->lport;
 	u32 addr;
 
-	lport = x->ep ? htons(x->ep->base.bind_addr.port) :
-			x->laddr->v4.sin_port;
 	if (paddr->sa.sa_family == AF_INET6)
 		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
 	else
@@ -875,29 +862,32 @@ static const struct rhashtable_params sctp_hash_params = {
 
 int sctp_transport_hashtable_init(void)
 {
-	return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
+	return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params);
 }
 
 void sctp_transport_hashtable_destroy(void)
 {
-	rhashtable_destroy(&sctp_transport_hashtable);
+	rhltable_destroy(&sctp_transport_hashtable);
 }
 
-void sctp_hash_transport(struct sctp_transport *t)
+int sctp_hash_transport(struct sctp_transport *t)
 {
 	struct sctp_hash_cmp_arg arg;
+	int err;
 
 	if (t->asoc->temp)
-		return;
+		return 0;
 
-	arg.ep = t->asoc->ep;
-	arg.paddr = &t->ipaddr;
 	arg.net   = sock_net(t->asoc->base.sk);
+	arg.paddr = &t->ipaddr;
+	arg.lport = htons(t->asoc->base.bind_addr.port);
 
-reinsert:
-	if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
-					 &t->node, sctp_hash_params) == -EBUSY)
-		goto reinsert;
+	err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
+				  &t->node, sctp_hash_params);
+	if (err)
+		pr_err_once("insert transport fail, errno %d\n", err);
+
+	return err;
 }
 
 void sctp_unhash_transport(struct sctp_transport *t)
@@ -905,39 +895,62 @@ void sctp_unhash_transport(struct sctp_transport *t)
 	if (t->asoc->temp)
 		return;
 
-	rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
-			       sctp_hash_params);
+	rhltable_remove(&sctp_transport_hashtable, &t->node,
+			sctp_hash_params);
 }
 
+/* return a transport with holding it */
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
 				const union sctp_addr *laddr,
 				const union sctp_addr *paddr)
 {
+	struct rhlist_head *tmp, *list;
+	struct sctp_transport *t;
 	struct sctp_hash_cmp_arg arg = {
-		.ep    = NULL,
-		.laddr = laddr,
 		.paddr = paddr,
 		.net   = net,
+		.lport = laddr->v4.sin_port,
 	};
 
-	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
-				      sctp_hash_params);
+	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+			       sctp_hash_params);
+
+	rhl_for_each_entry_rcu(t, tmp, list, node) {
+		if (!sctp_transport_hold(t))
+			continue;
+
+		if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
+					 laddr, sctp_sk(t->asoc->base.sk)))
+			return t;
+		sctp_transport_put(t);
+	}
+
+	return NULL;
 }
 
+/* return a transport without holding it, as it's only used under sock lock */
 struct sctp_transport *sctp_epaddr_lookup_transport(
 				const struct sctp_endpoint *ep,
 				const union sctp_addr *paddr)
 {
 	struct net *net = sock_net(ep->base.sk);
+	struct rhlist_head *tmp, *list;
+	struct sctp_transport *t;
 	struct sctp_hash_cmp_arg arg = {
-		.ep    = ep,
 		.paddr = paddr,
 		.net   = net,
+		.lport = htons(ep->base.bind_addr.port),
 	};
 
-	return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
-				      sctp_hash_params);
+	list = rhltable_lookup(&sctp_transport_hashtable, &arg,
+			       sctp_hash_params);
+
+	rhl_for_each_entry_rcu(t, tmp, list, node)
+		if (ep == t->asoc->ep)
+			return t;
+
+	return NULL;
 }
 
 /* Look up an association. */
@@ -951,7 +964,7 @@ static struct sctp_association *__sctp_lookup_association(
 	struct sctp_association *asoc = NULL;
 
 	t = sctp_addrs_lookup_transport(net, local, peer);
-	if (!t || !sctp_transport_hold(t))
+	if (!t)
 		goto out;
 
 	asoc = t->asoc;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f23ad91..d5f4b4a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4392,10 +4392,7 @@ int sctp_transport_walk_start(struct rhashtable_iter *iter)
 {
 	int err;
 
-	err = rhashtable_walk_init(&sctp_transport_hashtable, iter,
-				   GFP_KERNEL);
-	if (err)
-		return err;
+	rhltable_walk_enter(&sctp_transport_hashtable, iter);
 
 	err = rhashtable_walk_start(iter);
 	if (err && err != -EAGAIN) {
@@ -4479,7 +4476,7 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *),
 
 	rcu_read_lock();
 	transport = sctp_addrs_lookup_transport(net, laddr, paddr);
-	if (!transport || !sctp_transport_hold(transport))
+	if (!transport)
 		goto out;
 
 	rcu_read_unlock();
-- 
2.1.0

^ permalink raw reply related

* Re: [PATCH v3] ip6_output: ensure flow saddr actually belongs to device
From: David Ahern @ 2016-11-15 15:26 UTC (permalink / raw)
  To: Hannes Frederic Sowa, Jason A. Donenfeld
  Cc: Netdev, LKML, WireGuard mailing list, YOSHIFUJI Hideaki
In-Reply-To: <7543fa0f-a053-8387-1862-d78ffadba688@stressinduktion.org>

On 11/15/16 7:45 AM, Hannes Frederic Sowa wrote:

>> @@ -1012,6 +1013,16 @@ static int ip6_dst_lookup_tail(struct net *net,
>> const struct sock *sk,
>>          }
>>  #endif
>>
>> +        addr_type = ipv6_addr_type(&fl6->saddr);
>> +        if (addr_type == IPv6_ADDR_ANY)
>> +                return 0;
>> +
>> +        err = -EINVAL;
>> +        bind_to_dev = __ipv6_addr_src_scope(addr_type) <=
>> IPV6_ADDR_SCOPE_LINKLOCAL;
>> +        if (!ipv6_chk_addr(net, &fl6->saddr, bind_to_dev ?
>> (*dst)->dev : NULL, 0) &&
>> +            !ipv6_chk_acast_addr_src(net, (*dst)->dev, &fl6->saddr))
>> +                goto out_err_release;
>> +
>>          return 0;
>>
>>  out_err_release:
>>
> 
> We should not use (*dst)->dev, as this is the resulting device after the
> lookup and not necessarily corresponds to the device the user asked for.

To be consistent with IPv4 the saddr check is done before the lookup and dst and flow oif should not be used. Handling LL addresses are trickier and perhaps this is not the right place to enforce that check since it requires a specific device which is only really known after lookup. Why not add the if saddr is LL verification as part of the route selection? e.g, add something like rt6_device_match to ip6_pol_route (the device match call is only used for ip6_pol_route_lookup and not ip6_pol_route - why is that?).

^ permalink raw reply

* [PATCH net-next] udplite: fix NULL pointer dereference
From: Paolo Abeni @ 2016-11-15 15:37 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Andrei Vagin, David S. Miller, Hannes Frederic Sowa

The commit 850cbaddb52d ("udp: use it's own memory accounting schema")
assumes that the socket proto has memory accounting enabled,
but this is not the case for UDPLITE.
Fix it enabling memory accounting for UDPLITE and performing
fwd allocated memory reclaiming on socket shutdown.
UDP and UDPLITE share now the same memory accounting limits.
Also drop the backlog receive operation, since is no more needed.

Fixes: 850cbaddb52d ("udp: use it's own memory accounting schema")
Reported-by: Andrei Vagin <avagin@gmail.com>
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/udp.h     | 1 +
 include/net/udplite.h | 1 +
 net/ipv4/udp.c        | 3 ++-
 net/ipv4/udplite.c    | 3 ++-
 net/ipv6/udplite.c    | 3 ++-
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index e6e4e19..1661791 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -246,6 +246,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
 }
 
 /* net/ipv4/udp.c */
+void udp_destruct_sock(struct sock *sk);
 void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len);
 int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb);
 void udp_skb_destructor(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/udplite.h b/include/net/udplite.h
index 8076193..36097d3 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -27,6 +27,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 static inline int udplite_sk_init(struct sock *sk)
 {
 	udp_sk(sk)->pcflag = UDPLITE_BIT;
+	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c827e4e..9ae7c63 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1253,7 +1253,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
 
-static void udp_destruct_sock(struct sock *sk)
+void udp_destruct_sock(struct sock *sk)
 {
 	/* reclaim completely the forward allocated memory */
 	unsigned int total = 0;
@@ -1267,6 +1267,7 @@ static void udp_destruct_sock(struct sock *sk)
 
 	inet_sock_destruct(sk);
 }
+EXPORT_SYMBOL_GPL(udp_destruct_sock);
 
 int udp_init_sock(struct sock *sk)
 {
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index af81715..59f10fe 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -50,10 +50,11 @@ struct proto 	udplite_prot = {
 	.sendmsg	   = udp_sendmsg,
 	.recvmsg	   = udp_recvmsg,
 	.sendpage	   = udp_sendpage,
-	.backlog_rcv	   = udp_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
 	.obj_size	   = sizeof(struct udp_sock),
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 47d0d2b..2784cc3 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -45,10 +45,11 @@ struct proto udplitev6_prot = {
 	.getsockopt	   = udpv6_getsockopt,
 	.sendmsg	   = udpv6_sendmsg,
 	.recvmsg	   = udpv6_recvmsg,
-	.backlog_rcv	   = udpv6_queue_rcv_skb,
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v6_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = sysctl_udp_mem,
 	.obj_size	   = sizeof(struct udp6_sock),
 	.h.udp_table	   = &udplite_table,
 #ifdef CONFIG_COMPAT
-- 
1.8.3.1

^ permalink raw reply related

* RE: [PATCH for-next 03/11] IB/hns: Optimize the logic of allocating memory using APIs
From: Salil Mehta @ 2016-11-15 15:52 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: dledford@redhat.com, Huwei (Xavier), oulijun,
	mehta.salil.lnk@gmail.com, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Linuxarm,
	Zhangping (ZP)
In-Reply-To: <20161109072130.GH27883@leon.nu>

> -----Original Message-----
> From: Leon Romanovsky [mailto:leon@kernel.org]
> Sent: Wednesday, November 09, 2016 7:22 AM
> To: Salil Mehta
> Cc: dledford@redhat.com; Huwei (Xavier); oulijun;
> mehta.salil.lnk@gmail.com; linux-rdma@vger.kernel.org;
> netdev@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm;
> Zhangping (ZP)
> Subject: Re: [PATCH for-next 03/11] IB/hns: Optimize the logic of
> allocating memory using APIs
> 
> On Fri, Nov 04, 2016 at 04:36:25PM +0000, Salil Mehta wrote:
> > From: "Wei Hu (Xavier)" <xavier.huwei@huawei.com>
> >
> > This patch modified the logic of allocating memory using APIs in
> > hns RoCE driver. We used kcalloc instead of kmalloc_array and
> > bitmap_zero. And When kcalloc failed, call vzalloc to alloc
> > memory.
> >
> > Signed-off-by: Wei Hu (Xavier) <xavier.huwei@huawei.com>
> > Signed-off-by: Ping Zhang <zhangping5@huawei.com>
> > Signed-off-by: Salil Mehta  <salil.mehta@huawei.com>
> > ---
> >  drivers/infiniband/hw/hns/hns_roce_mr.c |   15 ++++++++-------
> >  1 file changed, 8 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c
> b/drivers/infiniband/hw/hns/hns_roce_mr.c
> > index fb87883..d3dfb5f 100644
> > --- a/drivers/infiniband/hw/hns/hns_roce_mr.c
> > +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
> > @@ -137,11 +137,12 @@ static int hns_roce_buddy_init(struct
> hns_roce_buddy *buddy, int max_order)
> >
> >  	for (i = 0; i <= buddy->max_order; ++i) {
> >  		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
> > -		buddy->bits[i] = kmalloc_array(s, sizeof(long),
> GFP_KERNEL);
> > -		if (!buddy->bits[i])
> > -			goto err_out_free;
> > -
> > -		bitmap_zero(buddy->bits[i], 1 << (buddy->max_order - i));
> > +		buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL);
> > +		if (!buddy->bits[i]) {
> > +			buddy->bits[i] = vzalloc(s * sizeof(long));
> 
> I wonder, why don't you use directly vzalloc instead of kcalloc
> fallback?
As we know we will have physical contiguous pages if the kcalloc
call succeeds. This will give us a chance to have better performance
over the allocations which are just virtually contiguous through the
function vzalloc(). Therefore, later has only been used as a fallback
when our memory request cannot be entertained through kcalloc.

Are you suggesting that there will not be much performance penalty
if we use just vzalloc ?

> 
> > +			if (!buddy->bits[i])
> > +				goto err_out_free;
> > +		}
> >  	}

^ permalink raw reply

* Re: [PATCH net] ipv6 addrconf: Implemented enhanced DAD (RFC7527)
From: Hannes Frederic Sowa @ 2016-11-15 16:00 UTC (permalink / raw)
  To: Erik Nordmark, netdev
In-Reply-To: <5863cb78-0050-7b7e-f886-d4d6b2186f16@sonic.net>

On 15.11.2016 08:57, Erik Nordmark wrote:
> Implemented RFC7527 Enhanced DAD.
> IPv6 duplicate address detection can fail if there is some temporary
> loopback of Ethernet frames. RFC7527 solves this by including a random
> nonce in the NS messages used for DAD, and if an NS is received with the
> same nonce it is assumed to be a looped back DAD probe and is ignored.
> RFC7527 is disabled by default. Can be enabled by setting either one of
> conf/{all,interface}/ipv6_rfc7527 to non-zero.
> 
> Signed-off-by: Erik Nordmark <nordmark@arista.com>
> 
> Index: linux-stable/Documentation/networking/ip-sysctl.txt
> ===================================================================
> --- linux-stable.orig/Documentation/networking/ip-sysctl.txt
> +++ linux-stable/Documentation/networking/ip-sysctl.txt
> @@ -1713,6 +1713,15 @@ drop_unsolicited_na - BOOLEAN
> 
>      By default this is turned off.
> 
> +ipv6_rfc7527 - BOOLEAN

Could you rename the sysctl to enhanced_dad. As it will anyway show up
in the ipv6/ directory hierarchy you don't need to prefix it with 'ipv6_'

> +    Include a nonce option in the IPv6 neighbor solicitation messages
> used for
> +    duplicate address detection per RFC7527. A received DAD NS will
> only signal
> +    a duplicate address if the nonce is different. This avoids any false
> +    detection of duplicates due to loopback of the NS messages that we
> send.
> +    The nonce option will be sent on an interface if either one of
> +    conf/{all,interface}/ipv6_rfc7527 are TRUE.
> +    Default: FALSE
> +
>  icmp/*:
>  ratelimit - INTEGER
>      Limit the maximal rates for sending ICMPv6 packets.
> Index: linux-stable/include/linux/ipv6.h
> ===================================================================
> --- linux-stable.orig/include/linux/ipv6.h
> +++ linux-stable/include/linux/ipv6.h
> @@ -63,6 +63,7 @@ struct ipv6_devconf {
>      } stable_secret;
>      __s32        use_oif_addrs_only;
>      __s32        keep_addr_on_down;
> +    __u32        ipv6_rfc7527;

Same here, as above.

> 
>      struct ctl_table_header *sysctl_header;
>  };
> Index: linux-stable/include/net/if_inet6.h
> ===================================================================
> --- linux-stable.orig/include/net/if_inet6.h
> +++ linux-stable/include/net/if_inet6.h
> @@ -55,6 +55,7 @@ struct inet6_ifaddr {
>      __u8            stable_privacy_retry;
> 
>      __u16            scope;
> +    __u64            dad_nonce;
> 
>      unsigned long        cstamp;    /* created timestamp */
>      unsigned long        tstamp; /* updated timestamp */
> Index: linux-stable/include/net/ndisc.h
> ===================================================================
> --- linux-stable.orig/include/net/ndisc.h
> +++ linux-stable/include/net/ndisc.h
> @@ -31,6 +31,7 @@ enum {
>      ND_OPT_PREFIX_INFO = 3,        /* RFC2461 */
>      ND_OPT_REDIRECT_HDR = 4,    /* RFC2461 */
>      ND_OPT_MTU = 5,            /* RFC2461 */
> +    ND_OPT_NONCE = 14,              /* RFC7527 */
>      __ND_OPT_ARRAY_MAX,
>      ND_OPT_ROUTE_INFO = 24,        /* RFC4191 */
>      ND_OPT_RDNSS = 25,        /* RFC5006 */
> @@ -121,6 +122,7 @@ struct ndisc_options {
>  #define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END]
>  #define nd_opts_rh            nd_opt_array[ND_OPT_REDIRECT_HDR]
>  #define nd_opts_mtu            nd_opt_array[ND_OPT_MTU]
> +#define nd_opts_nonce            nd_opt_array[ND_OPT_NONCE]
>  #define nd_802154_opts_src_lladdr
> nd_802154_opt_array[ND_OPT_SOURCE_LL_ADDR]
>  #define nd_802154_opts_tgt_lladdr
> nd_802154_opt_array[ND_OPT_TARGET_LL_ADDR]
> 
> @@ -398,7 +400,8 @@ void ndisc_cleanup(void);
>  int ndisc_rcv(struct sk_buff *skb);
> 
>  void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
> -           const struct in6_addr *daddr, const struct in6_addr *saddr);
> +           const struct in6_addr *daddr, const struct in6_addr *saddr,
> +           u64 nonce);
> 
>  void ndisc_send_rs(struct net_device *dev,
>             const struct in6_addr *saddr, const struct in6_addr *daddr);
> Index: linux-stable/include/uapi/linux/ipv6.h
> ===================================================================
> --- linux-stable.orig/include/uapi/linux/ipv6.h
> +++ linux-stable/include/uapi/linux/ipv6.h
> @@ -177,6 +177,7 @@ enum {
>      DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
>      DEVCONF_DROP_UNSOLICITED_NA,
>      DEVCONF_KEEP_ADDR_ON_DOWN,
> +    DEVCONF_IPV6_RFC7527,

Ditto.

>      DEVCONF_MAX
>  };
> 
> Index: linux-stable/net/ipv6/addrconf.c
> ===================================================================
> --- linux-stable.orig/net/ipv6/addrconf.c
> +++ linux-stable/net/ipv6/addrconf.c
> @@ -217,6 +217,7 @@ static struct ipv6_devconf ipv6_devconf
>      .use_oif_addrs_only    = 0,
>      .ignore_routes_with_linkdown = 0,
>      .keep_addr_on_down    = 0,
> +    .ipv6_rfc7527           = 0,

What is your reason to not enable this by default? I haven't fully
studied the RFC, but it seems to be backwards compatible.

>  };
> 
>  static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
> @@ -262,6 +263,7 @@ static struct ipv6_devconf ipv6_devconf_
>      .use_oif_addrs_only    = 0,
>      .ignore_routes_with_linkdown = 0,
>      .keep_addr_on_down    = 0,
> +    .ipv6_rfc7527           = 0,
>  };
> 
>  /* Check if a valid qdisc is available */
> @@ -3722,12 +3724,18 @@ static void addrconf_dad_kick(struct ine
>  {
>      unsigned long rand_num;
>      struct inet6_dev *idev = ifp->idev;
> +    u64 nonce;
> 
>      if (ifp->flags & IFA_F_OPTIMISTIC)
>          rand_num = 0;
>      else
>          rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
> 
> +    nonce = 0;
> +    if (ifp->idev->cnf.ipv6_rfc7527 ||
> + dev_net((ifp->idev)->dev)->ipv6.devconf_all->ipv6_rfc7527)

idev should already be in scope, so you can simplify this conditional.


> +        get_random_bytes(&nonce, 6);

Maybe:

do
	get_random_bytes(&nonce, 6);
while (!nonce);

> +    ifp->dad_nonce = nonce;
>      ifp->dad_probes = idev->cnf.dad_transmits;
>      addrconf_mod_dad_work(ifp, rand_num);
>  }
> @@ -3903,7 +3911,8 @@ static void addrconf_dad_work(struct wor
> 
>      /* send a neighbour solicitation for our addr */
>      addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
> -    ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
> +    ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
> +              ifp->dad_nonce);
>  out:
>      in6_ifa_put(ifp);
>      rtnl_unlock();
> @@ -4937,6 +4946,7 @@ static inline void ipv6_store_devconf(st
>      array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
> cnf->drop_unicast_in_l2_multicast;
>      array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
>      array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
> +    array[DEVCONF_IPV6_RFC7527] = cnf->ipv6_rfc7527;
>  }
> 
>  static inline size_t inet6_ifla6_size(void)
> @@ -6027,6 +6037,13 @@ static const struct ctl_table addrconf_s
> 
>      },
>      {
> +        .procname       = "ipv6_rfc7527",
> +        .data           = &ipv6_devconf.ipv6_rfc7527,
> +        .maxlen         = sizeof(int),
> +        .mode           = 0644,
> +        .proc_handler   = proc_dointvec,
> +    },
> +    {
>          /* sentinel */
>      }
>  };
> Index: linux-stable/net/ipv6/ndisc.c
> ===================================================================
> --- linux-stable.orig/net/ipv6/ndisc.c
> +++ linux-stable/net/ipv6/ndisc.c
> @@ -234,6 +234,7 @@ struct ndisc_options *ndisc_parse_option
>          case ND_OPT_SOURCE_LL_ADDR:
>          case ND_OPT_TARGET_LL_ADDR:
>          case ND_OPT_MTU:
> +        case ND_OPT_NONCE:
>          case ND_OPT_REDIRECT_HDR:
>              if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
>                  ND_PRINTK(2, warn,
> @@ -571,7 +572,8 @@ static void ndisc_send_unsol_na(struct n
>  }
> 
>  void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
> -           const struct in6_addr *daddr, const struct in6_addr *saddr)
> +           const struct in6_addr *daddr, const struct in6_addr *saddr,
> +           u64 nonce)
>  {
>      struct sk_buff *skb;
>      struct in6_addr addr_buf;
> @@ -591,6 +593,8 @@ void ndisc_send_ns(struct net_device *de
>      if (inc_opt)
>          optlen += ndisc_opt_addr_space(dev,
>                             NDISC_NEIGHBOUR_SOLICITATION);
> +    if (nonce != 0)
> +        optlen += 8;
> 
>      skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
>      if (!skb)
> @@ -608,6 +612,13 @@ void ndisc_send_ns(struct net_device *de
>          ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
>                         dev->dev_addr,
>                         NDISC_NEIGHBOUR_SOLICITATION);
> +    if (nonce != 0) {
> +        u8 *opt = skb_put(skb, 8);
> +
> +        opt[0] = ND_OPT_NONCE;
> +        opt[1] = 8 >> 3;
> +        memcpy(opt + 2, &nonce, 6);
> +    }
> 
>      ndisc_send_skb(skb, daddr, saddr);
>  }
> @@ -696,12 +707,12 @@ static void ndisc_solicit(struct neighbo
>                    "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
>                    __func__, target);
>          }
> -        ndisc_send_ns(dev, target, target, saddr);
> +        ndisc_send_ns(dev, target, target, saddr, 0);
>      } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
>          neigh_app_ns(neigh);
>      } else {
>          addrconf_addr_solict_mult(target, &mcaddr);
> -        ndisc_send_ns(dev, target, &mcaddr, saddr);
> +        ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
>      }
>  }
> 
> @@ -745,6 +756,7 @@ static void ndisc_recv_ns(struct sk_buff
>      int dad = ipv6_addr_any(saddr);
>      bool inc;
>      int is_router = -1;
> +    u64 nonce;
> 
>      if (skb->len < sizeof(struct nd_msg)) {
>          ND_PRINTK(2, warn, "NS: packet too short\n");
> @@ -789,6 +801,8 @@ static void ndisc_recv_ns(struct sk_buff
>              return;
>          }
>      }
> +    if (ndopts.nd_opts_nonce)
> +        memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);

You only initialize 6 bytes of the nonce, with the other 2 being not
initialized.

> 
>      inc = ipv6_addr_is_multicast(daddr);
> 
> @@ -797,6 +811,16 @@ static void ndisc_recv_ns(struct sk_buff
>  have_ifp:
>          if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
>              if (dad) {
> +                if (nonce != 0 && ifp->dad_nonce == nonce) {
> +                    /* Matching nonce if looped back */
> +                    if (net_ratelimit())
> +                        ND_PRINTK(2, notice,
> +                              "%s: IPv6 DAD loopback for address %pI6c
> nonce %llu ignored\n",
> +                               ifp->idev->dev->name,
> +                               &ifp->addr,
> +                               nonce);

If we print the nonce for debugging reasons, we should keep it in
correct endianess on the wire vs. in the debug output.

> +                    goto out;
> +                }
>                  /*
>                   * We are colliding with another node
>                   * who is doing DAD
> 

^ permalink raw reply

* Re: [PATCH net 1/3] net: phy: realtek: add eee advertisement disable options
From: Andrew Lunn @ 2016-11-15 16:30 UTC (permalink / raw)
  To: Jerome Brunet
  Cc: netdev, devicetree, Florian Fainelli, Carlo Caione, Kevin Hilman,
	Giuseppe Cavallaro, Alexandre TORGUE, Martin Blumenstingl,
	Andre Roth, Neil Armstrong, linux-amlogic, linux-arm-kernel,
	linux-kernel
In-Reply-To: <1479220154-25851-2-git-send-email-jbrunet@baylibre.com>

On Tue, Nov 15, 2016 at 03:29:12PM +0100, Jerome Brunet wrote:
> On some platforms, energy efficient ethernet with rtl8211 devices is
> causing issue, like throughput drop or broken link.
> 
> This was reported on the OdroidC2 (DWMAC + RTL8211F). While the issue root
> cause is not fully understood yet, disabling EEE advertisement prevent auto
> negotiation from enabling EEE.
> 
> This patch provides options to disable 1000T and 100TX EEE advertisement
> individually for the realtek phys supporting this feature.

Looking at the code, i don't see anything specific to RealTek
here. This all seems generic. So should it be in phy.c and made a
generic OF property which can be applied to any PHY which supports
EEE.

      Andrew

^ permalink raw reply

* Re: [PATCH net-next v3 3/7] vxlan: simplify exception handling
From: Pravin Shelar @ 2016-11-15 16:40 UTC (permalink / raw)
  To: Jiri Benc; +Cc: Linux Kernel Network Developers
In-Reply-To: <20161115153014.38fa2480@griffin>

On Tue, Nov 15, 2016 at 6:30 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Sun, 13 Nov 2016 20:43:54 -0800, Pravin B Shelar wrote:
>> @@ -1927,13 +1923,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>       struct ip_tunnel_info *info;
>>       struct vxlan_dev *vxlan = netdev_priv(dev);
>>       struct sock *sk;
>> -     struct rtable *rt = NULL;
>>       const struct iphdr *old_iph;
>>       union vxlan_addr *dst;
>>       union vxlan_addr remote_ip, local_ip;
>>       union vxlan_addr *src;
>>       struct vxlan_metadata _md;
>>       struct vxlan_metadata *md = &_md;
>> +     struct dst_entry *ndst = NULL;
>>       __be16 src_port = 0, dst_port;
>>       __be32 vni, label;
>>       __be16 df = 0;
>> @@ -2009,6 +2005,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>
>>       if (dst->sa.sa_family == AF_INET) {
>>               struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
>> +             struct rtable *rt;
>>
>>               if (!sock4)
>>                       goto drop;
>> @@ -2030,7 +2027,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>                       netdev_dbg(dev, "circular route to %pI4\n",
>>                                  &dst->sin.sin_addr.s_addr);
>>                       dev->stats.collisions++;
>> -                     goto rt_tx_error;
>> +                     ip_rt_put(rt);
>> +                     goto tx_error;
>>               }
>>
>>               /* Bypass encapsulation if the destination is local */
>> @@ -2053,12 +2051,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>               else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
>>                       df = htons(IP_DF);
>>
>> +             ndst = &rt->dst;
>
> It would be a bit cleaner to do this assignment just after rt is
> assigned (but after the IS_ERR(rt) condition), get rid of the added
> ip_rt_put call above and move the existing ip_rt_put call in the bypass
> case just before the vxlan_encap_bypass call...
>
Does it really matters given that next patches in this series moves
this duplicate code and does pretty much what you are describing?

>>               tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
>>               ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
>> -             err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
>> +             err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
>>                                     vni, md, flags, udp_sum);
>>               if (err < 0)
>> -                     goto xmit_tx_error;
>> +                     goto tx_error;
>>
>>               udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr,
>>                                   dst->sin.sin_addr.s_addr, tos, ttl, df,
>> @@ -2066,7 +2065,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>  #if IS_ENABLED(CONFIG_IPV6)
>>       } else {
>>               struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
>> -             struct dst_entry *ndst;
>>               u32 rt6i_flags;
>>
>>               ndst = vxlan6_get_route(vxlan, sock6, skb,
>> @@ -2078,13 +2076,13 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>                       netdev_dbg(dev, "no route to %pI6\n",
>>                                  &dst->sin6.sin6_addr);
>>                       dev->stats.tx_carrier_errors++;
>> +                     ndst = NULL;
>>                       goto tx_error;
>>               }
>>
>>               if (ndst->dev == dev) {
>>                       netdev_dbg(dev, "circular route to %pI6\n",
>>                                  &dst->sin6.sin6_addr);
>> -                     dst_release(ndst);
>>                       dev->stats.collisions++;
>>                       goto tx_error;
>>               }
>> @@ -2096,12 +2094,12 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
>>                   !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
>>                       struct vxlan_dev *dst_vxlan;
>>
>> -                     dst_release(ndst);
>>                       dst_vxlan = vxlan_find_vni(vxlan->net, vni,
>>                                                  dst->sa.sa_family, dst_port,
>>                                                  vxlan->flags);
>>                       if (!dst_vxlan)
>>                               goto tx_error;
>> +                     dst_release(ndst);
>>                       vxlan_encap_bypass(skb, vxlan, dst_vxlan);
>>                       return;
>>               }
>
> ...the same way you have it here, in the IPv6 part. Could you change
> the IPv4 part to match it?
>
Patch 5 does this by defining encap_bypass_if_local() for IPv4 and
IPv6 vxlan tunnels.

> Looks good otherwise. Seeing it, I like this version much more than v2.
>
> Thanks!
>
>  Jiri

^ permalink raw reply

* [net PATCH 0/2] ipv4: Fix memory leaks and reference issues in fib
From: Alexander Duyck @ 2016-11-15 10:46 UTC (permalink / raw)
  To: netdev; +Cc: davem

This series fixes one major issue and one minor issue in the fib tables.

The major issue is that we had lost the functionality that was flushing the
local table entries from main after we had unmerged the two tries.  In
order to regain the functionality I have performed a partial revert and
then moved the functionality for flushing the external entries from main
into fib_unmerge.

The minor issue was a memory leak that could occur in the event that we
weren't able to add an alias to the local trie resulting in the fib alias
being leaked.

---

Alexander Duyck (2):
      ipv4: Restore fib_trie_flush_external function and fix call ordering
      ipv4: Fix memory leak in exception case for splitting tries

 include/net/ip_fib.h    |    1 +
 net/ipv4/fib_frontend.c |   20 ++++++++++----
 net/ipv4/fib_trie.c     |   69 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 84 insertions(+), 6 deletions(-)

^ permalink raw reply

* [net PATCH 1/2] ipv4: Restore fib_trie_flush_external function and fix call ordering
From: Alexander Duyck @ 2016-11-15 10:46 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Jiri Pirko, davem
In-Reply-To: <20161115104306.13711.67911.stgit@ahduyck-blue-test.jf.intel.com>

The patch that removed the FIB offload infrastructure was a bit too
aggressive and also removed code needed to clean up us splitting the table
if additional rules were added.  Specifically the function
fib_trie_flush_external was called at the end of a new rule being added to
flush the foreign trie entries from the main trie.

I updated the code so that we only call fib_trie_flush_external on the main
table so that we flush the entries for local from main.  This way we don't
call it for every rule change which is what was happening previously.

Fixes: 347e3b28c1ba2 ("switchdev: remove FIB offload infrastructure")
Reported-by: Eric Dumazet <edumazet@google.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 include/net/ip_fib.h    |    1 +
 net/ipv4/fib_frontend.c |   20 +++++++++++---
 net/ipv4/fib_trie.c     |   65 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index b9314b4..f390c3b 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -243,6 +243,7 @@ int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
 		   struct netlink_callback *cb);
 int fib_table_flush(struct net *net, struct fib_table *table);
 struct fib_table *fib_trie_unmerge(struct fib_table *main_tb);
+void fib_table_flush_external(struct fib_table *table);
 void fib_free_table(struct fib_table *tb);
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c3b8047..161fc0f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -151,7 +151,7 @@ static void fib_replace_table(struct net *net, struct fib_table *old,
 
 int fib_unmerge(struct net *net)
 {
-	struct fib_table *old, *new;
+	struct fib_table *old, *new, *main_table;
 
 	/* attempt to fetch local table if it has been allocated */
 	old = fib_get_table(net, RT_TABLE_LOCAL);
@@ -162,11 +162,21 @@ int fib_unmerge(struct net *net)
 	if (!new)
 		return -ENOMEM;
 
+	/* table is already unmerged */
+	if (new == old)
+		return 0;
+
 	/* replace merged table with clean table */
-	if (new != old) {
-		fib_replace_table(net, old, new);
-		fib_free_table(old);
-	}
+	fib_replace_table(net, old, new);
+	fib_free_table(old);
+
+	/* attempt to fetch main table if it has been allocated */
+	main_table = fib_get_table(net, RT_TABLE_MAIN);
+	if (!main_table)
+		return 0;
+
+	/* flush local entries from main table */
+	fib_table_flush_external(main_table);
 
 	return 0;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4cff74d..735edc9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1760,6 +1760,71 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
 	return NULL;
 }
 
+/* Caller must hold RTNL */
+void fib_table_flush_external(struct fib_table *tb)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *pn = t->kv;
+	unsigned long cindex = 1;
+	struct hlist_node *tmp;
+	struct fib_alias *fa;
+
+	/* walk trie in reverse order */
+	for (;;) {
+		unsigned char slen = 0;
+		struct key_vector *n;
+
+		if (!(cindex--)) {
+			t_key pkey = pn->key;
+
+			/* cannot resize the trie vector */
+			if (IS_TRIE(pn))
+				break;
+
+			/* resize completed node */
+			pn = resize(t, pn);
+			cindex = get_index(pkey, pn);
+
+			continue;
+		}
+
+		/* grab the next available node */
+		n = get_child(pn, cindex);
+		if (!n)
+			continue;
+
+		if (IS_TNODE(n)) {
+			/* record pn and cindex for leaf walking */
+			pn = n;
+			cindex = 1ul << n->bits;
+
+			continue;
+		}
+
+		hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
+			/* if alias was cloned to local then we just
+			 * need to remove the local copy from main
+			 */
+			if (tb->tb_id != fa->tb_id) {
+				hlist_del_rcu(&fa->fa_list);
+				alias_free_mem_rcu(fa);
+				continue;
+			}
+
+			/* record local slen */
+			slen = fa->fa_slen;
+		}
+
+		/* update leaf slen */
+		n->slen = slen;
+
+		if (hlist_empty(&n->leaf)) {
+			put_child_root(pn, n->key, NULL);
+			node_free(n);
+		}
+	}
+}
+
 /* Caller must hold RTNL. */
 int fib_table_flush(struct net *net, struct fib_table *tb)
 {

^ permalink raw reply related

* [net PATCH 2/2] ipv4: Fix memory leak in exception case for splitting tries
From: Alexander Duyck @ 2016-11-15 10:46 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, davem
In-Reply-To: <20161115104306.13711.67911.stgit@ahduyck-blue-test.jf.intel.com>

Fix a small memory leak that can occur where we leak a fib_alias in the
event of us not being able to insert it into the local table.

Fixes: 0ddcf43d5d4a0 ("ipv4: FIB Local/MAIN table collapse")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
---
 net/ipv4/fib_trie.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 735edc9..026f309 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1743,8 +1743,10 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
 				local_l = fib_find_node(lt, &local_tp, l->key);
 
 			if (fib_insert_alias(lt, local_tp, local_l, new_fa,
-					     NULL, l->key))
+					     NULL, l->key)) {
+				kmem_cache_free(fn_alias_kmem, new_fa);
 				goto out;
+			}
 		}
 
 		/* stop loop if key wrapped back to 0 */

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox