Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net] ipv4/igmp: fix v1/v2 switchback timeout based on rfc3376, 8.12
From: Hangbin Liu @ 2018-10-26  3:30 UTC (permalink / raw)
  To: netdev
  Cc: Daniel Borkmann, David S. Miller, Hannes Frederic Sowa, Cong Wang,
	Hangbin Liu

Similiar with ipv6 mcast commit 89225d1ce6af3 ("net: ipv6: mld: fix v1/v2
switchback timeout to rfc3810, 9.12.")

i) RFC3376 8.12. Older Version Querier Present Timeout says:

   The Older Version Querier Interval is the time-out for transitioning
   a host back to IGMPv3 mode once an older version query is heard.
   When an older version query is received, hosts set their Older
   Version Querier Present Timer to Older Version Querier Interval.

   This value MUST be ((the Robustness Variable) times (the Query
   Interval in the last Query received)) plus (one Query Response
   Interval).

Currently we only use a hardcode value IGMP_V1/v2_ROUTER_PRESENT_TIMEOUT.
Fix it by adding two new items mr_qi(Query Interval) and mr_qri(Query Response
Interval) in struct in_device.

Now we can calculate the switchback time via (mr_qrv * mr_qi) + mr_qri.
We need update these values when receive IGMPv3 queries.

Reported-by: Ying Xu <yinxu@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
---
 include/linux/inetdevice.h |  4 +++-
 net/ipv4/igmp.c            | 53 +++++++++++++++++++++++++++++++---------------
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index c759d1c..a64f21a 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -37,7 +37,9 @@ struct in_device {
 	unsigned long		mr_v1_seen;
 	unsigned long		mr_v2_seen;
 	unsigned long		mr_maxdelay;
-	unsigned char		mr_qrv;
+	unsigned long		mr_qi;		/* Query Interval */
+	unsigned long		mr_qri;		/* Query Response Interval */
+	unsigned char		mr_qrv;		/* Query Robustness Variable */
 	unsigned char		mr_gq_running;
 	unsigned char		mr_ifc_count;
 	struct timer_list	mr_gq_timer;	/* general query timer */
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4da3944..765b2b3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -111,13 +111,10 @@
 #ifdef CONFIG_IP_MULTICAST
 /* Parameter names and values are taken from igmp-v2-06 draft */
 
-#define IGMP_V1_ROUTER_PRESENT_TIMEOUT		(400*HZ)
-#define IGMP_V2_ROUTER_PRESENT_TIMEOUT		(400*HZ)
 #define IGMP_V2_UNSOLICITED_REPORT_INTERVAL	(10*HZ)
 #define IGMP_V3_UNSOLICITED_REPORT_INTERVAL	(1*HZ)
+#define IGMP_QUERY_INTERVAL			(125*HZ)
 #define IGMP_QUERY_RESPONSE_INTERVAL		(10*HZ)
-#define IGMP_QUERY_ROBUSTNESS_VARIABLE		2
-
 
 #define IGMP_INITIAL_REPORT_DELAY		(1)
 
@@ -935,13 +932,15 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 
 			max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
 			in_dev->mr_v1_seen = jiffies +
-				IGMP_V1_ROUTER_PRESENT_TIMEOUT;
+				(in_dev->mr_qrv * in_dev->mr_qi) +
+				in_dev->mr_qri;
 			group = 0;
 		} else {
 			/* v2 router present */
 			max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
 			in_dev->mr_v2_seen = jiffies +
-				IGMP_V2_ROUTER_PRESENT_TIMEOUT;
+				(in_dev->mr_qrv * in_dev->mr_qi) +
+				in_dev->mr_qri;
 		}
 		/* cancel the interface change timer */
 		in_dev->mr_ifc_count = 0;
@@ -981,8 +980,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
 		if (!max_delay)
 			max_delay = 1;	/* can't mod w/ 0 */
 		in_dev->mr_maxdelay = max_delay;
-		if (ih3->qrv)
-			in_dev->mr_qrv = ih3->qrv;
+
+		/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
+		 * received value was zero, use the default or statically
+		 * configured value.
+		 */
+		in_dev->mr_qrv = ih3->qrv ?: net->ipv4.sysctl_igmp_qrv;
+		in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+
+		/* RFC3376, 8.3. Query Response Interval:
+		 * The number of seconds represented by the [Query Response
+		 * Interval] must be less than the [Query Interval].
+		 */
+		if (in_dev->mr_qri >= in_dev->mr_qi)
+			in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+
 		if (!group) { /* general query */
 			if (ih3->nsrcs)
 				return true;	/* no sources allowed */
@@ -1723,18 +1735,30 @@ void ip_mc_down(struct in_device *in_dev)
 	ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
 }
 
-void ip_mc_init_dev(struct in_device *in_dev)
-{
 #ifdef CONFIG_IP_MULTICAST
+static void ip_mc_reset(struct in_device *in_dev)
+{
 	struct net *net = dev_net(in_dev->dev);
+
+	in_dev->mr_qi = IGMP_QUERY_INTERVAL;
+	in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
+	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
+}
+#else
+static void ip_mc_reset(struct in_device *in_dev)
+{
+}
 #endif
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
 	timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
 	timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
-	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
+	ip_mc_reset(in_dev);
 
 	spin_lock_init(&in_dev->mc_tomb_lock);
 }
@@ -1744,15 +1768,10 @@ void ip_mc_init_dev(struct in_device *in_dev)
 void ip_mc_up(struct in_device *in_dev)
 {
 	struct ip_mc_list *pmc;
-#ifdef CONFIG_IP_MULTICAST
-	struct net *net = dev_net(in_dev->dev);
-#endif
 
 	ASSERT_RTNL();
 
-#ifdef CONFIG_IP_MULTICAST
-	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
-#endif
+	ip_mc_reset(in_dev);
 	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
 
 	for_each_pmc_rtnl(in_dev, pmc) {
-- 
2.5.5

^ permalink raw reply related

* Re: Regression: kernel 4.14 an later very slow with many ipsec tunnels
From: Wolfgang Walter @ 2018-10-26 12:18 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, fw, steffen.klassert, linux-kernel, torvalds,
	christophe.gouault, gregkh
In-Reply-To: <20181025.103450.1966639999117342457.davem@davemloft.net>

Am Donnerstag, 25. Oktober 2018, 10:34:50 schrieb David Miller:
> From: Wolfgang Walter <linux@stwm.de>
> Date: Thu, 25 Oct 2018 11:38:19 +0200
> 
> > there is now a new 4.19 which still has the big performance regression
> > when
> > many ipsec tunnels are configured (throughput and latency get worse by 10
> > to 50 times) which makes any kernel > 4.9 unusable for our routers.
> > 
> > I still don't understand why a revert of the flow cache removal at least
> > for the longterm kernels is that a bad option (maybe as a compile time
> > option), especially as there is no workaround available.
> 
> You do know that the flow cache is DDoS targettable, right?
> 
> That's why we removed it, we did not make the change lightly.

Though this is true, we now have simply a permanent DDoS situation. The 
removal of the flow cache leads to the situation so that with enough ipsec-
tunnels you are now always as bad as you would have been prior under a DDoS 
attack.

This is not comparable to the routing cache situation where a fast, well 
tested solution already existed (for routes in a table; if you use a lot of 
rules for policy routing this may be a different story).

Futher I don't think that the DoS is that a strong argument for the removal of 
the routing cache if the routing performance would have dropped 10 times and 
more.

Also, the routing cache was even a problem with legitimate traffic, so I never 
had a problem with the moderate performance regression it caused here.

> 
> Adding a DDoS vector back into the kernel is not an option sorry.

All kernels >= 4.14 are in our use case as bad as if they were under attack. 
They are completely unusable and I even can't 

> 
> Please work diligently with Florian and others to try and find ways to
> soften the performance hit.
> 

I proposed to revert this for the longterm kernels and I only depending on a 
compile time option which explicitely had to be switched on. Then we could 
start using 4.19. People not using ipsec or who use it only with < 100 rules 
would still live without flow cache.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts

^ permalink raw reply

* Re: ethernet "bus" number in DTS ?
From: Joakim Tjernlund @ 2018-10-26 13:00 UTC (permalink / raw)
  To: f.fainelli@gmail.com, msuchanek@suse.de
  Cc: netdev@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	andrew@lunn.ch
In-Reply-To: <20181024082239.5ee41017@naga.suse.cz>

On Wed, 2018-10-24 at 08:22 +0200, Michal Suchánek wrote:
> CAUTION: This email originated from outside of the organization. Do not click links or open attachments unless you recognize the sender and know the content is safe.
> 
> 
> On Tue, 23 Oct 2018 11:20:36 -0700
> Florian Fainelli <f.fainelli@gmail.com> wrote:
> 
> > On 10/23/18 11:02 AM, Joakim Tjernlund wrote:
> > > On Tue, 2018-10-23 at 10:03 -0700, Florian Fainelli wrote:
> > > I also noted that using status = "disabled" didn't work either to
> > > create a fix name scheme. Even worse, all the eth I/F after gets
> > > renumbered. It seems to me there is value in having stability in
> > > eth I/F naming at boot. Then userspace(udev) can rename if need be.
> > > 
> > > Sure would like to known more about why this feature is not wanted ?
> > > 
> > > I found
> > >   https://patchwork.kernel.org/patch/4122441/
> > > You quote policy as reason but surely it must be better to
> > > have something stable, connected to the hardware name, than
> > > semirandom naming?
> > 
> > If the Device Tree nodes are ordered by ascending base register
> > address, my understanding is that you get the same order as far as
> > platform_device creation goes, this may not be true in the future if
> > Rob decides to randomize that, but AFAICT this is still true. This
> > may not work well with status = disabled properties being inserted
> > here and there, but we have used that here and it has worked for as
> > far as I can remember doing it.
> 
> So this is unstable in several respects. First is changing the
> enabled/disabled status in the deivecetrees provided by the kernel.
> 
> Second is if you have hardware hotplug mechanism either by firmware or
> by loading device overlays.
> 
> Third is the case when the devicetree is not built as part of the
> kernel but is instead provided by firmware that initializes the
> low-level hardware details. Then the ordering by address is not
> guaranteed nor is that the same address will be used to access the same
> interface every time. There might be multiple ways to configure the
> hardware depending on firmware configuration and/or version.
> 
> 
> > Second, you might want to name network devices ethX, but what if I
> > want to name them ethernetX or fooX or barX? Should we be accepting a
> > mechanism in the kernel that would allow someone to name the
> > interfaces the way they want straight from a name being provided in
> > Device Tree?

Just to be clear, I am saying that we don't need to control the full
name of the Ethernet device, just the numerical id so one can tie eth0
to a fixed physical device.

> 
> Clearly if there is text Ethernet1 printed above the Ethernet port we
> should provide a mechanism to name the port Ethernet1 by default.
> 
> > Aliases are fine for providing relative stability within the Device
> > Tree itself and boot programs that might need to modify the Device
> > Tree (e.g: inserting MAC addresses) such that you don't have to
> > encode logic to search for nodes by compatible strings etc. but
> > outside of that use case, it seems to me that you can resolve every
> > naming decision in user-space.
> 
> However, this is pushing platform-specific knowledge to userspace. The
> way the Ethernet interface is attached and hence the device properties
> usable for identifying the device uniquely are platform-specific.
> 
> On the other hand, aliases are universal when provided. If they are
> good enough to assign a MAC address they are good enough to provide a
> stable default name.
> 
> I think this is indeed forcing the userspace to reinvent several wheels
> for no good reason.
> 
> What is the problem with adding the aliases?

Well put above, thanks.

   Jocke

^ permalink raw reply

* I NEED YOUR URGENT REPLY.
From: MONICA BROWN @ 2018-10-26  4:26 UTC (permalink / raw)


Dear friend
I am contacting you for a business deal of $7.5 Million US Dollars,
ready for transfer into your account
if we make this claim, we will share it 60%/40%.
100% risk free and it will be legally backed up with government
approved If you are interested reply for more details.

Best regards,
Please kindly reply to my alternative email address below
mb147844@gmail.com
MONICA BROWN

^ permalink raw reply

* Re: [PATCH net-next v8 28/28] net: WireGuard secure network tunnel
From: Theodore Y. Ts'o @ 2018-10-26 13:09 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Andrew Lunn, LKML, Netdev, Linux Crypto Mailing List,
	David Miller, Greg Kroah-Hartman
In-Reply-To: <CAHmME9pf=tCd=RKPTCpYuHLPkYqQwwr3LXEHP+_WXJ7DNjueZQ@mail.gmail.com>

On Fri, Oct 26, 2018 at 01:47:21AM +0200, Jason A. Donenfeld wrote:
> when it goes to sleep (screen blanking, wakelocks, etc). The Android
> model of Linux revolves around this, and hence the suspend semantics
> for WireGuard respect this model and adjust accordingly, using the
> appropriate CONFIG_ANDROID to determine which model we're operating
> under. This is not a bandaid, and it doesn't have to do with forks of
> the Linux kernel.

If that's what you are trying to conditionalize, why don't use
CONFIG_PM_AUTOSLEEP?  That way if there are other systems that want to
use the Android wakelocks style of suspend management, your code will
DTRT, as opposed to depending on CONFIG_ANDROID.

						- Ted

^ permalink raw reply

* Re: [PATCH 4.9 50/71] inet: frags: use rhashtables for reassembly units
From: Stefan Schmidt @ 2018-10-26 13:39 UTC (permalink / raw)
  To: Greg Kroah-Hartman, linux-kernel, netdev
  Cc: stable, Eric Dumazet, Kirill Tkhai, Herbert Xu, Florian Westphal,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	David S. Miller
In-Reply-To: <20181016170541.874459615@linuxfoundation.org>

Hello Greg.

[Hope I am not to late for this]

On 16/10/2018 19:09, Greg Kroah-Hartman wrote:
> 4.9-stable review patch.  If anyone has any objections, please let me know.
> 
> ------------------
> 
> From: Eric Dumazet <edumazet@google.com>
> 
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.
> 
> It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
> 
> A work queue is supposed to garbage collect items when host is under memory
> pressure, and doing a hash rebuild, changing seed used in hash computations.
> 
> This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
> occurring every 5 seconds if host is under fire.
> 
> Then there is the problem of sharing this hash table for all netns.
> 
> It is time to switch to rhashtables, and allocate one of them per netns
> to speedup netns dismantle, since this is a critical metric these days.
> 
> Lookup is now using RCU. A followup patch will even remove
> the refcount hold/release left from prior implementation and save
> a couple of atomic operations.
> 
> Before this patch, 16 cpus (16 RX queue NIC) could not handle more
> than 1 Mpps frags DDOS.
> 
> After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
> of storage for the fragments (exact number depends on frags being evicted
> after timeout)
> 
> $ grep FRAG /proc/net/sockstat
> FRAG: inuse 1966916 memory 2140004608
> 
> A followup patch will change the limits for 64bit arches.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> Cc: Herbert Xu <herbert@gondor.apana.org.au>
> Cc: Florian Westphal <fw@strlen.de>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Alexander Aring <alex.aring@gmail.com>
> Cc: Stefan Schmidt <stefan@osg.samsung.com>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> (cherry picked from commit 648700f76b03b7e8149d13cc2bdb3355035258a9)
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
>  Documentation/networking/ip-sysctl.txt  |    7 
>  include/net/inet_frag.h                 |   81 +++----
>  include/net/ipv6.h                      |   16 -
>  net/ieee802154/6lowpan/6lowpan_i.h      |   26 --
>  net/ieee802154/6lowpan/reassembly.c     |   91 +++-----
>  net/ipv4/inet_fragment.c                |  349 ++++++--------------------------
>  net/ipv4/ip_fragment.c                  |  112 ++++------
>  net/ipv6/netfilter/nf_conntrack_reasm.c |   51 +---
>  net/ipv6/reassembly.c                   |  110 ++++------
>  9 files changed, 267 insertions(+), 576 deletions(-)
> 

When this patch hit master a while back we had to address a regression
in the ieee802514 6lowpan layer. It seems this fix is missing in the
backport series (only looking at your patchset here, no the full tree).

https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/?id=f18fa5de5ba7f1d6650951502bb96a6e4715a948

I would appreciate if you could pull this into this series as well.

regards
Stefan Schmidt

^ permalink raw reply

* Re: [PATCH net-next v8 28/28] net: WireGuard secure network tunnel
From: Jason A. Donenfeld @ 2018-10-26 14:38 UTC (permalink / raw)
  To: Theodore Ts'o, Andrew Lunn, LKML, Netdev,
	Linux Crypto Mailing List, David Miller, Greg Kroah-Hartman
In-Reply-To: <20181026130914.GA8279@thunk.org>

Hey Ted,

On Fri, Oct 26, 2018 at 3:09 PM Theodore Y. Ts'o <tytso@mit.edu> wrote:
> If that's what you are trying to conditionalize, why don't use
> CONFIG_PM_AUTOSLEEP?  That way if there are other systems that want to
> use the Android wakelocks style of suspend management, your code will
> DTRT, as opposed to depending on CONFIG_ANDROID.

That's a terrific idea; thanks for the suggestion. I'll make that
change for the next patchset version.

Jason

^ permalink raw reply

* Re: [PATCH net-next] net/ipv6: Block IPv6 addrconf on team ports
From: Jiri Pirko @ 2018-10-26  6:01 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Chas Williams, davem, netdev, vfalico, andy, kuznet, yoshfuji
In-Reply-To: <26415.1540507231@famine>

Fri, Oct 26, 2018 at 12:40:31AM CEST, jay.vosburgh@canonical.com wrote:
>Chas Williams <3chas3@gmail.com> wrote:
>
>>On 10/25/2018 05:59 PM, Jay Vosburgh wrote:
>>> Chas Williams <3chas3@gmail.com> wrote:
>>>
>>>> netif_is_lag_port should be used to identify link aggregation ports.
>>>> For this to work, we need to reorganize the bonding and team drivers
>>>> so that the necessary flags are set before dev_open is called.
>>>>
>>>> commit 31e77c93e432 ("sched/fair: Update blocked load when newly idle")
>>>> made this decision originally based on the IFF_SLAVE flag which isn't
>>>> used by the team driver.  Note, we do need to retain the IFF_SLAVE
>>>> check for the eql driver.
>>>
>>> 	Is 31e77c93e432 the correct commit reference?  I don't see
>>> anything in there about IFF_SLAVE or bonding; it's a patch to the
>>> process scheduler.
>>
>>No, that's wrong.  It should be c2edacf80e155.
>>
>>> 	And, as Jiri said, the subject doesn't mention bonding.
>>
>>The behavior of bonding wasn't changed.  The intent of the patch
>>is to add team slaves to the interfaces that don't get automatic
>>IPv6 addresses.  The body discusses why bonding had to change as
>>well.
>
>	Sure, but the bonding code has changed, and the current
>presentation makes it harder for reviewers to follow (or perhaps even
>notice).
>
>>I was under the impression that the subject needs to kept short.
>>If there a better way to phrase what I want to do?
>
>	I'd suggest splitting this into three patches: A first patch
>that adds the new IPv6 functionality, then one patch each for team and
>bonding to take advantage of that new functionality.  Each of the three
>would then be very straightforward, change just one thing, and should be
>clearer all around.

+1

^ permalink raw reply

* Re: [PATCH RFC] net: dsa: Make switches VLAN aware when enslaved into a bridge
From: Ido Schimmel @ 2018-10-26 15:10 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev@vger.kernel.org, Jiri Pirko, Petr Machata,
	privat@egil-hjelmeland.no, Woojung.Huh@microchip.com,
	tristram.ha@microchip.com, Andrew Lunn, Vivien Didelot,
	David S. Miller, open list
In-Reply-To: <20181024193657.24012-1-f.fainelli@gmail.com>

On Wed, Oct 24, 2018 at 12:36:57PM -0700, Florian Fainelli wrote:
> Commit 2ea7a679ca2a ("net: dsa: Don't add vlans when vlan filtering is
> disabled") changed the behavior of DSA switches when the switch ports
> are enslaved into the bridge and only pushed the VLAN configuration down
> to the switch if the bridge is configured with VLAN filtering enabled.

This is what mlxsw is doing.

> This is unfortunately wrong, because what vlan_filtering configures is a
> policy on the acceptance of VLAN tagged frames with an unknown VID.
> 
> vlan_filtering=0 means a frame with a VLAN tag that is not part of the
> VLAN table should be allowed to ingress the switch, and vlan_fltering=1
> would reject that frame.

While you correctly describe the logic, this is not how VLAN-unaware
bridges are actually used. The expectation is that packets will be
untagged when entering the bridge. Either because they are truly
untagged or because they were untagged by a VLAN netdev.

For a long time we rejected the enslavement of physical ports to
VLAN-unaware bridges and only allowed VLAN netdevs to be enslaved. In
order to support the logic you described, we would need to map all 4K
VLANs on each port to 4K different FIDs. In addition, each FDB entry
would need to be programmed 4K times, each time with a different FID.
This is because FDB lookup is performed using {MAC, FID} and not only
MAC. I can go into more details about why we cannot map different VLANs
on a port to the same FID, but I do not think it is pertinent to our
discussion.

Eventually, users started complaining about this constraint and we
relaxed it in commit 65b53bfd497b ("mlxsw: spectrum_switchdev: Allow
port enslavement to a VLAN-unaware bridge").

P.S. Corrected Petr's mail address.

^ permalink raw reply

* Re: [PATCH net] bridge: do not add port to router list when receives query with source 0.0.0.0
From: Nikolay Aleksandrov @ 2018-10-26  7:27 UTC (permalink / raw)
  To: Hangbin Liu, netdev
  Cc: Jiri Pirko, Linus Lüssing, David S. Miller, bridge,
	Roopa Prabhu
In-Reply-To: <1540520923-17589-1-git-send-email-liuhangbin@gmail.com>

On 26/10/2018 05:28, Hangbin Liu wrote:
> Based on RFC 4541, 2.1.1.  IGMP Forwarding Rules
> 
>   The switch supporting IGMP snooping must maintain a list of
>   multicast routers and the ports on which they are attached.  This
>   list can be constructed in any combination of the following ways:
> 
>   a) This list should be built by the snooping switch sending
>      Multicast Router Solicitation messages as described in IGMP
>      Multicast Router Discovery [MRDISC].  It may also snoop
>      Multicast Router Advertisement messages sent by and to other
>      nodes.
> 
>   b) The arrival port for IGMP Queries (sent by multicast routers)
>      where the source address is not 0.0.0.0.
> 
> We should not add the port to router list when receives query with source
> 0.0.0.0.
> 
> Reported-by: Ying Xu <yinxu@redhat.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  net/bridge/br_multicast.c | 10 +++++++++-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> index 024139b..41cdafb 100644
> --- a/net/bridge/br_multicast.c
> +++ b/net/bridge/br_multicast.c
> @@ -1422,7 +1422,15 @@ static void br_multicast_query_received(struct net_bridge *br,
>  		return;
>  
>  	br_multicast_update_query_timer(br, query, max_delay);
> -	br_multicast_mark_router(br, port);
> +
> +	/* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules,
> +	 * the arrival port for IGMP Queries where the source address
> +	 * is 0.0.0.0 should not be added to router port list.
> +	 */
> +	if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) ||
> +	    (saddr->proto == htons(ETH_P_IPV6) &&
> +	     !ipv6_addr_any(&saddr->u.ip6)))
> +		br_multicast_mark_router(br, port);
>  }
>  
>  static void br_ip4_multicast_query(struct net_bridge *br,
> 

+CC Roopa & bridge@lists.linux-foundation.org

^ permalink raw reply

* KASAN: slab-out-of-bounds Read in sctp_getsockopt
From: syzbot @ 2018-10-26 16:38 UTC (permalink / raw)
  To: davem, linux-kernel, linux-sctp, marcelo.leitner, netdev, nhorman,
	syzkaller-bugs, vyasevich

Hello,

syzbot found the following crash on:

HEAD commit:    bd6bf7c10484 Merge tag 'pci-v4.20-changes' of git://git.ke..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16fd6bcb400000
kernel config:  https://syzkaller.appspot.com/x/.config?x=2dd8629d56664133
dashboard link: https://syzkaller.appspot.com/bug?extid=5da0d0a72a9e7d791748
compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16b3ea33400000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17f9f1bd400000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+5da0d0a72a9e7d791748@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: slab-out-of-bounds in sctp_getsockopt_pr_streamstatus  
net/sctp/socket.c:7174 [inline]
BUG: KASAN: slab-out-of-bounds in sctp_getsockopt+0x7516/0x7cc2  
net/sctp/socket.c:7582
Read of size 8 at addr ffff8801d89f0968 by task syz-executor278/5330

CPU: 1 PID: 5330 Comm: syz-executor278 Not tainted 4.19.0+ #303
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x244/0x39d lib/dump_stack.c:113
  print_address_description.cold.7+0x9/0x1ff mm/kasan/report.c:256
  kasan_report_error mm/kasan/report.c:354 [inline]
  kasan_report.cold.8+0x242/0x309 mm/kasan/report.c:412
  __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
  sctp_getsockopt_pr_streamstatus net/sctp/socket.c:7174 [inline]
  sctp_getsockopt+0x7516/0x7cc2 net/sctp/socket.c:7582
  sock_common_getsockopt+0x9a/0xe0 net/core/sock.c:2937
  __sys_getsockopt+0x1ad/0x390 net/socket.c:1939
  __do_sys_getsockopt net/socket.c:1950 [inline]
  __se_sys_getsockopt net/socket.c:1947 [inline]
  __x64_sys_getsockopt+0xbe/0x150 net/socket.c:1947
  do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x445789
Code: e8 6c b6 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 2b 12 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007effdb293db8 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 00000000006dac48 RCX: 0000000000445789
RDX: 0000000000000074 RSI: 0000000000000084 RDI: 0000000000000003
RBP: 00000000006dac40 R08: 0000000020000040 R09: 0000000000000000
R10: 0000000020000080 R11: 0000000000000246 R12: 00000000006dac4c
R13: 00007ffcfc408c6f R14: 00007effdb2949c0 R15: 00000000006dad2c

Allocated by task 5329:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
  kmem_cache_alloc_trace+0x152/0x750 mm/slab.c:3620
  kmalloc include/linux/slab.h:513 [inline]
  kzalloc include/linux/slab.h:707 [inline]
  sctp_stream_init_ext+0x4f/0xf0 net/sctp/stream.c:237
  sctp_sendmsg_to_asoc+0x1308/0x1a20 net/sctp/socket.c:1896
  sctp_sendmsg+0x13c2/0x1da0 net/sctp/socket.c:2113
  inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
  sock_sendmsg_nosec net/socket.c:621 [inline]
  sock_sendmsg+0xd5/0x120 net/socket.c:631
  __sys_sendto+0x3d7/0x670 net/socket.c:1788
  __do_sys_sendto net/socket.c:1800 [inline]
  __se_sys_sendto net/socket.c:1796 [inline]
  __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1796
  do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 3223:
  save_stack+0x43/0xd0 mm/kasan/kasan.c:448
  set_track mm/kasan/kasan.c:460 [inline]
  __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
  __cache_free mm/slab.c:3498 [inline]
  kfree+0xcf/0x230 mm/slab.c:3813
  kzfree+0x28/0x30 mm/slab_common.c:1543
  aa_free_file_ctx security/apparmor/include/file.h:76 [inline]
  apparmor_file_free_security+0x133/0x1a0 security/apparmor/lsm.c:448
  security_file_free+0x4a/0x80 security/security.c:900
  file_free fs/file_table.c:54 [inline]
  __fput+0x4e8/0xa30 fs/file_table.c:294
  ____fput+0x15/0x20 fs/file_table.c:309
  task_work_run+0x1e8/0x2a0 kernel/task_work.c:113
  tracehook_notify_resume include/linux/tracehook.h:188 [inline]
  exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166
  prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
  syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
  do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8801d89f0900
  which belongs to the cache kmalloc-96 of size 96
The buggy address is located 8 bytes to the right of
  96-byte region [ffff8801d89f0900, ffff8801d89f0960)
The buggy address belongs to the page:
page:ffffea0007627c00 count:1 mapcount:0 mapping:ffff8801da8004c0 index:0x0
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0007646748 ffffea0007613488 ffff8801da8004c0
raw: 0000000000000000 ffff8801d89f0000 0000000100000020 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8801d89f0800: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
  ffff8801d89f0880: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> ffff8801d89f0900: 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc fc
                                                           ^
  ffff8801d89f0980: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
  ffff8801d89f0a00: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
==================================================================


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [RFC] net: stmmac: RX Jumbo packet size > 8191 problem
From: Jose Abreu @ 2018-10-26  8:27 UTC (permalink / raw)
  To: thor.thayer, Giuseppe CAVALLARO, alexandre.torgue, jose.abreu,
	netdev
In-Reply-To: <25eeec13-8aed-b715-2f06-54dbf04825d4@linux.intel.com>



On 25-10-2018 21:41, Thor Thayer wrote:
> Hi,
>
> I'm running into a weird issue at the DMA boundary for large
> packets (>8192) that I can't explain.  I'm hoping someone here
> has an idea on why I'm seeing this issue.
>
> This is the Synopsys DesignWare Ethernet GMAC core (3.74) using
> the stmmac driver found at drivers/net/ethernet/stmicro/stmmac.
>
> If I ping with data sizes that exceed the first DMA buffer size
> (size set to 8191), ping reports a data mismatch as follows at
> byte #8144:
>
> $ ping -c 1 -M do -s 8150 192.168.1.99
> PING 192.168.1.99 (192.168.1.99) 8150(8178) bytes of data.
> 8158 bytes from 192.168.1.99: icmp_seq=1 ttl=64 time=0.669 ms
> wrong data byte #8144 should be 0xd0 but was 0x0
> #16    10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22
> 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f
> %< ---------------snip--------------------------------------
> #8112    b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf c0 c1
> c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf
> #8144    0 0 0 0 d0 d1
>         ^^^^^^^
> Notice the 4 bytes of 0 there before the expected byte of d0. I
> confirmed the on-wire result with wireshark - same data packet
> as shown above.
>
> Looking at the queue, I'm seeing these values in the RX
> descriptors (I'm using ring mode, enhanced descriptors).
> 0xa0040320 0x9fff1fff 0x7a358042 0x7a35a042
>  ^des0      ^des1      ^des2      ^desc3
>
> desc0 => 8196 bytes, OWN, First & Last Descriptor, Frame type =
> Eth
> desc1 => Disable IRQ on done, Rx Buffer2 sz = 8191, Rx Buffer1
> sz = 8191
> desc2 => Buffer 1 Addr Pointer
> desc3 => Buffer 2 Addr Pointer
>
> If I adjust init_desc3() and refill_desc3() to initialize desc3
> to desc2+BUF_SIZE_8KiB-4, I get a descriptor as show below and
> ping completes successfully.
> 0xa0040320 0x9fff1fff 0x77df8042 0x77dfa03e
>                                   ^ this is now different
>
> But I'm not sure why the -4 works because desc3 overlaps into
> the end of the first DMA buffer area (des2) which is
> counterintuitive.

By databook you have to set buffer size as multiple of bus width
but you are setting 8191 so this is not correct.

Can you try changing ehn_desc_rx_set_on_ring() and remove the
subtraction, as well as in enh_desc_init_rx_desc() ?

Thanks and Best Regards,
Jose Miguel Abreu

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: Add NCSI Mellanox OEM command
From: Vijay Khemka @ 2018-10-26 17:19 UTC (permalink / raw)
  To: David Miller
  Cc: sam@mendozajonas.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, openbmc@lists.ozlabs.org,
	Justin.Lee1@Dell.com, joel@jms.id.au,
	linux-aspeed@lists.ozlabs.org
In-Reply-To: <20181025.155335.2132843393553452471.davem@davemloft.net>

Thanks David,
Do you have any timeline when it is going to open next or how do I know.

Regards
-Vijay

On 10/25/18, 3:54 PM, "David Miller" <davem@davemloft.net> wrote:

    From: Vijay Khemka <vijaykhemka@fb.com>
    Date: Thu, 25 Oct 2018 15:04:13 -0700
    
    > This patch adds OEM Mellanox commands and response handling. It also
    > defines OEM Get MAC Address handler to get and configure the device.
    > 
    > ncsi_oem_gma_handler_mlx: This handler send NCSI mellanox command for
    > getting mac address.
    > ncsi_rsp_handler_oem_mlx: This handles response received for all
    > mellanox OEM commands.
    > ncsi_rsp_handler_oem_mlx_gma: This handles get mac address response and
    > set it to device.
    > 
    > Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
    
    net-next is closed, please resubmit this when the net-next tree opens
    back up.
    
    Thank you.
    


^ permalink raw reply

* RE: [PATCH net-next v2 5/6] net/ncsi: Reset channel state in ncsi_start_dev()
From: Justin.Lee1 @ 2018-10-26 17:25 UTC (permalink / raw)
  To: sam, netdev; +Cc: davem, linux-kernel, openbmc
In-Reply-To: <20181023215201.27315-6-sam@mendozajonas.com>

Hi Samuel,

I noticed a few issues and commented below.

Thanks,
Justin

>  /* Resources */
> +int ncsi_reset_dev(struct ncsi_dev *nd);
>  void ncsi_start_channel_monitor(struct ncsi_channel *nc);
>  void ncsi_stop_channel_monitor(struct ncsi_channel *nc);
>  struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np,
> diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
> index 014321ad31d3..9bad03e3fa5e 100644
> --- a/net/ncsi/ncsi-manage.c
> +++ b/net/ncsi/ncsi-manage.c
> @@ -550,8 +550,10 @@ static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp)
>  		spin_lock_irqsave(&nc->lock, flags);
>  		nc->state = NCSI_CHANNEL_INACTIVE;
>  		spin_unlock_irqrestore(&nc->lock, flags);
> -		ncsi_process_next_channel(ndp);
> -
> +		if (ndp->flags & NCSI_DEV_RESET)
> +			ncsi_reset_dev(nd);
> +		else
> +			ncsi_process_next_channel(ndp);
>  		break;
>  	default:
>  		netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n",
> @@ -1554,7 +1556,7 @@ int ncsi_start_dev(struct ncsi_dev *nd)
>  		return 0;
>  	}
>  
> -	return ncsi_choose_active_channel(nd);
> +	return ncsi_reset_dev(nd);

If there is no available channel due to the whitelist, ncsi_start_dev() function will return failed
Status and the network interface may fail to bring up too. It is possible for user to disable all 
channels and leave the interface up for checking the LOM status.

>  }
>  EXPORT_SYMBOL_GPL(ncsi_start_dev);

Also, if I send set_package_mask and set_channel_mask commands back to back in a program,
the state machine doesn't work well. If I use command line and wait for it to complete for 
each step, then it is fine.

npcm7xx-emc f0825000.eth eth2: NCSI: Multi-package enabled on ifindex 2, mask 0x00000001
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_stop_channel_monitor() - pkg 0 ch 0
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_dev_work()
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_suspend_channel() - pkg 0 ch 0 state 0400
npcm7xx-emc f0825000.eth eth2: NCSI: pkg 0 ch 0 set as preferred channel
npcm7xx-emc f0825000.eth eth2: NCSI: Multi-channel enabled on ifindex 2, mask 0x00000003
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_stop_channel_monitor() - pkg 0 ch 1
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_dev_work()
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_suspend_channel() - pkg 0 ch 1 state 0400
npcm7xx-emc f0825000.eth eth2: NCSI: Package 1 set to all channels disabled
npcm7xx-emc f0825000.eth eth2: NCSI: Multi-channel enabled on ifindex 2, mask 0x00000000
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel()
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - pkg 0
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - pass pkg whitelist
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - ch 0
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - pass ch whitelist
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - skip
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - ch 1
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - pass ch whitelist
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - skip
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - next pkg
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_choose_active_channel() - pkg 1
npcm7xx-emc f0825000.eth eth2: NCSI: No channel found to configure!
npcm7xx-emc f0825000.eth eth2: NCSI interface down
npcm7xx-emc f0825000.eth eth2: NCSI: ncsi_dev_work()
npcm7xx-emc f0825000.eth eth2: Wrong NCSI state 0x100 in workqueue

All masks are set correctly, but you can see the PS column is not right and channel doesn't
configure correctly.

/sys/kernel/debug/ncsi_protocol# cat ncsi_device_status
IFIDX IFNAME NAME   PID CID RX TX MP MC WP WC PC PS LS RU CR NQ HA
===================================================================
  2   eth2   ncsi0  000 000 1  1  1  1  1  1  1  0  1  1  1  0  1
  2   eth2   ncsi1  000 001 1  0  1  1  1  1  0  0  1  1  1  0  1
  2   eth2   ncsi2  001 000 0  0  1  1  0  0  0  0  1  1  1  0  1
  2   eth2   ncsi3  001 001 0  0  1  1  0  0  0  0  1  1  1  0  1
===================================================================
MP: Multi-mode Package     WP: Whitelist Package
MC: Multi-mode Channel     WC: Whitelist Channel
PC: Primary Channel
PS: Poll Status
LS: Link Status
RU: Running
CR: Carrier OK
NQ: Queue Stopped
HA: Hardware Arbitration

PS column is getting from (int)nc->monitor.enabled.

^ permalink raw reply

* Re: [RFC net-next v2 2/8] net: add netif_is_geneve()
From: Sergei Shtylyov @ 2018-10-26  8:51 UTC (permalink / raw)
  To: John Hurley, netdev, oss-drivers, jiri, gerlitz.or, ozsh,
	jakub.kicinski, simon.horman, avivh
In-Reply-To: <1540470417-14803-3-git-send-email-john.hurley@netronome.com>

Hello!

On 25.10.2018 15:26, John Hurley wrote:

> Add a helper function to determine if the type of a netdev is geneve based
> on its rtnl_link_ops. This allows drivers that may wish to ofload tunnels

    Offload?

> to check the underlying type of the device.
>
> A recent patch added a similar helper to vxlan.h
>
> Signed-off-by: John Hurley <john.hurley@netronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: Add NCSI Mellanox OEM command
From: David Miller @ 2018-10-26 17:36 UTC (permalink / raw)
  To: vijaykhemka
  Cc: sam, netdev, linux-kernel, openbmc, Justin.Lee1, joel,
	linux-aspeed
In-Reply-To: <F0024535-93C3-4470-98A6-F57426587474@fb.com>

From: Vijay Khemka <vijaykhemka@fb.com>
Date: Fri, 26 Oct 2018 17:19:49 +0000

> Do you have any timeline when it is going to open next or how do I
> know.

I always announce net-next openning and closing here on the list.

There is also a web site:

	http://vger.kernel.org/~davem/net-next.html

Thanks.

^ permalink raw reply

* application
From: Kelvin Quarterman   @ 2018-10-26  9:07 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 178 bytes --]

Howdy,
My name is Kelvin Quarterman   and I'm interested in a position.

I've attached a copy of my CV.
The password is "1234"

Best regards!

--
Kelvin Quarterman  

[-- Attachment #2: Kelvin Quarterman   Resume.doc --]
[-- Type: application/msword, Size: 39256 bytes --]

^ permalink raw reply

* Re: [PATCH 1/2] Bluetooth: Add quirk for reading BD_ADDR from fwnode property
From: Matthias Kaehlcke @ 2018-10-26 17:58 UTC (permalink / raw)
  To: Balakrishna Godavarthi
  Cc: Marcel Holtmann, Johan Hedberg, David S . Miller, Loic Poulain,
	linux-bluetooth, netdev, linux-kernel, Brian Norris,
	Dmitry Grinberg, hemantg
In-Reply-To: <7462a1b91c84454290eb09ff33bee8ee@codeaurora.org>

On Fri, Oct 26, 2018 at 10:31:37AM +0530, Balakrishna Godavarthi wrote:
> Hi Matthias,
> 
> I missed to add a point here.
> 
> On 2018-10-25 20:06, Balakrishna Godavarthi wrote:
> > On 2018-10-25 05:51, Matthias Kaehlcke wrote:
> > > Add HCI_QUIRK_USE_BDADDR_PROPERTY to allow controllers to retrieve
> > > the public Bluetooth address from the firmware node property
> > > 'local-bd-address'. If quirk is set and the property does not exist
> > > or is invalid the controller is marked as unconfigured.
> > > 
> > > Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
> > > ---
> > > hci_dev_get_bd_addr_from_property() currently assumes that the
> > > firmware node with 'local-bd-address' is from hdev->dev.parent, not
> > > sure if this universally true. However if it is true for existing
> > > device that might use this interface we can assume this for now
> > > (unless there is a clear solution now), and cross the bridge of
> > > finding an alternative when we actually encounter the situation.
> > > One option could be to look for the first parent that has a fwnode.
> > > ---
> > >  include/net/bluetooth/hci.h | 12 +++++++++++
> > >  net/bluetooth/hci_core.c    | 42
> > > +++++++++++++++++++++++++++++++++++++
> > >  net/bluetooth/mgmt.c        |  6 ++++--
> > >  3 files changed, 58 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
> > > index cdd9f1fe7cfa..a5d748099752 100644
> > > --- a/include/net/bluetooth/hci.h
> > > +++ b/include/net/bluetooth/hci.h
> > > @@ -158,6 +158,18 @@ enum {
> > >  	 */
> > >  	HCI_QUIRK_INVALID_BDADDR,
> > > 
> > > +	/* When this quirk is set, the public Bluetooth address
> > > +	 * initially reported by HCI Read BD Address command
> > > +	 * is considered invalid. The public BD Address can be
> > > +	 * specified in the fwnode property 'local-bd-address'.
> > > +	 * If this property does not exist or is invalid controller
> > > +	 * configuration is required before this device can be used.
> > > +	 *
> > > +	 * This quirk can be set before hci_register_dev is called or
> > > +	 * during the hdev->setup vendor callback.
> > > +	 */
> > > +	HCI_QUIRK_USE_BDADDR_PROPERTY,
> > > +
> > >  	/* When this quirk is set, the duplicate filtering during
> > >  	 * scanning is based on Bluetooth devices addresses. To allow
> > >  	 * RSSI based updates, restart scanning if needed.
> > > diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
> > > index 74b29c7d841c..97214262c4fb 100644
> > > --- a/net/bluetooth/hci_core.c
> > > +++ b/net/bluetooth/hci_core.c
> > > @@ -30,6 +30,7 @@
> > >  #include <linux/rfkill.h>
> > >  #include <linux/debugfs.h>
> > >  #include <linux/crypto.h>
> > > +#include <linux/property.h>
> > >  #include <asm/unaligned.h>
> > > 
> > >  #include <net/bluetooth/bluetooth.h>
> > > @@ -1355,9 +1356,40 @@ int hci_inquiry(void __user *arg)
> > >  	return err;
> > >  }
> > > 
> > > +/**
> > > + * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device
> > > Address
> > > + *				       (BD_ADDR) for a HCI device from
> > > + *				       a firmware node property.
> > > + * @hdev:	The HCI device
> > > + *
> > > + * Search the firmware node for 'local-bd-address'.
> > > + *
> > > + * All-zero BD addresses are rejected, because those could be
> > > properties
> > > + * that exist in the firmware tables, but were not updated by the
> > > firmware. For
> > > + * example, the DTS could define 'local-bd-address', with zero BD
> > > addresses.
> > > + */
> > > +static int hci_dev_get_bd_addr_from_property(struct hci_dev *hdev)
> > > +{
> > > +	struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent);
> > > +	bdaddr_t ba;
> > > +	int ret;
> > > +
> > > +	ret = fwnode_property_read_u8_array(fwnode, "local-bd-address",
> > > +					    (u8 *)&ba, sizeof(ba));
> > > +	if (ret < 0)
> > > +		return ret;
> > > +	if (!bacmp(&ba, BDADDR_ANY))
> > > +		return -ENODATA;
> > > +
> > > +	hdev->public_addr = ba;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static int hci_dev_do_open(struct hci_dev *hdev)
> > >  {
> > >  	int ret = 0;
> > > +	bool bd_addr_set = false;
> > > 
> > >  	BT_DBG("%s %p", hdev->name, hdev);
> > > 
> > > @@ -1422,6 +1454,16 @@ static int hci_dev_do_open(struct hci_dev
> > > *hdev)
> > >  		if (hdev->setup)
> > >  			ret = hdev->setup(hdev);
> > > 
> > > +		if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) {
> > > +			if (!hci_dev_get_bd_addr_from_property(hdev))
> > > +				if (hdev->set_bdaddr &&
> > > +				    !hdev->set_bdaddr(hdev, &hdev->public_addr))
> > > +					bd_addr_set = true;
> 
> Can we check the return status of hdev->setup() before calling
> hdev->set_bdaddr().
> some vendors assign hdev->set_baddr helper before calling hdev->setup().
> https://elixir.bootlin.com/linux/v4.19-rc8/source/drivers/bluetooth/btqcomsmd.c#L194
> There will no use in calling hdev->set_baddr() if hdev->setup() fails.

Thanks for pointing this out, I'll add the check.

This is more a question for Marcel: independently from this change I
wonder how robust the error flow in this function is. Is there are
reason to not bail out directly when a seemingly vital function like
->setup() fails, and instead continue and potentially overwrite the
error code? And there are other similar patterns in hci_dev_do_open().

Bailing out would certainly add a bit more code and probably gotos to
a cleanup section (currently in the else branch at the bottom of the
function), but might improve readability and robustness (I don't claim
there is an actual problem, but overwriting the error code seems
brittle).

Cheers

Matthias

^ permalink raw reply

* CAKE and r8169 cause panic on upload in v4.19
From: Oleksandr Natalenko @ 2018-10-26 19:26 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Dave Taht, David S. Miller, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, linux-kernel

Hello.

I was excited regarding the fact that v4.19 introduced CAKE, so I've 
deployed it on my home router.

I used this script of mine [1]:

# bufferbloat enp3s0.100 20 20

to do its job on the VLAN interface, where 20/20 ISP link is switched 
from the home switch. Basically, it just follows [2] with simple 
bandwidth restriction and egress mirroring using ifb.

Then I thought it would be nice to run speedtest-cli on one of the 
computer in the home LAN, connected to this router. Download stage went 
fine, but immediately after upload started I've got a panic on the 
router: [3] (sorry, it is a photo, netconsole didn't work because, I 
assume, the panic happened in the networking code). I rebooted the 
router and tried once more, and got the same result, again during upload 
stage. Then I rebooted again, replaced CAKE script with my former HTB 
script, and after running speedtest-cli a couple of times there's no 
panic.

Before running speedtest-cli I was using CAKE for a couple of days 
without generating much traffic just fine. It seems it crashes only if 
lots of traffic is generated with tools like this.

My sysctl: [4] and ethtool -k: [5]

So far, I've found something similar only here: [6] [7]. The common 
thing is r8169 driver in use, so, maybe, it is a driver issue, and CAKE 
is just happy to reveal it.

If it is something known, please point me to a possible fix. If it is 
something new, I'm open to provide more info on your request, try 
patches etc (as usual).

Thanks.

-- 
   Oleksandr Natalenko (post-factum)

[1] https://gist.github.com/4b27c49a7f9b4d775e2e38ba23d3f13c
[2] https://www.bufferbloat.net/projects/codel/wiki/Cake
[3] https://bit.ly/2SlUl7R
[4] https://gist.github.com/pfactum/bdad2594b151578f460857cacd94c689
[5] https://gist.github.com/pfactum/cad2cc5d1512b31fbc76d821b3e63dbf
[6] https://boards.4chan.org/g/thread/68171835#p68188019
[7] https://i.4cdn.org/g/1540307271879.jpg

^ permalink raw reply

* [PATCH] net: allow traceroute with a specified interface in a vrf
From: Mike Manning @ 2018-10-26 11:24 UTC (permalink / raw)
  To: netdev

Traceroute executed in a vrf succeeds if no device is given or if the
vrf is given as the device, but fails if the interface is given as the
device. This is for default UDP probes, it succeeds for TCP SYN or ICMP
ECHO probes. As the skb bound dev is the interface and the sk dev is
the vrf, sk lookup fails for ICMP_DEST_UNREACH and ICMP_TIME_EXCEEDED
messages. The solution is for the secondary dev to be passed so that
the interface is available for the device match to succeed, in the same
way as is already done for non-error cases.

Signed-off-by: Mike Manning <mmanning@vyatta.att-mail.com>
---
 net/ipv4/udp.c | 4 ++--
 net/ipv6/udp.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1f5e78d1477d..c9bc08915153 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -676,8 +676,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	struct net *net = dev_net(skb->dev);
 
 	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
-			       iph->saddr, uh->source, skb->dev->ifindex, 0,
-			       udptable, NULL);
+			       iph->saddr, uh->source, skb->dev->ifindex,
+			       inet_sdif(skb), udptable, NULL);
 	if (!sk) {
 		__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 		return;	/* No socket for error */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4f0a8728d723..740be1fbd4f5 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -543,7 +543,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct net *net = dev_net(skb->dev);
 
 	sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
-			       inet6_iif(skb), 0, udptable, skb);
+			       inet6_iif(skb), inet6_sdif(skb), udptable, skb);
 	if (!sk) {
 		__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
 				  ICMP6_MIB_INERRORS);
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] net: sched: Remove TCA_OPTIONS from policy
From: Marco Berizzi @ 2018-10-26 11:34 UTC (permalink / raw)
  To: David Ahern; +Cc: davem, netdev
In-Reply-To: <20181024153249.15374-1-dsahern@kernel.org>

> Il 24 ottobre 2018 alle 17.32 David Ahern <dsahern@kernel.org> ha scritto:
>  net/sched/sch_api.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
> index 3dc0acf54245..be7cd140b2a3 100644
> --- a/net/sched/sch_api.c
> +++ b/net/sched/sch_api.c
> @@ -1309,7 +1309,6 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
> 
> const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
>  [TCA_KIND] = { .type = NLA_STRING },
> 
> *   [TCA_OPTIONS] = { .type = NLA_NESTED },
> [TCA_RATE] = { .type = NLA_BINARY,
>  .len = sizeof(struct tc_estimator) },
> [TCA_STAB] = { .type = NLA_NESTED },
> --
> 2.11.0

David,

Apologies for bothering you again.
I applied your patch to 4.19, but after issuing this
command:

root@Calimero:~# tc qdisc add dev eth0 root handle 1:0 hfsc default 1
root@Calimero:~# ping 10.81.104.1
PING 10.81.104.1 (10.81.104.1) 56(84) bytes of data.
^C
--- 10.81.104.1 ping statistics ---
2 packets transmitted, 0 received, 100% packet loss, time 1001ms

I'm losing ipv4 connectivity.
If I remove the qdisc everything is going to work again:

root@Calimero:~# tc qdisc del dev eth0 root                   
root@Calimero:~# ping 10.81.104.1
PING 10.81.104.1 (10.81.104.1) 56(84) bytes of data.
64 bytes from 10.81.104.1: icmp_seq=1 ttl=255 time=0.711 ms
^C
--- 10.81.104.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.711/0.711/0.711/0.000 ms

^ permalink raw reply

* Re: CAKE and r8169 cause panic on upload in v4.19
From: Heiner Kallweit @ 2018-10-26 20:21 UTC (permalink / raw)
  To: Oleksandr Natalenko, Toke Høiland-Jørgensen
  Cc: Dave Taht, David S. Miller, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, linux-kernel
In-Reply-To: <61d09f0db41f269cc9ee13dd68a5c285@natalenko.name>

On 26.10.2018 21:26, Oleksandr Natalenko wrote:
> Hello.
> 
> I was excited regarding the fact that v4.19 introduced CAKE, so I've deployed it on my home router.
> 
> I used this script of mine [1]:
> 
> # bufferbloat enp3s0.100 20 20
> 
> to do its job on the VLAN interface, where 20/20 ISP link is switched from the home switch. Basically, it just follows [2] with simple bandwidth restriction and egress mirroring using ifb.
> 
> Then I thought it would be nice to run speedtest-cli on one of the computer in the home LAN, connected to this router. Download stage went fine, but immediately after upload started I've got a panic on the router: [3] (sorry, it is a photo, netconsole didn't work because, I assume, the panic happened in the networking code). I rebooted the router and tried once more, and got the same result, again during upload stage. Then I rebooted again, replaced CAKE script with my former HTB script, and after running speedtest-cli a couple of times there's no panic.
> 
> Before running speedtest-cli I was using CAKE for a couple of days without generating much traffic just fine. It seems it crashes only if lots of traffic is generated with tools like this.
> 
> My sysctl: [4] and ethtool -k: [5]
> 
> So far, I've found something similar only here: [6] [7]. The common thing is r8169 driver in use, so, maybe, it is a driver issue, and CAKE is just happy to reveal it.
> 
> If it is something known, please point me to a possible fix. If it is something new, I'm open to provide more info on your request, try patches etc (as usual).
> 
It seems to be the same problem as described here: https://bugzilla.kernel.org/show_bug.cgi?id=201063
As I commented in bugzilla, the GPF in dev_hard_start_xmit and the values of R12/R15 make me think
that a poisoned list pointer is accessed. It's so deep in the network stack that I can not really
imagine the network driver is to blame. One screenshot attached to the bug report shows that the
GPF also happened with the igb driver. Most likely we find out only once somebody spends effort
on bisecting the issue.
d4546c2509b1 ("net: Convert GRO SKB handling to list_head.") and some subsequent changes deal with
skb list processing, maybe the issue is related to one of these changes.

> Thanks.
> 

^ permalink raw reply

* Re: Fw: [Bug 201423] New: eth0: hw csum failure
From: Andre Tomt @ 2018-10-26 11:45 UTC (permalink / raw)
  To: Eric Dumazet, Eric Dumazet
  Cc: Stephen Hemminger, netdev, rossi.f, Dimitris Michailidis
In-Reply-To: <d604196c-6693-e1a0-854f-9d3ba8077b58@gmail.com>

On 25.10.2018 19:38, Eric Dumazet wrote:
> 
> 
> On 10/24/2018 12:41 PM, Andre Tomt wrote:
>>
>> It eventually showed up again with mlx4, on 4.18.16 + fix and also on 4.19. I still do not have a useful packet capture.
>>
>> It is running a torrent client serving up various linux distributions.
>>
> 
> Have you also applied this fix ?
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/?id=db4f1be3ca9b0ef7330763d07bf4ace83ad6f913
> 

No. I've applied it now to 4.19 and will report back if anything shows up.

^ permalink raw reply

* Re: CAKE and r8169 cause panic on upload in v4.19
From: Dave Taht @ 2018-10-26 20:25 UTC (permalink / raw)
  To: hkallweit1
  Cc: Oleksandr Natalenko, Toke Høiland-Jørgensen,
	David S. Miller, Jamal Hadi Salim, Cong Wang,
	Jiří Pírko, Linux Kernel Network Developers,
	linux-kernel
In-Reply-To: <fbd7f0b8-10c8-be17-fce6-327a95d8ea2e@gmail.com>

On Fri, Oct 26, 2018 at 1:21 PM Heiner Kallweit <hkallweit1@gmail.com> wrote:
>
> On 26.10.2018 21:26, Oleksandr Natalenko wrote:
> > Hello.
> >
> > I was excited regarding the fact that v4.19 introduced CAKE, so I've deployed it on my home router.
> >
> > I used this script of mine [1]:
> >
> > # bufferbloat enp3s0.100 20 20
> >
> > to do its job on the VLAN interface, where 20/20 ISP link is switched from the home switch. Basically, it just follows [2] with simple bandwidth restriction and egress mirroring using ifb.
> >
> > Then I thought it would be nice to run speedtest-cli on one of the computer in the home LAN, connected to this router. Download stage went fine, but immediately after upload started I've got a panic on the router: [3] (sorry, it is a photo, netconsole didn't work because, I assume, the panic happened in the networking code). I rebooted the router and tried once more, and got the same result, again during upload stage. Then I rebooted again, replaced CAKE script with my former HTB script, and after running speedtest-cli a couple of times there's no panic.
> >
> > Before running speedtest-cli I was using CAKE for a couple of days without generating much traffic just fine. It seems it crashes only if lots of traffic is generated with tools like this.
> >
> > My sysctl: [4] and ethtool -k: [5]
> >
> > So far, I've found something similar only here: [6] [7]. The common thing is r8169 driver in use, so, maybe, it is a driver issue, and CAKE is just happy to reveal it.
> >
> > If it is something known, please point me to a possible fix. If it is something new, I'm open to provide more info on your request, try patches etc (as usual).
> >
> It seems to be the same problem as described here: https://bugzilla.kernel.org/show_bug.cgi?id=201063
> As I commented in bugzilla, the GPF in dev_hard_start_xmit and the values of R12/R15 make me think
> that a poisoned list pointer is accessed. It's so deep in the network stack that I can not really
> imagine the network driver is to blame. One screenshot attached to the bug report shows that the
> GPF also happened with the igb driver. Most likely we find out only once somebody spends effort
> on bisecting the issue.
> d4546c2509b1 ("net: Convert GRO SKB handling to list_head.") and some subsequent changes deal with
> skb list processing, maybe the issue is related to one of these changes.

Can you repeat your test, disabling gro splitting in cake?

the option is "no-split-gso"

>
> > Thanks.
> >



-- 

Dave Täht
CTO, TekLibre, LLC
http://www.teklibre.com
Tel: 1-831-205-9740

^ permalink raw reply

* Re: [PATCH] igb: shorten maximum PHC timecounter update interval
From: Miroslav Lichvar @ 2018-10-26 12:04 UTC (permalink / raw)
  To: Richard Cochran; +Cc: intel-wired-lan, netdev, Jacob Keller, Thomas Gleixner
In-Reply-To: <20181012140530.6mjxkb2co3nhl5pf@localhost>

On Fri, Oct 12, 2018 at 07:05:30AM -0700, Richard Cochran wrote:
> On Fri, Oct 12, 2018 at 01:13:39PM +0200, Miroslav Lichvar wrote:
> > Since commit 500462a9d ("timers: Switch to a non-cascading wheel"),
> > scheduling of delayed work seems to be less accurate and a requested
> > delay of 540 seconds may actually be longer than 550 seconds. Shorten
> > the delay to 480 seconds to be sure the timecounter is updated in time.
> 
> Good catch.  This timer wheel change will affect other, similar
> drivers.  Guess I'll go through and adjust their timeouts, too.

I just realized that we need to fit there also any frequency
adjustments of the PHC and system clock. The PHC can be set to run up
to 6% faster and the system clock can be slowed down by up to 10%.

Those 480 seconds in the igb driver is not short enough for that.
Should I fix and resend this patch, or send a new one?

Other drivers may have a similar problem.

-- 
Miroslav Lichvar

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox