Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/3] bonding: replace the return value type
From: Tonghao Zhang @ 2018-05-11  9:53 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 ++++----
 drivers/net/bonding/bond_main.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
 		rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-			    struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+				    struct slave *tx_slave)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
 	return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 	return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+					struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+					  struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+				       struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave = NULL;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 1/3] bonding: replace the return value type
From: Tonghao Zhang @ 2018-05-11  9:52 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 ++++----
 drivers/net/bonding/bond_main.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
 		rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-			    struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+				    struct slave *tx_slave)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
 	return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 	return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+					struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+					  struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+				       struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave = NULL;
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Michael Schmitz @ 2018-05-11  9:30 UTC (permalink / raw)
  To: Finn Thain, Geert Uytterhoeven
  Cc: David S. Miller, linux-m68k, netdev, Linux Kernel Mailing List,
	Christoph Hellwig
In-Reply-To: <alpine.LNX.2.21.1805111451220.8@nippy.intranet>

Hi Finn,

Am 11.05.2018 um 17:28 schrieb Finn Thain:
> On Fri, 11 May 2018, Michael Schmitz wrote:
> 
>>
>> I'm afraid using platform_device_register() (which you already use for 
>> the SCC devices) is the only option handling this on a per-device basis 
>> without touching platform core code, while at the same time keeping the 
>> DMA mask setup out of device drivers
> 
> I don't think that will fly. If you call platform_device_register() and 
> follow that with a dma mask assignment, you could race with the bus 
> matching and driver probe, and we are back to the same WARNING message.

I wasn't proposing to follow platform_device_register() with the mask
assignment, but rather to use the same strategy from the Coldfire FEC
patch (f61e64310b75): set pdev.dev.coherent_dma_mask and
pdev.dev.dma_mask _before_ calling platform_device_register().

> If you want to use platform_device_register(), you'd have to implement 
> arch_setup_pdev_archdata() and use that to set up the dma mask.

If you want to avoid the overhead of defining the macmace platform
device and using platform_device_register() ... seeing as you would not
be defining just the DMA mask and nothing else, that's probably the
least effort for the MACE and SONIC drivers.

>> (I can see Geert's point there - device driver code might be shared 
>> across implementations of the device on platforms with different DMA 
>> mask requirements,, something the driver can't be expected to know 
>> about).
> 
> As I said, these drivers might be expected to be portable between Macs and 
> early PowerMacs, but the same dma mask would apply AFAIK.
> 
> If a platform driver isn't expected to be portable, I think either method 
> is reasonable: arch_setup_pdev_archdata() or the method in the patch.
> 
> Anyway, there is this in arch/powerpc/kernel/setup-common.c:
> 
> void arch_setup_pdev_archdata(struct platform_device *pdev)
> {
>         pdev->archdata.dma_mask = DMA_BIT_MASK(32);
>         pdev->dev.dma_mask = &pdev->archdata.dma_mask;
> 	...

You would have to be careful not to overwrite a pdev->dev.dma_mask and
pdev->dev.dma_coherent_mask that might have been set in a platform
device passed via platform_device_register here. Coldfire is the only
m68k platform currently using that, but there might be others in future.

> I'm inclined to propose something similar for m68k. That should fix the 
> problem, since arch_setup_pdev_archdata() is already in the call chain:
> 
> 	platform_device_register_simple()
> 		platform_device_register_resndata()
> 			platform_device_register_full()
> 				platform_device_alloc()
> 					arch_setup_pdev_archdata()
> 
> Thoughts? Will this have nasty side effects for m68k platforms that use 
> smaller dma masks?

These can still set a smaller mask in pdev->dev directly and use
platform_device_register() instead. But I don't think there are smaller
DMA masks used by m68k drivers that use the platform device mechanism at
present. I've only looked at arch/m68k though.

Cheers,

	Michael

^ permalink raw reply

* Re: net: hang in unregister_netdevice: waiting for lo to become free
From: Dmitry Vyukov @ 2018-05-11  9:19 UTC (permalink / raw)
  To: Dan Streetman
  Cc: Tommi Rantala, Neil Horman, Xin Long, David Ahern,
	Daniel Borkmann, Cong Wang, David Miller, Eric Dumazet,
	Willem de Bruijn, Jakub Kicinski, Rasmus Villemoes, netdev, LKML,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, syzkaller, Dan Streetman,
	Eric W. Biederman, Alexey Kodanev
In-Reply-To: <CALZtONBohVvUKcEa1K78KpcebP4JYEHNS7JFNpdtYuC+1ZJKiw@mail.gmail.com>

On Thu, May 10, 2018 at 12:23 PM, Dan Streetman <ddstreet@ieee.org> wrote:
>>>>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>>
>>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>>
>>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>>> Author: Tommi Rantala <tommi.t.rantala@nokia.com>
>>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>>
>>>>>>>>>>>      sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>>> ...
>>>>>>>>>>>      Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>>      Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>>> secondary
>>>>>>>>>>> addresses")
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>>> Author: Alexey Kodanev <alexey.kodanev@oracle.com>
>>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>>
>>>>>>>>>>>      sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>>> ...
>>>>>>>>>>>      Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>>> secondary
>>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>>
>>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Hi Tommi,
>>>>>>>>>>
>>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>>
>>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>>
>>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>>
>>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>>>>>>>> out a
>>>>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>>>>> definitively if
>>>>>>>>> the problem is in the same location or not
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file that you
>>>>>>>> sent in the first email in this thread:
>>>>>>>>
>>>>>>>> With 4.16-rc2 unable to reproduce.
>>>>>>>>
>>>>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting for
>>>>>>>> lo to become free. Usage count = 3"
>>>>>>>>
>>>>>>>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in sctp_v6_get_dst()"
>>>>>>>> cherry-picked on top, unable to reproduce.
>>>>>>>>
>>>>>>>>
>>>>>>>> Is syzkaller doing something else now to trigger the bug...?
>>>>>>>> Can you still trigger the bug with the same reproducer?
>>>>>>>
>>>>>>> Hi Neil, Tommi,
>>>>>>>
>>>>>>> Reviving this old thread about "unregister_netdevice: waiting for lo
>>>>>>> to become free. Usage count = 3" hangs.
>>>>>>> I still did not have time to deep dive into what happens there (too
>>>>>>> many bugs coming from syzbot). But this still actively happens and I
>>>>>>> suspect accounts to a significant portion of various hang reports,
>>>>>>> which are quite unpleasant.
>>>>>>>
>>>>>>> One idea that could make it all simpler:
>>>>>>>
>>>>>>> Is this wait loop in netdev_wait_allrefs() supposed to wait for any
>>>>>>> prolonged periods of time under any non-buggy conditions? E.g. more
>>>>>>> than 1-2 minutes?
>>>>>>> If it only supposed to wait briefly for things that already supposed
>>>>>>> to be shutting down, and we add a WARNING there after some timeout,
>>>>>>> then syzbot will report all info how/when it happens, hopefully
>>>>>>> extracting reproducers, and all the nice things.
>>>>>>> But this WARNING should not have any false positives under any
>>>>>>> realistic conditions (e.g. waiting for arrival of remote packets with
>>>>>>> large timeouts).
>>>>>>>
>>>>>>> Looking at some task hung reports, it seems that this code holds some
>>>>>>> mutexes, takes workqueue thread and prevents any progress with
>>>>>>> destruction of other devices (and net namespace creation/destruction),
>>>>>>> so I guess it should not wait for any indefinite periods of time?
>>>>>>
>>>>>> I'm working on this currently:
>>>>>> https://bugs.launchpad.net/ubuntu/zesty/+source/linux/+bug/1711407
>>>>>>
>>>>>> I added a summary of what I've found to be the cause (or at least, one
>>>>>> possible cause) of this:
>>>>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/comments/72
>>>>>>
>>>>>> I'm working on a patch to work around the main side-effect of this,
>>>>>> which is hanging while holding the global net mutex.  Hangs will still
>>>>>> happen (e.g. if a dst leaks) but should not affect anything else,
>>>>>> other than a leak of the dst and its net namespace.
>>>>>>
>>>>>> Fixing the dst leaks is important too, of course, but a dst leak (or
>>>>>> other cause) shouldn't break the entire system.
>>>>>
>>>>> Leaking some memory is definitely better than hanging the system.
>>>>>
>>>>> So I've made syzkaller to recognize "unregister_netdevice: waiting for
>>>>> (.*) to become free" as a kernel bug:
>>>>> https://github.com/google/syzkaller/commit/7a67784ca8bdc3b26cce2f0ec9a40d2dd9ec9396
>>>>> Unfortunately it does not make it catch these bugs because creating a
>>>>> net namespace per test is too damn slow, so namespaces are reused for
>>>>> lots of tests and when/if it's eventually destroyed it's already too
>>>>> late to find root cause.
>>>>>
>>>>> But I've run a one-off experiment with prompt net namespace
>>>>> destruction and syzkaller was able to easily extract a C reproducer:
>>>>> https://gist.githubusercontent.com/dvyukov/d571e8fff24e127ca48a8c4790d42bfa/raw/52050e93ba9afbb5126b9d7bb39b7e71a82af016/gistfile1.txt
>>>>>
>>>>> On upstream 16e205cf42da1f497b10a4a24f563e6c0d574eec with this config:
>>>>> https://gist.githubusercontent.com/dvyukov/9663c57443adb21f2795b92ef0829d62/raw/bbea0652e23746096dd56855a28f6c681aebcdee/gistfile1.txt
>>>>>
>>>>> this gives me:
>>>>>
>>>>> [   83.183198] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> [   85.231202] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> ...
>>>>> [  523.511205] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> ...
>>>>>
>>>>> This is generated from this syzkaller program:
>>>>>
>>>>> r0 = socket$inet6(0xa, 0x1, 0x84)
>>>>> setsockopt$inet6_IPV6_XFRM_POLICY(r0, 0x29, 0x23,
>>>>> &(0x7f0000000380)={{{@in6=@remote={0xfe, 0x80, [], 0xbb},
>>>>> @in=@dev={0xac, 0x14, 0x14}, 0x0, 0x0, 0x0, 0x0, 0xa}, {}, {}, 0x0,
>>>>> 0x0, 0x1}, {{@in=@local={0xac, 0x14, 0x14, 0xaa}, 0x0, 0x32}, 0x0,
>>>>> @in=@local={0xac, 0x14, 0x14, 0xaa}, 0x3504}}, 0xe8)
>>>>> bind$inet6(r0, &(0x7f0000000000)={0xa, 0x4e20}, 0x1c)
>>>>> connect$inet(r0, &(0x7f0000000040)={0x2, 0x4e20, @dev={0xac, 0x14,
>>>>> 0x14, 0xd}}, 0x10)
>>>>> syz_emit_ethernet(0x3e, &(0x7f00000001c0)={@local={[0xaa, 0xaa, 0xaa,
>>>>> 0xaa, 0xaa], 0xaa}, @dev={[0xaa, 0xaa, 0xaa, 0xaa, 0xaa]}, [],
>>>>> {@ipv6={0x86dd, {0x0, 0x6, "50a09c", 0x8, 0xffffff11, 0x0,
>>>>> @remote={0xfe, 0x80, [], 0xbb}, @local={0xfe, 0x80, [], 0xaa}, {[],
>>>>> @udp={0x0, 0x4e20, 0x8}}}}}}, &(0x7f0000000040))
>>>>>
>>>>> So this seems to be related to IPv6 and/or xfrm and is potentially
>>>>> caused by external packets (that syz_emit_ethernet call).
>>>>
>>>>
>>>>
>>>> Here is another repro which seems to be a different bug (note that it
>>>> requires fault injection):
>>>>
>>>> https://gist.githubusercontent.com/dvyukov/1c56623016cc4c24a69d433c5114ad5b/raw/530478f571b195193101b912aa646948528baa8e/gistfile1.txt
>>>>
>>>> Dan, do you mind taking a look at them? Fixing these should eliminate
>>>> root causes of these hangs/leaks.
>>>
>>> Yep I will look at them, thanks for the reproducers.
>>
>> Hi Dan,
>>
>> Any updates on this? syzbot is hitting this all the time.
>
> Sorry, the recent changes from net_mutex -> net_rwsem/pernet_ops_rwsem
> have complicated what I had done to workaround this, but I'm still
> working on it.  Apologies for the delay.

Are you looking at the mitigation? Or the bugs that trigger it? Or both?

^ permalink raw reply

* Re: [PATCH net-next 01/10] net: stmmac: Let descriptor code set skbuff address
From: Jose Abreu @ 2018-05-11  9:09 UTC (permalink / raw)
  To: David Miller, Jose.Abreu
  Cc: netdev, Joao.Pinto, Vitor.Soares, peppe.cavallaro,
	alexandre.torgue
In-Reply-To: <20180510.150613.1334467102643804220.davem@davemloft.net>

On 10-05-2018 20:06, David Miller wrote:
> From: Jose Abreu <Jose.Abreu@synopsys.com>
> Date: Tue,  8 May 2018 15:45:24 +0100
>
>> Stop using if conditions depending on the GMAC version for setting the
>> the descriptor skbuff address and use instead a helper implemented in
>> the descriptor files.
>>
>> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> With Spectre mitigations, indirect calls are extremely expensive.  Much
> more expensive than conditional checks.
>
> And since this is the descriptor setup in the fast paths of the driver,

Hmm, no. This is only done in HW setup. The only patch in this
series that can impact performance because of what you mentioned
is 09.

> I advise that you keep these conditionals.

I see your point but I'm trying to make the code more flexible
because we will add support for XGMAC soon. This XGMAC will need
a similar structure to what we have in GMAC.

By removing this conditionals we make stmmac_main totally
agnostic of HW and we will prevent patterns like this in the future:

if (priv->has_xgmac)
    do_stuff()
else if (priv->synopsys_id > GMAC_SOME_VERSION)
    do_other_stuff()
else
    do_other_other_stuff()

Thanks and Best Regards,
Jose Miguel Abreu

^ permalink raw reply

* Re: [PATCH] [PATCH net v2] ipv6: remove min MTU check for ipsec tunnels
From: Sergei Shtylyov @ 2018-05-11  9:05 UTC (permalink / raw)
  To: Ashwanth Goli, netdev, davem; +Cc: pabeni, dsahern, eric.dumazet
In-Reply-To: <a7572ee6-17d1-801d-bf9a-0eca7757fb00@cogentembedded.com>

On 5/11/2018 12:02 PM, Sergei Shtylyov wrote:

>> With 749439bfac "fix udpv6 sendmsg crash caused by too small MTU"
> 
>     When you cite a comnmit, you must specify at least 12-digit SHA1 and 
> enclose the summary in (""), not just "".
> 
>> ipsec tunnels that report a MTU less than IPV6_MIN_MTU are broken
>> even for packets that are smaller than IPV6_MIN_MTU.
>>
>> According to rfc2473#section-7.1
>>
>>      if the original IPv6 packet is equal or  smaller  than  the
>>      IPv6 minimum link MTU, the tunnel entry-point node
>>      encapsulates the original packet, and subsequently
>>      fragments the resulting IPv6 tunnel packet into IPv6
>>      fragments that do not exceed the Path MTU to the tunnel
>>      exit-point.
>>
>> Dropping the MTU check for ipsec tunnel destinations.
>>
>> Signed-off-by: Ashwanth Goli <ashwanth@codeaurora.org>
> [...]

    And you need to specify the patch version change log after the --- tear line.

MBR, Sergei

^ permalink raw reply

* Re: [PATCH] [PATCH net v2] ipv6: remove min MTU check for ipsec tunnels
From: Sergei Shtylyov @ 2018-05-11  9:02 UTC (permalink / raw)
  To: Ashwanth Goli, netdev, davem; +Cc: pabeni, dsahern, eric.dumazet
In-Reply-To: <1525972764-30904-1-git-send-email-ashwanth@codeaurora.org>

Hello!

On 5/10/2018 8:19 PM, Ashwanth Goli wrote:

> With 749439bfac "fix udpv6 sendmsg crash caused by too small MTU"

    When you cite a comnmit, you must specify at least 12-digit SHA1 and 
enclose the summary in (""), not just "".

> ipsec tunnels that report a MTU less than IPV6_MIN_MTU are broken
> even for packets that are smaller than IPV6_MIN_MTU.
> 
> According to rfc2473#section-7.1
> 
>      if the original IPv6 packet is equal or  smaller  than  the
>      IPv6 minimum link MTU, the tunnel entry-point node
>      encapsulates the original packet, and subsequently
>      fragments the resulting IPv6 tunnel packet into IPv6
>      fragments that do not exceed the Path MTU to the tunnel
>      exit-point.
> 
> Dropping the MTU check for ipsec tunnel destinations.
> 
> Signed-off-by: Ashwanth Goli <ashwanth@codeaurora.org>
[...]

MBR, Sergei

^ permalink raw reply

* [PATCH net-next 2/2] mlxsw: spectrum_span: Use a more fitting error code
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20180511085731.9256-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

ENOENT is suitable when an item is looked for in a collection and can't
be found. The failure here is actually a depletion of a resource, where
ENOBUFS is the more fitting error code.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index adf1f789b166..e5f4f7620ab7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -924,7 +924,7 @@ int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
 
 	span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms);
 	if (!span_entry)
-		return -ENOENT;
+		return -ENOBUFS;
 
 	netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n",
 		   span_entry->id);
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 1/2] mlxsw: spectrum_span: Rename misnamed variable l3edev
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20180511085731.9256-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Calling the variable l3edev was relevant when neighbor lookup was the
last stage in the simulated pipeline. Now that mlxsw handles bridges and
vlan devices as well, calling it "L3" is a misnomer.

Thus in mlxsw_sp_span_dmac(), rename to "dev", because that function is
just a service routine where the distinction between tunnel and egress
device isn't necessary.

In mlxsw_sp_span_entry_tunnel_parms_common(), rename to "edev" to
emphasize that the routine traces packet egress.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 32 +++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 3b77990df599..adf1f789b166 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -137,14 +137,14 @@ struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = {
 
 static int mlxsw_sp_span_dmac(struct neigh_table *tbl,
 			      const void *pkey,
-			      struct net_device *l3edev,
+			      struct net_device *dev,
 			      unsigned char dmac[ETH_ALEN])
 {
-	struct neighbour *neigh = neigh_lookup(tbl, pkey, l3edev);
+	struct neighbour *neigh = neigh_lookup(tbl, pkey, dev);
 	int err = 0;
 
 	if (!neigh) {
-		neigh = neigh_create(tbl, pkey, l3edev);
+		neigh = neigh_create(tbl, pkey, dev);
 		if (IS_ERR(neigh))
 			return PTR_ERR(neigh);
 	}
@@ -246,7 +246,7 @@ mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
 }
 
 static __maybe_unused int
-mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
+mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
 					union mlxsw_sp_l3addr saddr,
 					union mlxsw_sp_l3addr daddr,
 					union mlxsw_sp_l3addr gw,
@@ -260,31 +260,31 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	if (mlxsw_sp_l3addr_is_zero(gw))
 		gw = daddr;
 
-	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+	if (!edev || mlxsw_sp_span_dmac(tbl, &gw, edev, dmac))
 		goto unoffloadable;
 
-	if (is_vlan_dev(l3edev))
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+	if (is_vlan_dev(edev))
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 
-	if (netif_is_bridge_master(l3edev)) {
-		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
-		if (!l3edev)
+	if (netif_is_bridge_master(edev)) {
+		edev = mlxsw_sp_span_entry_bridge(edev, dmac, &vid);
+		if (!edev)
 			goto unoffloadable;
 	}
 
-	if (is_vlan_dev(l3edev)) {
-		if (vid || !(l3edev->flags & IFF_UP))
+	if (is_vlan_dev(edev)) {
+		if (vid || !(edev->flags & IFF_UP))
 			goto unoffloadable;
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 	}
 
-	if (!mlxsw_sp_port_dev_check(l3edev))
+	if (!mlxsw_sp_port_dev_check(edev))
 		goto unoffloadable;
 
-	sparmsp->dest_port = netdev_priv(l3edev);
+	sparmsp->dest_port = netdev_priv(edev);
 	sparmsp->ttl = ttl;
 	memcpy(sparmsp->dmac, dmac, ETH_ALEN);
-	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
+	memcpy(sparmsp->smac, edev->dev_addr, ETH_ALEN);
 	sparmsp->saddr = saddr;
 	sparmsp->daddr = daddr;
 	sparmsp->vid = vid;
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 0/2] mlxsw: spectrum_span: Two minor adjustments
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel

Petr says:

This patch set fixes a couple of nits in mlxsw's SPAN implementation:
two counts of inaccurate variable name and one count of unsuitable error
code, fixed, respectively, in patches #1 and #2.

Petr Machata (2):
  mlxsw: spectrum_span: Rename misnamed variable l3edev
  mlxsw: spectrum_span: Use a more fitting error code

 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 34 +++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

-- 
2.14.3

^ permalink raw reply

* Re: [PATCH] dt-bindings: net: ravb: Add support for r8a77990 SoC
From: Sergei Shtylyov @ 2018-05-11  8:42 UTC (permalink / raw)
  To: Yoshihiro Shimoda, netdev, linux-renesas-soc, robh+dt,
	mark.rutland
  Cc: devicetree
In-Reply-To: <1526008736-26496-1-git-send-email-yoshihiro.shimoda.uh@renesas.com>

On 5/11/2018 6:18 AM, Yoshihiro Shimoda wrote:

> Add documentation for r8a77990 compatible string to renesas ravb device
> tree bindings documentation.
> 
> Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
[...]

Acked-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

MBR, Sergei

^ permalink raw reply

* Re: [RFC] net: Add new LoRaWAN subsystem
From: Jiri Pirko @ 2018-05-11  8:16 UTC (permalink / raw)
  To: Jian-Hong Pan
  Cc: David S. Miller, Alexander Aring, Stefan Schmidt, linux-wpan - ML,
	netdev, linux-kernel
In-Reply-To: <CAC=mGzijVyqEG=DXH4v9WkD0kXR2WOJC4KBPzoT7g5wqVrPGXA@mail.gmail.com>

Tue, May 08, 2018 at 05:33:01PM CEST, starnight@g.ncu.edu.tw wrote:
>A Low-Power Wide-Area Network (LPWAN) is a type of wireless
>telecommunication wide area network designed to allow long range
>communications at a low bit rate among things (connected objects), such
>as sensors operated on a battery.  It can be used widely in IoT area.
>LoRaWAN, which is one kind of implementation of LPWAN, is a medium
>access control (MAC) layer protocol for managing communication between
>LPWAN gateways and end-node devices, maintained by the LoRa Alliance.
>LoRaWAN™ Specification could be downloaded at:
>https://lora-alliance.org/lorawan-for-developers
>
>However, LoRaWAN is not implemented in Linux kernel right now, so I am
>trying to develop it.  Here is my repository:
>https://github.com/starnight/LoRa/tree/lorawan-ndo/LoRaWAN

Link to some out-of-tree module is not enough.
If you want anyone to look at this and comment, you need to base your
work on top of kernel git (net-next for example) and send a patch/patchset.


>
>Because it is a kind of network, the ideal usage in an user space
>program should be like "socket(PF_LORAWAN, SOCK_DGRAM, 0)" and with
>other socket APIs.  Therefore, the definitions like AF_LORAWAN,
>PF_LORAWAN ..., must be listed in the header files of glibc.
>For the driver in kernel space, the definitions also must be listed in
>the corresponding Linux socket header files.
>Especially, both are for the testing programs.
>
>Back to the mentioned "LoRaWAN is not implemented in Linux kernel now".
>Could or should we add the definitions into corresponding kernel header
>files now, if LoRaWAN will be accepted as a subsystem in Linux?
>
>Thanks,
>Jian-Hong Pan

^ permalink raw reply

* Re: KASAN: null-ptr-deref Read in rds_ib_get_mr
From: Yanjun Zhu @ 2018-05-11  7:48 UTC (permalink / raw)
  To: DaeRyong Jeong, santosh.shilimkar, davem
  Cc: netdev, linux-rdma, rds-devel, linux-kernel, byoungyoung, kt0755
In-Reply-To: <20180511052056.GA10547@dragonet.kaist.ac.kr>



On 2018/5/11 13:20, DaeRyong Jeong wrote:
> We report the crash: KASAN: null-ptr-deref Read in rds_ib_get_mr
>
> Note that this bug is previously reported by syzkaller.
> https://syzkaller.appspot.com/bug?id=0bb56a5a48b000b52aa2b0d8dd20b1f545214d91
> Nonetheless, this bug has not fixed yet, and we hope that this report and our
> analysis, which gets help by the RaceFuzzer's feature, will helpful to fix the
> crash.
>
> This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
> version of Syzkaller), which we describe more at the end of this
> report. Our analysis shows that the race occurs when invoking two
> syscalls concurrently, bind$rds and setsockopt$RDS_GET_MR.
>
>
> Analysis:
> We think the concurrent execution of __rds_rdma_map() and rds_bind()
> causes the problem. __rds_rdma_map() checks whether rs->rs_bound_addr is 0
> or not. But the concurrent execution with rds_bind() can by-pass this
> check. Therefore, __rds_rdmap_map() calls rs->rs_transport->get_mr() and
> rds_ib_get_mr() causes the null deref at ib_rdma.c:544 in v4.17-rc1, when
> dereferencing rs_conn.
>
>
> Thread interleaving:
> CPU0 (__rds_rdma_map)					CPU1 (rds_bind)
> 							// rds_add_bound() sets rs->bound_addr as none 0
> 							ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
> if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
> 	ret = -ENOTCONN; /* XXX not a great errno */
> 	goto out;
> }
> 							if (rs->rs_transport) { /* previously bound */
> 								trans = rs->rs_transport;
> 								if (trans->laddr_check(sock_net(sock->sk),
> 										       sin->sin_addr.s_addr) != 0) {
> 									ret = -ENOPROTOOPT;
> 									// rds_remove_bound() sets rs->bound_addr as 0
> 									rds_remove_bound(rs);
> ...
> trans_private = rs->rs_transport->get_mr(sg, nents, rs,
> 					 &mr->r_key);
> (in rds_ib_get_mr())
> struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
>
>
> Call sequence (v4.17-rc1):
> CPU0
> rds_setsockopt
> 	rds_get_mr
> 		__rds_rdma_map
> 			rds_ib_get_mr
>
>
> CPU1
> rds_bind
> 	rds_add_bound
> 	...
> 	rds_remove_bound
>
>
> Crash log:
> ==================================================================
> BUG: KASAN: null-ptr-deref in rds_ib_get_mr+0x3a/0x150 net/rds/ib_rdma.c:544
> Read of size 8 at addr 0000000000000068 by task syz-executor0/32067
>
> CPU: 0 PID: 32067 Comm: syz-executor0 Not tainted 4.17.0-rc1 #1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x166/0x21c lib/dump_stack.c:113
>   kasan_report_error mm/kasan/report.c:352 [inline]
>   kasan_report+0x140/0x360 mm/kasan/report.c:412
>   check_memory_region_inline mm/kasan/kasan.c:260 [inline]
>   __asan_load8+0x54/0x90 mm/kasan/kasan.c:699
>   rds_ib_get_mr+0x3a/0x150 net/rds/ib_rdma.c:544
>   __rds_rdma_map+0x521/0x9d0 net/rds/rdma.c:271
>   rds_get_mr+0xad/0xf0 net/rds/rdma.c:333
>   rds_setsockopt+0x57f/0x720 net/rds/af_rds.c:347
>   __sys_setsockopt+0x147/0x230 net/socket.c:1903
>   __do_sys_setsockopt net/socket.c:1914 [inline]
>   __se_sys_setsockopt net/socket.c:1911 [inline]
>   __x64_sys_setsockopt+0x67/0x80 net/socket.c:1911
>   do_syscall_64+0x15f/0x4a0 arch/x86/entry/common.c:287
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x4563f9
> RSP: 002b:00007f6a2b3c2b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
> RAX: ffffffffffffffda RBX: 000000000072bee0 RCX: 00000000004563f9
> RDX: 0000000000000002 RSI: 0000000000000114 RDI: 0000000000000015
> RBP: 0000000000000575 R08: 0000000000000020 R09: 0000000000000000
> R10: 0000000020000140 R11: 0000000000000246 R12: 00007f6a2b3c36d4
> R13: 00000000ffffffff R14: 00000000006fd398 R15: 0000000000000000
> ==================================================================
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e678699..2228b50 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -539,11 +539,17 @@ void rds_ib_flush_mrs(void)
  void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                     struct rds_sock *rs, u32 *key_ret)
  {
-       struct rds_ib_device *rds_ibdev;
+       struct rds_ib_device *rds_ibdev = NULL;
         struct rds_ib_mr *ibmr = NULL;
-       struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
+       struct rds_ib_connection *ic = NULL;
         int ret;

+       if (rs->rs_bound_addr == 0) {
+               ret = -EPERM;
+               goto out;
+       }
+
+       ic = rs->rs_conn->c_transport_data;
         rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
         if (!rds_ibdev) {
                 ret = -ENODEV;

I made this raw patch. If you can reproduce this bug, please make tests 
with it.

Thanks a lot.
>
> = About RaceFuzzer
>
> RaceFuzzer is a customized version of Syzkaller, specifically tailored
> to find race condition bugs in the Linux kernel. While we leverage
> many different technique, the notable feature of RaceFuzzer is in
> leveraging a custom hypervisor (QEMU/KVM) to interleave the
> scheduling. In particular, we modified the hypervisor to intentionally
> stall a per-core execution, which is similar to supporting per-core
> breakpoint functionality. This allows RaceFuzzer to force the kernel
> to deterministically trigger racy condition (which may rarely happen
> in practice due to randomness in scheduling).
>
> RaceFuzzer's C repro always pinpoints two racy syscalls. Since C
> repro's scheduling synchronization should be performed at the user
> space, its reproducibility is limited (reproduction may take from 1
> second to 10 minutes (or even more), depending on a bug). This is
> because, while RaceFuzzer precisely interleaves the scheduling at the
> kernel's instruction level when finding this bug, C repro cannot fully
> utilize such a feature. Please disregard all code related to
> "should_hypercall" in the C repro, as this is only for our debugging
> purposes using our own hypervisor.
>

^ permalink raw reply related

* [PATCH v2 net] stmmac: strip vlan tag on reception only for 8021q tagged frames
From: Elad Nachman @ 2018-05-11  7:31 UTC (permalink / raw)
  To: Toshiaki Makita, davem, netdev; +Cc: eladv6
In-Reply-To: <f56f7073-46ac-9f9e-f8c0-deaa23ab3aa9@lab.ntt.co.jp>

stmmac reception handler calls stmmac_rx_vlan() to strip the vlan before calling napi_gro_receive().

The function assumes VLAN tagged frames are always tagged with 802.1Q protocol,
and assigns ETH_P_8021Q to the skb by hard-coding the parameter on call to __vlan_hwaccel_put_tag() without checking the actual VLAN tag.

This causes packets not to be passed to the VLAN slave if it was created with 802.1AD protocol
(ip link add link eth0 eth0.100 type vlan proto 802.1ad id 100).

This fix only strips the VLAN tag for 802.1Q tagged protocols. 802.1AD frames will be handled later by skb_vlan_untag() .

Signed-off-by: Elad Nachman <eladn@gilat.com>

---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b65e2d1..b230ab5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3293,17 +3293,21 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 
 static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
 {
-	struct ethhdr *ehdr;
+	struct vlan_ethhdr *veth;
 	u16 vlanid;
+	__be16 vlan_proto;
 
 	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
 	    NETIF_F_HW_VLAN_CTAG_RX &&
 	    !__vlan_get_tag(skb, &vlanid)) {
-		/* pop the vlan tag */
-		ehdr = (struct ethhdr *)skb->data;
-		memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
-		skb_pull(skb, VLAN_HLEN);
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+		veth = (struct vlan_ethhdr *)skb->data;
+		vlan_proto = veth->h_vlan_proto;
+		if (likely(vlan_proto == htons(ETH_P_8021Q))) {
+			/* pop the vlan tag */
+			memmove(skb->data + VLAN_HLEN, veth, ETH_ALEN * 2);
+			skb_pull(skb, VLAN_HLEN);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+		}
 	}
 }
 
-- 
2.7.4

^ permalink raw reply related

* Re: [bpf-next v3 8/9] bpf: Provide helper to do forwarding lookups in kernel FIB table
From: David Ahern @ 2018-05-11  6:30 UTC (permalink / raw)
  To: Mathieu Xhonneux
  Cc: netdev, borkmann, ast, David Miller, shm, roopa, brouer, toke,
	john.fastabend
In-Reply-To: <CAKSCvkTQPFZUBfNVfdJYj3PakTMxCwEwoWJWdVfnofzjFtG=3g@mail.gmail.com>

On 5/10/18 12:27 PM, Mathieu Xhonneux wrote:
> I'm quite interested in this helper to implement OAM features (through
> other hooks, e.g. the BPF LWT hook). Do you have an idea about how it
> behaves with ECMP routes (with IPv4 and/or IPv6) ? In IPv6, I'm
> guessing that the returned gateway address is always a link-local
> address ?

ECMP works with BPF and forwarding just like it does with the full stack
-- the traffic should be distributed based on the L3 or L4 hash
algorithm and the number of links.

As for the IPv6 gateway question, it returns the gateway of the route in
the FIB -- ie., link local or global address. No assumptions are
necessary or made for the BPF helper.

^ permalink raw reply

* Re:Re: [PATCH net] net: Correct wrong skb_flow_limit check when enable RPS
From: Gao Feng @ 2018-05-11  6:20 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: davem@davemloft.net, daniel@iogearbox.net,
	jakub.kicinski@netronome.com, David Ahern, netdev@vger.kernel.org
In-Reply-To: <CAF=yD-LVQ_hpQaN9tZ_UmJ3YYqipAaHBLhEsusaOXYJiXfcCrw@mail.gmail.com>

At 2018-05-11 11:54:55, "Willem de Bruijn" <willemdebruijn.kernel@gmail.com> wrote:
>On Thu, May 10, 2018 at 4:28 AM,  <gfree.wind@vip.163.com> wrote:
>> From: Gao Feng <gfree.wind@vip.163.com>
>>
>> The skb flow limit is implemented for each CPU independently. In the
>> current codes, the function skb_flow_limit gets the softnet_data by
>> this_cpu_ptr. But the target cpu of enqueue_to_backlog would be not
>> the current cpu when enable RPS. As the result, the skb_flow_limit checks
>> the stats of current CPU, while the skb is going to append the queue of
>> another CPU. It isn't the expected behavior.
>>
>> Now pass the softnet_data as a param to softnet_data to make consistent.
>
>The local cpu softnet_data is used on purpose. The operations in
>skb_flow_limit() on sd fields could race if not executed on the local cpu.

I think the race doesn't exist because of the rps_lock.
The enqueue_to_backlog has hold the rps_lock before skb_flow_limit.

>
>Flow limit tries to detect large ("elephant") DoS flows with a fixed four-tuple.
>These would always hit the same RPS cpu, so that cpu being backlogged

They may hit the different target CPU when enable RFS. Because the app could be scheduled
to another CPU, then RFS tries to deliver the skb to latest core which has hot cache.

>may be an indication that such a flow is active. But the flow will also always
>arrive on the same initial cpu courtesy of RSS. So storing the lookup table

The RSS couldn't make sure the irq is handled by same cpu. It would be balanced between
the cpus.

>on the initial CPU is also fine. There may be false positives on other CPUs
>with the same RPS destination, but that is unlikely with a highly concurrent
>traffic server mix ("mice").

If my comment is right, the flow couldn't always arrive one the same initial cpu,  although
it may be sent to one same target cpu.

>
>Note that the sysctl net.core.flow_limit_cpu_bitmap enables the feature
>for the cpus on which traffic initially lands, not the RPS destination cpus.
>See also Documentation/networking/scaling.txt
>
>That said, I had to reread the code, as it does seem sensible that the
>same softnet_data is intended to be used both when testing qlen and
>flow_limit.

In most cases, user configures the same RPS map with flow_limit like 0xff.
Because user couldn't predict which core the evil flow would arrive on.

Take an example, there are 2 cores, cpu0 and cpu1. 
One flow is the an evil flow, but the irq is sent to cpu0. After RPS/RFS, the target cpu is cpu1.
Now cpu0 invokes enqueue_to_backlog, then the skb_flow_limit checkes the queue length
of cpu0. Certainly it could pass the check of skb_flow_limit because there is no any evil flow on cpu0.
Then the cpu0 inserts the skb into the queue of cpu1.
As a result, the skb_flow_limit doesn't work as expected.

BTW, I have already sent the v2 patch which only adds the "Fixes: tag".

Best Regards
Feng



^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-11  5:28 UTC (permalink / raw)
  To: Michael Schmitz, Geert Uytterhoeven
  Cc: David S. Miller, linux-m68k, netdev, Linux Kernel Mailing List,
	Christoph Hellwig
In-Reply-To: <0b13a08d-ba2d-b9dc-4fd9-1f6bad5cd1ee@gmail.com>

On Fri, 11 May 2018, Michael Schmitz wrote:

> 
> I'm afraid using platform_device_register() (which you already use for 
> the SCC devices) is the only option handling this on a per-device basis 
> without touching platform core code, while at the same time keeping the 
> DMA mask setup out of device drivers

I don't think that will fly. If you call platform_device_register() and 
follow that with a dma mask assignment, you could race with the bus 
matching and driver probe, and we are back to the same WARNING message.

If you want to use platform_device_register(), you'd have to implement 
arch_setup_pdev_archdata() and use that to set up the dma mask.

> (I can see Geert's point there - device driver code might be shared 
> across implementations of the device on platforms with different DMA 
> mask requirements,, something the driver can't be expected to know 
> about).

As I said, these drivers might be expected to be portable between Macs and 
early PowerMacs, but the same dma mask would apply AFAIK.

If a platform driver isn't expected to be portable, I think either method 
is reasonable: arch_setup_pdev_archdata() or the method in the patch.

Anyway, there is this in arch/powerpc/kernel/setup-common.c:

void arch_setup_pdev_archdata(struct platform_device *pdev)
{
        pdev->archdata.dma_mask = DMA_BIT_MASK(32);
        pdev->dev.dma_mask = &pdev->archdata.dma_mask;
	...

I'm inclined to propose something similar for m68k. That should fix the 
problem, since arch_setup_pdev_archdata() is already in the call chain:

	platform_device_register_simple()
		platform_device_register_resndata()
			platform_device_register_full()
				platform_device_alloc()
					arch_setup_pdev_archdata()

Thoughts? Will this have nasty side effects for m68k platforms that use 
smaller dma masks?

-- 

> 
> Cheers,
> 
> 	Michael
> 

^ permalink raw reply

* KASAN: null-ptr-deref Read in rds_ib_get_mr
From: DaeRyong Jeong @ 2018-05-11  5:20 UTC (permalink / raw)
  To: santosh.shilimkar, davem
  Cc: netdev, linux-rdma, rds-devel, linux-kernel, byoungyoung, kt0755

We report the crash: KASAN: null-ptr-deref Read in rds_ib_get_mr

Note that this bug is previously reported by syzkaller.
https://syzkaller.appspot.com/bug?id=0bb56a5a48b000b52aa2b0d8dd20b1f545214d91
Nonetheless, this bug has not fixed yet, and we hope that this report and our
analysis, which gets help by the RaceFuzzer's feature, will helpful to fix the
crash.

This crash has been found in v4.17-rc1 using RaceFuzzer (a modified
version of Syzkaller), which we describe more at the end of this
report. Our analysis shows that the race occurs when invoking two
syscalls concurrently, bind$rds and setsockopt$RDS_GET_MR.


Analysis:
We think the concurrent execution of __rds_rdma_map() and rds_bind()
causes the problem. __rds_rdma_map() checks whether rs->rs_bound_addr is 0
or not. But the concurrent execution with rds_bind() can by-pass this
check. Therefore, __rds_rdmap_map() calls rs->rs_transport->get_mr() and
rds_ib_get_mr() causes the null deref at ib_rdma.c:544 in v4.17-rc1, when
dereferencing rs_conn.


Thread interleaving:
CPU0 (__rds_rdma_map)					CPU1 (rds_bind)
							// rds_add_bound() sets rs->bound_addr as none 0
							ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
	ret = -ENOTCONN; /* XXX not a great errno */
	goto out;
}
							if (rs->rs_transport) { /* previously bound */
								trans = rs->rs_transport;
								if (trans->laddr_check(sock_net(sock->sk),
										       sin->sin_addr.s_addr) != 0) {
									ret = -ENOPROTOOPT;
									// rds_remove_bound() sets rs->bound_addr as 0
									rds_remove_bound(rs);
...
trans_private = rs->rs_transport->get_mr(sg, nents, rs,
					 &mr->r_key);
(in rds_ib_get_mr())
struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;


Call sequence (v4.17-rc1):
CPU0
rds_setsockopt
	rds_get_mr
		__rds_rdma_map
			rds_ib_get_mr


CPU1
rds_bind
	rds_add_bound
	...
	rds_remove_bound


Crash log:
==================================================================
BUG: KASAN: null-ptr-deref in rds_ib_get_mr+0x3a/0x150 net/rds/ib_rdma.c:544
Read of size 8 at addr 0000000000000068 by task syz-executor0/32067

CPU: 0 PID: 32067 Comm: syz-executor0 Not tainted 4.17.0-rc1 #1
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x166/0x21c lib/dump_stack.c:113
 kasan_report_error mm/kasan/report.c:352 [inline]
 kasan_report+0x140/0x360 mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 __asan_load8+0x54/0x90 mm/kasan/kasan.c:699
 rds_ib_get_mr+0x3a/0x150 net/rds/ib_rdma.c:544
 __rds_rdma_map+0x521/0x9d0 net/rds/rdma.c:271
 rds_get_mr+0xad/0xf0 net/rds/rdma.c:333
 rds_setsockopt+0x57f/0x720 net/rds/af_rds.c:347
 __sys_setsockopt+0x147/0x230 net/socket.c:1903
 __do_sys_setsockopt net/socket.c:1914 [inline]
 __se_sys_setsockopt net/socket.c:1911 [inline]
 __x64_sys_setsockopt+0x67/0x80 net/socket.c:1911
 do_syscall_64+0x15f/0x4a0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4563f9
RSP: 002b:00007f6a2b3c2b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 000000000072bee0 RCX: 00000000004563f9
RDX: 0000000000000002 RSI: 0000000000000114 RDI: 0000000000000015
RBP: 0000000000000575 R08: 0000000000000020 R09: 0000000000000000
R10: 0000000020000140 R11: 0000000000000246 R12: 00007f6a2b3c36d4
R13: 00000000ffffffff R14: 00000000006fd398 R15: 0000000000000000
==================================================================


= About RaceFuzzer

RaceFuzzer is a customized version of Syzkaller, specifically tailored
to find race condition bugs in the Linux kernel. While we leverage
many different technique, the notable feature of RaceFuzzer is in
leveraging a custom hypervisor (QEMU/KVM) to interleave the
scheduling. In particular, we modified the hypervisor to intentionally
stall a per-core execution, which is similar to supporting per-core
breakpoint functionality. This allows RaceFuzzer to force the kernel
to deterministically trigger racy condition (which may rarely happen
in practice due to randomness in scheduling).

RaceFuzzer's C repro always pinpoints two racy syscalls. Since C
repro's scheduling synchronization should be performed at the user
space, its reproducibility is limited (reproduction may take from 1
second to 10 minutes (or even more), depending on a bug). This is
because, while RaceFuzzer precisely interleaves the scheduling at the
kernel's instruction level when finding this bug, C repro cannot fully
utilize such a feature. Please disregard all code related to
"should_hypercall" in the C repro, as this is only for our debugging
purposes using our own hypervisor.

^ permalink raw reply

* Re: [RFC bpf-next 07/11] bpf: Add helper to retrieve socket in BPF
From: Martin KaFai Lau @ 2018-05-11  5:00 UTC (permalink / raw)
  To: Joe Stringer; +Cc: daniel, netdev, ast, john.fastabend
In-Reply-To: <20180509210709.7201-8-joe@wand.net.nz>

On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> programs to find out if there is a socket listening on this host, and
> returns a socket pointer which the BPF program can then access to
> determine, for instance, whether to forward or drop traffic. sk_lookup()
> takes a reference on the socket, so when a BPF program makes use of this
> function, it must subsequently pass the returned pointer into the newly
> added sk_release() to return the reference.
> 
> By way of example, the following pseudocode would filter inbound
> connections at XDP if there is no corresponding service listening for
> the traffic:
> 
>   struct bpf_sock_tuple tuple;
>   struct bpf_sock_ops *sk;
> 
>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>   if (!sk) {
>     // Couldn't find a socket listening for this traffic. Drop.
>     return TC_ACT_SHOT;
>   }
>   bpf_sk_release(sk, 0);
>   return TC_ACT_OK;
> 
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---
>  include/uapi/linux/bpf.h                  |  39 +++++++++++-
>  kernel/bpf/verifier.c                     |   8 ++-
>  net/core/filter.c                         | 102 ++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
>  tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
>  5 files changed, 193 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d615c777b573..29f38838dbca 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1828,6 +1828,25 @@ union bpf_attr {
>   * 	Return
>   * 		0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for socket matching 'tuple'. The return value must be checked,
> + * 		and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -1898,7 +1917,9 @@ union bpf_attr {
>  	FN(xdp_adjust_tail),		\
>  	FN(skb_get_xfrm_state),		\
>  	FN(get_stack),			\
> -	FN(skb_load_bytes_relative),
> +	FN(skb_load_bytes_relative),	\
> +	FN(sk_lookup),			\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2060,6 +2081,22 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} saddr;
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} daddr;
> +	__be16 sport;
> +	__be16 dport;
> +	__u32 dst_if;
> +	__u8 family;
> +	__u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 92b9a5dc465a..579012c483e4 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>   * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
>   * passes through a NULL-check conditional. For the branch wherein the state is
>   * changed to CONST_IMM, the verifier releases the reference.
> + *
> + * For each helper function that allocates a reference, such as bpf_sk_lookup(),
> + * there is a corresponding release function, such as bpf_sk_release(). When
> + * a reference type passes into the release function, the verifier also releases
> + * the reference. If any unchecked or unreleased reference remains at the end of
> + * the program, the verifier rejects it.
>   */
>  
>  /* verifier_state + insn_idx are pushed to stack when branch is encountered */
> @@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -	return false;
> +	return func_id == BPF_FUNC_sk_release;
>  }
>  
>  /* string representation of 'enum bpf_reg_type' */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4c35152fb3a8..751c255d17d3 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -58,8 +58,12 @@
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
> +#include <net/udp.h>
>  #include <linux/bpf_trace.h>
>  #include <net/xdp_sock.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
> +#include <net/net_namespace.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +struct sock *
> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
Would it be possible to have another version that
returns a sk without taking its refcnt?
It may have performance benefit.

> +	int dst_if = (int)tuple->dst_if;
> +	struct in6_addr *src6;
> +	struct in6_addr *dst6;
> +
> +	if (tuple->family == AF_INET6) {
> +		src6 = (struct in6_addr *)&tuple->saddr.ipv6;
> +		dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
> +	} else if (tuple->family != AF_INET) {
> +		return ERR_PTR(-EOPNOTSUPP);
> +	}
> +
> +	if (tuple->proto == IPPROTO_TCP) {
> +		if (tuple->family == AF_INET)
> +			return inet_lookup(net, &tcp_hashinfo, NULL, 0,
> +					   tuple->saddr.ipv4, tuple->sport,
> +					   tuple->daddr.ipv4, tuple->dport,
> +					   dst_if);
> +		else
> +			return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
> +					    src6, tuple->sport,
> +					    dst6, tuple->dport, dst_if);
> +	} else if (tuple->proto == IPPROTO_UDP) {
> +		if (tuple->family == AF_INET)
> +			return udp4_lib_lookup(net, tuple->saddr.ipv4,
> +					       tuple->sport, tuple->daddr.ipv4,
> +					       tuple->dport, dst_if);
> +		else
> +			return udp6_lib_lookup(net, src6, tuple->sport,
> +					       dst6, tuple->dport, dst_if);
> +	} else {
> +		return ERR_PTR(-EOPNOTSUPP);
> +	}
> +
> +	return NULL;
> +}
> +
> +BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
> +{
> +	struct net *caller_net = dev_net(skb->dev);
> +	struct sock *sk = NULL;
> +	struct net *net;
> +
> +	/* XXX: Perform verification-time checking of tuple size? */
> +	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
> +		goto out;
> +
> +	net = get_net_ns_by_id(caller_net, netns_id);
> +	if (unlikely(!net))
> +		goto out;
> +
> +	sk = sk_lookup(net, tuple);
> +	put_net(net);
> +	if (IS_ERR_OR_NULL(sk))
> +		sk = NULL;
> +	else
> +		sk = sk_to_full_sk(sk);
> +out:
> +	return (unsigned long) sk;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_proto = {
> +	.func		= bpf_sk_lookup,
> +	.gpl_only	= false,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> +{
> +	sock_gen_put(sk);
> +	if (unlikely(flags))
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_release_proto = {
> +	.func		= bpf_sk_release,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_SOCKET,
> +	.arg2_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  	case BPF_FUNC_skb_get_xfrm_state:
>  		return &bpf_skb_get_xfrm_state_proto;
>  #endif
> +	case BPF_FUNC_sk_lookup:
> +		return &bpf_sk_lookup_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_socket_uid_proto;
>  	case BPF_FUNC_sk_redirect_map:
>  		return &bpf_sk_redirect_map_proto;
> +	case BPF_FUNC_sk_lookup:
> +		return &bpf_sk_lookup_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index fff51c187d1e..29f38838dbca 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -117,6 +117,7 @@ enum bpf_map_type {
>  	BPF_MAP_TYPE_DEVMAP,
>  	BPF_MAP_TYPE_SOCKMAP,
>  	BPF_MAP_TYPE_CPUMAP,
> +	BPF_MAP_TYPE_XSKMAP,
>  };
>  
>  enum bpf_prog_type {
> @@ -1827,6 +1828,25 @@ union bpf_attr {
>   * 	Return
>   * 		0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for socket matching 'tuple'. The return value must be checked,
> + * 		and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -1897,7 +1917,9 @@ union bpf_attr {
>  	FN(xdp_adjust_tail),		\
>  	FN(skb_get_xfrm_state),		\
>  	FN(get_stack),			\
> -	FN(skb_load_bytes_relative),
> +	FN(skb_load_bytes_relative),	\
> +	FN(sk_lookup),			\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2059,6 +2081,22 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} saddr;
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} daddr;
> +	__be16 sport;
> +	__be16 dport;
> +	__u32 dst_if;
> +	__u8 family;
> +	__u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index 265f8e0e8ada..4dc311ea0c16 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
>  	(void *) BPF_FUNC_skb_get_xfrm_state;
>  static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
>  	(void *) BPF_FUNC_get_stack;
> +static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
> +					 struct bpf_sock_tuple *tuple,
> +					 int size, unsigned int netns_id,
> +					 unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup;
> +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_release;
>  
>  /* llvm builtin functions that eBPF C program may use to
>   * emit BPF_LD_ABS and BPF_LD_IND instructions
> -- 
> 2.14.1
> 

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Michael Schmitz @ 2018-05-11  4:18 UTC (permalink / raw)
  To: Finn Thain
  Cc: Geert Uytterhoeven, David S. Miller, linux-m68k, netdev,
	Linux Kernel Mailing List, Christoph Hellwig
In-Reply-To: <alpine.LNX.2.21.1805111221180.8@nippy.intranet>

Hi Finn,

Am 11.05.2018 um 15:28 schrieb Finn Thain:
> On Fri, 11 May 2018, Michael Schmitz wrote:
> 
>>>> Which begs the question: why can' you set up all Nubus bus devices' 
>>>> DMA masks in nubus_device_register(), or nubus_add_board()?
>>>
>>> I am expecting to see the same WARNING from the nubus sonic driver but 
>>> it hasn't happened yet, so I don't have a patch for it yet. In 
>>> anycase, the nubus fix would be a lot like the zorro bus fix, so I 
>>> don't see a problem.
>>
>> That's odd. But what I meant to say is that by setting up 
>> dma_coherent_mask in nubus_add_board(), and pointing dma_mask to that, 
>> ypu won't need any patches to Nubus device drivers.
> 
> Right. I think I've already acknowledged that. But it's off-topic, because 
> the patches under review are for platform drivers. Those patches fix an 
> actual bug that I've observed. Whereas, the nubus driver dma mask issue 
> that you raised is purely theoretical at this stage.

I had lost track of the fact that macsonic can be probed as either Nubus
or platform device. Sorry for the noise.

I'm afraid using platform_device_register() (which you already use for
the SCC devices) is the only option handling this on a per-device basis
without touching platform core code, while at the same time keeping the
DMA mask setup out of device drivers (I can see Geert's point there -
device driver code might be shared across implementations of the device
on platforms with different DMA mask requirements,, something the driver
can't be expected to know about).

Cheers,

	Michael

^ permalink raw reply

* Re: [PATCH net] net: Correct wrong skb_flow_limit check when enable RPS
From: Willem de Bruijn @ 2018-05-11  3:54 UTC (permalink / raw)
  To: gfree.wind
  Cc: David Miller, Daniel Borkmann, jakub.kicinski, David Ahern,
	Network Development
In-Reply-To: <1525940884-21067-1-git-send-email-gfree.wind@vip.163.com>

On Thu, May 10, 2018 at 4:28 AM,  <gfree.wind@vip.163.com> wrote:
> From: Gao Feng <gfree.wind@vip.163.com>
>
> The skb flow limit is implemented for each CPU independently. In the
> current codes, the function skb_flow_limit gets the softnet_data by
> this_cpu_ptr. But the target cpu of enqueue_to_backlog would be not
> the current cpu when enable RPS. As the result, the skb_flow_limit checks
> the stats of current CPU, while the skb is going to append the queue of
> another CPU. It isn't the expected behavior.
>
> Now pass the softnet_data as a param to softnet_data to make consistent.

The local cpu softnet_data is used on purpose. The operations in
skb_flow_limit() on sd fields could race if not executed on the local cpu.

Flow limit tries to detect large ("elephant") DoS flows with a fixed four-tuple.
These would always hit the same RPS cpu, so that cpu being backlogged
may be an indication that such a flow is active. But the flow will also always
arrive on the same initial cpu courtesy of RSS. So storing the lookup table
on the initial CPU is also fine. There may be false positives on other CPUs
with the same RPS destination, but that is unlikely with a highly concurrent
traffic server mix ("mice").

Note that the sysctl net.core.flow_limit_cpu_bitmap enables the feature
for the cpus on which traffic initially lands, not the RPS destination cpus.
See also Documentation/networking/scaling.txt

That said, I had to reread the code, as it does seem sensible that the
same softnet_data is intended to be used both when testing qlen and
flow_limit.

^ permalink raw reply

* Re: [PATCH net-next] udp: avoid refcount_t saturation in __udp_gso_segment()
From: Willem de Bruijn @ 2018-05-11  3:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Eric Dumazet, Willem de Bruijn,
	Alexander Duyck
In-Reply-To: <20180511020713.159465-1-edumazet@google.com>

On Thu, May 10, 2018 at 10:07 PM, Eric Dumazet <edumazet@google.com> wrote:
> For some reason, Willem thought that the issue we fixed for TCP
> in commit 7ec318feeed1 ("tcp: gso: avoid refcount_t warning from
> tcp_gso_segment()") was not relevant for UDP GSO.
>
> But syzbot found its way.

[..]

> Fixes: ad405857b174 ("udp: better wmem accounting on gso")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Willem de Bruijn <willemb@google.com>
> Cc: Alexander Duyck <alexander.h.duyck@intel.com>
> Reported-by: syzbot <syzkaller@googlegroups.com>

Acked-by: Willem de Bruijn <willemb@google.com>

Thanks Eric. Yep, I was naive here. I am quite curious what kind of
gso packet syzkaller was able to cook that exceeds the truesize
of its segments.

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-11  3:28 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, David S. Miller, linux-m68k, netdev,
	Linux Kernel Mailing List, Christoph Hellwig
In-Reply-To: <CAOmrzkLL0ZNzAh7rwiDy=BkNqqXgbqfv3vkN6K8CTF8VGbT8zA@mail.gmail.com>

On Fri, 11 May 2018, Michael Schmitz wrote:

> > > Which begs the question: why can' you set up all Nubus bus devices' 
> > > DMA masks in nubus_device_register(), or nubus_add_board()?
> >
> > I am expecting to see the same WARNING from the nubus sonic driver but 
> > it hasn't happened yet, so I don't have a patch for it yet. In 
> > anycase, the nubus fix would be a lot like the zorro bus fix, so I 
> > don't see a problem.
> 
> That's odd. But what I meant to say is that by setting up 
> dma_coherent_mask in nubus_add_board(), and pointing dma_mask to that, 
> ypu won't need any patches to Nubus device drivers.

Right. I think I've already acknowledged that. But it's off-topic, because 
the patches under review are for platform drivers. Those patches fix an 
actual bug that I've observed. Whereas, the nubus driver dma mask issue 
that you raised is purely theoretical at this stage.

-- 

^ permalink raw reply

* [PATCH] dt-bindings: net: ravb: Add support for r8a77990 SoC
From: Yoshihiro Shimoda @ 2018-05-11  3:18 UTC (permalink / raw)
  To: netdev, linux-renesas-soc, robh+dt, mark.rutland
  Cc: sergei.shtylyov, devicetree, Yoshihiro Shimoda

Add documentation for r8a77990 compatible string to renesas ravb device
tree bindings documentation.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
---
 Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt b/Documentation/devicetree/bindings/net/renesas,ravb.txt
index 890526d..fac897d 100644
--- a/Documentation/devicetree/bindings/net/renesas,ravb.txt
+++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt
@@ -21,6 +21,7 @@ Required properties:
       - "renesas,etheravb-r8a77965" for the R8A77965 SoC.
       - "renesas,etheravb-r8a77970" for the R8A77970 SoC.
       - "renesas,etheravb-r8a77980" for the R8A77980 SoC.
+      - "renesas,etheravb-r8a77990" for the R8A77990 SoC.
       - "renesas,etheravb-r8a77995" for the R8A77995 SoC.
       - "renesas,etheravb-rcar-gen3" as a fallback for the above
 		R-Car Gen3 devices.
-- 
1.9.1

^ permalink raw reply related

* [PATCH net V2] tun: fix use after free for ptr_ring
From: Jason Wang @ 2018-05-11  2:49 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: xiyou.wangcong, eric.dumazet, mst, Jason Wang

We used to initialize ptr_ring during TUNSETIFF, this is because its
size depends on the tx_queue_len of netdevice. And we try to clean it
up when socket were detached from netdevice. A race were spotted when
trying to do uninit during a read which will lead a use after free for
pointer ring. Solving this by always initialize a zero size ptr_ring
in open() and do resizing during TUNSETIFF, and then we can safely do
cleanup during close(). With this, there's no need for the workaround
that was introduced by commit 4df0bfc79904 ("tun: fix a memory leak
for tfile->tx_array").

Reported-by: syzbot+e8b902c3c3fadf0a9dba@syzkaller.appspotmail.com
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Fixes: 1576d9860599 ("tun: switch to use skb array for tx")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
Changes from v1:
- free ptr_ring during close()
- use tun_ptr_free() during resie for safety
---
 drivers/net/tun.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ef33950..9fbbb32 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -681,15 +681,6 @@ static void tun_queue_purge(struct tun_file *tfile)
 	skb_queue_purge(&tfile->sk.sk_error_queue);
 }
 
-static void tun_cleanup_tx_ring(struct tun_file *tfile)
-{
-	if (tfile->tx_ring.queue) {
-		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
-		xdp_rxq_info_unreg(&tfile->xdp_rxq);
-		memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-	}
-}
-
 static void __tun_detach(struct tun_file *tfile, bool clean)
 {
 	struct tun_file *ntfile;
@@ -736,7 +727,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 			    tun->dev->reg_state == NETREG_REGISTERED)
 				unregister_netdevice(tun->dev);
 		}
-		tun_cleanup_tx_ring(tfile);
+		if (tun)
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
 	}
 }
@@ -783,14 +775,14 @@ static void tun_detach_all(struct net_device *dev)
 		tun_napi_del(tun, tfile);
 		/* Drop read queue */
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
 		tun_enable_queue(tfile);
 		tun_queue_purge(tfile);
+		xdp_rxq_info_unreg(&tfile->xdp_rxq);
 		sock_put(&tfile->sk);
-		tun_cleanup_tx_ring(tfile);
 	}
 	BUG_ON(tun->numdisabled != 0);
 
@@ -834,7 +826,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	}
 
 	if (!tfile->detached &&
-	    ptr_ring_init(&tfile->tx_ring, dev->tx_queue_len, GFP_KERNEL)) {
+	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
+			    GFP_KERNEL, tun_ptr_free)) {
 		err = -ENOMEM;
 		goto out;
 	}
@@ -3219,6 +3212,11 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
+	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
+		sk_free(&tfile->sk);
+		return -ENOMEM;
+	}
+
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
@@ -3239,8 +3237,6 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 
 	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
 
-	memset(&tfile->tx_ring, 0, sizeof(tfile->tx_ring));
-
 	return 0;
 }
 
@@ -3249,6 +3245,7 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
+	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }
-- 
2.7.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox