Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next] Add Mellanox BlueField Gigabit Ethernet driver
From: Andrew Lunn @ 2020-07-31 17:42 UTC (permalink / raw)
  To: David Thompson; +Cc: netdev, davem, kuba, jiri, Asmaa Mnebhi
In-Reply-To: <1596047355-28777-1-git-send-email-dthompson@mellanox.com>

Hi David

>> +static int mlxbf_gige_mdio_poll_bit(struct mlxbf_gige *priv, u32 bit_mask)
> +{
> +	unsigned long timeout;
> +	u32 val;
> +
> +	timeout = jiffies + msecs_to_jiffies(MLXBF_GIGE_MDIO_POLL_BUSY_TIMEOUT);
> +	do {
> +		val = readl(priv->mdio_io + MLXBF_GIGE_MDIO_GW_OFFSET);
> +		if (!(val & bit_mask))
> +			return 0;
> +		udelay(MLXBF_GIGE_MDIO_POLL_DELAY_USEC);
> +	} while (time_before(jiffies, timeout));

Please use one of the include/linux/iopoll.h macros. 

> +
> +	return -ETIME;

ETIMEDOUT, not ETIME. But that will automatically be fixed when you
use iopoll.h. Core code has less bugs, which is why you should use it.

> +}
> +
> +static int mlxbf_gige_mdio_read(struct mii_bus *bus, int phy_add, int phy_reg)
> +{
> +	struct mlxbf_gige *priv = bus->priv;
> +	u32 cmd;
> +	u32 ret;
> +
> +	/* If the lock is held by something else, drop the request.
> +	 * If the lock is cleared, that means the busy bit was cleared.
> +	 */

How can this happen? The mdio core has a mutex which prevents parallel
access?

> +	ret = mlxbf_gige_mdio_poll_bit(priv, MLXBF_GIGE_MDIO_GW_LOCK_MASK);
> +	if (ret)
> +		return -EBUSY;

PHY drivers are not going to like that. They are not going to
retry. What is likely to happen is that phylib moves into the ERROR
state, and the PHY driver grinds to a halt.

> +static void mlxbf_gige_mdio_disable_phy_int(struct mlxbf_gige *priv)
> +{
> +	unsigned long flags;
> +	u32 val;
> +
> +	spin_lock_irqsave(&priv->gpio_lock, flags);
> +	val = readl(priv->gpio_io + MLXBF_GIGE_GPIO_CAUSE_OR_EVTEN0);
> +	val &= ~priv->phy_int_gpio_mask;
> +	writel(val, priv->gpio_io + MLXBF_GIGE_GPIO_CAUSE_OR_EVTEN0);
> +	spin_unlock_irqrestore(&priv->gpio_lock, flags);
> +}
> +
> +static void mlxbf_gige_mdio_enable_phy_int(struct mlxbf_gige *priv)
> +{
> +	unsigned long flags;
> +	u32 val;
> +
> +	spin_lock_irqsave(&priv->gpio_lock, flags);
> +	/* The INT_N interrupt level is active low.
> +	 * So enable cause fall bit to detect when GPIO
> +	 * state goes low.
> +	 */
> +	val = readl(priv->gpio_io + MLXBF_GIGE_GPIO_CAUSE_FALL_EN);
> +	val |= priv->phy_int_gpio_mask;
> +	writel(val, priv->gpio_io + MLXBF_GIGE_GPIO_CAUSE_FALL_EN);
> +
> +	/* Enable PHY interrupt by setting the priority level */
> +	val = readl(priv->gpio_io +
> +			MLXBF_GIGE_GPIO_CAUSE_OR_EVTEN0);
> +	val |= priv->phy_int_gpio_mask;
> +	writel(val, priv->gpio_io +
> +			MLXBF_GIGE_GPIO_CAUSE_OR_EVTEN0);
> +	spin_unlock_irqrestore(&priv->gpio_lock, flags);
> +}
> +
> +/* Interrupt handler is called from mlxbf_gige_main.c
> + * driver whenever a phy interrupt is received.
> + */
> +irqreturn_t mlxbf_gige_mdio_handle_phy_interrupt(struct mlxbf_gige *priv)
> +{
> +	u32 val;
> +
> +	/* The YU interrupt is shared between SMBus and GPIOs.
> +	 * So first, determine whether this is a GPIO interrupt.
> +	 */
> +	val = readl(priv->cause_rsh_coalesce0_io);
> +	if (!MLXBF_GIGE_GPIO_CAUSE_IRQ_IS_SET(val)) {
> +		/* Nothing to do here, not a GPIO interrupt */
> +		return IRQ_NONE;
> +	}
> +	/* Then determine which gpio register this interrupt is for.
> +	 * Return if the interrupt is not for gpio block 0.
> +	 */
> +	val = readl(priv->cause_gpio_arm_coalesce0_io);
> +	if (!(val & MLXBF_GIGE_GPIO_BLOCK0_MASK))
> +		return IRQ_NONE;
> +
> +	/* Finally check if this interrupt is from PHY device.
> +	 * Return if it is not.
> +	 */
> +	val = readl(priv->gpio_io +
> +			MLXBF_GIGE_GPIO_CAUSE_OR_CAUSE_EVTEN0);
> +	if (!(val & priv->phy_int_gpio_mask))
> +		return IRQ_NONE;
> +
> +	/* Clear interrupt when done, otherwise, no further interrupt
> +	 * will be triggered.
> +	 * Writing 0x1 to the clear cause register also clears the
> +	 * following registers:
> +	 * cause_gpio_arm_coalesce0
> +	 * cause_rsh_coalesce0
> +	 */
> +	val = readl(priv->gpio_io +
> +			MLXBF_GIGE_GPIO_CAUSE_OR_CLRCAUSE);
> +	val |= priv->phy_int_gpio_mask;
> +	writel(val, priv->gpio_io +
> +			MLXBF_GIGE_GPIO_CAUSE_OR_CLRCAUSE);

Shoudn't there be a call into the PHY driver at this point?

> +
> +	return IRQ_HANDLED;
> +}

So these last three functions seem to be an interrupt controller?  So
why not model it as a Linux interrupt controller? 

> +static void mlxbf_gige_mdio_init_config(struct mlxbf_gige *priv)
> +{
> +	struct device *dev = priv->dev;
> +	u32 mdio_full_drive;
> +	u32 mdio_out_sample;
> +	u32 mdio_in_sample;
> +	u32 mdio_voltage;
> +	u32 mdc_period;
> +	u32 mdio_mode;
> +	u32 mdio_cfg;
> +	int ret;
> +
> +	ret = device_property_read_u32(dev, "mdio-mode", &mdio_mode);
> +	if (ret < 0)
> +		mdio_mode = MLXBF_GIGE_MDIO_MODE_MASTER;
> +
> +	ret = device_property_read_u32(dev, "mdio-voltage", &mdio_voltage);
> +	if (ret < 0)
> +		mdio_voltage = MLXBF_GIGE_MDIO3_3;
> +
> +	ret = device_property_read_u32(dev, "mdio-full-drive", &mdio_full_drive);
> +	if (ret < 0)
> +		mdio_full_drive = MLXBF_GIGE_MDIO_FULL_DRIVE;
> +
> +	ret = device_property_read_u32(dev, "mdc-period", &mdc_period);
> +	if (ret < 0)
> +		mdc_period = MLXBF_GIGE_MDIO_PERIOD;
> +
> +	ret = device_property_read_u32(dev, "mdio-in-sample", &mdio_in_sample);
> +	if (ret < 0)
> +		mdio_in_sample = MLXBF_GIGE_MDIO_IN_SAMP;
> +
> +	ret = device_property_read_u32(dev, "mdio-out-sample", &mdio_out_sample);
> +	if (ret < 0)
> +		mdio_out_sample = MLXBF_GIGE_MDIO_OUT_SAMP;

Please see the discussion going on in the thread:

https://lore.kernel.org/linux-acpi/20200715090400.4733-1-calvin.johnson@oss.nxp.com/T/#t

and in particular

https://lore.kernel.org/linux-acpi/20200715090400.4733-1-calvin.johnson@oss.nxp.com/T/#t

My reading of this is you need to provide a specification of these
properties, and show they really are being used. Please join in on
that thread. Until we make progress on how ACPI should be used, you
might want to drop all these properties and just hard code it as a
standard 2.5Mhz MDIO bus.

> +int mlxbf_gige_mdio_probe(struct platform_device *pdev, struct mlxbf_gige *priv)
> +{
> +
> +	ret = device_property_read_u32(dev, "phy-addr", &phy_addr);
> +	if (ret < 0)
> +		phy_addr = MLXBF_GIGE_MDIO_DEFAULT_PHY_ADDR;

This is going to be problematic. See above.

> +
> +	priv->mdiobus->irq[phy_addr] = PHY_POLL;

That is the default anyway. You can skip this.  But why do you have
interrupt handling code, and then poll it? Maybe just delete all the
interrupt code?

> +
> +	/* Auto probe PHY at the corresponding address */
> +	priv->mdiobus->phy_mask = ~(1 << phy_addr);
> +	ret = mdiobus_register(priv->mdiobus);
> +	if (ret)
> +		dev_err(dev, "Failed to register MDIO bus\n");

Does it break if you scan the whole bus? It would allow you to avoid
some of the ACPI issues.

> +
> +	return ret;
> +}
> +

  Andrew

^ permalink raw reply

* Re: [PATCH bpf-next] bpf: fix compilation warning of selftests
From: Andrii Nakryiko @ 2020-07-31 17:39 UTC (permalink / raw)
  To: Jianlin Lv
  Cc: bpf, David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Yonghong Song, Song.Zhu, open list, Networking
In-Reply-To: <20200731061600.18344-1-Jianlin.Lv@arm.com>

On Thu, Jul 30, 2020 at 11:18 PM Jianlin Lv <Jianlin.Lv@arm.com> wrote:
>
> Clang compiler version: 12.0.0
> The following warning appears during the selftests/bpf compilation:
>
> prog_tests/send_signal.c:51:3: warning: ignoring return value of ‘write’,
> declared with attribute warn_unused_result [-Wunused-result]
>    51 |   write(pipe_c2p[1], buf, 1);
>       |   ^~~~~~~~~~~~~~~~~~~~~~~~~~
> prog_tests/send_signal.c:54:3: warning: ignoring return value of ‘read’,
> declared with attribute warn_unused_result [-Wunused-result]
>    54 |   read(pipe_p2c[0], buf, 1);
>       |   ^~~~~~~~~~~~~~~~~~~~~~~~~
> ......
>
> prog_tests/stacktrace_build_id_nmi.c:13:2: warning: ignoring return value
> of ‘fscanf’,declared with attribute warn_unused_result [-Wunused-resul]
>    13 |  fscanf(f, "%llu", &sample_freq);
>       |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>
> test_tcpnotify_user.c:133:2: warning:ignoring return value of ‘system’,
> declared with attribute warn_unused_result [-Wunused-result]
>   133 |  system(test_script);
>       |  ^~~~~~~~~~~~~~~~~~~
> test_tcpnotify_user.c:138:2: warning:ignoring return value of ‘system’,
> declared with attribute warn_unused_result [-Wunused-result]
>   138 |  system(test_script);
>       |  ^~~~~~~~~~~~~~~~~~~
> test_tcpnotify_user.c:143:2: warning:ignoring return value of ‘system’,
> declared with attribute warn_unused_result [-Wunused-result]
>   143 |  system(test_script);
>       |  ^~~~~~~~~~~~~~~~~~~
>
> Add code that fix compilation warning about ignoring return value and
> handles any errors; Check return value of library`s API make the code
> more secure.
>
> Signed-off-by: Jianlin Lv <Jianlin.Lv@arm.com>
> ---
>  .../selftests/bpf/prog_tests/send_signal.c    | 37 ++++++++++++++-----
>  .../bpf/prog_tests/stacktrace_build_id_nmi.c  |  3 +-
>  .../selftests/bpf/test_tcpnotify_user.c       | 15 ++++++--
>  3 files changed, 41 insertions(+), 14 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> index 504abb7bfb95..7a5272e4e810 100644
> --- a/tools/testing/selftests/bpf/prog_tests/send_signal.c
> +++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
> @@ -48,22 +48,31 @@ static void test_send_signal_common(struct perf_event_attr *attr,
>                 close(pipe_p2c[1]); /* close write */
>
>                 /* notify parent signal handler is installed */
> -               write(pipe_c2p[1], buf, 1);
> +               if (CHECK_FAIL(write(pipe_c2p[1], buf, 1) != 1)) {
> +                       perror("Child: write pipe error");
> +                       goto close_out;
> +               }

Please don't use CHECK_FAIL. Using CHECK is better for many reasons,
but it will also be shorter here (while still recording failure):


CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);


>
>                 /* make sure parent enabled bpf program to send_signal */
> -               read(pipe_p2c[0], buf, 1);
> +               if (CHECK_FAIL(read(pipe_p2c[0], buf, 1) != 1)) {
> +                       perror("Child: read pipe error");
> +                       goto close_out;
> +               }
>

[...]

^ permalink raw reply

* Re: [PATCH net] net: bridge: clear bridge's private skb space on xmit
From: Nikolay Aleksandrov @ 2020-07-31 17:38 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: bridge, roopa, davem
In-Reply-To: <181931fb-dc60-7db6-60ac-b8ff1402efec@cumulusnetworks.com>

On 31/07/2020 20:37, Nikolay Aleksandrov wrote:
> On 31/07/2020 20:27, David Ahern wrote:
>> On 7/31/20 10:26 AM, Nikolay Aleksandrov wrote:
>>> We need to clear all of the bridge private skb variables as they can be
>>> stale due to the packet being recirculated through the stack and then
>>> transmitted through the bridge device. Similar memset is already done on
>>> bridge's input. We've seen cases where proxyarp_replied was 1 on routed
>>> multicast packets transmitted through the bridge to ports with neigh
>>> suppress which were getting dropped. Same thing can in theory happen with
>>> the port isolation bit as well.
>>>
>>> Fixes: 821f1b21cabb ("bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood")
>>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>>> ---
>>>  net/bridge/br_device.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
>>> index 8c7b78f8bc23..9a2fb4aa1a10 100644
>>> --- a/net/bridge/br_device.c
>>> +++ b/net/bridge/br_device.c
>>> @@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>>>  	const unsigned char *dest;
>>>  	u16 vid = 0;
>>>  
>>> +	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
>>> +
>>>  	rcu_read_lock();
>>>  	nf_ops = rcu_dereference(nf_br_ops);
>>>  	if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
>>>
>>
>> What's the performance hit of doing this on every packet?
>>
>> Can you just set a flag that tells the code to reset on recirculation?
>> Seems like br_input_skb_cb has space for that.
>>
> 
> Virtually non-existent, we had a patch that turned that field into a 16 byte
> field so that is really 2 8 byte stores. It is already cache hot, we could

err, s/field/struct/

> initialize each individual field separately as br_input does.
> 
> I don't want to waste flags on such thing, this makes it future-proof 
> and I'll remove the individual field zeroing later which will alleviate
> the cost further.
> 
> 


^ permalink raw reply

* Re: [PATCH v3 bpf-next 4/9] tcp: Add unknown_opt arg to tcp_parse_options
From: Martin KaFai Lau @ 2020-07-31 17:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Lawrence Brakmo, Neal Cardwell, netdev, Yuchung Cheng
In-Reply-To: <CANn89i+f4se896OPGx6dPKZuObeJR2gaTExqoAHmDK=r7cTmaw@mail.gmail.com>

On Fri, Jul 31, 2020 at 09:12:10AM -0700, Eric Dumazet wrote:
> On Thu, Jul 30, 2020 at 1:58 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > In the latter patch, the bpf prog only wants to be called to handle
> > a header option if that particular header option cannot be handled by
> > the kernel.  This unknown option could be written by the peer's bpf-prog.
> > It could also be a new standard option that the running kernel does not
> > support it while a bpf-prog can handle it.
> >
> > In a latter patch, the bpf prog will be called from tcp_validate_incoming()
> > if there is unknown option and a flag is set in tp->bpf_sock_ops_cb_flags.
> >
> > Instead of using skb->cb[] in an earlier attempt, this patch
> > adds an optional arg "bool *unknown_opt" to tcp_parse_options().
> > The bool will be set to true if it has encountered an option
> > that the kernel does not recognize.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  drivers/infiniband/hw/cxgb4/cm.c |  2 +-
> >  include/net/tcp.h                |  3 ++-
> >  net/ipv4/syncookies.c            |  2 +-
> >  net/ipv4/tcp_input.c             | 40 +++++++++++++++++++++-----------
> >  net/ipv4/tcp_minisocks.c         |  4 ++--
> >  net/ipv6/syncookies.c            |  2 +-
> >  6 files changed, 34 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
> > index 30e08bcc9afb..dedca6576bb9 100644
> > --- a/drivers/infiniband/hw/cxgb4/cm.c
> > +++ b/drivers/infiniband/hw/cxgb4/cm.c
> > @@ -3949,7 +3949,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
> >          */
> >         memset(&tmp_opt, 0, sizeof(tmp_opt));
> >         tcp_clear_options(&tmp_opt);
> > -       tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
> > +       tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
> >
> >         req = __skb_push(skb, sizeof(*req));
> >         memset(req, 0, sizeof(*req));
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 895e7aabf136..d49d8f1c961a 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -413,7 +413,8 @@ int tcp_mmap(struct file *file, struct socket *sock,
> >  #endif
> >  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
> >                        struct tcp_options_received *opt_rx,
> > -                      int estab, struct tcp_fastopen_cookie *foc);
> > +                      int estab, struct tcp_fastopen_cookie *foc,
> > +                      bool *unknown_opt);
> >  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
> >
> 
> Instead of changing signatures of many functions (and make future
> stable backports challenging)
> how about adding a field into 'struct tcp_options_received' ?
Sounds good.  There is a one byte hole in 'struct tcp_options_received',
so it won't matter much even there is "rx_opt" in "struct tcp_sock".

^ permalink raw reply

* Re: [PATCH net] net: bridge: clear bridge's private skb space on xmit
From: Nikolay Aleksandrov @ 2020-07-31 17:37 UTC (permalink / raw)
  To: David Ahern, netdev; +Cc: bridge, roopa, davem
In-Reply-To: <07823615-29a8-9553-d56b-1beef55a07bc@gmail.com>

On 31/07/2020 20:27, David Ahern wrote:
> On 7/31/20 10:26 AM, Nikolay Aleksandrov wrote:
>> We need to clear all of the bridge private skb variables as they can be
>> stale due to the packet being recirculated through the stack and then
>> transmitted through the bridge device. Similar memset is already done on
>> bridge's input. We've seen cases where proxyarp_replied was 1 on routed
>> multicast packets transmitted through the bridge to ports with neigh
>> suppress which were getting dropped. Same thing can in theory happen with
>> the port isolation bit as well.
>>
>> Fixes: 821f1b21cabb ("bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood")
>> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
>> ---
>>  net/bridge/br_device.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
>> index 8c7b78f8bc23..9a2fb4aa1a10 100644
>> --- a/net/bridge/br_device.c
>> +++ b/net/bridge/br_device.c
>> @@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>>  	const unsigned char *dest;
>>  	u16 vid = 0;
>>  
>> +	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
>> +
>>  	rcu_read_lock();
>>  	nf_ops = rcu_dereference(nf_br_ops);
>>  	if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
>>
> 
> What's the performance hit of doing this on every packet?
> 
> Can you just set a flag that tells the code to reset on recirculation?
> Seems like br_input_skb_cb has space for that.
> 

Virtually non-existent, we had a patch that turned that field into a 16 byte
field so that is really 2 8 byte stores. It is already cache hot, we could
initialize each individual field separately as br_input does.

I don't want to waste flags on such thing, this makes it future-proof 
and I'll remove the individual field zeroing later which will alleviate
the cost further.



^ permalink raw reply

* Re: [PATCH v3 bpf-next 1/9] tcp: Use a struct to represent a saved_syn
From: Eric Dumazet @ 2020-07-31 17:31 UTC (permalink / raw)
  To: Eric Dumazet, Martin KaFai Lau
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Lawrence Brakmo, Neal Cardwell, netdev, Yuchung Cheng
In-Reply-To: <CANn89iK8h8x6oVZ0O0P+3gs1NyxfX0F--+Gw4CjOBhHE0NxqqA@mail.gmail.com>



On 7/31/20 8:57 AM, Eric Dumazet wrote:
> On Thu, Jul 30, 2020 at 1:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>>
>> The TCP_SAVE_SYN has both the network header and tcp header.
>> The total length of the saved syn packet is currently stored in
>> the first 4 bytes (u32) of an array and the actual packet data is
>> stored after that.
>>
>> A latter patch will add a bpf helper that allows to get the tcp header
> 
> s/latter/later/

Sorry, brain fart. I am using a different wording, that is all ;)


^ permalink raw reply

* Re: [net-next] seg6: using DSCP of inner IPv4 packets
From: Ahmed Abdelsalam @ 2020-07-31 17:31 UTC (permalink / raw)
  To: David Miller; +Cc: kuznet, yoshfuji, kuba, linux-kernel, netdev, andrea.mayer
In-Reply-To: <20200730.164424.85007408369570229.davem@davemloft.net>

I will refactor the code of this function and submit a new patch.
Ahmed

On 31/07/2020 01:44, David Miller wrote:
> From: Ahmed Abdelsalam <ahabdels@gmail.com>
> Date: Tue, 28 Jul 2020 12:20:44 +0000
> 
>> This patch allows copying the DSCP from inner IPv4 header to the
>> outer IPv6 header, when doing SRv6 Encapsulation.
>>
>> This allows forwarding packet across the SRv6 fabric based on their
>> original traffic class.
>>
>> Signed-off-by: Ahmed Abdelsalam <ahabdels@gmail.com>
> 
> The conditionals in this function are now a mess.
> 
>> -	inner_hdr = ipv6_hdr(skb);
>> +	if (skb->protocol == htons(ETH_P_IPV6))
>> +		inner_hdr = ipv6_hdr(skb);
>> +	else
>> +		inner_ipv4_hdr = ip_hdr(skb);
>> +
> 
> You assume that if skb->protocol is not ipv6 then it is ipv4.
> 
>> @@ -138,6 +143,10 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
>>   		ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
>>   			     flowlabel);
>>   		hdr->hop_limit = inner_hdr->hop_limit;
>> +	} else if (skb->protocol == htons(ETH_P_IP)) {
>> +		ip6_flow_hdr(hdr, inner_ipv4_hdr->tos, flowlabel);
>> +		hdr->hop_limit = inner_ipv4_hdr->ttl;
>> +		memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
>>   	} else {
>>   		ip6_flow_hdr(hdr, 0, flowlabel);
>>   		hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
> 
> But this code did not make that assumption at all.
> 
> Only one of the two can be correct.
> 
> The conditional assignment is also very ugly, you have two pointers
> conditionally initialized.  The compiler is going to have a hard time
> figuring out that each pointer is only used in the code path where it
> is guaranteed to be initialiazed.
> 
> And it can't do that, as far as the compiler knows, skb->protocol can
> change between those two locations.  It MUST assume that can happen if
> there are any functions calls whatsoever between these two code points.
> 
> This function has to be sanitized, with better handling of access to
> the inner protocol header values, before I am willing to apply this.
> 

^ permalink raw reply

* Re: [PATCH net] net: bridge: clear bridge's private skb space on xmit
From: David Ahern @ 2020-07-31 17:27 UTC (permalink / raw)
  To: Nikolay Aleksandrov, netdev; +Cc: bridge, roopa, davem
In-Reply-To: <20200731162616.345380-1-nikolay@cumulusnetworks.com>

On 7/31/20 10:26 AM, Nikolay Aleksandrov wrote:
> We need to clear all of the bridge private skb variables as they can be
> stale due to the packet being recirculated through the stack and then
> transmitted through the bridge device. Similar memset is already done on
> bridge's input. We've seen cases where proxyarp_replied was 1 on routed
> multicast packets transmitted through the bridge to ports with neigh
> suppress which were getting dropped. Same thing can in theory happen with
> the port isolation bit as well.
> 
> Fixes: 821f1b21cabb ("bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood")
> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> ---
>  net/bridge/br_device.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
> index 8c7b78f8bc23..9a2fb4aa1a10 100644
> --- a/net/bridge/br_device.c
> +++ b/net/bridge/br_device.c
> @@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
>  	const unsigned char *dest;
>  	u16 vid = 0;
>  
> +	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
> +
>  	rcu_read_lock();
>  	nf_ops = rcu_dereference(nf_br_ops);
>  	if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
> 

What's the performance hit of doing this on every packet?

Can you just set a flag that tells the code to reset on recirculation?
Seems like br_input_skb_cb has space for that.

^ permalink raw reply

* Re: [Linux-kernel-mentees] [PATCH net] rds: Prevent kernel-infoleak in rds_notify_queue_get()
From: Greg Kroah-Hartman @ 2020-07-31 17:19 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Peilin Ye, Santosh Shilimkar, David S. Miller,
	Jakub Kicinski, Dan Carpenter, Arnd Bergmann,
	linux-kernel-mentees, netdev, linux-rdma, rds-devel, linux-kernel
In-Reply-To: <20200731143604.GF24045@ziepe.ca>

On Fri, Jul 31, 2020 at 11:36:04AM -0300, Jason Gunthorpe wrote:
> On Fri, Jul 31, 2020 at 04:21:48PM +0200, Greg Kroah-Hartman wrote:
> 
> > > The spec was updated in C11 to require zero'ing padding when doing
> > > partial initialization of aggregates (eg = {})
> > > 
> > > """if it is an aggregate, every member is initialized (recursively)
> > > according to these rules, and any padding is initialized to zero
> > > bits;"""
> > 
> > But then why does the compilers not do this?
> 
> Do you have an example?

At the moment, no, but we have had them in the past due to security
issues we have had to fix for this.

> > > Considering we have thousands of aggregate initializers it
> > > seems likely to me Linux also requires a compiler with this C11
> > > behavior to operate correctly.
> > 
> > Note that this is not an "operate correctly" thing, it is a "zero out
> > stale data in structure paddings so that data will not leak to
> > userspace" thing.
> 
> Yes, not being insecure is "operate correctly", IMHO :)
>  
> > > Does this patch actually fix anything? My compiler generates identical
> > > assembly code in either case.
> > 
> > What compiler version?
> 
> I tried clang 10 and gcc 9.3 for x86-64.
> 
> #include <string.h>
> 
> void test(void *out)
> {
> 	struct rds_rdma_notify {
> 		unsigned long user_token;
> 		unsigned int status;
> 	} foo = {};
> 	memcpy(out, &foo, sizeof(foo));
> }
> 
> $ gcc -mno-sse2 -O2 -Wall -std=c99 t.c -S
> 
> test:
> 	endbr64
> 	movq	$0, (%rdi)
> 	movq	$0, 8(%rdi)
> 	ret
> 
> Just did this same test with gcc 4.4 and it also gave the same output..
> 
> Made it more complex with this:
> 
> 	struct rds_rdma_notify {
> 		unsigned long user_token;
> 		unsigned char status;
> 		unsigned long user_token1;
> 		unsigned char status1;
> 		unsigned long user_token2;
> 		unsigned char status2;
> 		unsigned long user_token3;
> 		unsigned char status3;
> 		unsigned long user_token4;
> 		unsigned char status4;
> 	} foo;
> 
> And still got the same assembly vs memset on gcc 4.4.
> 
> I tried for a bit and didn't find a way to get even old gcc 4.4 to not
> initialize the holes.

Odd, so it is just the "= {0};" that does not zero out the holes?

thanks,

greg k-h

^ permalink raw reply

* Re: [RFC PATCH bpf-next 2/3] bpf: Add helper to do forwarding lookups in kernel FDB table
From: David Ahern @ 2020-07-31 17:15 UTC (permalink / raw)
  To: Yoshiki Komachi, David S. Miller, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Jakub Kicinski, Martin KaFai Lau, Song Liu, Yonghong Song,
	Andrii Nakryiko, KP Singh, Roopa Prabhu, Nikolay Aleksandrov,
	David Ahern
  Cc: netdev, bridge, bpf
In-Reply-To: <1596170660-5582-3-git-send-email-komachi.yoshiki@gmail.com>

On 7/30/20 10:44 PM, Yoshiki Komachi wrote:
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 654c346b7d91..68800d1b8cd5 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5084,6 +5085,46 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
>  	.arg4_type	= ARG_ANYTHING,
>  };
>  
> +#if IS_ENABLED(CONFIG_BRIDGE)
> +BPF_CALL_4(bpf_xdp_fdb_lookup, struct xdp_buff *, ctx,
> +	   struct bpf_fdb_lookup *, params, int, plen, u32, flags)
> +{
> +	struct net_device *src, *dst;
> +	struct net *net;
> +
> +	if (plen < sizeof(*params))
> +		return -EINVAL;

I need to look at the details more closely, but on first reading 2
things caught me eye:
1. you need to make sure flags is 0 since there are no supported flags
at the moment, and

> +
> +	net = dev_net(ctx->rxq->dev);
> +
> +	if (is_multicast_ether_addr(params->addr) ||
> +	    is_broadcast_ether_addr(params->addr))
> +		return BPF_FDB_LKUP_RET_NOENT;
> +
> +	src = dev_get_by_index_rcu(net, params->ifindex);
> +	if (unlikely(!src))
> +		return -ENODEV;
> +
> +	dst = br_fdb_find_port_xdp(src, params->addr, params->vlan_id);

2. this needs to be done via netdev ops to avoid referencing bridge code
which can be compiled as a module. I suspect the build robots will id
this part soon.


^ permalink raw reply

* KASAN: slab-out-of-bounds Read in qrtr_endpoint_post (2)
From: syzbot @ 2020-07-31 17:04 UTC (permalink / raw)
  To: bjorn.andersson, davem, kuba, linux-kernel, manivannan.sadhasivam,
	netdev, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    83bdc727 random32: remove net_rand_state from the latent e..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=11b2f56c900000
kernel config:  https://syzkaller.appspot.com/x/.config?x=e59ee776d5aa8d55
dashboard link: https://syzkaller.appspot.com/bug?extid=1917d778024161609247
compiler:       clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=14ac9b60900000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=14256c5c900000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+1917d778024161609247@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: slab-out-of-bounds in skb_put_data include/linux/skbuff.h:2260 [inline]
BUG: KASAN: slab-out-of-bounds in qrtr_endpoint_post+0x659/0x1150 net/qrtr/qrtr.c:492
Read of size 4294967294 at addr ffff8880a201b650 by task syz-executor462/6791

CPU: 1 PID: 6791 Comm: syz-executor462 Not tainted 5.8.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1f0/0x31e lib/dump_stack.c:118
 print_address_description+0x66/0x5a0 mm/kasan/report.c:383
 __kasan_report mm/kasan/report.c:513 [inline]
 kasan_report+0x132/0x1d0 mm/kasan/report.c:530
 check_memory_region_inline mm/kasan/generic.c:183 [inline]
 check_memory_region+0x2b5/0x2f0 mm/kasan/generic.c:192
 memcpy+0x25/0x60 mm/kasan/common.c:105
 skb_put_data include/linux/skbuff.h:2260 [inline]
 qrtr_endpoint_post+0x659/0x1150 net/qrtr/qrtr.c:492
 qrtr_tun_write_iter+0xc6/0x120 net/qrtr/tun.c:92
 call_write_iter include/linux/fs.h:1908 [inline]
 new_sync_write fs/read_write.c:503 [inline]
 vfs_write+0xa08/0xc70 fs/read_write.c:578
 ksys_write+0x11b/0x220 fs/read_write.c:631
 do_syscall_64+0x73/0xe0 arch/x86/entry/common.c:384
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x440259
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffd2181ec68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440259
RDX: 0000000000000010 RSI: 0000000020000040 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401a60
R13: 0000000000401af0 R14: 0000000000000000 R15: 0000000000000000

Allocated by task 6791:
 save_stack mm/kasan/common.c:48 [inline]
 set_track mm/kasan/common.c:56 [inline]
 __kasan_kmalloc+0x103/0x140 mm/kasan/common.c:494
 __do_kmalloc mm/slab.c:3656 [inline]
 __kmalloc+0x24b/0x330 mm/slab.c:3665
 kmalloc include/linux/slab.h:560 [inline]
 kzalloc+0x16/0x30 include/linux/slab.h:669
 qrtr_tun_write_iter+0x76/0x120 net/qrtr/tun.c:83
 call_write_iter include/linux/fs.h:1908 [inline]
 new_sync_write fs/read_write.c:503 [inline]
 vfs_write+0xa08/0xc70 fs/read_write.c:578
 ksys_write+0x11b/0x220 fs/read_write.c:631
 do_syscall_64+0x73/0xe0 arch/x86/entry/common.c:384
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 4860:
 save_stack mm/kasan/common.c:48 [inline]
 set_track mm/kasan/common.c:56 [inline]
 kasan_set_free_info mm/kasan/common.c:316 [inline]
 __kasan_slab_free+0x114/0x170 mm/kasan/common.c:455
 __cache_free mm/slab.c:3426 [inline]
 kfree+0x10a/0x220 mm/slab.c:3757
 simple_xattr_set+0x5ae/0x5e0 fs/xattr.c:923
 __vfs_removexattr+0x3b9/0x3f0 fs/xattr.c:377
 vfs_removexattr+0xa5/0x190 fs/xattr.c:396
 removexattr fs/xattr.c:691 [inline]
 path_removexattr+0x174/0x240 fs/xattr.c:705
 __do_sys_lremovexattr fs/xattr.c:725 [inline]
 __se_sys_lremovexattr fs/xattr.c:722 [inline]
 __x64_sys_lremovexattr+0x59/0x70 fs/xattr.c:722
 do_syscall_64+0x73/0xe0 arch/x86/entry/common.c:384
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

The buggy address belongs to the object at ffff8880a201b640
 which belongs to the cache kmalloc-32 of size 32
The buggy address is located 16 bytes inside of
 32-byte region [ffff8880a201b640, ffff8880a201b660)
The buggy address belongs to the page:
page:ffffea00028806c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff8880a201bfc1
flags: 0xfffe0000000200(slab)
raw: 00fffe0000000200 ffffea00027cd3c8 ffffea00027a46c8 ffff8880aa4001c0
raw: ffff8880a201bfc1 ffff8880a201b000 000000010000003c 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8880a201b500: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8880a201b580: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
>ffff8880a201b600: fb fb fb fb fc fc fc fc 00 00 fc fc fc fc fc fc
                                                 ^
 ffff8880a201b680: fb fb fb fb fc fc fc fc fb fb fb fb fc fc fc fc
 ffff8880a201b700: 00 fc fc fc fc fc fc fc fb fb fb fb fc fc fc fc
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* KASAN: null-ptr-deref Write in amp_read_loc_assoc_final_data
From: syzbot @ 2020-07-31 17:04 UTC (permalink / raw)
  To: corbet, coreteam, davem, johan.hedberg, kaber, kadlec, kuba,
	linux-bluetooth, linux-kernel, linux-media, linux, marcel,
	mchehab, mchehab, netdev, netfilter-devel, pablo, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    83bdc727 random32: remove net_rand_state from the latent e..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=176e5d12900000
kernel config:  https://syzkaller.appspot.com/x/.config?x=e59ee776d5aa8d55
dashboard link: https://syzkaller.appspot.com/bug?extid=f4fb0eaafdb51c32a153
compiler:       clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=13d5ed24900000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1381a56c900000

The issue was bisected to:

commit a4585c31c5018578b4abf699ddfdff719dd1c313
Author: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date:   Tue Oct 18 19:44:09 2016 +0000

    [media] marvell-ccic: don't break long lines

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=160d627c900000
final oops:     https://syzkaller.appspot.com/x/report.txt?x=150d627c900000
console output: https://syzkaller.appspot.com/x/log.txt?x=110d627c900000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+f4fb0eaafdb51c32a153@syzkaller.appspotmail.com
Fixes: a4585c31c501 ("[media] marvell-ccic: don't break long lines")

==================================================================
BUG: KASAN: null-ptr-deref in instrument_atomic_write include/linux/instrumented.h:71 [inline]
BUG: KASAN: null-ptr-deref in set_bit include/asm-generic/bitops/instrumented-atomic.h:28 [inline]
BUG: KASAN: null-ptr-deref in amp_read_loc_assoc_final_data+0x115/0x260 net/bluetooth/amp.c:304
Write of size 8 at addr 0000000000000030 by task kworker/u5:2/6842

CPU: 1 PID: 6842 Comm: kworker/u5:2 Not tainted 5.8.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: hci0 hci_rx_work
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1f0/0x31e lib/dump_stack.c:118
 __kasan_report mm/kasan/report.c:517 [inline]
 kasan_report+0x151/0x1d0 mm/kasan/report.c:530
 check_memory_region_inline mm/kasan/generic.c:183 [inline]
 check_memory_region+0x2b5/0x2f0 mm/kasan/generic.c:192
 instrument_atomic_write include/linux/instrumented.h:71 [inline]
 set_bit include/asm-generic/bitops/instrumented-atomic.h:28 [inline]
 amp_read_loc_assoc_final_data+0x115/0x260 net/bluetooth/amp.c:304
 hci_chan_selected_evt net/bluetooth/hci_event.c:4897 [inline]
 hci_event_packet+0x8289/0x18240 net/bluetooth/hci_event.c:6164
 hci_rx_work+0x236/0x9c0 net/bluetooth/hci_core.c:4705
 process_one_work+0x789/0xfc0 kernel/workqueue.c:2269
 worker_thread+0xaa4/0x1460 kernel/workqueue.c:2415
 kthread+0x37e/0x3a0 drivers/block/aoe/aoecmd.c:1234
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293
==================================================================
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 6842 Comm: kworker/u5:2 Tainted: G    B             5.8.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: hci0 hci_rx_work
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1f0/0x31e lib/dump_stack.c:118
 panic+0x264/0x7a0 kernel/panic.c:231
 end_report mm/kasan/report.c:104 [inline]
 __kasan_report mm/kasan/report.c:520 [inline]
 kasan_report+0x1c9/0x1d0 mm/kasan/report.c:530
 check_memory_region_inline mm/kasan/generic.c:183 [inline]
 check_memory_region+0x2b5/0x2f0 mm/kasan/generic.c:192
 instrument_atomic_write include/linux/instrumented.h:71 [inline]
 set_bit include/asm-generic/bitops/instrumented-atomic.h:28 [inline]
 amp_read_loc_assoc_final_data+0x115/0x260 net/bluetooth/amp.c:304
 hci_chan_selected_evt net/bluetooth/hci_event.c:4897 [inline]
 hci_event_packet+0x8289/0x18240 net/bluetooth/hci_event.c:6164
 hci_rx_work+0x236/0x9c0 net/bluetooth/hci_core.c:4705
 process_one_work+0x789/0xfc0 kernel/workqueue.c:2269
 worker_thread+0xaa4/0x1460 kernel/workqueue.c:2415
 kthread+0x37e/0x3a0 drivers/block/aoe/aoecmd.c:1234
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293
Kernel Offset: disabled
Rebooting in 86400 seconds..

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH] net: Phy: Add PHY lookup support on MDIO bus in case of ACPI probe
From: Andrew Lunn @ 2020-07-31 16:54 UTC (permalink / raw)
  To: Vikas Singh
  Cc: f.fainelli, hkallweit1, linux, netdev, calvin.johnson,
	kuldip.dwivedi, madalin.bucur, vikas.singh
In-Reply-To: <1595938298-13190-1-git-send-email-vikas.singh@puresoftware.com>

On Tue, Jul 28, 2020 at 05:41:38PM +0530, Vikas Singh wrote:
> Auto-probe of c45 devices with extended scanning in xgmac_mdio works
> well but fails to update device "fwnode" while registering PHYs on
> MDIO bus.
> This patch is based on https://www.spinics.net/lists/netdev/msg662173.html
> 
> This change will update the "fwnode" while PHYs get registered and allow
> lookup for registered PHYs on MDIO bus from other drivers while probing.
> 
> Signed-off-by: Vikas Singh <vikas.singh@puresoftware.com>
> ---
>  drivers/net/phy/mdio_bus.c | 19 ++++++++++++++++++-
>  1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
> index 46b3370..7275eff 100644
> --- a/drivers/net/phy/mdio_bus.c
> +++ b/drivers/net/phy/mdio_bus.c
> @@ -447,8 +447,25 @@ static void of_mdiobus_link_mdiodev(struct mii_bus *bus,

Why would a function called of_mdiobus_link_mdiodev() be poking
around trying to find ACPI properties?

       Andrew

^ permalink raw reply

* INFO: trying to register non-static key in skb_queue_purge
From: syzbot @ 2020-07-31 16:54 UTC (permalink / raw)
  To: davem, johan.hedberg, kuba, linux-bluetooth, linux-kernel, marcel,
	netdev, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    83bdc727 random32: remove net_rand_state from the latent e..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=11088204900000
kernel config:  https://syzkaller.appspot.com/x/.config?x=e59ee776d5aa8d55
dashboard link: https://syzkaller.appspot.com/bug?extid=99efc1c133eff186721a
compiler:       clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=12429014900000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=12dbc404900000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+99efc1c133eff186721a@syzkaller.appspotmail.com

IPVS: ftp: loaded support on port[0] = 21
INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 0 PID: 6819 Comm: syz-executor370 Not tainted 5.8.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1f0/0x31e lib/dump_stack.c:118
 register_lock_class+0xf06/0x1520 kernel/locking/lockdep.c:893
 __lock_acquire+0x102/0x2c30 kernel/locking/lockdep.c:4259
 lock_acquire+0x160/0x720 kernel/locking/lockdep.c:4959
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0x9e/0xc0 kernel/locking/spinlock.c:159
 skb_dequeue net/core/skbuff.c:3038 [inline]
 skb_queue_purge+0x2e/0x1c0 net/core/skbuff.c:3076
 l2cap_conn_del+0x3de/0x650 net/bluetooth/l2cap_core.c:1890
 hci_disconn_cfm include/net/bluetooth/hci_core.h:1355 [inline]
 hci_conn_hash_flush+0x127/0x200 net/bluetooth/hci_conn.c:1536
 hci_dev_do_close+0xb7b/0x1040 net/bluetooth/hci_core.c:1761
 hci_unregister_dev+0x16d/0x1590 net/bluetooth/hci_core.c:3606
 vhci_release+0x73/0xc0 drivers/bluetooth/hci_vhci.c:340
 __fput+0x2f0/0x750 fs/file_table.c:281
 task_work_run+0x137/0x1c0 kernel/task_work.c:135
 exit_task_work include/linux/task_work.h:25 [inline]
 do_exit+0x601/0x1f80 kernel/exit.c:805
 do_group_exit+0x161/0x2d0 kernel/exit.c:903
 __do_sys_exit_group+0x13/0x20 kernel/exit.c:914
 __se_sys_exit_group+0x10/0x10 kernel/exit.c:912
 __x64_sys_exit_group+0x37/0x40 kernel/exit.c:912
 do_syscall_64+0x73/0xe0 arch/x86/entry/common.c:384
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x444fe8
Code: Bad RIP value.
RSP: 002b:00007ffe95cf39a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 0000000000444fe8
RDX: 0000000000000001 RSI: 000000000000003c RDI: 0000000000000001
RBP: 00000000004cce10 R08: 00000000000000e7 R09: ffffffffffffffd0
R10: 00007fa6b64e3700 R11: 0000000000000246 R12: 0000000000000001
R13: 00000000006e0200 R14: 00000000011eb850 R15: 0000000000000001

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* INFO: trying to register non-static key in skb_dequeue
From: syzbot @ 2020-07-31 16:54 UTC (permalink / raw)
  To: davem, johan.hedberg, kuba, linux-bluetooth, linux-kernel, marcel,
	netdev, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    83bdc727 random32: remove net_rand_state from the latent e..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=119bc404900000
kernel config:  https://syzkaller.appspot.com/x/.config?x=c0cfcf935bcc94d2
dashboard link: https://syzkaller.appspot.com/bug?extid=fadfba6a911f6bf71842
compiler:       gcc (GCC) 10.1.0-syz 20200507
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16ce9270900000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1485c092900000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+fadfba6a911f6bf71842@syzkaller.appspotmail.com

IPVS: ftp: loaded support on port[0] = 21
INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 1 PID: 6833 Comm: syz-executor596 Not tainted 5.8.0-rc7-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x18f/0x20d lib/dump_stack.c:118
 assign_lock_key kernel/locking/lockdep.c:894 [inline]
 register_lock_class+0x157d/0x1630 kernel/locking/lockdep.c:1206
 __lock_acquire+0xfa/0x56e0 kernel/locking/lockdep.c:4259
 lock_acquire+0x1f1/0xad0 kernel/locking/lockdep.c:4959
 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
 _raw_spin_lock_irqsave+0x8c/0xc0 kernel/locking/spinlock.c:159
 skb_dequeue+0x1c/0x180 net/core/skbuff.c:3038
 skb_queue_purge+0x21/0x30 net/core/skbuff.c:3076
 l2cap_chan_del+0x61d/0x1300 net/bluetooth/l2cap_core.c:657
 l2cap_conn_del+0x46a/0x9e0 net/bluetooth/l2cap_core.c:1890
 l2cap_disconn_cfm net/bluetooth/l2cap_core.c:8159 [inline]
 l2cap_disconn_cfm+0x85/0xa0 net/bluetooth/l2cap_core.c:8152
 hci_disconn_cfm include/net/bluetooth/hci_core.h:1355 [inline]
 hci_conn_hash_flush+0x114/0x220 net/bluetooth/hci_conn.c:1536
 hci_dev_do_close+0x5c6/0x1080 net/bluetooth/hci_core.c:1761
 hci_unregister_dev+0x1a3/0xe20 net/bluetooth/hci_core.c:3606
 vhci_release+0x70/0xe0 drivers/bluetooth/hci_vhci.c:340
 __fput+0x33c/0x880 fs/file_table.c:281
 task_work_run+0xdd/0x190 kernel/task_work.c:135
 exit_task_work include/linux/task_work.h:25 [inline]
 do_exit+0xb72/0x2a40 kernel/exit.c:805
 do_group_exit+0x125/0x310 kernel/exit.c:903
 __do_sys_exit_group kernel/exit.c:914 [inline]
 __se_sys_exit_group kernel/exit.c:912 [inline]
 __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:912
 do_syscall_64+0x60/0xe0 arch/x86/entry/common.c:384
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x444fe8
Code: Bad RIP value.
RSP: 002b:00007ffde50eda98 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 0000000000444fe8
RDX: 0000000000000001 RSI: 000000000000003c RDI: 0000000000000001
RBP: 00000000004cce10 R08: 00000000000000e7 R09: ffffffffffffffd0
R10: 00007f0c43ebb700 R11: 0000000000000246 R12: 0000000000000001
R13: 00000000006e0200 R14: 0000000001898850 R15: 0000000000000001

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* pull-request: mac80211-next 2020-07-31
From: Johannes Berg @ 2020-07-31 16:54 UTC (permalink / raw)
  To: netdev; +Cc: linux-wireless

Hi Dave,

Here's a set of patches for -next, in case we get a release
on Sunday :-) If not I may have some more next week, but no
point waiting now with this.

Please pull and let me know if there's any problem.

Thanks,
johannes



The following changes since commit 41d707b7332f1386642c47eb078110ca368a46f5:

  fib: fix fib_rules_ops indirect calls wrappers (2020-07-29 13:26:42 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211-next.git tags/mac80211-next-for-davem-2020-07-31

for you to fetch changes up to c8ad010665c0b85c6a35466b2159e907b8dd85d1:

  mac80211: warn only once in check_sdata_in_driver() at each caller (2020-07-31 09:27:02 +0200)

----------------------------------------------------------------
We have a number of changes
 * code cleanups and fixups as usual
 * AQL & internal TXQ improvements from Felix
 * some mesh 802.1X support bits
 * some injection improvements from Mathy of KRACK
   fame, so we'll see what this results in ;-)
 * some more initial S1G supports bits, this time
   (some of?) the userspace APIs

----------------------------------------------------------------
Christophe JAILLET (2):
      nl80211: Remove a misleading label in 'nl80211_trigger_scan()'
      nl80211: Simplify error handling path in 'nl80211_trigger_scan()'

Chung-Hsien Hsu (1):
      nl80211: support 4-way handshake offloading for WPA/WPA2-PSK in AP mode

Colin Ian King (1):
      mac80211: remove the need for variable rates_idx

Emmanuel Grumbach (1):
      cfg80211: allow the low level driver to flush the BSS table

Felix Fietkau (4):
      mac80211: improve AQL tx airtime estimation
      net/fq_impl: use skb_get_hash instead of skb_get_hash_perturb
      mac80211: calculate skb hash early when using itxq
      mac80211: add a function for running rx without passing skbs to the stack

Gustavo A. R. Silva (1):
      mac80211: Use fallthrough pseudo-keyword

Johannes Berg (2):
      cfg80211: invert HE BSS color 'disabled' to 'enabled'
      mac80211: warn only once in check_sdata_in_driver() at each caller

Julian Squires (1):
      cfg80211: allow vendor dumpit to terminate by returning 0

Linus Lüssing (1):
      cfg80211/mac80211: add mesh_param "mesh_nolearn" to skip path discovery

Markus Theil (2):
      cfg80211/mac80211: add connected to auth server to meshconf
      cfg80211/mac80211: add connected to auth server to station info

Mathy Vanhoef (6):
      mac80211: never drop injected frames even if normally not allowed
      mac80211: add radiotap flag to prevent sequence number overwrite
      mac80211: do not overwrite the sequence number if requested
      mac80211: use same flag everywhere to avoid sequence number overwrite
      mac80211: remove unused flags argument in transmit functions
      mac80211: parse radiotap header when selecting Tx queue

P Praneesh (1):
      cfg80211/mac80211: avoid bss color setting in non-HE modes

Randy Dunlap (5):
      net/wireless: nl80211.h: drop duplicate words in comments
      net/wireless: wireless.h: drop duplicate word in comments
      net/wireless: cfg80211.h: drop duplicate words in comments
      net/wireless: mac80211.h: drop duplicate words in comments
      net/wireless: regulatory.h: drop duplicate word in comment

Thomas Pedersen (1):
      nl80211: S1G band and channel definitions

Veerendranath Jakkam (1):
      cfg80211: Add support to advertize OCV support

 drivers/net/wireless/ath/ath10k/mac.c |  9 +---
 drivers/net/wireless/ath/ath11k/mac.c |  2 +-
 drivers/net/wireless/mac80211_hwsim.c |  2 +-
 include/net/cfg80211.h                | 41 +++++++++++++--
 include/net/fq.h                      |  1 -
 include/net/fq_impl.h                 |  3 +-
 include/net/ieee80211_radiotap.h      |  1 +
 include/net/mac80211.h                | 42 +++++++++++++--
 include/net/regulatory.h              |  2 +-
 include/uapi/linux/nl80211.h          | 94 +++++++++++++++++++++++++--------
 include/uapi/linux/wireless.h         |  2 +-
 net/mac80211/airtime.c                | 26 +++++++--
 net/mac80211/cfg.c                    | 21 +++++---
 net/mac80211/chan.c                   |  9 +++-
 net/mac80211/debugfs_netdev.c         |  5 ++
 net/mac80211/driver-ops.h             | 11 ++--
 net/mac80211/ht.c                     |  4 +-
 net/mac80211/ibss.c                   |  4 +-
 net/mac80211/ieee80211_i.h            | 16 +++---
 net/mac80211/iface.c                  | 25 +++++----
 net/mac80211/key.c                    |  2 +-
 net/mac80211/mesh.c                   |  9 ++--
 net/mac80211/mesh_hwmp.c              | 41 ++++++++++++++-
 net/mac80211/mesh_plink.c             |  2 +-
 net/mac80211/mlme.c                   | 14 ++---
 net/mac80211/offchannel.c             |  6 +--
 net/mac80211/rx.c                     | 66 ++++++++++++++---------
 net/mac80211/scan.c                   |  8 +--
 net/mac80211/sta_info.c               |  6 ++-
 net/mac80211/sta_info.h               |  2 +
 net/mac80211/status.c                 |  4 +-
 net/mac80211/tdls.c                   |  8 +--
 net/mac80211/tx.c                     | 99 +++++++++++++++++------------------
 net/mac80211/util.c                   | 20 ++++---
 net/mac80211/wme.c                    |  2 +-
 net/wireless/chan.c                   | 35 +++++++++++++
 net/wireless/core.c                   |  5 +-
 net/wireless/mesh.c                   |  1 +
 net/wireless/nl80211.c                | 74 ++++++++++++++------------
 net/wireless/scan.c                   | 10 ++++
 net/wireless/trace.h                  |  4 +-
 net/wireless/util.c                   |  8 +++
 42 files changed, 511 insertions(+), 235 deletions(-)


^ permalink raw reply

* Re: [PATCH] net: add support for threaded NAPI polling
From: Eric Dumazet @ 2020-07-31 16:36 UTC (permalink / raw)
  To: Sebastian Gottschall, Eric Dumazet, Felix Fietkau, netdev; +Cc: Hillf Danton
In-Reply-To: <e65f8b84-e6f2-7aa0-4920-db44c63b5efc@dd-wrt.com>



On 7/30/20 10:21 AM, Sebastian Gottschall wrote:
> 
> Am 30.07.2020 um 18:08 schrieb Eric Dumazet:
>>
>> On 7/30/20 7:30 AM, Sebastian Gottschall wrote:
>>> Am 29.07.2020 um 19:44 schrieb Eric Dumazet:
>>>> On 7/29/20 9:50 AM, Felix Fietkau wrote:
>>>>> For some drivers (especially 802.11 drivers), doing a lot of work in the NAPI
>>>>> poll function does not perform well. Since NAPI poll is bound to the CPU it
>>>>> was scheduled from, we can easily end up with a few very busy CPUs spending
>>>>> most of their time in softirq/ksoftirqd and some idle ones.
>>>>>
>>>>> Introduce threaded NAPI for such drivers based on a workqueue. The API is the
>>>>> same except for using netif_threaded_napi_add instead of netif_napi_add.
>>>>>
>>>>> In my tests with mt76 on MT7621 using threaded NAPI + a thread for tx scheduling
>>>>> improves LAN->WLAN bridging throughput by 10-50%. Throughput without threaded
>>>>> NAPI is wildly inconsistent, depending on the CPU that runs the tx scheduling
>>>>> thread.
>>>>>
>>>>> With threaded NAPI, throughput seems stable and consistent (and higher than
>>>>> the best results I got without it).
>>>>>
>>>>> Based on a patch by Hillf Danton
>>>>>
>>>>> Cc: Hillf Danton <hdanton@sina.com>
>>>>> Signed-off-by: Felix Fietkau <nbd@nbd.name>
>>>>> ---
>>>>> Changes since RFC v2:
>>>>> - fix unused but set variable reported by kbuild test robot
>>>>>
>>>>> Changes since RFC:
>>>>> - disable softirq around threaded poll functions
>>>>> - reuse most parts of napi_poll()
>>>>> - fix re-schedule condition
>>>>>
>>>>>    include/linux/netdevice.h |  23 ++++++
>>>>>    net/core/dev.c            | 162 ++++++++++++++++++++++++++------------
>>>>>    2 files changed, 133 insertions(+), 52 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>>>> index ac2cd3f49aba..3a39211c7598 100644
>>>>> --- a/include/linux/netdevice.h
>>>>> +++ b/include/linux/netdevice.h
>>>>> @@ -347,6 +347,7 @@ struct napi_struct {
>>>>>        struct list_head    dev_list;
>>>>>        struct hlist_node    napi_hash_node;
>>>>>        unsigned int        napi_id;
>>>>> +    struct work_struct    work;
>>>>>    };
>>>>>      enum {
>>>>> @@ -357,6 +358,7 @@ enum {
>>>>>        NAPI_STATE_HASHED,    /* In NAPI hash (busy polling possible) */
>>>>>        NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
>>>>>        NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
>>>>> +    NAPI_STATE_THREADED,    /* Use threaded NAPI */
>>>>>    };
>>>>>      enum {
>>>>> @@ -367,6 +369,7 @@ enum {
>>>>>        NAPIF_STATE_HASHED     = BIT(NAPI_STATE_HASHED),
>>>>>        NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
>>>>>        NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>>>>> +    NAPIF_STATE_THREADED     = BIT(NAPI_STATE_THREADED),
>>>>>    };
>>>>>      enum gro_result {
>>>>> @@ -2315,6 +2318,26 @@ static inline void *netdev_priv(const struct net_device *dev)
>>>>>    void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
>>>>>                int (*poll)(struct napi_struct *, int), int weight);
>>>>>    +/**
>>>>> + *    netif_threaded_napi_add - initialize a NAPI context
>>>>> + *    @dev:  network device
>>>>> + *    @napi: NAPI context
>>>>> + *    @poll: polling function
>>>>> + *    @weight: default weight
>>>>> + *
>>>>> + * This variant of netif_napi_add() should be used from drivers using NAPI
>>>>> + * with CPU intensive poll functions.
>>>>> + * This will schedule polling from a high priority workqueue that
>>>>> + */
>>>>> +static inline void netif_threaded_napi_add(struct net_device *dev,
>>>>> +                       struct napi_struct *napi,
>>>>> +                       int (*poll)(struct napi_struct *, int),
>>>>> +                       int weight)
>>>>> +{
>>>>> +    set_bit(NAPI_STATE_THREADED, &napi->state);
>>>>> +    netif_napi_add(dev, napi, poll, weight);
>>>>> +}
>>>>> +
>>>>>    /**
>>>>>     *    netif_tx_napi_add - initialize a NAPI context
>>>>>     *    @dev:  network device
>>>>> diff --git a/net/core/dev.c b/net/core/dev.c
>>>>> index 19f1abc26fcd..11b027f3a2b9 100644
>>>>> --- a/net/core/dev.c
>>>>> +++ b/net/core/dev.c
>>>>> @@ -158,6 +158,7 @@ static DEFINE_SPINLOCK(offload_lock);
>>>>>    struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
>>>>>    struct list_head ptype_all __read_mostly;    /* Taps */
>>>>>    static struct list_head offload_base __read_mostly;
>>>>> +static struct workqueue_struct *napi_workq __read_mostly;
>>>>>      static int netif_rx_internal(struct sk_buff *skb);
>>>>>    static int call_netdevice_notifiers_info(unsigned long val,
>>>>> @@ -6286,6 +6287,11 @@ void __napi_schedule(struct napi_struct *n)
>>>>>    {
>>>>>        unsigned long flags;
>>>>>    +    if (test_bit(NAPI_STATE_THREADED, &n->state)) {
>>>>> +        queue_work(napi_workq, &n->work);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>> Where is the corresponding cancel_work_sync() or flush_work() at device dismantle ?
>>>>
>>>> Just hoping the thread will eventually run seems optimistic to me.
>>>>
>>>>
>>>> Quite frankly, I do believe this STATE_THREADED status should be a generic NAPI attribute
>>>> that can be changed dynamically, at admin request, instead of having to change/recompile
>>>> a driver.
>>> thats not that easy. wifi devices do use dummy netdev devices. they are not visible to sysfs and other administrative options.
>>> so changing it would just be possible if a special mac80211 based control would be implemented for these drivers.
>>> for standard netdev devices it isnt a big thing to implement a administrative control by sysfs (if you are talking about such a feature)
>> We do not want to add code in fast path only for one device. We need something truly generic.
>>
>> I am not saying only the admin can chose, it is fine if a driver does not give the choice
>> and will simply call netif_threaded_napi_add()
> what could make sense if the feature can be disabled / enabled, but it will only affect drivers using the netif_threaded_napi_add call, but it should not affect drivers
> using the old api in any way since not all drivers will work with this feature.


If we provide something in core NAPI stack, we want to make sure we can test/use it with other drivers.

ethtool, or a /sys/class/net/ethXXX entry could be used.

The argument about not affecting other drivers is misleading, since the patch adds another conditional test in
standard NAPI layer.

Lets keep NAPI generic please.

Lets make sure syzbot will find bugs without having to attach a specific mac80211 hardware.

Another concern I have with this patch is that we no longer can contain NIC processing is done
on a selected set of cpus (as commanded in /proc/irq/XXX/smp_affinity).
Or can we ?


^ permalink raw reply

* Re: [PATCH v2] ravb: Fixed the problem that rmmod can not be done
From: Sergei Shtylyov @ 2020-07-31 16:28 UTC (permalink / raw)
  To: ashiduka@fujitsu.com
  Cc: netdev@vger.kernel.org, linux-renesas-soc@vger.kernel.org
In-Reply-To: <OSAPR01MB3844C77766155CAB10BE296CDF4E0@OSAPR01MB3844.jpnprd01.prod.outlook.com>

Hello!

On 7/31/20 1:18 PM, ashiduka@fujitsu.com wrote:

> I understand that the commit log needs to be corrected.

   The subject also could be more concise...

> (Shimoda-san's point is also correct)
> 
> If there is anything else that needs to be corrected, please point it out.

   OK, I'll try to post a proper patch review...

>>    That seems a common pattern, inlluding the Renesas sh_eth
>> driver...
> 
> Yes.

    Not at all so common as I thought! Only 4 drivers use mdio-bitbang, 2 of them are
for the Renesas SoCs...

> If I can get an R-Car Gen2 board, I will also fix sh_eth driver.

    Do yuo have R-Car V3H at hand, by chance? It does have a GEther controler used for
booting up... 

>>    No, the driver's remove() method calls ravb_mdio_release() and
>> that one calls
>> free_mdio_bitbang() that calls module_put(); the actual reason lies
>> somewehre deeper than this...
> 
> No.
> Running rmmod calls delete_module() in kernel/module.c before ravb_mdio_release() is called.
> delete_module()
>    -> try_stop_module()
>      -> try_release_module_ref()
> In try_release_module_ref(), check refcnt and if it is counted up, ravb_mdio_release() is not
> called and rmmod is terminated.

   Yes, after some rummaging in the module support code, I have to agree here. I was
just surprised with you finding such a critical bug so late in the drivers' life cycle.
Well, due to usually using NFS the EtherAVB (and Ether too) driver is probably alwaysbuilt in-kernel...

> Thanks & Best Regards,
> Yuusuke Ashizuka <ashiduka@fujitsu.com>

   Trim your messages after your goodbye. That original message stuff typically isn't
tolerated in the Linux mailing lists, nearly the same as top-posting...

[...]

MBR, Sergei

^ permalink raw reply

* [PATCH net] net: bridge: clear bridge's private skb space on xmit
From: Nikolay Aleksandrov @ 2020-07-31 16:26 UTC (permalink / raw)
  To: netdev; +Cc: bridge, roopa, davem, Nikolay Aleksandrov

We need to clear all of the bridge private skb variables as they can be
stale due to the packet being recirculated through the stack and then
transmitted through the bridge device. Similar memset is already done on
bridge's input. We've seen cases where proxyarp_replied was 1 on routed
multicast packets transmitted through the bridge to ports with neigh
suppress which were getting dropped. Same thing can in theory happen with
the port isolation bit as well.

Fixes: 821f1b21cabb ("bridge: add new BR_NEIGH_SUPPRESS port flag to suppress arp and nd flood")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
 net/bridge/br_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 8c7b78f8bc23..9a2fb4aa1a10 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -36,6 +36,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	const unsigned char *dest;
 	u16 vid = 0;

+	memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+
 	rcu_read_lock();
 	nf_ops = rcu_dereference(nf_br_ops);
 	if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
-- 
2.25.4

^ permalink raw reply related

* Re: [PATCH bpf-next v3] Documentation/bpf: Use valid and new links in index.rst
From: Daniel Borkmann @ 2020-07-31 16:19 UTC (permalink / raw)
  To: Tiezhu Yang, Jonathan Corbet, Alexei Starovoitov
  Cc: Martin KaFai Lau, Song Liu, Yonghong Song, Andrii Nakryiko,
	John Fastabend, KP Singh, David S. Miller, Jakub Kicinski,
	Tobin C. Harding, Mauro Carvalho Chehab, linux-doc, netdev, bpf,
	linux-kernel
In-Reply-To: <1596184142-18476-1-git-send-email-yangtiezhu@loongson.cn>

On 7/31/20 10:29 AM, Tiezhu Yang wrote:
> There exists an error "404 Not Found" when I click the html link of
> "Documentation/networking/filter.rst" in the BPF documentation [1],
> fix it.
> 
> Additionally, use the new links about "BPF and XDP Reference Guide"
> and "bpf(2)" to avoid redirects.
> 
> [1] https://www.kernel.org/doc/html/latest/bpf/
> 
> Fixes: d9b9170a2653 ("docs: bpf: Rename README.rst to index.rst")
> Fixes: cb3f0d56e153 ("docs: networking: convert filter.txt to ReST")
> Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>

That looks better, applied, thanks!

^ permalink raw reply

* Re: pull-request: bpf 2020-07-31
From: Daniel Borkmann @ 2020-07-31 16:12 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: davem, kuba, ast, jolsa, netdev, bpf
In-Reply-To: <20200731152432.GA4296@krava>

On 7/31/20 5:24 PM, Jiri Olsa wrote:
> On Fri, Jul 31, 2020 at 03:51:45PM +0200, Daniel Borkmann wrote:
>> Hi David,
>>
>> The following pull-request contains BPF updates for your *net* tree.
>>
>> We've added 5 non-merge commits during the last 21 day(s) which contain
>> a total of 5 files changed, 126 insertions(+), 18 deletions(-).
>>
>> The main changes are:
>>
>> 1) Fix a map element leak in HASH_OF_MAPS map type, from Andrii Nakryiko.
>>
>> 2) Fix a NULL pointer dereference in __btf_resolve_helper_id() when no
>>     btf_vmlinux is available, from Peilin Ye.
>>
>> 3) Init pos variable in __bpfilter_process_sockopt(), from Christoph Hellwig.
>>
>> 4) Fix a cgroup sockopt verifier test by specifying expected attach type,
>>     from Jean-Philippe Brucker.
>>
>> Please consider pulling these changes from:
>>
>>    git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
>>
>> Thanks a lot!
>>
>> Note that when net gets merged into net-next later on, there is a small
>> merge conflict in kernel/bpf/btf.c between commit 5b801dfb7feb ("bpf: Fix
>> NULL pointer dereference in __btf_resolve_helper_id()") from the bpf tree
>> and commit 138b9a0511c7 ("bpf: Remove btf_id helpers resolving") from the
>> net-next tree.
>>
>> Resolve as follows: remove the old hunk with the __btf_resolve_helper_id()
>> function. Change the btf_resolve_helper_id() so it actually tests for a
>> NULL btf_vmlinux and bails out:
>>
>> int btf_resolve_helper_id(struct bpf_verifier_log *log,
>>                            const struct bpf_func_proto *fn, int arg)
>> {
>>          int id;
>>
>>          if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID || !btf_vmlinux)
>>                  return -EINVAL;
>>          id = fn->btf_id[arg];
>>          if (!id || id > btf_vmlinux->nr_types)
>>                  return -EINVAL;
>>          return id;
>> }
>>
>> Let me know if you run into any others issues (CC'ing Jiri Olsa so he's in
>> the loop with regards to merge conflict resolution).
> 
> we'll loose the bpf_log message, but I'm fine with that ;-) looks good

Checking again on the fix, even though it was only triggered by syzkaller
so far, I think it's also possible if users don't have BTF debug data set
in the Kconfig but use a helper that expects it, so agree, lets re-add the
log in this case:

int btf_resolve_helper_id(struct bpf_verifier_log *log,
                           const struct bpf_func_proto *fn, int arg)
{
         int id;

         if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
                 return -EINVAL;
         if (!btf_vmlinux) {
                 bpf_log(log, "btf_vmlinux doesn't exist\n");
                 return -EINVAL;
         }
         id = fn->btf_id[arg];
         if (!id || id > btf_vmlinux->nr_types)
                 return -EINVAL;
         return id;
}

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH v3 bpf-next 4/9] tcp: Add unknown_opt arg to tcp_parse_options
From: Eric Dumazet @ 2020-07-31 16:12 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Lawrence Brakmo, Neal Cardwell, netdev, Yuchung Cheng
In-Reply-To: <20200730205723.3353838-1-kafai@fb.com>

On Thu, Jul 30, 2020 at 1:58 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> In the latter patch, the bpf prog only wants to be called to handle
> a header option if that particular header option cannot be handled by
> the kernel.  This unknown option could be written by the peer's bpf-prog.
> It could also be a new standard option that the running kernel does not
> support it while a bpf-prog can handle it.
>
> In a latter patch, the bpf prog will be called from tcp_validate_incoming()
> if there is unknown option and a flag is set in tp->bpf_sock_ops_cb_flags.
>
> Instead of using skb->cb[] in an earlier attempt, this patch
> adds an optional arg "bool *unknown_opt" to tcp_parse_options().
> The bool will be set to true if it has encountered an option
> that the kernel does not recognize.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  drivers/infiniband/hw/cxgb4/cm.c |  2 +-
>  include/net/tcp.h                |  3 ++-
>  net/ipv4/syncookies.c            |  2 +-
>  net/ipv4/tcp_input.c             | 40 +++++++++++++++++++++-----------
>  net/ipv4/tcp_minisocks.c         |  4 ++--
>  net/ipv6/syncookies.c            |  2 +-
>  6 files changed, 34 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
> index 30e08bcc9afb..dedca6576bb9 100644
> --- a/drivers/infiniband/hw/cxgb4/cm.c
> +++ b/drivers/infiniband/hw/cxgb4/cm.c
> @@ -3949,7 +3949,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
>          */
>         memset(&tmp_opt, 0, sizeof(tmp_opt));
>         tcp_clear_options(&tmp_opt);
> -       tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL);
> +       tcp_parse_options(&init_net, skb, &tmp_opt, 0, NULL, NULL);
>
>         req = __skb_push(skb, sizeof(*req));
>         memset(req, 0, sizeof(*req));
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 895e7aabf136..d49d8f1c961a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -413,7 +413,8 @@ int tcp_mmap(struct file *file, struct socket *sock,
>  #endif
>  void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
>                        struct tcp_options_received *opt_rx,
> -                      int estab, struct tcp_fastopen_cookie *foc);
> +                      int estab, struct tcp_fastopen_cookie *foc,
> +                      bool *unknown_opt);
>  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
>

Instead of changing signatures of many functions (and make future
stable backports challenging)
how about adding a field into 'struct tcp_options_received' ?

Sorry for not suggesting this earlier :/

^ permalink raw reply

* Re: [PATCH v3 bpf-next 6/9] bpf: tcp: Allow bpf prog to write and parse TCP header option
From: Eric Dumazet @ 2020-07-31 16:06 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Lawrence Brakmo, Neal Cardwell, netdev, Yuchung Cheng
In-Reply-To: <20200730205736.3354304-1-kafai@fb.com>

On Thu, Jul 30, 2020 at 1:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> The earlier effort in BPF-TCP-CC allows the TCP Congestion Control
> algorithm to be written in BPF.  It opens up opportunities to allow
> a faster turnaround time in testing/releasing new congestion control
> ideas to production environment.
>
> The same flexibility can be extended to writing TCP header option.
> It is not uncommon that people want to test new TCP header option
> to improve the TCP performance.  Another use case is for data-center
> that has a more controlled environment and has more flexibility in
> putting header options for internal only use.
>
> For example, we want to test the idea in putting maximum delay
> ACK in TCP header option which is similar to a draft RFC proposal [1].
>
> This patch introduces the necessary BPF API and use them in the
> TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse
> and write TCP header options.  It currently supports most of
> the TCP packet except RST.
>
> Supported TCP header option:
> ───────────────────────────
> This patch allows the bpf-prog to write any option kind.
> Different bpf-progs can write its own option by calling the new helper
> bpf_store_hdr_opt().  The helper will ensure there is no duplicated
> option in the header.
>
> By allowing bpf-prog to write any option kind, this gives a lot of
> flexibility to the bpf-prog.  Different bpf-prog can write its
> own option kind.  It could also allow the bpf-prog to support a
> recently standardized option on an older kernel.
>
> Sockops Callback Flags:
> ──────────────────────
> The header parsing and writing callback can be turned on
> by enabling a few newly added callback flags:
>
> BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG:
>         Call bpf when kernel has received a header option that
>         the kernel cannot handle.  It is useful when the peer doesn't
>         send bpf-options very often.
>
>         The bpf-prog can inspect the received header by sock_ops->skb_data
>         which covers the whole header (including the fixed fields like
>         ports, flags...etc) or
>         use the new bpf_load_hdr_opt() to search for a particular TCP
>         header option.
>
>
>
>

> [1]: draft-wang-tcpm-low-latency-opt-00
>      https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  include/linux/bpf-cgroup.h     |  25 +++
>  include/linux/filter.h         |   4 +
>  include/net/tcp.h              |  53 ++++-
>  include/uapi/linux/bpf.h       | 231 ++++++++++++++++++++-
>  net/core/filter.c              | 365 +++++++++++++++++++++++++++++++++
>  net/ipv4/tcp_fastopen.c        |   2 +-
>  net/ipv4/tcp_input.c           |  86 +++++++-
>  net/ipv4/tcp_ipv4.c            |   3 +-
>  net/ipv4/tcp_minisocks.c       |   1 +
>  net/ipv4/tcp_output.c          | 194 ++++++++++++++++--
>  net/ipv6/tcp_ipv6.c            |   3 +-
>  tools/include/uapi/linux/bpf.h | 231 ++++++++++++++++++++-
>  12 files changed, 1171 insertions(+), 27 deletions(-)

This is a truly gigantic patch.

Could you split it in maybe two parts ?

This way I could focus on the TCP changes, and let eBPF experts focus
on BPF changes.

^ permalink raw reply

* Re: [net-next v4 1/6] net: marvell: prestera: Add driver for Prestera family ASIC devices
From: Andy Shevchenko @ 2020-07-31 16:02 UTC (permalink / raw)
  To: Vadym Kochan
  Cc: David S. Miller, Jakub Kicinski, Jiri Pirko, Ido Schimmel,
	Andrew Lunn, Oleksandr Mazur, Serhiy Boiko, Serhiy Pshyk,
	Volodymyr Mytnyk, Taras Chornyi, Andrii Savka, netdev,
	Linux Kernel Mailing List, Mickey Rachamim
In-Reply-To: <20200731152201.GB10391@plvision.eu>

On Fri, Jul 31, 2020 at 6:22 PM Vadym Kochan <vadym.kochan@plvision.eu> wrote:
> On Mon, Jul 27, 2020 at 03:59:13PM +0300, Andy Shevchenko wrote:
> > On Mon, Jul 27, 2020 at 3:23 PM Vadym Kochan <vadym.kochan@plvision.eu> wrote:

...

> > > Signed-off-by: Andrii Savka <andrii.savka@plvision.eu>
> > > Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
> > > Signed-off-by: Serhiy Boiko <serhiy.boiko@plvision.eu>
> > > Signed-off-by: Serhiy Pshyk <serhiy.pshyk@plvision.eu>
> > > Signed-off-by: Taras Chornyi <taras.chornyi@plvision.eu>
> > > Signed-off-by: Volodymyr Mytnyk <volodymyr.mytnyk@plvision.eu>
> > > Signed-off-by: Vadym Kochan <vadym.kochan@plvision.eu>
> >
> > This needs more work. You have to really understand the role of each
> > person in the above list.
> > I highly recommend (re-)read sections 11-13 of Submitting Patches.
> >
> At least looks like I need to add these persons as Co-author's.

I don't know, you are!
And I think you meant Co-developer's

...

> > > +#include <linux/string.h>
> > > +#include <linux/bitops.h>
> > > +#include <linux/bitfield.h>
> > > +#include <linux/errno.h>
> >
> > Perhaps ordered?
> >
> alphabetical ?

Yes.

...

> > > +       struct prestera_msg_event_port *hw_evt;
> > > +
> > > +       hw_evt = (struct prestera_msg_event_port *)msg;
> >
> > Can be one line I suppose.
> >
> Yes, but I am trying to avoid line-breaking because of 80 chars
> limitation.

We have 100, but okay.

...

> > > +       if (evt->id == PRESTERA_PORT_EVENT_STATE_CHANGED)
> > > +               evt->port_evt.data.oper_state = hw_evt->param.oper_state;
> > > +       else
> > > +               return -EINVAL;
> > > +
> > > +       return 0;
> >
> > Perhaps traditional pattern, i.e.
> >
> >   if (...)
> >     return -EINVAL;
> >   ...
> >   return 0;
> >
> I am not sure if it is applicable here, because the error state here
> is if 'evt->id' did not matched after all checks. Actually this is
> simply a 'switch', but I use 'if' to have shorter code.

Then do it a switch-case. I can see that other reviewers/contributors
may stumble over this.

...

> > > +       /* Only 0xFF mac addrs are supported */
> > > +       if (port->fp_id >= 0xFF)
> > > +               goto err_port_init;
> >
> > You meant 255, right?
> > Otherwise you have to mentioned is it byte limitation or what?
> >
> > ...
> Yes, 255 is a limitation because of max byte value.

But 255 itself is some kind of error value? Perhaps it deserves a definition.

...

> > > +       np = of_find_compatible_node(NULL, NULL, "marvell,prestera");
> > > +       if (np) {
> > > +               base_mac_np = of_parse_phandle(np, "base-mac-provider", 0);
> > > +               if (base_mac_np) {
> > > +                       const char *base_mac;
> > > +
> > > +                       base_mac = of_get_mac_address(base_mac_np);
> > > +                       of_node_put(base_mac_np);
> > > +                       if (!IS_ERR(base_mac))
> > > +                               ether_addr_copy(sw->base_mac, base_mac);
> > > +               }
> > > +       }
> > > +
> > > +       if (!is_valid_ether_addr(sw->base_mac)) {
> > > +               eth_random_addr(sw->base_mac);
> > > +               dev_info(sw->dev->dev, "using random base mac address\n");
> > > +       }
> >
> > Isn't it device_get_mac_address() reimplementation?
> >
> device_get_mac_address() just tries to get mac via fwnode.

Yes, and how is it different from here? (consider
fwnode_get_mac_address() if it suits better).

...

> > > +       new_skb = alloc_skb(len, GFP_ATOMIC | GFP_DMA);
> >
> > Atomic? Why?
> >
> TX path might be called from net_tx_action which is softirq.

Okay, but GFP_DMA is quite a limitation to the memory region. Can't  be 32-bit?

...

> > > +       int tx_retry_num = 10 * tx_ring->max_burst;
> >
> > Magic!
> You mean the code is magic ? Yes, I am trying to relax the
> calling of SDMA engine.

Usually when reviewers tell you about magic it assumes magic numbers
whose meaning is not clear.
(Requires either to be defined or commented)

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply

* Re: [PATCH v3 bpf-next 1/9] tcp: Use a struct to represent a saved_syn
From: Eric Dumazet @ 2020-07-31 15:57 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: bpf, Alexei Starovoitov, Daniel Borkmann, kernel-team,
	Lawrence Brakmo, Neal Cardwell, netdev, Yuchung Cheng
In-Reply-To: <20200730205704.3352619-1-kafai@fb.com>

On Thu, Jul 30, 2020 at 1:57 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> The TCP_SAVE_SYN has both the network header and tcp header.
> The total length of the saved syn packet is currently stored in
> the first 4 bytes (u32) of an array and the actual packet data is
> stored after that.
>
> A latter patch will add a bpf helper that allows to get the tcp header

s/latter/later/

> alone from the saved syn without the network header.  It will be more
> convenient to have a direct offset to a specific header instead of
> re-parsing it.  This requires to separately store the network hdrlen.
> The total header length (i.e. network + tcp) is still needed for the
> current usage in getsockopt.  Although this total length can be obtained
> by looking into the tcphdr and then get the (th->doff << 2), this patch
> chooses to directly store the tcp hdrlen in the second four bytes of
> this newly created "struct saved_syn".  By using a new struct, it can
> give a readable name to each individual header length.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>


> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index a018bafd7bdf..6c38ca9de17e 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -6598,13 +6598,14 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
>  {
>         if (tcp_sk(sk)->save_syn) {
>                 u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
> -               u32 *copy;
> -
> -               copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
> -               if (copy) {
> -                       copy[0] = len;
> -                       memcpy(&copy[1], skb_network_header(skb), len);
> -                       req->saved_syn = copy;
> +               struct saved_syn *saved_syn;
> +
> +               saved_syn = kmalloc(len + sizeof(*saved_syn), GFP_ATOMIC);

Please use
                  saved_syn = kmalloc(struct_size(saved_syn, data,
len), GFP_ATOMIC)

This will avoid yet another trivial patch in the future.

> +               if (saved_syn) {
> +                       saved_syn->network_hdrlen = skb_network_header_len(skb);
> +                       saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
> +                       memcpy(saved_syn->data, skb_network_header(skb), len);
> +                       req->saved_syn = saved_syn;
>                 }
>         }
>  }
> --
> 2.24.1
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox