* [PATCH] [RESEND][for 4.14] net: qcom/emac: add software control for pause frame mode
From: Timur Tabi @ 2017-09-20 20:32 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: timur
The EMAC has the option of sending only a single pause frame when
flow control is enabled and the RX queue is full. Although sending
only one pause frame has little value, this would allow admins to
enable automatic flow control without having to worry about the EMAC
flooding nearby switches with pause frames if the kernel hangs.
The option is enabled by using the single-pause-mode private flag.
Signed-off-by: Timur Tabi <timur@codeaurora.org>
---
drivers/net/ethernet/qualcomm/emac/emac-ethtool.c | 30 +++++++++++++++++++++++
drivers/net/ethernet/qualcomm/emac/emac-mac.c | 22 +++++++++++++++++
drivers/net/ethernet/qualcomm/emac/emac.c | 3 +++
drivers/net/ethernet/qualcomm/emac/emac.h | 3 +++
4 files changed, 58 insertions(+)
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c
index bbe24639aa5a..c8c6231b87f3 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c
@@ -88,6 +88,8 @@ static void emac_set_msglevel(struct net_device *netdev, u32 data)
static int emac_get_sset_count(struct net_device *netdev, int sset)
{
switch (sset) {
+ case ETH_SS_PRIV_FLAGS:
+ return 1;
case ETH_SS_STATS:
return EMAC_STATS_LEN;
default:
@@ -100,6 +102,10 @@ static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
unsigned int i;
switch (stringset) {
+ case ETH_SS_PRIV_FLAGS:
+ strcpy(data, "single-pause-mode");
+ break;
+
case ETH_SS_STATS:
for (i = 0; i < EMAC_STATS_LEN; i++) {
strlcpy(data, emac_ethtool_stat_strings[i],
@@ -230,6 +236,27 @@ static int emac_get_regs_len(struct net_device *netdev)
return EMAC_MAX_REG_SIZE * sizeof(u32);
}
+#define EMAC_PRIV_ENABLE_SINGLE_PAUSE BIT(0)
+
+static int emac_set_priv_flags(struct net_device *netdev, u32 flags)
+{
+ struct emac_adapter *adpt = netdev_priv(netdev);
+
+ adpt->single_pause_mode = !!(flags & EMAC_PRIV_ENABLE_SINGLE_PAUSE);
+
+ if (netif_running(netdev))
+ return emac_reinit_locked(adpt);
+
+ return 0;
+}
+
+static u32 emac_get_priv_flags(struct net_device *netdev)
+{
+ struct emac_adapter *adpt = netdev_priv(netdev);
+
+ return adpt->single_pause_mode ? EMAC_PRIV_ENABLE_SINGLE_PAUSE : 0;
+}
+
static const struct ethtool_ops emac_ethtool_ops = {
.get_link_ksettings = phy_ethtool_get_link_ksettings,
.set_link_ksettings = phy_ethtool_set_link_ksettings,
@@ -253,6 +280,9 @@ static int emac_get_regs_len(struct net_device *netdev)
.get_regs_len = emac_get_regs_len,
.get_regs = emac_get_regs,
+
+ .set_priv_flags = emac_set_priv_flags,
+ .get_priv_flags = emac_get_priv_flags,
};
void emac_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
index bcd4708b3745..0ea3ca09c689 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c
@@ -551,6 +551,28 @@ static void emac_mac_start(struct emac_adapter *adpt)
mac &= ~(HUGEN | VLAN_STRIP | TPAUSE | SIMR | HUGE | MULTI_ALL |
DEBUG_MODE | SINGLE_PAUSE_MODE);
+ /* Enable single-pause-frame mode if requested.
+ *
+ * If enabled, the EMAC will send a single pause frame when the RX
+ * queue is full. This normally leads to packet loss because
+ * the pause frame disables the remote MAC only for 33ms (the quanta),
+ * and then the remote MAC continues sending packets even though
+ * the RX queue is still full.
+ *
+ * If disabled, the EMAC sends a pause frame every 31ms until the RX
+ * queue is no longer full. Normally, this is the preferred
+ * method of operation. However, when the system is hung (e.g.
+ * cores are halted), the EMAC interrupt handler is never called
+ * and so the RX queue fills up quickly and stays full. The resuling
+ * non-stop "flood" of pause frames sometimes has the effect of
+ * disabling nearby switches. In some cases, other nearby switches
+ * are also affected, shutting down the entire network.
+ *
+ * The user can enable or disable single-pause-frame mode
+ * via ethtool.
+ */
+ mac |= adpt->single_pause_mode ? SINGLE_PAUSE_MODE : 0;
+
writel_relaxed(csr1, adpt->csr + EMAC_EMAC_WRAPPER_CSR1);
writel_relaxed(mac, adpt->base + EMAC_MAC_CTRL);
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c
index 60850bfa3d32..759543512117 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.c
+++ b/drivers/net/ethernet/qualcomm/emac/emac.c
@@ -443,6 +443,9 @@ static void emac_init_adapter(struct emac_adapter *adpt)
/* default to automatic flow control */
adpt->automatic = true;
+
+ /* Disable single-pause-frame mode by default */
+ adpt->single_pause_mode = false;
}
/* Get the clock */
diff --git a/drivers/net/ethernet/qualcomm/emac/emac.h b/drivers/net/ethernet/qualcomm/emac/emac.h
index 8ee4ec6aef2e..d7c9f44209d4 100644
--- a/drivers/net/ethernet/qualcomm/emac/emac.h
+++ b/drivers/net/ethernet/qualcomm/emac/emac.h
@@ -363,6 +363,9 @@ struct emac_adapter {
bool tx_flow_control;
bool rx_flow_control;
+ /* True == use single-pause-frame mode. */
+ bool single_pause_mode;
+
/* Ring parameter */
u8 tpd_burst;
u8 rfd_burst;
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc. Qualcomm Technologies, Inc. is a member of the
Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply related
* [PATCH iproute2 v2] ip: ipaddress: fix missing space after prefixlen
From: Julien Fortin @ 2017-09-20 20:26 UTC (permalink / raw)
To: netdev; +Cc: roopa, nikolay, dsa, sd, Julien Fortin
From: Julien Fortin <julien@cumulusnetworks.com>
Fixes: d0e720111aad2 ("ip: ipaddress.c: add support for json output")
Reported-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>
---
ip/ipaddress.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 97971450..fb496bbb 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1604,7 +1604,7 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
format_host_rta(ifa->ifa_family,
rta_tb[IFA_ADDRESS]));
}
- print_int(PRINT_ANY, "prefixlen", "/%d", ifa->ifa_prefixlen);
+ print_int(PRINT_ANY, "prefixlen", "/%d ", ifa->ifa_prefixlen);
}
if (brief)
--
2.14.1
^ permalink raw reply related
* Re: [RFC PATCH] can: m_can: Support higher speed CAN-FD bitrates
From: Franklin S Cooper Jr @ 2017-09-20 20:19 UTC (permalink / raw)
To: Yang, Wenyou, Sekhar Nori, wg, mkl, mario.huettel, socketcan,
quentin.schulz, edumazet, linux-can, netdev, linux-kernel
Cc: Wenyou Yang, Dong Aisheng
In-Reply-To: <532e09ae-775d-e8aa-e468-78e148c650da@Microchip.com>
Hi Wenyou,
On 09/17/2017 10:47 PM, Yang, Wenyou wrote:
>
>
> On 2017/9/14 13:06, Sekhar Nori wrote:
>> On Thursday 14 September 2017 03:28 AM, Franklin S Cooper Jr wrote:
>>>
>>> On 08/18/2017 02:39 PM, Franklin S Cooper Jr wrote:
>>>> During test transmitting using CAN-FD at high bitrates (4 Mbps) only
>>>> resulted in errors. Scoping the signals I noticed that only a single
>>>> bit
>>>> was being transmitted and with a bit more investigation realized the
>>>> actual
>>>> MCAN IP would go back to initialization mode automatically.
>>>>
>>>> It appears this issue is due to the MCAN needing to use the Transmitter
>>>> Delay Compensation Mode as defined in the MCAN User's Guide. When this
>>>> mode is used the User's Guide indicates that the Transmitter Delay
>>>> Compensation Offset register should be set. The document mentions
>>>> that this
>>>> register should be set to (1/dbitrate)/2*(Func Clk Freq).
>>>>
>>>> Additional CAN-CIA's "Bit Time Requirements for CAN FD" document
>>>> indicates
>>>> that this TDC mode is only needed for data bit rates above 2.5 Mbps.
>>>> Therefore, only enable this mode and only set TDCO when the data bit
>>>> rate
>>>> is above 2.5 Mbps.
>>>>
>>>> Signed-off-by: Franklin S Cooper Jr <fcooper@ti.com>
>>>> ---
>>>> I'm pretty surprised that this hasn't been implemented already since
>>>> the primary purpose of CAN-FD is to go beyond 1 Mbps and the MCAN IP
>>>> supports up to 10 Mbps.
>>>>
>>>> So it will be nice to get comments from users of this driver to
>>>> understand
>>>> if they have been able to use CAN-FD beyond 2.5 Mbps without this
>>>> patch.
>>>> If they haven't what did they do to get around it if they needed higher
>>>> speeds.
>>>>
>>>> Meanwhile I plan on testing this using a more "realistic" CAN bus to
>>>> insure
>>>> everything still works at 5 Mbps which is the max speed of my CAN
>>>> transceiver.
>>> ping. Anyone has any thoughts on this?
>> I added Dong who authored the m_can driver and Wenyou who added the only
>> in-kernel user of the driver for any help.
> I tested it on SAMA5D2 Xplained board both with and without this patch,
> both work with the 4M bps data bit rate.
Thank you for testing this out. Its interesting that you have been able
to use higher speeds without this patch. What is the CAN transceiver
being used on the SAMA5D2 Xplained board? I tried looking at the
schematic but it seems the CAN signals are used on an extension board
which I can't find the schematic for. Also do you mind sharing your test
setup? Were you doing a short point to point test?
Thank You,
Franklin
>
>>
>> Thanks,
>> Sekhar
>>
>>>> drivers/net/can/m_can/m_can.c | 24 +++++++++++++++++++++++-
>>>> 1 file changed, 23 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/drivers/net/can/m_can/m_can.c
>>>> b/drivers/net/can/m_can/m_can.c
>>>> index f4947a7..720e073 100644
>>>> --- a/drivers/net/can/m_can/m_can.c
>>>> +++ b/drivers/net/can/m_can/m_can.c
>>>> @@ -126,6 +126,12 @@ enum m_can_mram_cfg {
>>>> #define DBTP_DSJW_SHIFT 0
>>>> #define DBTP_DSJW_MASK (0xf << DBTP_DSJW_SHIFT)
>>>> +/* Transmitter Delay Compensation Register (TDCR) */
>>>> +#define TDCR_TDCO_SHIFT 8
>>>> +#define TDCR_TDCO_MASK (0x7F << TDCR_TDCO_SHIFT)
>>>> +#define TDCR_TDCF_SHIFT 0
>>>> +#define TDCR_TDCF_MASK (0x7F << TDCR_TDCO_SHIFT)
>>>> +
>>>> /* Test Register (TEST) */
>>>> #define TEST_LBCK BIT(4)
>>>> @@ -977,6 +983,8 @@ static int m_can_set_bittiming(struct
>>>> net_device *dev)
>>>> const struct can_bittiming *dbt = &priv->can.data_bittiming;
>>>> u16 brp, sjw, tseg1, tseg2;
>>>> u32 reg_btp;
>>>> + u32 enable_tdc = 0;
>>>> + u32 tdco;
>>>> brp = bt->brp - 1;
>>>> sjw = bt->sjw - 1;
>>>> @@ -991,9 +999,23 @@ static int m_can_set_bittiming(struct
>>>> net_device *dev)
>>>> sjw = dbt->sjw - 1;
>>>> tseg1 = dbt->prop_seg + dbt->phase_seg1 - 1;
>>>> tseg2 = dbt->phase_seg2 - 1;
>>>> +
>>>> + /* TDC is only needed for bitrates beyond 2.5 MBit/s
>>>> + * Specified in the "Bit Time Requirements for CAN FD"
>>>> document
>>>> + */
>>>> + if (dbt->bitrate > 2500000) {
>>>> + enable_tdc = DBTP_TDC;
>>>> + /* Equation based on Bosch's M_CAN User Manual's
>>>> + * Transmitter Delay Compensation Section
>>>> + */
>>>> + tdco = priv->can.clock.freq / (dbt->bitrate * 2);
>>>> + m_can_write(priv, M_CAN_TDCR, tdco << TDCR_TDCO_SHIFT);
>>>> + }
>>>> +
>>>> reg_btp = (brp << DBTP_DBRP_SHIFT) | (sjw <<
>>>> DBTP_DSJW_SHIFT) |
>>>> (tseg1 << DBTP_DTSEG1_SHIFT) |
>>>> - (tseg2 << DBTP_DTSEG2_SHIFT);
>>>> + (tseg2 << DBTP_DTSEG2_SHIFT) | enable_tdc;
>>>> +
>>>> m_can_write(priv, M_CAN_DBTP, reg_btp);
>>>> }
>>>>
>
> Regards,
> Wenyou Yang
>
^ permalink raw reply
* Re: [iproute2 json PATCH] ip: ipaddress: fix missing space after prefixlen
From: Julien Fortin @ 2017-09-20 20:13 UTC (permalink / raw)
To: netdev; +Cc: Roopa Prabhu, Nikolay Aleksandrov, David Ahern, sd, Julien Fortin
In-Reply-To: <20170920200421.94514-1-julien@cumulusnetworks.com>
Sorry Sabrina, looks like i forgot the reported-by tag, i'll send a v2...
On Wed, Sep 20, 2017 at 1:04 PM, Julien Fortin
<julien@cumulusnetworks.com> wrote:
> From: Julien Fortin <julien@cumulusnetworks.com>
>
> Fixes: d0e720111aad2 ("ip: ipaddress.c: add support for json output")
>
> Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>
> ---
> ip/ipaddress.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/ip/ipaddress.c b/ip/ipaddress.c
> index 97971450..fb496bbb 100644
> --- a/ip/ipaddress.c
> +++ b/ip/ipaddress.c
> @@ -1604,7 +1604,7 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
> format_host_rta(ifa->ifa_family,
> rta_tb[IFA_ADDRESS]));
> }
> - print_int(PRINT_ANY, "prefixlen", "/%d", ifa->ifa_prefixlen);
> + print_int(PRINT_ANY, "prefixlen", "/%d ", ifa->ifa_prefixlen);
> }
>
> if (brief)
> --
> 2.14.1
>
^ permalink raw reply
* Re: cross namespace interface notification for tun devices
From: Jason A. Donenfeld @ 2017-09-20 20:13 UTC (permalink / raw)
To: Cong Wang; +Cc: Netdev, Mathias
In-Reply-To: <CAM_iQpXhjuh5NJJiAwBJG4EpkWXWz74WS7RFjt5L1mDV9nKTVw@mail.gmail.com>
On Wed, Sep 20, 2017 at 8:29 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> Sounds like we should set NETIF_F_NETNS_LOCAL for tun
> device.
Absolutely do not do this under any circumstances. This would be a
regression and would break API compatibility. As I wrote in my first
email, it's already possible to sleep-loop for that information using
the tun device's fd; I'm just looking for a better event-based
approach.
> What is your legitimate use case of send/receive packet to/from
> a tun device in a different netns?
Because sometimes it's very nice to be able to move network interfaces
that use tun devices into different namespaces, for some xnamespace
proxying.
What Dan described in the email he just sent is exactly this use case.
In WireGuard (a kernel thing), I have facilities for this --
https://www.wireguard.com/netns/ . Now I'm working on the userspace
version and would like to expose the same utility.
Anyway, the purpose of me sending this message to the list was not to
question the "legitimacy" of my application usage, but rather to
elicit feedback on two specific things:
1. to determine if there's already a mechanism in place for this that
I've overlooked; and
2. to determine particularities of me implementing a mechanism, if
it's not already there.
I'm slightly more convinced that there isn't currently a mechanism for
this. It seems like the easiest way, therefore, would be some kind of
control message that could be poll'd for, using the existing
per-process fd. That way there wouldn't be any violations of the
current namespace situation, yet processes could still get event
notifications as needed.
^ permalink raw reply
* Re: [PATCH RFC V1 net-next 0/6] Time based packet transmission
From: Richard Cochran @ 2017-09-20 20:11 UTC (permalink / raw)
To: levipearson
Cc: rcochran, netdev, linux-kernel, intel-wired-lan, vinicius.gomes,
andre.guedes, john.stultz, jesus.sanchez-palencia, henrik, tglx,
anna-maria, davem
In-Reply-To: <20170920173533.32537-1-levipearson@gmail.com>
On Wed, Sep 20, 2017 at 11:35:33AM -0600, levipearson@gmail.com wrote:
> Anyway, I am wholly in favor of this proposal--in fact, it is very similar to
> a patch set I shared with Eric Mann and others at Intel in early Dec 2016 with
> the intention to get some early feedback before submitting here. I never heard
> back and got busy with other things. I only mention this since you said
> elsewhere that you got this idea from Eric Mann yourself, and I am curious
> whether Eric and I came up with it independently (which I would not be
> surprised at).
Well, I actually thought of placing the Tx time in a CMSG all by
myself, but later I found Eric's talk from 2012,
https://linuxplumbers.ubicast.tv/videos/linux-network-enabling-requirements-for-audiovideo-bridging-avb/
and so I wanted to give him credit.
Thanks,
Richard
^ permalink raw reply
* [iproute2 json PATCH] ip: ipaddress: fix missing space after prefixlen
From: Julien Fortin @ 2017-09-20 20:04 UTC (permalink / raw)
To: netdev; +Cc: roopa, nikolay, dsa, sd, Julien Fortin
From: Julien Fortin <julien@cumulusnetworks.com>
Fixes: d0e720111aad2 ("ip: ipaddress.c: add support for json output")
Signed-off-by: Julien Fortin <julien@cumulusnetworks.com>
---
ip/ipaddress.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 97971450..fb496bbb 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1604,7 +1604,7 @@ int print_addrinfo(const struct sockaddr_nl *who, struct nlmsghdr *n,
format_host_rta(ifa->ifa_family,
rta_tb[IFA_ADDRESS]));
}
- print_int(PRINT_ANY, "prefixlen", "/%d", ifa->ifa_prefixlen);
+ print_int(PRINT_ANY, "prefixlen", "/%d ", ifa->ifa_prefixlen);
}
if (brief)
--
2.14.1
^ permalink raw reply related
* Re: cross namespace interface notification for tun devices
From: Dan Williams @ 2017-09-20 19:57 UTC (permalink / raw)
To: Cong Wang, Jason A. Donenfeld; +Cc: Netdev, Mathias
In-Reply-To: <CAM_iQpXhjuh5NJJiAwBJG4EpkWXWz74WS7RFjt5L1mDV9nKTVw@mail.gmail.com>
On Wed, 2017-09-20 at 11:29 -0700, Cong Wang wrote:
> On Tue, Sep 19, 2017 at 2:02 PM, Jason A. Donenfeld <Jason@zx2c4.com>
> wrote:
> > On Tue, Sep 19, 2017 at 10:40 PM, Cong Wang <xiyou.wangcong@gmail.c
> > om> wrote:
> > > By "notification" I assume you mean netlink notification.
> >
> > Yes, netlink notification.
> >
> > > The question is why does the process in A still care about
> > > the device sitting in B?
> > >
> > > Also, the process should be able to receive a last notification
> > > on IFF_UP|IFF_RUNNING before device is finally moved to B.
> > > After this point, it should not have any relation to netns A
> > > any more, like the device were completely gone.
> >
> > That's very clearly not the case with a tun device. Tun devices
> > work
> > by letting a userspace process control the inputs (ndo_start_xmit)
> > and
> > outputs (netif_rx) of the actual network device. This controlling
> > userspace process needs to know when its own interface that it
> > controls goes up and down. In the kernel, we can do this by just
> > checking dev->flags&IFF_UP, and receive notifications on ndo_open
> > and
> > ndo_stop. In userspace, the controlling process looses the ability
> > to
> > receive notifications like ndo_open/ndo_stop when the interface is
> > moved to a new namespace. After the interface is moved to a
> > namespace,
> > the process will still control inputs and ouputs (ndo_start_xmit
> > and
> > netif_rx), but it will no longer receive netlink notifications for
> > the
> > equivalent of ndo_open and ndo_stop. This is problematic.
>
> Sounds like we should set NETIF_F_NETNS_LOCAL for tun
> device.
>
> What is your legitimate use case of send/receive packet to/from
> a tun device in a different netns?
One thought: run openvpn in the master netns, but put its tun0
interface into an application's netns. Per-application VPN,
essentially? Or maybe that's not how people do this kind of thing, but
it's a thought.
Dan
^ permalink raw reply
* Re: usb/net/p54: trying to register non-static key in p54_unregister_leds
From: Johannes Berg @ 2017-09-20 19:55 UTC (permalink / raw)
To: Christian Lamparter, Andrey Konovalov
Cc: Kalle Valo, linux-wireless, netdev, LKML, Dmitry Vyukov,
Kostya Serebryany, syzkaller, Stephen Boyd, Tejun Heo, Yong Zhang
In-Reply-To: <2277141.bYDD1vAb9W@debian64>
On Wed, 2017-09-20 at 21:27 +0200, Christian Lamparter wrote:
> It seems this is caused as a result of:
> -> lock_map_acquire(&work->lockdep_map);
> lock_map_release(&work->lockdep_map);
>
> in flush_work() [0]
Agree.
> This was added by:
>
> commit 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a
> Author: Stephen Boyd <sboyd@codeaurora.org>
> Date: Fri Apr 20 17:28:50 2012 -0700
>
> workqueue: Catch more locking problems with flush_work()
Yes, but that doesn't matter.
> Looking at the Stephen's patch, it's clear that it was made
> with "static DECLARE_WORK(work, my_work)" in mind. However
> p54's led_work is "per-device", hence it is stored in the
> devices context p54_common, which is dynamically allocated.
> So, maybe revert Stephen's patch?
I disagree - as the lockdep warning says:
> > INFO: trying to register non-static key.
> > the code is fine but needs lockdep annotation.
> > turning off the locking correctness validator.
What it needs is to actually correctly go through initializing the work
at least once.
Without more information, I can't really say what's going on, but I
assume that something is failing and p54_unregister_leds() is getting
invoked without p54_init_leds() having been invoked, so essentially
it's trying to flush a work that was never initialized?
INIT_DELAYED_WORK() does, after all, initialize the lockdep map
properly via __INIT_WORK().
johannes
^ permalink raw reply
* Re: [PATCH net-next 08/14] gtp: Support encpasulating over IPv6
From: David Miller @ 2017-09-20 19:45 UTC (permalink / raw)
To: tom; +Cc: tom, netdev, pablo, laforge, rohit
In-Reply-To: <CALx6S36cJEU2LkbNf-d8kUHCPj78bt5Y2Bd_rdvjxJ9epOEqAQ@mail.gmail.com>
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 20 Sep 2017 11:03:52 -0700
> On Mon, Sep 18, 2017 at 9:19 PM, David Miller <davem@davemloft.net> wrote:
>> From: Tom Herbert <tom@quantonium.net>
>> Date: Mon, 18 Sep 2017 17:38:58 -0700
>>
>>> Allow peers to be specified by IPv6 addresses.
>>>
>>> Signed-off-by: Tom Herbert <tom@quantonium.net>
>>
>> Hmmm, can you just check the socket family or something like that?
>
> I'm not sure what code you're referring to.
There is a socket associated with the tunnel to do the encapsulation
and it has an address family, right?
^ permalink raw reply
* Re: Regression in throughput between kvm guests over virtual bridge
From: Matthew Rosato @ 2017-09-20 19:38 UTC (permalink / raw)
To: Jason Wang, netdev; +Cc: davem, mst
In-Reply-To: <1173ab1f-e2b6-26b3-8c3c-bd5ceaa1bd8e@redhat.com>
> Seems to make some progress on wakeup mitigation. Previous patch tries
> to reduce the unnecessary traversal of waitqueue during rx. Attached
> patch goes even further which disables rx polling during processing tx.
> Please try it to see if it has any difference.
Unfortunately, this patch doesn't seem to have made a difference. I
tried runs with both this patch and the previous patch applied, as well
as only this patch applied for comparison (numbers from vhost thread of
sending VM):
4.12 4.13 patch1 patch2 patch1+2
2.00% +3.69% +2.55% +2.81% +2.69% [...] __wake_up_sync_key
In each case, the regression in throughput was still present.
> And two questions:
> - Is the issue existed if you do uperf between 2VMs (instead of 4VMs)
Verified that the second set of guests are not actually required, I can
see the regression with only 2 VMs.
> - Can enable batching in the tap of sending VM improve the performance
> (ethtool -C $tap rx-frames 64)
I tried this, but it did not help (actually seemed to make things a
little worse)
^ permalink raw reply
* Re: usb/net/p54: trying to register non-static key in p54_unregister_leds
From: Christian Lamparter @ 2017-09-20 19:27 UTC (permalink / raw)
To: Andrey Konovalov
Cc: Kalle Valo, linux-wireless, netdev, LKML, Dmitry Vyukov,
Kostya Serebryany, syzkaller, Stephen Boyd, Tejun Heo, Yong Zhang
In-Reply-To: <CAAeHK+xPoNn7i5xOzEqHX6tfDpmpb1rd0hcAxZpTHKi6B8xzsw@mail.gmail.com>
On Wednesday, September 20, 2017 8:37:08 PM CEST Andrey Konovalov wrote:
> Hi!
>
> I've got the following report while fuzzing the kernel with syzkaller.
>
> On commit ebb2c2437d8008d46796902ff390653822af6cc4 (Sep 18).
>
> INFO: trying to register non-static key.
> the code is fine but needs lockdep annotation.
> turning off the locking correctness validator.
> CPU: 1 PID: 1404 Comm: kworker/1:1 Not tainted
> 4.14.0-rc1-42251-gebb2c2437d80-dirty #205
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Workqueue: usb_hub_wq hub_event
> Call Trace:
> __dump_stack lib/dump_stack.c:16
> dump_stack+0x292/0x395 lib/dump_stack.c:52
> register_lock_class+0x6c4/0x1a00 kernel/locking/lockdep.c:769
> __lock_acquire+0x27e/0x4550 kernel/locking/lockdep.c:3385
> lock_acquire+0x259/0x620 kernel/locking/lockdep.c:4002
> flush_work+0xf0/0x8c0 kernel/workqueue.c:2886
> __cancel_work_timer+0x51d/0x870 kernel/workqueue.c:2961
> cancel_delayed_work_sync+0x1f/0x30 kernel/workqueue.c:3081
> p54_unregister_leds+0x6c/0xc0 drivers/net/wireless/intersil/p54/led.c:160
> p54_unregister_common+0x3d/0xb0 drivers/net/wireless/intersil/p54/main.c:856
> p54u_disconnect+0x86/0x120 drivers/net/wireless/intersil/p54/p54usb.c:1073
> usb_unbind_interface+0x21c/0xa90 drivers/usb/core/driver.c:423
> __device_release_driver drivers/base/dd.c:861
> device_release_driver_internal+0x4f4/0x5c0 drivers/base/dd.c:893
> device_release_driver+0x1e/0x30 drivers/base/dd.c:918
> bus_remove_device+0x2f4/0x4b0 drivers/base/bus.c:565
> device_del+0x5c4/0xab0 drivers/base/core.c:1985
> usb_disable_device+0x1e9/0x680 drivers/usb/core/message.c:1170
> usb_disconnect+0x260/0x7a0 drivers/usb/core/hub.c:2124
> hub_port_connect drivers/usb/core/hub.c:4754
> hub_port_connect_change drivers/usb/core/hub.c:5009
> port_event drivers/usb/core/hub.c:5115
> hub_event+0x1318/0x3740 drivers/usb/core/hub.c:5195
> process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119
> process_scheduled_works kernel/workqueue.c:2179
> worker_thread+0xb2b/0x1850 kernel/workqueue.c:2255
> kthread+0x3a1/0x470 kernel/kthread.c:231
> ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431
>
It seems this is caused as a result of:
-> lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
in flush_work() [0]
This was added by:
commit 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a
Author: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri Apr 20 17:28:50 2012 -0700
workqueue: Catch more locking problems with flush_work()
Looking at the Stephen's patch, it's clear that it was made
with "static DECLARE_WORK(work, my_work)" in mind. However
p54's led_work is "per-device", hence it is stored in the
devices context p54_common, which is dynamically allocated.
So, maybe revert Stephen's patch?
[0] <http://elixir.free-electrons.com/linux/latest/source/kernel/workqueue.c#L2853>
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 19:23 UTC (permalink / raw)
To: Cong Wang, Eric Dumazet
Cc: Wei Wang, Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <07f0c800-7ee7-1ebe-9f5f-eaad52a7b42d@itcare.pl>
W dniu 2017-09-20 o 21:13, Paweł Staszewski pisze:
>
>
> W dniu 2017-09-20 o 20:36, Cong Wang pisze:
>> On Wed, Sep 20, 2017 at 11:30 AM, Eric Dumazet
>> <eric.dumazet@gmail.com> wrote:
>>> On Wed, 2017-09-20 at 11:22 -0700, Cong Wang wrote:
>>>> but dmesg at this time shows nothing about interfaces or flaps.
>>>>
>>>> This is very odd.
>>>>
>>>> We only free netdevice in free_netdev() and it is only called when
>>>> we unregister a netdevice. Otherwise pcpu_refcnt is impossible
>>>> to be NULL.
>>> If there is a missing dev_hold() or one dev_put() in excess,
>>> this would allow the netdev to be freed too soon.
>>>
>>> -> Use after free.
>>> memory holding netdev could be reallocated-cleared by some other kernel
>>> user.
>>>
>> Sure, but only unregister could trigger a free. If there is no
>> unregister,
>> like what Pawel claims, then there is no free, the refcnt just goes to
>> 0 but the memory is still there.
>>
> About possible mistake from my side with bisect - i can judge too
> early that some bisect was good
> the road was:
> git bisect start
> # bad: [ac7b75966c9c86426b55fe1c50ae148aa4571075] Merge tag
> 'pinctrl-v4.13-1' of
> git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl
> git bisect bad ac7b75966c9c86426b55fe1c50ae148aa4571075
> # good: [e24dd9ee5399747b71c1d982a484fc7601795f31] Merge branch 'next'
> of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
> git bisect good e24dd9ee5399747b71c1d982a484fc7601795f31
> # bad: [9cc9a5cb176ccb4f2cda5ac34da5a659926f125f] datapath: Avoid
> using stack larger than 1024.
> git bisect bad 9cc9a5cb176ccb4f2cda5ac34da5a659926f125f
> # good: [073cf9e20c333ab29744717a23f9e43ec7512a20] Merge branch
> 'udp-reduce-cache-pressure'
> git bisect good 073cf9e20c333ab29744717a23f9e43ec7512a20
> # bad: [8abd5599a520e9f188a750f1bde9dde5fb856230] Merge branch
> 's390-net-updates-part-2'
> git bisect bad 8abd5599a520e9f188a750f1bde9dde5fb856230
> # good: [2fae5d0e647c6470d206e72b5fc24972bb900f70] Merge branch
> 'bpf-ctx-narrow'
> git bisect good 2fae5d0e647c6470d206e72b5fc24972bb900f70
> # good: [41500c3e2a19ffcf40a7158fce1774de08e26ba2] rds: tcp: remove
> cp_outgoing
> git bisect good 41500c3e2a19ffcf40a7158fce1774de08e26ba2
> # bad: [8917a777be3ba566377be05117f71b93a5fd909d] tcp: md5: add
> TCP_MD5SIG_EXT socket option to set a key address prefix
> git bisect bad 8917a777be3ba566377be05117f71b93a5fd909d
> # good: [4a6ce2b6f2ecabbddcfe47e7cf61dd0f00b10e36] net: introduce a
> new function dst_dev_put()
>
> And currently have this running for about 4 hours without problems.
>
>
>
> git bisect good 4a6ce2b6f2ecabbddcfe47e7cf61dd0f00b10e36
> # bad: [a4c2fd7f78915a0d7c5275e7612e7793157a01f2] net: remove
> DST_NOCACHE flag
>
> Here for sure - panic
>
> git bisect bad a4c2fd7f78915a0d7c5275e7612e7793157a01f2
> # bad: [ad65a2f05695aced349e308193c6e2a6b1d87112] ipv6: call
> dst_hold_safe() properly
> git bisect bad ad65a2f05695aced349e308193c6e2a6b1d87112
> # good: [9df16efadd2a8a82731dc76ff656c771e261827f] ipv4: call
> dst_hold_safe() properly
> git bisect good 9df16efadd2a8a82731dc76ff656c771e261827f
> # bad: [1cfb71eeb12047bcdbd3e6730ffed66e810a0855] ipv6: take
> dst->__refcnt for insertion into fib6 tree
>
> im not 100% sure tor last two
> Will test them again starting from
> [95c47f9cf5e028d1ae77dc6c767c1edc8a18025b] ipv4: call dst_dev_put()
> properly
>
>
> git bisect bad 1cfb71eeb12047bcdbd3e6730ffed66e810a0855
> # bad: [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4: mark DST_NOGC
> and remove the operation of dst_free()
>
>
>
> git bisect bad b838d5e1c5b6e57b10ec8af2268824041e3ea911
> # first bad commit: [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4:
> mark DST_NOGC and remove the operation of dst_free()
>
>
>
What i can say more
I can reproduce this on any server with similar configuration
the difference can be teamd instead of bonding
ixgbe or i40e and mlx5
Same problems
vlans - more or less prefixes learned from bgp -> zebra -> netlink -> kernel
But normally in lab when using only plain routing no bgpd and about 128
vlans - with 128 routes - cant reproduce this - this apperas only with
bgp - minimum where i can reproduce this was about 130k prefixes with
about 286 nexthops
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 19:13 UTC (permalink / raw)
To: Cong Wang, Eric Dumazet
Cc: Wei Wang, Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <CAM_iQpVF3Gf3G=543o8g+5N2dtEL4DcUPN3a6=L8V_G0CXZ65w@mail.gmail.com>
W dniu 2017-09-20 o 20:36, Cong Wang pisze:
> On Wed, Sep 20, 2017 at 11:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>> On Wed, 2017-09-20 at 11:22 -0700, Cong Wang wrote:
>>> but dmesg at this time shows nothing about interfaces or flaps.
>>>
>>> This is very odd.
>>>
>>> We only free netdevice in free_netdev() and it is only called when
>>> we unregister a netdevice. Otherwise pcpu_refcnt is impossible
>>> to be NULL.
>> If there is a missing dev_hold() or one dev_put() in excess,
>> this would allow the netdev to be freed too soon.
>>
>> -> Use after free.
>> memory holding netdev could be reallocated-cleared by some other kernel
>> user.
>>
> Sure, but only unregister could trigger a free. If there is no unregister,
> like what Pawel claims, then there is no free, the refcnt just goes to
> 0 but the memory is still there.
>
About possible mistake from my side with bisect - i can judge too early
that some bisect was good
the road was:
git bisect start
# bad: [ac7b75966c9c86426b55fe1c50ae148aa4571075] Merge tag
'pinctrl-v4.13-1' of
git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl
git bisect bad ac7b75966c9c86426b55fe1c50ae148aa4571075
# good: [e24dd9ee5399747b71c1d982a484fc7601795f31] Merge branch 'next'
of git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security
git bisect good e24dd9ee5399747b71c1d982a484fc7601795f31
# bad: [9cc9a5cb176ccb4f2cda5ac34da5a659926f125f] datapath: Avoid using
stack larger than 1024.
git bisect bad 9cc9a5cb176ccb4f2cda5ac34da5a659926f125f
# good: [073cf9e20c333ab29744717a23f9e43ec7512a20] Merge branch
'udp-reduce-cache-pressure'
git bisect good 073cf9e20c333ab29744717a23f9e43ec7512a20
# bad: [8abd5599a520e9f188a750f1bde9dde5fb856230] Merge branch
's390-net-updates-part-2'
git bisect bad 8abd5599a520e9f188a750f1bde9dde5fb856230
# good: [2fae5d0e647c6470d206e72b5fc24972bb900f70] Merge branch
'bpf-ctx-narrow'
git bisect good 2fae5d0e647c6470d206e72b5fc24972bb900f70
# good: [41500c3e2a19ffcf40a7158fce1774de08e26ba2] rds: tcp: remove
cp_outgoing
git bisect good 41500c3e2a19ffcf40a7158fce1774de08e26ba2
# bad: [8917a777be3ba566377be05117f71b93a5fd909d] tcp: md5: add
TCP_MD5SIG_EXT socket option to set a key address prefix
git bisect bad 8917a777be3ba566377be05117f71b93a5fd909d
# good: [4a6ce2b6f2ecabbddcfe47e7cf61dd0f00b10e36] net: introduce a new
function dst_dev_put()
And currently have this running for about 4 hours without problems.
git bisect good 4a6ce2b6f2ecabbddcfe47e7cf61dd0f00b10e36
# bad: [a4c2fd7f78915a0d7c5275e7612e7793157a01f2] net: remove
DST_NOCACHE flag
Here for sure - panic
git bisect bad a4c2fd7f78915a0d7c5275e7612e7793157a01f2
# bad: [ad65a2f05695aced349e308193c6e2a6b1d87112] ipv6: call
dst_hold_safe() properly
git bisect bad ad65a2f05695aced349e308193c6e2a6b1d87112
# good: [9df16efadd2a8a82731dc76ff656c771e261827f] ipv4: call
dst_hold_safe() properly
git bisect good 9df16efadd2a8a82731dc76ff656c771e261827f
# bad: [1cfb71eeb12047bcdbd3e6730ffed66e810a0855] ipv6: take
dst->__refcnt for insertion into fib6 tree
im not 100% sure tor last two
Will test them again starting from
[95c47f9cf5e028d1ae77dc6c767c1edc8a18025b] ipv4: call dst_dev_put() properly
git bisect bad 1cfb71eeb12047bcdbd3e6730ffed66e810a0855
# bad: [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4: mark DST_NOGC
and remove the operation of dst_free()
git bisect bad b838d5e1c5b6e57b10ec8af2268824041e3ea911
# first bad commit: [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4:
mark DST_NOGC and remove the operation of dst_free()
^ permalink raw reply
* [PATCH] hv_netvsc: fix send buffer failure on MTU change
From: Stephen Hemminger @ 2017-09-20 18:17 UTC (permalink / raw)
To: davem, kys; +Cc: netdev, Alex Ng, Stephen Hemminger
From: Alex Ng <alexng@microsoft.com>
If MTU is changed the host would reject the send buffer change.
This problem is result of recent change to allow changing send
buffer size.
Every time we change the MTU, we store the previous net_device section
count before destroying the buffer, but we don’t store the previous
section size. When we reinitialize the buffer, its size is calculated
by multiplying the previous count and previous size. Since we
continuously increase the MTU, the host returns us a decreasing count
value while the section size is reinitialized to 1728 bytes every
time.
This eventually leads to a condition where the calculated buf_size is
so small that the host rejects it.
Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size")
Signed-off-by: Alex Ng <alexng@microsoft.com>
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
drivers/net/hyperv/hyperv_net.h | 2 ++
drivers/net/hyperv/netvsc.c | 7 ++-----
drivers/net/hyperv/netvsc_drv.c | 8 ++++++++
3 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index d98cdfb1536b..5176be76ca7d 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -150,6 +150,8 @@ struct netvsc_device_info {
u32 num_chn;
u32 send_sections;
u32 recv_sections;
+ u32 send_section_size;
+ u32 recv_section_size;
};
enum rndis_device_state {
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index a5511b7326af..8d5077fb0492 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -76,9 +76,6 @@ static struct netvsc_device *alloc_net_device(void)
net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT;
net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT;
- net_device->recv_section_size = NETVSC_RECV_SECTION_SIZE;
- net_device->send_section_size = NETVSC_SEND_SECTION_SIZE;
-
init_completion(&net_device->channel_init_wait);
init_waitqueue_head(&net_device->subchan_open);
INIT_WORK(&net_device->subchan_work, rndis_set_subchannel);
@@ -262,7 +259,7 @@ static int netvsc_init_buf(struct hv_device *device,
int ret = 0;
/* Get receive buffer area. */
- buf_size = device_info->recv_sections * net_device->recv_section_size;
+ buf_size = device_info->recv_sections * device_info->recv_section_size;
buf_size = roundup(buf_size, PAGE_SIZE);
net_device->recv_buf = vzalloc(buf_size);
@@ -344,7 +341,7 @@ static int netvsc_init_buf(struct hv_device *device,
goto cleanup;
/* Now setup the send buffer. */
- buf_size = device_info->send_sections * net_device->send_section_size;
+ buf_size = device_info->send_sections * device_info->send_section_size;
buf_size = round_up(buf_size, PAGE_SIZE);
net_device->send_buf = vzalloc(buf_size);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index d4902ee5f260..a32ae02e1b6c 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -848,7 +848,9 @@ static int netvsc_set_channels(struct net_device *net,
device_info.num_chn = count;
device_info.ring_size = ring_size;
device_info.send_sections = nvdev->send_section_cnt;
+ device_info.send_section_size = nvdev->send_section_size;
device_info.recv_sections = nvdev->recv_section_cnt;
+ device_info.recv_section_size = nvdev->recv_section_size;
rndis_filter_device_remove(dev, nvdev);
@@ -963,7 +965,9 @@ static int netvsc_change_mtu(struct net_device *ndev, int mtu)
device_info.ring_size = ring_size;
device_info.num_chn = nvdev->num_chn;
device_info.send_sections = nvdev->send_section_cnt;
+ device_info.send_section_size = nvdev->send_section_size;
device_info.recv_sections = nvdev->recv_section_cnt;
+ device_info.recv_section_size = nvdev->recv_section_size;
rndis_filter_device_remove(hdev, nvdev);
@@ -1485,7 +1489,9 @@ static int netvsc_set_ringparam(struct net_device *ndev,
device_info.num_chn = nvdev->num_chn;
device_info.ring_size = ring_size;
device_info.send_sections = new_tx;
+ device_info.send_section_size = nvdev->send_section_size;
device_info.recv_sections = new_rx;
+ device_info.recv_section_size = nvdev->recv_section_size;
netif_device_detach(ndev);
was_opened = rndis_filter_opened(nvdev);
@@ -1934,7 +1940,9 @@ static int netvsc_probe(struct hv_device *dev,
device_info.ring_size = ring_size;
device_info.num_chn = VRSS_CHANNEL_DEFAULT;
device_info.send_sections = NETVSC_DEFAULT_TX;
+ device_info.send_section_size = NETVSC_SEND_SECTION_SIZE;
device_info.recv_sections = NETVSC_DEFAULT_RX;
+ device_info.recv_section_size = NETVSC_RECV_SECTION_SIZE;
nvdev = rndis_filter_device_add(dev, &device_info);
if (IS_ERR(nvdev)) {
--
2.11.0
^ permalink raw reply related
* Re: [PATCH net-next] bpf: Optimize lpm trie delete
From: Craig Gallek @ 2017-09-20 18:51 UTC (permalink / raw)
To: Daniel Mack; +Cc: Alexei Starovoitov, Daniel Borkmann, David S . Miller, netdev
In-Reply-To: <47e47f0d-d04d-b52b-647a-7883aa5f3268@zonque.org>
On Wed, Sep 20, 2017 at 12:51 PM, Daniel Mack <daniel@zonque.org> wrote:
> Hi Craig,
>
> Thanks, this looks much cleaner already :)
>
> On 09/20/2017 06:22 PM, Craig Gallek wrote:
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 9d58a576b2ae..b5a7d70ec8b5 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -397,7 +397,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
>> struct lpm_trie_node __rcu **trim;
>> struct lpm_trie_node *node;
>> unsigned long irq_flags;
>> - unsigned int next_bit;
>> + unsigned int next_bit = 0;
>
> This default assignment seems wrong, and I guess you only added it to
> squelch a compiler warning?
Yes, this variable is only initialized after the lookup iterations
below (meaning it will never be initialized the the root-removal
case).
> [...]
>
>> + /* If the node has one child, we may be able to collapse the tree
>> + * while removing this node if the node's child is in the same
>> + * 'next bit' slot as this node was in its parent or if the node
>> + * itself is the root.
>> + */
>> + if (trim == &trie->root) {
>> + next_bit = node->child[0] ? 0 : 1;
>> + rcu_assign_pointer(trie->root, node->child[next_bit]);
>> + kfree_rcu(node, rcu);
>
> I don't think you should treat this 'root' case special.
>
> Instead, move the 'next_bit' assignment outside of the condition ...
I'm not quite sure I follow. Are you saying do something like this:
if (trim == &trie->root) {
next_bit = node->child[0] ? 0 : 1;
}
if (rcu_access_pointer(node->child[next_bit])) {
...
This would save a couple lines of code, but I think the as-is
implementation is slightly easier to understand. I don't have a
strong opinion either way, though.
Thanks for the pointers,
Craig
^ permalink raw reply
* usb/net/p54: trying to register non-static key in p54_unregister_leds
From: Andrey Konovalov @ 2017-09-20 18:37 UTC (permalink / raw)
To: Christian Lamparter, Kalle Valo, linux-wireless, netdev, LKML
Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller
Hi!
I've got the following report while fuzzing the kernel with syzkaller.
On commit ebb2c2437d8008d46796902ff390653822af6cc4 (Sep 18).
INFO: trying to register non-static key.
the code is fine but needs lockdep annotation.
turning off the locking correctness validator.
CPU: 1 PID: 1404 Comm: kworker/1:1 Not tainted
4.14.0-rc1-42251-gebb2c2437d80-dirty #205
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
__dump_stack lib/dump_stack.c:16
dump_stack+0x292/0x395 lib/dump_stack.c:52
register_lock_class+0x6c4/0x1a00 kernel/locking/lockdep.c:769
__lock_acquire+0x27e/0x4550 kernel/locking/lockdep.c:3385
lock_acquire+0x259/0x620 kernel/locking/lockdep.c:4002
flush_work+0xf0/0x8c0 kernel/workqueue.c:2886
__cancel_work_timer+0x51d/0x870 kernel/workqueue.c:2961
cancel_delayed_work_sync+0x1f/0x30 kernel/workqueue.c:3081
p54_unregister_leds+0x6c/0xc0 drivers/net/wireless/intersil/p54/led.c:160
p54_unregister_common+0x3d/0xb0 drivers/net/wireless/intersil/p54/main.c:856
p54u_disconnect+0x86/0x120 drivers/net/wireless/intersil/p54/p54usb.c:1073
usb_unbind_interface+0x21c/0xa90 drivers/usb/core/driver.c:423
__device_release_driver drivers/base/dd.c:861
device_release_driver_internal+0x4f4/0x5c0 drivers/base/dd.c:893
device_release_driver+0x1e/0x30 drivers/base/dd.c:918
bus_remove_device+0x2f4/0x4b0 drivers/base/bus.c:565
device_del+0x5c4/0xab0 drivers/base/core.c:1985
usb_disable_device+0x1e9/0x680 drivers/usb/core/message.c:1170
usb_disconnect+0x260/0x7a0 drivers/usb/core/hub.c:2124
hub_port_connect drivers/usb/core/hub.c:4754
hub_port_connect_change drivers/usb/core/hub.c:5009
port_event drivers/usb/core/hub.c:5115
hub_event+0x1318/0x3740 drivers/usb/core/hub.c:5195
process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119
process_scheduled_works kernel/workqueue.c:2179
worker_thread+0xb2b/0x1850 kernel/workqueue.c:2255
kthread+0x3a1/0x470 kernel/kthread.c:231
ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Cong Wang @ 2017-09-20 18:36 UTC (permalink / raw)
To: Eric Dumazet
Cc: Paweł Staszewski, Wei Wang, Linux Kernel Network Developers,
Eric Dumazet
In-Reply-To: <1505932231.29839.106.camel@edumazet-glaptop3.roam.corp.google.com>
On Wed, Sep 20, 2017 at 11:30 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Wed, 2017-09-20 at 11:22 -0700, Cong Wang wrote:
>> but dmesg at this time shows nothing about interfaces or flaps.
>>
>> This is very odd.
>>
>> We only free netdevice in free_netdev() and it is only called when
>> we unregister a netdevice. Otherwise pcpu_refcnt is impossible
>> to be NULL.
>
> If there is a missing dev_hold() or one dev_put() in excess,
> this would allow the netdev to be freed too soon.
>
> -> Use after free.
> memory holding netdev could be reallocated-cleared by some other kernel
> user.
>
Sure, but only unregister could trigger a free. If there is no unregister,
like what Pawel claims, then there is no free, the refcnt just goes to
0 but the memory is still there.
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Eric Dumazet @ 2017-09-20 18:30 UTC (permalink / raw)
To: Cong Wang
Cc: Paweł Staszewski, Wei Wang, Linux Kernel Network Developers,
Eric Dumazet
In-Reply-To: <CAM_iQpUOOCcES4d_ueK2wizn2oyDZPEzLfWQFu3oOq=BuRzdiw@mail.gmail.com>
On Wed, 2017-09-20 at 11:22 -0700, Cong Wang wrote:
> but dmesg at this time shows nothing about interfaces or flaps.
>
> This is very odd.
>
> We only free netdevice in free_netdev() and it is only called when
> we unregister a netdevice. Otherwise pcpu_refcnt is impossible
> to be NULL.
If there is a missing dev_hold() or one dev_put() in excess,
this would allow the netdev to be freed too soon.
-> Use after free.
memory holding netdev could be reallocated-cleared by some other kernel
user.
^ permalink raw reply
* Re: cross namespace interface notification for tun devices
From: Cong Wang @ 2017-09-20 18:29 UTC (permalink / raw)
To: Jason A. Donenfeld; +Cc: Netdev, Mathias
In-Reply-To: <CAHmME9oSZeAoDtSLxoGiEcTcCbmPLf-nj3OJcJ1ma8fs4BWQAw@mail.gmail.com>
On Tue, Sep 19, 2017 at 2:02 PM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> On Tue, Sep 19, 2017 at 10:40 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> By "notification" I assume you mean netlink notification.
>
> Yes, netlink notification.
>
>> The question is why does the process in A still care about
>> the device sitting in B?
>>
>> Also, the process should be able to receive a last notification
>> on IFF_UP|IFF_RUNNING before device is finally moved to B.
>> After this point, it should not have any relation to netns A
>> any more, like the device were completely gone.
>
> That's very clearly not the case with a tun device. Tun devices work
> by letting a userspace process control the inputs (ndo_start_xmit) and
> outputs (netif_rx) of the actual network device. This controlling
> userspace process needs to know when its own interface that it
> controls goes up and down. In the kernel, we can do this by just
> checking dev->flags&IFF_UP, and receive notifications on ndo_open and
> ndo_stop. In userspace, the controlling process looses the ability to
> receive notifications like ndo_open/ndo_stop when the interface is
> moved to a new namespace. After the interface is moved to a namespace,
> the process will still control inputs and ouputs (ndo_start_xmit and
> netif_rx), but it will no longer receive netlink notifications for the
> equivalent of ndo_open and ndo_stop. This is problematic.
Sounds like we should set NETIF_F_NETNS_LOCAL for tun
device.
What is your legitimate use case of send/receive packet to/from
a tun device in a different netns?
^ permalink raw reply
* Re: [PATCH v5 05/10] dt-bindings: net: dwmac-sun8i: update documentation about integrated PHY
From: Corentin Labbe @ 2017-09-20 18:23 UTC (permalink / raw)
To: Rob Herring
Cc: Andrew Lunn, Mark Rutland, Florian Fainelli, Alexandre Torgue,
devicetree@vger.kernel.org, Catalin Marinas, Will Deacon,
Russell King, linux-kernel@vger.kernel.org, Chen-Yu Tsai, netdev,
Giuseppe CAVALLARO, Maxime Ripard,
linux-arm-kernel@lists.infradead.org
In-Reply-To: <CAL_JsqJ2eVA9Zg60Hagqm732ZoOzHWTwVBqOabOhjE5aQ26L+g@mail.gmail.com>
On Tue, Sep 19, 2017 at 09:49:52PM -0500, Rob Herring wrote:
> On Thu, Sep 14, 2017 at 2:19 PM, Andrew Lunn <andrew@lunn.ch> wrote:
> >> > Is the MDIO controller "allwinner,sun8i-h3-emac" or "snps,dwmac-mdio"?
> >> > If the latter, then I think the node is fine, but then the mux should be
> >> > a child node of it. IOW, the child of an MDIO controller should either
> >> > be a mux node or slave devices.
> >
> > Hi Rob
> >
> > Up until now, children of an MDIO bus have been MDIO devices. Those
> > MDIO devices are either Ethernet PHYs, Ethernet Switches, or the
> > oddball devices that Broadcom iProc has, like generic PHYs.
> >
> > We have never had MDIO-muxes as MDIO children. A Mux is not an MDIO
> > device, and does not have the properties of an MDIO device. It is not
> > addressable on the MDIO bus. The current MUXes are addressed via GPIOs
> > or MMIO.
>
> The DT parent/child relationship defines the bus topology. We describe
> MDIO buses in that way and if a mux is sitting between the controller
> and the devices, then the DT hierarchy should reflect that. Now
> sometimes we have 2 options for what interface has the parent/child
> relationship (e.g. an I2C controlled USB hub chip), but in this case
> we don't.
>
Putting mdio-mux as a child of it (the mdio node) give me:
[ 18.175338] libphy: stmmac: probed
[ 18.175379] mdio_bus stmmac-0: /soc/ethernet@1c30000/mdio/mdio-mux has invalid PHY address
[ 18.175408] mdio_bus stmmac-0: scan phy mdio-mux at address 0
[ 18.175450] mdio_bus stmmac-0: scan phy mdio-mux at address 1
[ 18.175482] mdio_bus stmmac-0: scan phy mdio-mux at address 2
[ 18.175513] mdio_bus stmmac-0: scan phy mdio-mux at address 3
[ 18.175544] mdio_bus stmmac-0: scan phy mdio-mux at address 4
[ 18.175575] mdio_bus stmmac-0: scan phy mdio-mux at address 5
[ 18.175607] mdio_bus stmmac-0: scan phy mdio-mux at address 6
[ 18.175638] mdio_bus stmmac-0: scan phy mdio-mux at address 7
[ 18.175669] mdio_bus stmmac-0: scan phy mdio-mux at address 8
[ 18.175700] mdio_bus stmmac-0: scan phy mdio-mux at address 9
[ 18.175731] mdio_bus stmmac-0: scan phy mdio-mux at address 10
[ 18.175762] mdio_bus stmmac-0: scan phy mdio-mux at address 11
[ 18.175795] mdio_bus stmmac-0: scan phy mdio-mux at address 12
[ 18.175827] mdio_bus stmmac-0: scan phy mdio-mux at address 13
[ 18.175858] mdio_bus stmmac-0: scan phy mdio-mux at address 14
[ 18.175889] mdio_bus stmmac-0: scan phy mdio-mux at address 15
[ 18.175919] mdio_bus stmmac-0: scan phy mdio-mux at address 16
[ 18.175951] mdio_bus stmmac-0: scan phy mdio-mux at address 17
[ 18.175982] mdio_bus stmmac-0: scan phy mdio-mux at address 18
[ 18.176014] mdio_bus stmmac-0: scan phy mdio-mux at address 19
[ 18.176045] mdio_bus stmmac-0: scan phy mdio-mux at address 20
[ 18.176076] mdio_bus stmmac-0: scan phy mdio-mux at address 21
[ 18.176107] mdio_bus stmmac-0: scan phy mdio-mux at address 22
[ 18.176139] mdio_bus stmmac-0: scan phy mdio-mux at address 23
[ 18.176170] mdio_bus stmmac-0: scan phy mdio-mux at address 24
[ 18.176202] mdio_bus stmmac-0: scan phy mdio-mux at address 25
[ 18.176233] mdio_bus stmmac-0: scan phy mdio-mux at address 26
[ 18.176271] mdio_bus stmmac-0: scan phy mdio-mux at address 27
[ 18.176320] mdio_bus stmmac-0: scan phy mdio-mux at address 28
[ 18.176371] mdio_bus stmmac-0: scan phy mdio-mux at address 29
[ 18.176420] mdio_bus stmmac-0: scan phy mdio-mux at address 30
[ 18.176452] mdio_bus stmmac-0: scan phy mdio-mux at address 31
Adding a fake <reg> to mdio-mux remove it, but I found that a bit ugly.
Or perhaps patching of_mdiobus_register() to not scan node with compatible "mdio-mux".
What do you think ?
Regards
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Cong Wang @ 2017-09-20 18:22 UTC (permalink / raw)
To: Paweł Staszewski
Cc: Eric Dumazet, Wei Wang, Linux Kernel Network Developers,
Eric Dumazet
In-Reply-To: <3c227be7-a954-a406-1987-24e908cf214c@itcare.pl>
On Wed, Sep 20, 2017 at 10:55 AM, Paweł Staszewski
<pstaszewski@itcare.pl> wrote:
>
>
> W dniu 2017-09-20 o 19:50, Cong Wang pisze:
>
> On Wed, Sep 20, 2017 at 6:11 AM, Eric Dumazet <eric.dumazet@gmail.com>
> wrote:
>
> Sorry for top-posting, but this is to give context to Wei, since Pawel
> used a top posting way to report his bisection.
>
> Wei, can you take a look at Pawel report ?
>
> Crash happens in dst_destroy() at following :
>
> if (dst->dev)
> dev_put(dst->dev); <<CRASH>>
>
>
> dst->dev is not NULL, but netdev->pcpu_refcnt is NULL
>
> 65 ff 08 decl %gs:(%rax) // CRASH since rax = NULL
>
>
>
> Pawel, please share your netdevices and routing setup ?
>
> Looks like a double dev_put() on some dev...
>
> Pawel, do you have any idea how this is triggered? Does your
> test try to remove some network device? If so which one?
> I noticed you have at least multiple vlan, bond and ixgbe
> devices.
>
> Just after i start bgp sessions
> So when host is starting i have all bgp sessions to upstreams shutdown
>
> To trigger panic i just enable all 6x bgp sessions at once to upstreams -
> and zebra is start to pull prefixes and push them to the kernel
>
> Then some traffic is generated from test hosts thru this backup router and
> panic is generated - every time after 10 to 15 seconds after bgp sessions
> are connected.
>
> I'm not removing any interface at this time or do anything with interfaces -
> just wait.
>
> And yes there are vlans attached to the bond devices
> but dmesg at this time shows nothing about interfaces or flaps.
This is very odd.
We only free netdevice in free_netdev() and it is only called when
we unregister a netdevice. Otherwise pcpu_refcnt is impossible
to be NULL.
^ permalink raw reply
* Re: [PATCH net-next 12/14] gtp: Configuration for zero UDP checksum
From: Tom Herbert @ 2017-09-20 18:09 UTC (permalink / raw)
To: David Miller
Cc: Tom Herbert, Linux Kernel Network Developers, Pablo Neira Ayuso,
Harald Welte, Rohit Seth
In-Reply-To: <20170918.212445.1680068602644706922.davem@davemloft.net>
On Mon, Sep 18, 2017 at 9:24 PM, David Miller <davem@davemloft.net> wrote:
> From: Tom Herbert <tom@quantonium.net>
> Date: Mon, 18 Sep 2017 17:39:02 -0700
>
>> Add configuration to control use of zero checksums on transmit for both
>> IPv4 and IPv6, and control over accepting zero IPv6 checksums on
>> receive.
>>
>> Signed-off-by: Tom Herbert <tom@quantonium.net>
>
> I thought we were trying to move away from this special case of allowing
> zero UDP checksums with tunnels, especially for ipv6.
I don't have a strong preference either way. I like consistency with
VXLAN and foo/UDP, but I guess it's not required. Interestingly, since
GTP only carries IP, IPv6 zero checksums are actually safer here than
VXLAN or GRE/UDP.
Tom
^ permalink raw reply
* Re: [PATCH net-next 08/14] gtp: Support encpasulating over IPv6
From: Tom Herbert @ 2017-09-20 18:03 UTC (permalink / raw)
To: David Miller
Cc: Tom Herbert, Linux Kernel Network Developers, Pablo Neira Ayuso,
Harald Welte, Rohit Seth
In-Reply-To: <20170918.211952.1397675528248642600.davem@davemloft.net>
On Mon, Sep 18, 2017 at 9:19 PM, David Miller <davem@davemloft.net> wrote:
> From: Tom Herbert <tom@quantonium.net>
> Date: Mon, 18 Sep 2017 17:38:58 -0700
>
>> Allow peers to be specified by IPv6 addresses.
>>
>> Signed-off-by: Tom Herbert <tom@quantonium.net>
>
> Hmmm, can you just check the socket family or something like that?
I'm not sure what code you're referring to.
Thanks
^ permalink raw reply
* Re: Latest net-next from GIT panic
From: Paweł Staszewski @ 2017-09-20 17:58 UTC (permalink / raw)
To: Wei Wang; +Cc: Eric Dumazet, Linux Kernel Network Developers, Eric Dumazet
In-Reply-To: <CAEA6p_D8Av0QAJ46Gzx=Bt3uP-NshTmz2QoBzYaO8OK9+CFQNA@mail.gmail.com>
W dniu 2017-09-20 o 19:46, Wei Wang pisze:
>>> This is why I suggested to replace the BUG() in another mail
>>>
>>> So :
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index
>>> f535779d9dc1dfe36934c2abba4e43d053ac5d6f..220cd12456754876edf2d3ef13195e82d70d5c74
>>> 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -3331,7 +3331,15 @@ void netdev_run_todo(void);
>>> */
>>> static inline void dev_put(struct net_device *dev)
>>> {
>>> - this_cpu_dec(*dev->pcpu_refcnt);
>>> + int __percpu *pref = READ_ONCE(dev->pcpu_refcnt);
>>> +
>>> + if (!pref) {
>>> + pr_err("no pcpu_refcnt on dev %p(%s) state %d dismantle
>>> %d\n",
>>> + dev, dev->name, dev->reg_state, dev->dismantle);
>>> + for (;;)
>>> + cpu_relax();
>>> + }
>>> + this_cpu_dec(*pref);
>>> }
>>> /**
>>>
> Thanks a lot Eric for the debug patch.
>
> Pawel,
>
> I want to confirm with you about the last good commit when you did bisection.
> You mentioned:
>
>> And the last one
>>
>> git bisect good
>> Bisecting: 1 revision left to test after this (roughly 1 step)
>> [1cfb71eeb12047bcdbd3e6730ffed66e810a0855] ipv6: take dst->__refcnt for
>> insertion into fib6 tree
>>
>> With this have kernel panic same as always
>>
>> git bisect bad
>> Bisecting: 0 revisions left to test after this (roughly 0 steps)
>> [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4: mark DST_NOGC and
>> remove the operation of dst_free()
>
> So it breaks right at:
> [b838d5e1c5b6e57b10ec8af2268824041e3ea911] ipv4: mark DST_NOGC and
> remove the operation of dst_free()
> Right?
> If you sync the image to one commit before the above one:
> [9df16efadd2a8a82731dc76ff656c771e261827f] ipv4: call dst_hold_safe() properly
> Does it crash?
Later today i will repeat last three steps - in about next 3 hours after
rush hours of internet traffic - now i cant touch backup router :)
>
> And could you confirm that your config does not have any IPv6
> addresses or routes configured?
There is ipv6 enabled
And yes there are some ipv6 ip's
One interface have ipv6 enabled with one static route
but no ipv6 bgp sessions - so nt many ipv6 prefixes and ipv6 fib is
almost empty
ip -6 r ls | wc -l
57
>
> Thanks.
> Wei
>
>
> 6:03 +0200, Paweł Staszewski wrote:
>>>> Nit much more after adding this patch
>>>>
>>>> https://bugzilla.kernel.org/attachment.cgi?id=258529
>>>>
>>> This is why I suggested to replace the BUG() in another mail
>>>
>>> So :
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index
>>> f535779d9dc1dfe36934c2abba4e43d053ac5d6f..220cd12456754876edf2d3ef13195e82d70d5c74
>>> 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -3331,7 +3331,15 @@ void netdev_run_todo(void);
>>> */
>>> static inline void dev_put(struct net_device *dev)
>>> {
>>> - this_cpu_dec(*dev->pcpu_refcnt);
>>> + int __percpu *pref = READ_ONCE(dev->pcpu_refcnt);
>>> +
>>> + if (!pref) {
>>> + pr_err("no pcpu_refcnt on dev %p(%s) state %d dismantle
>>> %d\n",
>>> + dev, dev->name, dev->reg_state, dev->dismantle);
>>> + for (;;)
>>> + cpu_relax();
>>> + }
>>> + this_cpu_dec(*pref);
>>> }
>>> /**
>>>
>>>
>>>
>> Full panic
>>
>> https://bugzilla.kernel.org/attachment.cgi?id=258531
>>
>>
>> I will change patch and apply but later today cause now cant use backup
>> router as testlab - Internet rush hours if something happens this will be
>> bed when second router will have bugged kernel :)
>>
>>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox