Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH RFC 0/3] tun zerocopy stats
From: David Miller @ 2017-10-10 17:39 UTC (permalink / raw)
  To: willemdebruijn.kernel; +Cc: netdev, mst, jasowang, willemb
In-Reply-To: <CAF=yD-Jc_ikdYLydYwkw+h_+izVEg2Qp66=5+URokyqM=E=zJg@mail.gmail.com>

From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
Date: Tue, 10 Oct 2017 11:29:33 -0400

> If there is a way to expose these stats through vhost_net directly,
> instead of through tun, that may be better. But I did not see a
> suitable interface. Perhaps debugfs.

Please don't use debugfs, thank you :-)

^ permalink raw reply

* Re: [Patch net-next] tcp: add a tracepoint for tcp_retransmit_skb()
From: Alexei Starovoitov @ 2017-10-10 17:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, Eric Dumazet, Yuchung Cheng, Neal Cardwell,
	Martin KaFai Lau, Brendan Gregg, hannes, Song Liu
In-Reply-To: <20171010053547.21162-1-xiyou.wangcong@gmail.com>

On Mon, Oct 09, 2017 at 10:35:47PM -0700, Cong Wang wrote:
> We need a real-time notification for tcp retransmission
> for monitoring.
> 
> Of course we could use ftrace to dynamically instrument this
> kernel function too, however we can't retrieve the connection
> information at the same time, for example perf-tools [1] reads
> /proc/net/tcp for socket details, which is slow when we have
> a lots of connections.
> 
> Therefore, this patch adds a tracepoint for tcp_retransmit_skb()
> and exposes src/dst IP addresses and ports of the connection.
> This also makes it easier to integrate into perf.
> 
> Note, I expose both IPv4 and IPv6 addresses at the same time:
> for a IPv4 socket, v4 mapped address is used as IPv6 addresses,
> for a IPv6 socket, LOOPBACK4_IPV6 is already filled by kernel.
> 
> Perhaps there are other interfaces to use (for example netlink),
> but tracepoint is the quickest way I can think of.
> 
> 1. https://github.com/brendangregg/perf-tools/blob/master/net/tcpretrans
> 
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Yuchung Cheng <ycheng@google.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> ---
>  include/trace/events/tcp.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++
>  net/core/net-traces.c      |  1 +
>  net/ipv4/tcp_output.c      |  3 +++
>  3 files changed, 67 insertions(+)
>  create mode 100644 include/trace/events/tcp.h
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> new file mode 100644
> index 000000000000..cb22acc8aacd
> --- /dev/null
> +++ b/include/trace/events/tcp.h
> @@ -0,0 +1,63 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM tcp
> +
> +#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_TCP_H
> +
> +#include <linux/ipv6.h>
> +#include <linux/tcp.h>
> +#include <linux/tracepoint.h>
> +#include <net/ipv6.h>
> +
> +TRACE_EVENT(tcp_retransmit_skb,
> +
> +	TP_PROTO(const struct sock *sk, struct sk_buff *skb, int segs),
> +
> +	TP_ARGS(sk, skb, segs),
> +
> +	TP_STRUCT__entry(
> +		__field(__u16, sport)
> +		__field(__u16, dport)
> +		__array(__u8, saddr, 4)
> +		__array(__u8, daddr, 4)
> +		__array(__u8, saddr_v6, 16)
> +		__array(__u8, daddr_v6, 16)
> +	),
> +
> +	TP_fast_assign(
> +		struct ipv6_pinfo *np = inet6_sk(sk);
> +		struct inet_sock *inet = inet_sk(sk);
> +		struct in6_addr *pin6;
> +		__be32 *p32;
> +
> +		__entry->sport = ntohs(inet->inet_sport);
> +		__entry->dport = ntohs(inet->inet_dport);
> +
> +		p32 = (__be32 *) __entry->saddr;
> +		*p32 = inet->inet_saddr;
> +
> +		p32 = (__be32 *) __entry->daddr;
> +		*p32 =  inet->inet_daddr;
> +
> +		if (np) {
> +			pin6 = (struct in6_addr *)__entry->saddr_v6;
> +			*pin6 = np->saddr;
> +			pin6 = (struct in6_addr *)__entry->daddr_v6;
> +			*pin6 = *(np->daddr_cache);
> +		} else {
> +			pin6 = (struct in6_addr *)__entry->saddr_v6;
> +			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
> +			pin6 = (struct in6_addr *)__entry->daddr_v6;
> +			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
> +		}
> +	),
> +
> +	TP_printk("sport=%hu, dport=%hu, saddr=%pI4, daddr=%pI4, saddrv6=%pI6, daddrv6=%pI6",
> +		  __entry->sport, __entry->dport, __entry->saddr, __entry->daddr,
> +		  __entry->saddr_v6, __entry->daddr_v6)
> +);
> +
> +#endif /* _TRACE_TCP_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> diff --git a/net/core/net-traces.c b/net/core/net-traces.c
> index 1132820c8e62..f4e4fa2db505 100644
> --- a/net/core/net-traces.c
> +++ b/net/core/net-traces.c
> @@ -31,6 +31,7 @@
>  #include <trace/events/napi.h>
>  #include <trace/events/sock.h>
>  #include <trace/events/udp.h>
> +#include <trace/events/tcp.h>
>  #include <trace/events/fib.h>
>  #include <trace/events/qdisc.h>
>  #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 696b0a168f16..e6d6e1393578 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -42,6 +42,8 @@
>  #include <linux/gfp.h>
>  #include <linux/module.h>
>  
> +#include <trace/events/tcp.h>
> +
>  /* People can turn this off for buggy TCP's found in printers etc. */
>  int sysctl_tcp_retrans_collapse __read_mostly = 1;
>  
> @@ -2899,6 +2901,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
>  		if (!tp->retrans_stamp)
>  			tp->retrans_stamp = tcp_skb_timestamp(skb);
>  
> +		trace_tcp_retransmit_skb(sk, skb, segs);

I'm happy to see new tracepoints being added to tcp stack, but I'm concerned
with practical usability of them.
Like the above tracepoint definition makes it not very useful from bpf point of view,
since 'sk' pointer is not recored by as part of the tracepoint.
In bpf/tracing world we prefer tracepoints to have raw pointers recorded
in TP_STRUCT__entry() and _not_ printed in TP_printk()
(since pointers are useless for userspace).
Like trace_kfree_skb() tracepoint records raw 'skb' pointer and we can
walk whatever sk_buff fields we need inside the program.
Such approach allows tracepoint to be usable in many more scenarios, since
bpf program can examine kernel datastructures.
Over the last few years we've been running tcp statistics framework (similar to web10g)
using 8 kprobes in tcp stack with bpf programs extracting the data and now we're
ready to share this experience with the community. Right now we're working on a set
of tracepoints for tcp stack to make the interface more accurate, faster and more stable.
We're planning to send an RFC patch with these new tracepoints in the comming weeks.

More concrete, if you can make this trace_tcp_retransmit_skb() to record
sk, skb pointers and err code at the end of __tcp_retransmit_skb() it will solve
our need as well.

So far our list of kprobes is:
int kprobe__tcp_validate_incoming
int kprobe__tcp_send_active_reset
int kprobe__tcp_v4_send_reset
int kprobe__tcp_v6_send_reset
int kprobe__tcp_v4_destroy_sock
int kprobe__tcp_set_state
int kprobe__tcp_retransmit_skb
int kprobe__tcp_rtx_synack

with tracepoints we can consolidate two of them into one and drop
another one for sure. Notice that tcp_retransmit_skb is on our list too
and currently we're doing extra work inside the program to make it more
accurate which will be unnecessary if this tracepoint is at the end
of __tcp_retransmit_skb().

Thanks

^ permalink raw reply

* Re: [PATCH][net-next] ipv6: fix dereference of rt6_ex before null check error
From: Eric Dumazet @ 2017-10-10 17:26 UTC (permalink / raw)
  To: Colin King
  Cc: David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI, netdev,
	kernel-janitors, linux-kernel
In-Reply-To: <20171010170116.21124-1-colin.king@canonical.com>

On Tue, 2017-10-10 at 18:01 +0100, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Currently rt6_ex is being dereferenced before it is null checked
> hence there is a possible null dereference bug. Fix this by only
> dereferencing rt6_ex after it has been null checked.
> 
> Detected by CoverityScan, CID#1457749 ("Dereference before null check")
> 
> Fixes: 81eb8447daae ("ipv6: take care of rt6_stats")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply

* Re: [patch net-next 0/4] net: sched: get rid of cls_flower->egress_dev
From: Or Gerlitz @ 2017-10-10 17:25 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Linux Netdev List, David Miller, Jamal Hadi Salim, Cong Wang,
	Saeed Mahameed, Matan Barak, Leon Romanovsky, mlxsw
In-Reply-To: <20171010073016.3682-1-jiri@resnulli.us>

On Tue, Oct 10, 2017 at 10:30 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>
>
>  drivers/net/ethernet/mellanox/mlx5/core/en.h      |   3 +
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   4 +-
>  drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  |  31 ++--

Jiri,

FWIW, as I reported to you earlier, I was playing with tc encap/decap
rules on 4.14-rc+ (net) before
applying any patch of this series, and something is messy w.r.t to
decap rules. I don't see
them removed at all when user space attempts to do so. It might
(probably) mlx5 bug, which
we will have to fix for net and later rebase net-next over net. We
have short WW here so
I will not be able to do RCA this week.

^ permalink raw reply

* Re: [PATCH net-next] openvswitch: add ct_clear action
From: Joe Stringer @ 2017-10-10 17:24 UTC (permalink / raw)
  To: Eric Garver, Joe Stringer, Pravin Shelar,
	Linux Kernel Network Developers, ovs dev
In-Reply-To: <20171010150949.GG26353@dev-rhel7>

On 10 October 2017 at 08:09, Eric Garver <e@erig.me> wrote:
> On Tue, Oct 10, 2017 at 05:33:48AM -0700, Joe Stringer wrote:
>> On 9 October 2017 at 21:41, Pravin Shelar <pshelar-LZ6Gd1LRuIk@public.gmane.org> wrote:
>> > On Fri, Oct 6, 2017 at 9:44 AM, Eric Garver <e@erig.me> wrote:
>> >> This adds a ct_clear action for clearing conntrack state. ct_clear is
>> >> currently implemented in OVS userspace, but is not backed by an action
>> >> in the kernel datapath. This is useful for flows that may modify a
>> >> packet tuple after a ct lookup has already occurred.
>> >>
>> >> Signed-off-by: Eric Garver <e@erig.me>
>> > Patch mostly looks good. I have following comments.
>> >
>> >> ---
>> >>  include/uapi/linux/openvswitch.h |  2 ++
>> >>  net/openvswitch/actions.c        |  5 +++++
>> >>  net/openvswitch/conntrack.c      | 12 ++++++++++++
>> >>  net/openvswitch/conntrack.h      |  7 +++++++
>> >>  net/openvswitch/flow_netlink.c   |  5 +++++
>> >>  5 files changed, 31 insertions(+)
>> >>
>> >> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
>> >> index 156ee4cab82e..1b6e510e2cc6 100644
>> >> --- a/include/uapi/linux/openvswitch.h
>> >> +++ b/include/uapi/linux/openvswitch.h
>> >> @@ -806,6 +806,7 @@ struct ovs_action_push_eth {
>> >>   * packet.
>> >>   * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the
>> >>   * packet.
>> >> + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet.
>> >>   *
>> >>   * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
>> >>   * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
>> >> @@ -835,6 +836,7 @@ enum ovs_action_attr {
>> >>         OVS_ACTION_ATTR_TRUNC,        /* u32 struct ovs_action_trunc. */
>> >>         OVS_ACTION_ATTR_PUSH_ETH,     /* struct ovs_action_push_eth. */
>> >>         OVS_ACTION_ATTR_POP_ETH,      /* No argument. */
>> >> +       OVS_ACTION_ATTR_CT_CLEAR,     /* No argument. */
>> >>
>> >>         __OVS_ACTION_ATTR_MAX,        /* Nothing past this will be accepted
>> >>                                        * from userspace. */
>> >> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
>> >> index a54a556fcdb5..db9c7f2e662b 100644
>> >> --- a/net/openvswitch/actions.c
>> >> +++ b/net/openvswitch/actions.c
>> >> @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> >>                                 return err == -EINPROGRESS ? 0 : err;
>> >>                         break;
>> >>
>> >> +               case OVS_ACTION_ATTR_CT_CLEAR:
>> >> +                       err = ovs_ct_clear(skb, key);
>> >> +                       break;
>> >> +
>> >>                 case OVS_ACTION_ATTR_PUSH_ETH:
>> >>                         err = push_eth(skb, key, nla_data(a));
>> >>                         break;
>> >> @@ -1210,6 +1214,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>> >>                 case OVS_ACTION_ATTR_POP_ETH:
>> >>                         err = pop_eth(skb, key);
>> >>                         break;
>> >> +
>> >>                 }
>> > Unrelated change.
>> >
>> >>
>> >>                 if (unlikely(err)) {
>> >> diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
>> >> index d558e882ca0c..f9b73c726ad7 100644
>> >> --- a/net/openvswitch/conntrack.c
>> >> +++ b/net/openvswitch/conntrack.c
>> >> @@ -1129,6 +1129,18 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
>> >>         return err;
>> >>  }
>> >>
>> >> +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
>> >> +{
>> >> +       if (skb_nfct(skb)) {
>> >> +               nf_conntrack_put(skb_nfct(skb));
>> >> +               nf_ct_set(skb, NULL, 0);
>> > Can the new conntract state be appropriate? may be IP_CT_UNTRACKED?
>> >
>> >> +       }
>> >> +
>> >> +       ovs_ct_fill_key(skb, key);
>> >> +
>> > I do not see need to refill the key if there is no skb-nf-ct.
>>
>> Really this is trying to just zero the CT key fields, but reuses
>> existing functions, right? This means that subsequent upcalls, for
>
> Right.
>
>> instance, won't have the outdated view of the CT state from the
>> previous lookup (that was prior to the ct_clear). I'd expect these key
>> fields to be cleared.
>
> I assumed Pravin was saying that we don't need to clear them if there is
> no conntrack state. They should already be zero.

The conntrack calls aren't going to clear it, so I don't see what else
would clear it?

If you execute ct(),ct_clear(), then the first ct will set the
values.. what will zero them?

^ permalink raw reply

* Re: [PATCH RFC 0/3] tun zerocopy stats
From: Stephen Hemminger @ 2017-10-10 17:23 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: David Miller, Network Development, Michael S. Tsirkin, Jason Wang,
	Willem de Bruijn
In-Reply-To: <CAF=yD-Jc_ikdYLydYwkw+h_+izVEg2Qp66=5+URokyqM=E=zJg@mail.gmail.com>

On Tue, 10 Oct 2017 11:29:33 -0400
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:

> On Mon, Oct 9, 2017 at 11:52 PM, David Miller <davem@davemloft.net> wrote:
> > From: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
> > Date: Fri,  6 Oct 2017 18:25:13 -0400
> >  
> >> From: Willem de Bruijn <willemb@google.com>
> >>
> >> Add zerocopy transfer statistics to the vhost_net/tun zerocopy path.
> >>
> >> I've been using this to verify recent changes to zerocopy tuning [1].
> >> Sharing more widely, as it may be useful in similar future work.
> >>
> >> Use ethtool stats as interface, as these are defined per device
> >> driver and can easily be extended.
> >>
> >> Make the zerocopy release callback take an extra hop through the tun
> >> driver to allow the driver to increment its counters.
> >>
> >> Care must be taken to avoid adding an alloc/free to this hot path.
> >> Since the caller already must allocate a ubuf_info, make it allocate
> >> two at a time and grant one to the tun device.
> >>
> >>  1/3: introduce ethtool stats (`ethtool -S $DEV`) for tun devices
> >>  2/3: add zerocopy tx and tx_err counters
> >>  3/3: convert vhost_net to pass a pair of ubuf_info to tun
> >>
> >> [1] http://patchwork.ozlabs.org/patch/822613/  
> >
> > This looks mostly fine to me, but I don't know enough about how vhost
> > and tap interact to tell whether this makes sense to upstream.  
> 
> Thanks for taking a look. The need for monitoring these stats has
> come up in a couple of patch evaluation discussions, so I wanted
> to share at least one implementation to get the data.
> 
> Because the choice to use zerocopy is based on heuristics and
> there is a cost if it mispredicts, I think we even want to being able
> to continuously monitor this in production.
> 
> The implementation is probably not ready for that as is.

Another alternative is to use tracepoints for this.
If you need statistics in production then per-cpu (or per-queue) stats
would have less impact. Tracepoints have no visible impact unless used.

^ permalink raw reply

* [net-next 9/9] igb: check memory allocation failure
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Christophe JAILLET, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>

Check memory allocation failures and return -ENOMEM in such cases, as
already done for other memory allocations in this function.

This avoids NULL pointers dereference.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Tested-by: Aaron Brown <aaron.f.brown@intel.com
Acked-by: PJ Waskiewicz <peter.waskiewicz.jr@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..837d9b46a390 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter)
 	/* Setup and initialize a copy of the hw vlan table array */
 	adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32),
 				       GFP_ATOMIC);
+	if (!adapter->shadow_vfta)
+		return -ENOMEM;
 
 	/* This call may decrease the number of queues */
 	if (igb_init_interrupt_scheme(adapter, true)) {
-- 
2.14.2

^ permalink raw reply related

* [net-next 8/9] e1000e: Be drop monitor friendly
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Florian Fainelli, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Florian Fainelli <f.fainelli@gmail.com>

e1000e_put_txbuf() can be called from normal reclamation path as well as
when a DMA mapping failure, so we need to differentiate these two cases
when freeing SKBs to be drop monitor friendly. e1000e_tx_hwtstamp_work()
and e1000_remove() are processing TX timestamped SKBs and those should
not be accounted as drops either.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 00f48d4cabec..bf8f38f76953 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1071,7 +1071,8 @@ static bool e1000_clean_rx_irq(struct e1000_ring *rx_ring, int *work_done,
 }
 
 static void e1000_put_txbuf(struct e1000_ring *tx_ring,
-			    struct e1000_buffer *buffer_info)
+			    struct e1000_buffer *buffer_info,
+			    bool drop)
 {
 	struct e1000_adapter *adapter = tx_ring->adapter;
 
@@ -1085,7 +1086,10 @@ static void e1000_put_txbuf(struct e1000_ring *tx_ring,
 		buffer_info->dma = 0;
 	}
 	if (buffer_info->skb) {
-		dev_kfree_skb_any(buffer_info->skb);
+		if (drop)
+			dev_kfree_skb_any(buffer_info->skb);
+		else
+			dev_consume_skb_any(buffer_info->skb);
 		buffer_info->skb = NULL;
 	}
 	buffer_info->time_stamp = 0;
@@ -1199,7 +1203,7 @@ static void e1000e_tx_hwtstamp_work(struct work_struct *work)
 		wmb(); /* force write prior to skb_tstamp_tx */
 
 		skb_tstamp_tx(skb, &shhwtstamps);
-		dev_kfree_skb_any(skb);
+		dev_consume_skb_any(skb);
 	} else if (time_after(jiffies, adapter->tx_hwtstamp_start
 			      + adapter->tx_timeout_factor * HZ)) {
 		dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
@@ -1254,7 +1258,7 @@ static bool e1000_clean_tx_irq(struct e1000_ring *tx_ring)
 				}
 			}
 
-			e1000_put_txbuf(tx_ring, buffer_info);
+			e1000_put_txbuf(tx_ring, buffer_info, false);
 			tx_desc->upper.data = 0;
 
 			i++;
@@ -2437,7 +2441,7 @@ static void e1000_clean_tx_ring(struct e1000_ring *tx_ring)
 
 	for (i = 0; i < tx_ring->count; i++) {
 		buffer_info = &tx_ring->buffer_info[i];
-		e1000_put_txbuf(tx_ring, buffer_info);
+		e1000_put_txbuf(tx_ring, buffer_info, false);
 	}
 
 	netdev_reset_queue(adapter->netdev);
@@ -5625,7 +5629,7 @@ static int e1000_tx_map(struct e1000_ring *tx_ring, struct sk_buff *skb,
 			i += tx_ring->count;
 		i--;
 		buffer_info = &tx_ring->buffer_info[i];
-		e1000_put_txbuf(tx_ring, buffer_info);
+		e1000_put_txbuf(tx_ring, buffer_info, true);
 	}
 
 	return 0;
@@ -7419,7 +7423,7 @@ static void e1000_remove(struct pci_dev *pdev)
 	if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) {
 		cancel_work_sync(&adapter->tx_hwtstamp_work);
 		if (adapter->tx_hwtstamp_skb) {
-			dev_kfree_skb_any(adapter->tx_hwtstamp_skb);
+			dev_consume_skb_any(adapter->tx_hwtstamp_skb);
 			adapter->tx_hwtstamp_skb = NULL;
 		}
 	}
-- 
2.14.2

^ permalink raw reply related

* [net-next 7/9] e1000e: apply burst mode settings only on default
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Willem de Bruijn, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Willem de Bruijn <willemb@google.com>

Devices that support FLAG2_DMA_BURST have different default values
for RDTR and RADV. Apply burst mode default settings only when no
explicit value was passed at module load.

The RDTR default is zero. If the module is loaded for low latency
operation with RxIntDelay=0, do not override this value with a burst
default of 32.

Move the decision to apply burst values earlier, where explicitly
initialized module variables can be distinguished from defaults.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/e1000.h  |  4 ----
 drivers/net/ethernet/intel/e1000e/netdev.c |  8 --------
 drivers/net/ethernet/intel/e1000e/param.c  | 16 +++++++++++++++-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 98e68888abb1..2311b31bdcac 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -94,10 +94,6 @@ struct e1000_info;
  */
 #define E1000_CHECK_RESET_COUNT		25
 
-#define DEFAULT_RDTR			0
-#define DEFAULT_RADV			8
-#define BURST_RDTR			0x20
-#define BURST_RADV			0x20
 #define PCICFG_DESC_RING_STATUS		0xe4
 #define FLUSH_DESC_REQUIRED		0x100
 
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 14b096f3d1da..00f48d4cabec 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3242,14 +3242,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 		 */
 		ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE);
 		ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE);
-
-		/* override the delay timers for enabling bursting, only if
-		 * the value was not set by the user via module options
-		 */
-		if (adapter->rx_int_delay == DEFAULT_RDTR)
-			adapter->rx_int_delay = BURST_RDTR;
-		if (adapter->rx_abs_int_delay == DEFAULT_RADV)
-			adapter->rx_abs_int_delay = BURST_RADV;
 	}
 
 	/* set the Receive Delay Timer Register */
diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c
index 6d8c39abee16..47da51864543 100644
--- a/drivers/net/ethernet/intel/e1000e/param.c
+++ b/drivers/net/ethernet/intel/e1000e/param.c
@@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay");
 /* Receive Interrupt Delay in units of 1.024 microseconds
  * hardware will likely hang if you set this to anything but zero.
  *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
+ *
  * Valid Range: 0-65535
  */
 E1000_PARAM(RxIntDelay, "Receive Interrupt Delay");
+#define DEFAULT_RDTR	0
+#define BURST_RDTR	0x20
 #define MAX_RXDELAY 0xFFFF
 #define MIN_RXDELAY 0
 
 /* Receive Absolute Interrupt Delay in units of 1.024 microseconds
+ *
+ * Burst variant is used as default if device has FLAG2_DMA_BURST.
  *
  * Valid Range: 0-65535
  */
 E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay");
+#define DEFAULT_RADV	8
+#define BURST_RADV	0x20
 #define MAX_RXABSDELAY 0xFFFF
 #define MIN_RXABSDELAY 0
 
@@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
 					 .max = MAX_RXDELAY } }
 		};
 
+		if (adapter->flags2 & FLAG2_DMA_BURST)
+			opt.def = BURST_RDTR;
+
 		if (num_RxIntDelay > bd) {
 			adapter->rx_int_delay = RxIntDelay[bd];
 			e1000_validate_option(&adapter->rx_int_delay, &opt,
@@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter)
 	}
 	/* Receive Absolute Interrupt Delay */
 	{
-		static const struct e1000_option opt = {
+		static struct e1000_option opt = {
 			.type = range_option,
 			.name = "Receive Absolute Interrupt Delay",
 			.err  = "using default of "
@@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter)
 					 .max = MAX_RXABSDELAY } }
 		};
 
+		if (adapter->flags2 & FLAG2_DMA_BURST)
+			opt.def = BURST_RADV;
+
 		if (num_RxAbsIntDelay > bd) {
 			adapter->rx_abs_int_delay = RxAbsIntDelay[bd];
 			e1000_validate_option(&adapter->rx_abs_int_delay, &opt,
-- 
2.14.2

^ permalink raw reply related

* [net-next 6/9] e1000e: fix buffer overrun while the I219 is processing DMA transactions
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Sasha Neftin, netdev, nhorman, sassmann, jogreene, Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Sasha Neftin <sasha.neftin@intel.com>

Intel® 100/200 Series Chipset platforms reduced the round-trip
latency for the LAN Controller DMA accesses, causing in some high
performance cases a buffer overrun while the I219 LAN Connected
Device is processing the DMA transactions. I219LM and I219V devices
can fall into unrecovered Tx hang under very stressfully UDP traffic
and multiple reconnection of Ethernet cable. This Tx hang of the LAN
Controller is only recovered if the system is rebooted. Slightly slow
down DMA access by reducing the number of outstanding requests.
This workaround could have an impact on TCP traffic performance
on the platform. Disabling TSO eliminates performance loss for TCP
traffic without a noticeable impact on CPU performance.

Please, refer to I218/I219 specification update:
https://www.intel.com/content/www/us/en/embedded/products/networking/
ethernet-connection-i218-family-documentation.html

Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Reviewed-by: Dima Ruinskiy <dima.ruinskiy@intel.com>
Reviewed-by: Raanan Avargil <raanan.avargil@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index ee9de3500331..14b096f3d1da 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3021,8 +3021,8 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
 
 	hw->mac.ops.config_collision_dist(hw);
 
-	/* SPT and CNP Si errata workaround to avoid data corruption */
-	if (hw->mac.type >= e1000_pch_spt) {
+	/* SPT and KBL Si errata workaround to avoid data corruption */
+	if (hw->mac.type == e1000_pch_spt) {
 		u32 reg_val;
 
 		reg_val = er32(IOSFPC);
@@ -3030,7 +3030,9 @@ static void e1000_configure_tx(struct e1000_adapter *adapter)
 		ew32(IOSFPC, reg_val);
 
 		reg_val = er32(TARC(0));
-		reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ;
+		/* SPT and KBL Si errata workaround to avoid Tx hang */
+		reg_val &= ~BIT(28);
+		reg_val |= BIT(29);
 		ew32(TARC(0), reg_val);
 	}
 }
-- 
2.14.2

^ permalink raw reply related

* [net-next 5/9] e1000e: Avoid receiver overrun interrupt bursts
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

When e1000e_poll() is not fast enough to keep up with incoming traffic, the
adapter (when operating in msix mode) raises the Other interrupt to signal
Receiver Overrun.

This is a double problem because 1) at the moment e1000_msix_other()
assumes that it is only called in case of Link Status Change and 2) if the
condition persists, the interrupt is repeatedly raised again in quick
succession.

Ideally we would configure the Other interrupt to not be raised in case of
receiver overrun but this doesn't seem possible on this adapter. Instead,
we handle the first part of the problem by reverting to the practice of
reading ICR in the other interrupt handler, like before commit 16ecba59bc33
("e1000e: Do not read ICR in Other interrupt"). Thanks to commit
0a8047ac68e5 ("e1000e: Fix msi-x interrupt automask") which cleared IAME
from CTRL_EXT, reading ICR doesn't interfere with RxQ0, TxQ0 interrupts
anymore. We handle the second part of the problem by not re-enabling the
Other interrupt right away when there is overrun. Instead, we wait until
traffic subsides, napi polling mode is exited and interrupts are
re-enabled.

Reported-by: Lennart Sorensen <lsorense@csclub.uwaterloo.ca>
Fixes: 16ecba59bc33 ("e1000e: Do not read ICR in Other interrupt")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/defines.h |  1 +
 drivers/net/ethernet/intel/e1000e/netdev.c  | 33 ++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h
index 0641c0098738..afb7ebe20b24 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -398,6 +398,7 @@
 #define E1000_ICR_LSC           0x00000004 /* Link Status Change */
 #define E1000_ICR_RXSEQ         0x00000008 /* Rx sequence error */
 #define E1000_ICR_RXDMT0        0x00000010 /* Rx desc min. threshold (0) */
+#define E1000_ICR_RXO           0x00000040 /* Receiver Overrun */
 #define E1000_ICR_RXT0          0x00000080 /* Rx timer intr (ring 0) */
 #define E1000_ICR_ECCER         0x00400000 /* Uncorrectable ECC Error */
 /* If this bit asserted, the driver should claim the interrupt */
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 0a5f95ab0d3c..ee9de3500331 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -1910,14 +1910,30 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data)
 	struct net_device *netdev = data;
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
+	u32 icr;
+	bool enable = true;
+
+	icr = er32(ICR);
+	if (icr & E1000_ICR_RXO) {
+		ew32(ICR, E1000_ICR_RXO);
+		enable = false;
+		/* napi poll will re-enable Other, make sure it runs */
+		if (napi_schedule_prep(&adapter->napi)) {
+			adapter->total_rx_bytes = 0;
+			adapter->total_rx_packets = 0;
+			__napi_schedule(&adapter->napi);
+		}
+	}
+	if (icr & E1000_ICR_LSC) {
+		ew32(ICR, E1000_ICR_LSC);
+		hw->mac.get_link_status = true;
+		/* guard against interrupt when we're going down */
+		if (!test_bit(__E1000_DOWN, &adapter->state))
+			mod_timer(&adapter->watchdog_timer, jiffies + 1);
+	}
 
-	hw->mac.get_link_status = true;
-
-	/* guard against interrupt when we're going down */
-	if (!test_bit(__E1000_DOWN, &adapter->state)) {
-		mod_timer(&adapter->watchdog_timer, jiffies + 1);
+	if (enable && !test_bit(__E1000_DOWN, &adapter->state))
 		ew32(IMS, E1000_IMS_OTHER);
-	}
 
 	return IRQ_HANDLED;
 }
@@ -2687,7 +2703,8 @@ static int e1000e_poll(struct napi_struct *napi, int weight)
 		napi_complete_done(napi, work_done);
 		if (!test_bit(__E1000_DOWN, &adapter->state)) {
 			if (adapter->msix_entries)
-				ew32(IMS, adapter->rx_ring->ims_val);
+				ew32(IMS, adapter->rx_ring->ims_val |
+				     E1000_IMS_OTHER);
 			else
 				e1000_irq_enable(adapter);
 		}
@@ -4204,7 +4221,7 @@ static void e1000e_trigger_lsc(struct e1000_adapter *adapter)
 	struct e1000_hw *hw = &adapter->hw;
 
 	if (adapter->msix_entries)
-		ew32(ICS, E1000_ICS_OTHER);
+		ew32(ICS, E1000_ICS_LSC | E1000_ICS_OTHER);
 	else
 		ew32(ICS, E1000_ICS_LSC);
 }
-- 
2.14.2

^ permalink raw reply related

* [net-next 3/9] e1000e: Fix return value test
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

All the helpers return -E1000_ERR_PHY.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index ead4c112580e..a740de6a30b0 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5099,7 +5099,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
 		break;
 	}
 
-	if ((ret_val == E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
+	if ((ret_val == -E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) &&
 	    (er32(CTRL) & E1000_PHY_CTRL_GBE_DISABLE)) {
 		/* See e1000_kmrn_lock_loss_workaround_ich8lan() */
 		e_info("Gigabit has been disabled, downgrading speed\n");
-- 
2.14.2

^ permalink raw reply related

* [net-next 4/9] e1000e: Separate signaling for link check/link up
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

Lennart reported the following race condition:

\ e1000_watchdog_task
    \ e1000e_has_link
        \ hw->mac.ops.check_for_link() === e1000e_check_for_copper_link
            /* link is up */
            mac->get_link_status = false;

                            /* interrupt */
                            \ e1000_msix_other
                                hw->mac.get_link_status = true;

        link_active = !hw->mac.get_link_status
        /* link_active is false, wrongly */

This problem arises because the single flag get_link_status is used to
signal two different states: link status needs checking and link status is
down.

Avoid the problem by using the return value of .check_for_link to signal
the link status to e1000e_has_link().

Reported-by: Lennart Sorensen <lsorense@csclub.uwaterloo.ca>
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/mac.c    | 11 ++++++++---
 drivers/net/ethernet/intel/e1000e/netdev.c |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c
index b322011ec282..f457c5703d0c 100644
--- a/drivers/net/ethernet/intel/e1000e/mac.c
+++ b/drivers/net/ethernet/intel/e1000e/mac.c
@@ -410,6 +410,9 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw)
  *  Checks to see of the link status of the hardware has changed.  If a
  *  change in link status has been detected, then we read the PHY registers
  *  to get the current speed/duplex if link exists.
+ *
+ *  Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link
+ *  up).
  **/
 s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 {
@@ -423,7 +426,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 	 * Change or Rx Sequence Error interrupt.
 	 */
 	if (!mac->get_link_status)
-		return 0;
+		return 1;
 
 	/* First we want to see if the MII Status Register reports
 	 * link.  If so, then we want to get the current speed/duplex
@@ -461,10 +464,12 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw)
 	 * different link partner.
 	 */
 	ret_val = e1000e_config_fc_after_link_up(hw);
-	if (ret_val)
+	if (ret_val) {
 		e_dbg("Error configuring flow control\n");
+		return ret_val;
+	}
 
-	return ret_val;
+	return 1;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index a740de6a30b0..0a5f95ab0d3c 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5081,7 +5081,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
 	case e1000_media_type_copper:
 		if (hw->mac.get_link_status) {
 			ret_val = hw->mac.ops.check_for_link(hw);
-			link_active = !hw->mac.get_link_status;
+			link_active = ret_val > 0;
 		} else {
 			link_active = true;
 		}
-- 
2.14.2

^ permalink raw reply related

* [net-next 1/9] e1000e: Fix error path in link detection
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

In case of error from e1e_rphy(), the loop will exit early and "success"
will be set to true erroneously.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/phy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c
index d78d47b41a71..86ff0969efb6 100644
--- a/drivers/net/ethernet/intel/e1000e/phy.c
+++ b/drivers/net/ethernet/intel/e1000e/phy.c
@@ -1744,6 +1744,7 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
 	s32 ret_val = 0;
 	u16 i, phy_status;
 
+	*success = false;
 	for (i = 0; i < iterations; i++) {
 		/* Some PHYs require the MII_BMSR register to be read
 		 * twice due to the link bit being sticky.  No harm doing
@@ -1763,16 +1764,16 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations,
 		ret_val = e1e_rphy(hw, MII_BMSR, &phy_status);
 		if (ret_val)
 			break;
-		if (phy_status & BMSR_LSTATUS)
+		if (phy_status & BMSR_LSTATUS) {
+			*success = true;
 			break;
+		}
 		if (usec_interval >= 1000)
 			msleep(usec_interval / 1000);
 		else
 			udelay(usec_interval);
 	}
 
-	*success = (i < iterations);
-
 	return ret_val;
 }
 
-- 
2.14.2

^ permalink raw reply related

* [net-next 2/9] e1000e: Fix wrong comment related to link detection
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Benjamin Poirier, netdev, nhorman, sassmann, jogreene,
	Jeff Kirsher
In-Reply-To: <20171010172139.77914-1-jeffrey.t.kirsher@intel.com>

From: Benjamin Poirier <bpoirier@suse.com>

Reading e1000e_check_for_copper_link() shows that get_link_status is set to
false after link has been detected. Therefore, it stays TRUE until then.

Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 8436c5f2c3e8..ead4c112580e 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5074,7 +5074,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
 
 	/* get_link_status is set on LSC (link status) interrupt or
 	 * Rx sequence error interrupt.  get_link_status will stay
-	 * false until the check_for_link establishes link
+	 * true until the check_for_link establishes link
 	 * for copper adapters ONLY
 	 */
 	switch (hw->phy.media_type) {
@@ -5092,7 +5092,7 @@ static bool e1000e_has_link(struct e1000_adapter *adapter)
 		break;
 	case e1000_media_type_internal_serdes:
 		ret_val = hw->mac.ops.check_for_link(hw);
-		link_active = adapter->hw.mac.serdes_has_link;
+		link_active = hw->mac.serdes_has_link;
 		break;
 	default:
 	case e1000_media_type_unknown:
-- 
2.14.2

^ permalink raw reply related

* [net-next 0/9][pull request] 1GbE Intel Wired LAN Driver Updates 2017-10-10
From: Jeff Kirsher @ 2017-10-10 17:21 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene

This series contains updates to e1000e and igb.

Benjamin Poirier provides several fixes for e1000e, starting with a
correction to the return status which was always returning success even
if it was not successful.  Fixed code comments to reflect the actual
code behavior.  Fixed the conditional test for the correct return
value.  Fixed a potential race condition reported by Lennart Sorensen,
where the single flag get_link_status is used to signal two different
states.

Sasha fixes a buffer overrun for i219 devices, where the chipset had
reduced the round-trip latency for the LAN controller DMA accesses
which in some high performance cases caused a buffer overrun while
processing the DMA transactions.

Willem de Bruijn changes the default behavior of e1000e to use the
burst mode settings by default unless the user specifies the
receive interrupt delay (RxIntDelay).

Florian Fainelli updates the driver to differentiate between when
e1000e_put_txbuf() is called from normal reclamation or when a
DMA mapping failure to make the driver more "drop monitor friendly".

Christophe JAILLET fixes a potential NULL pointer dereference by
properly returning -ENOMEM on memory allocation failures.

The following are changes since commit 812b5ca7d376e7e008ac0c897d1ef94eb05ddc3b:
  Add a driver for Renesas uPD60620 and uPD60620A PHYs
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 1GbE

Benjamin Poirier (5):
  e1000e: Fix error path in link detection
  e1000e: Fix wrong comment related to link detection
  e1000e: Fix return value test
  e1000e: Separate signaling for link check/link up
  e1000e: Avoid receiver overrun interrupt bursts

Christophe JAILLET (1):
  igb: check memory allocation failure

Florian Fainelli (1):
  e1000e: Be drop monitor friendly

Sasha Neftin (1):
  e1000e: fix buffer overrun while the I219 is processing DMA
    transactions

Willem de Bruijn (1):
  e1000e: apply burst mode settings only on default

 drivers/net/ethernet/intel/e1000e/defines.h |  1 +
 drivers/net/ethernet/intel/e1000e/e1000.h   |  4 --
 drivers/net/ethernet/intel/e1000e/mac.c     | 11 +++--
 drivers/net/ethernet/intel/e1000e/netdev.c  | 75 +++++++++++++++++------------
 drivers/net/ethernet/intel/e1000e/param.c   | 16 +++++-
 drivers/net/ethernet/intel/e1000e/phy.c     |  7 +--
 drivers/net/ethernet/intel/igb/igb_main.c   |  2 +
 7 files changed, 75 insertions(+), 41 deletions(-)

-- 
2.14.2

^ permalink raw reply

* Re: [PATCH v2] xdp: Sample xdp program implementing ip forward
From: Stephen Hemminger @ 2017-10-10 17:21 UTC (permalink / raw)
  To: Christina Jacob
  Cc: netdev, linux-kernel, linux-arm-kernel, brouer, Sunil.Goutham,
	daniel, dsahern, Christina Jacob
In-Reply-To: <1507620532-25804-2-git-send-email-Christina.Jacob@cavium.com>

On Tue, 10 Oct 2017 12:58:52 +0530
Christina Jacob <christina.jacob.koikara@gmail.com> wrote:

> +		bzero(&route, sizeof(route));
> +		bzero(dsts, sizeof(dsts));
> +		bzero(dsts_len, sizeof(dsts_len));
> +		bzero(gws, sizeof(gws));
> +		bzero(ifs, sizeof(ifs));
> +		bzero(&route, sizeof(route));

This is all unnecessary.
It looks like you write OpenBSD security code.
Only a security person would zero stack variables before return
and only BSD people use bzero(), Linux style is to use memset.

^ permalink raw reply

* Re: [PATCH v2] xdp: Sample xdp program implementing ip forward
From: Stephen Hemminger @ 2017-10-10 17:19 UTC (permalink / raw)
  To: Christina Jacob
  Cc: netdev, linux-kernel, linux-arm-kernel, brouer, Sunil.Goutham,
	daniel, dsahern, Christina Jacob
In-Reply-To: <1507620532-25804-2-git-send-email-Christina.Jacob@cavium.com>

On Tue, 10 Oct 2017 12:58:52 +0530
Christina Jacob <christina.jacob.koikara@gmail.com> wrote:

> +/* Get the mac address of the interface given interface name */
> +static long *getmac(char *iface)
> +{
> +	int fd;
> +	struct ifreq ifr;
> +	long *mac = NULL;
> +
> +	fd = socket(AF_INET, SOCK_DGRAM, 0);
> +	ifr.ifr_addr.sa_family = AF_INET;
> +	strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1);
> +	ioctl(fd, SIOCGIFHWADDR, &ifr);
> +	mac = (long *)ifr.ifr_hwaddr.sa_data;
> +	close(fd);
> +	return mac;

Always check return value of ioctl.
You are assuming sizeof(long) > 6 bytes.
Also the byte order.

^ permalink raw reply

* Re: [Patch net-next] tcp: add a tracepoint for tcp_retransmit_skb()
From: Cong Wang @ 2017-10-10 17:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Yuchung Cheng, Neal Cardwell
In-Reply-To: <CANn89i+vtkP+bisnTURSazuKwtjeYJjLba6wZOLg5AjsqjJPqQ@mail.gmail.com>

On Tue, Oct 10, 2017 at 4:21 AM, Eric Dumazet <edumazet@google.com> wrote:
> On Mon, Oct 9, 2017 at 10:35 PM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> We need a real-time notification for tcp retransmission
>> for monitoring.
>
> Hi Cong
>
> This seems to not be well defined.
>
> For example, why a retransmission triggered by TLP is not traced ?

Oh, it should, seems I should just put the trace in __tcp_retransmit_skb()
instead...

I will update the patch after waiting for other feedbacks.

Thanks.

^ permalink raw reply

* Re: Patch "udp: perform source validation for mcast early demux" has been added to the 4.13-stable tree
From: David Miller @ 2017-10-10 17:09 UTC (permalink / raw)
  To: gregkh; +Cc: pabeni, stable, stable-commits, netdev
In-Reply-To: <20171010073143.GA19540@kroah.com>

From: Greg KH <gregkh@linuxfoundation.org>
Date: Tue, 10 Oct 2017 09:31:43 +0200

> On Tue, Oct 10, 2017 at 09:16:06AM +0200, Paolo Abeni wrote:
>> On Mon, 2017-10-09 at 10:54 +0200, Greg KH wrote:
>> > On Mon, Oct 09, 2017 at 10:02:14AM +0200, Paolo Abeni wrote:
>> > > On Mon, 2017-10-09 at 09:57 +0200, Greg KH wrote:
>> > > > On Mon, Oct 09, 2017 at 09:37:31AM +0200, Paolo Abeni wrote:
>> > > > > On Mon, 2017-10-09 at 09:35 +0200, gregkh@linuxfoundation.org wrote:
>> > > > > > This is a note to let you know that I've just added the patch titled
>> > > > > > 
>> > > > > >     udp: perform source validation for mcast early demux
>> > > > > > 
>> > > > > > to the 4.13-stable tree which can be found at:
>> > > > > >     http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
>> > > > > > 
>> > > > > > The filename of the patch is:
>> > > > > >      udp-perform-source-validation-for-mcast-early-demux.patch
>> > > > > > and it can be found in the queue-4.13 subdirectory.
>> > > > > > 
>> > > > > > If you, or anyone else, feels it should not be added to the stable tree,
>> > > > > > please let <stable@vger.kernel.org> know about it.
>> > > > > 
>> > > > > Please, keep this one on-hold. It needs a relevant follow-up I'm going
>> > > > > to post soon!
>> > > > 
>> > > > Can I keep the patch before this one in the series "IPv4: early demux
>> > > > can return an error code"?  Or should I hold off on both of these for
>> > > > now?
>> > > 
>> > > AFAIK the patch "IPv4: early demux can return an error code" does not
>> > > have any issue - it's just useless without this one - I guess it can
>> > > stay in.
>> > 
>> > Ok, I've now moved this one out, thanks for letting me know.
>> > 
>> > And if you happen to remember when/if a fix for this goes into the tree,
>> > that would be most helpful :)
>> 
>> Sure! the fix just entered Linus's tree: commit 996b44fcef8f ("udp: fix
>> bcast packet reception")
> 
> Great!  Dave, mind if I take this now, or do you want me to wait for the
> next round of networking patches.

Feel free to take this now, thanks!

^ permalink raw reply

* RE: [PATCH net 2/2] net/smc: dev_put for netdev after usage of ib_query_gid()
From: Parav Pandit @ 2017-10-10 17:09 UTC (permalink / raw)
  To: Ursula Braun, davem@davemloft.net
  Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-s390@vger.kernel.org, jwi@linux.vnet.ibm.com,
	schwidefsky@de.ibm.com, heiko.carstens@de.ibm.com,
	raspl@linux.vnet.ibm.com
In-Reply-To: <20171010141351.87700-3-ubraun@linux.vnet.ibm.com>



> -----Original Message-----
> From: Ursula Braun [mailto:ubraun@linux.vnet.ibm.com]
> Sent: Tuesday, October 10, 2017 9:14 AM
> To: davem@davemloft.net
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> s390@vger.kernel.org; jwi@linux.vnet.ibm.com; schwidefsky@de.ibm.com;
> heiko.carstens@de.ibm.com; raspl@linux.vnet.ibm.com;
> ubraun@linux.vnet.ibm.com; Parav Pandit <parav@mellanox.com>
> Subject: [PATCH net 2/2] net/smc: dev_put for netdev after usage of
> ib_query_gid()
> 
> For ROCEs ib_query_gid() takes a reference count on the net_device.
> This reference count must be decreased by the caller.
> 
> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Reported-by: Parav Pandit <parav@mellanox.com>

> ---
>  net/smc/smc_core.c | 11 +++++++----
>  1 file changed, 7 insertions(+), 4 deletions(-)
> 
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index
> 20b66e79c5d6..e93a31ec3cc2 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -380,10 +380,13 @@ static int smc_link_determine_gid(struct
> smc_link_group *lgr)
>  		if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
>  				 &gattr))
>  			continue;
> -		if (gattr.ndev &&
> -		    (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
> -			lnk->gid = gid;
> -			return 0;
> +		if (gattr.ndev) {
> +			if (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {

This needs to be changed to 
If (gattr.ndev) {
	if (is_vlan_dev(gattr.ndev)) && 
	    vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
	...
}

Without this check, on vlan disabled kernel at compile time, kernel will crash on vlan_dev_vlan_id().
Please fix this part, rest code looks fine.

^ permalink raw reply

* RE: [PATCH net 1/2] net/smc: replace function pointer get_netdev()
From: Parav Pandit @ 2017-10-10 17:01 UTC (permalink / raw)
  To: Ursula Braun, davem@davemloft.net
  Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-s390@vger.kernel.org, jwi@linux.vnet.ibm.com,
	schwidefsky@de.ibm.com, heiko.carstens@de.ibm.com,
	raspl@linux.vnet.ibm.com
In-Reply-To: <20171010141351.87700-2-ubraun@linux.vnet.ibm.com>



> -----Original Message-----
> From: Ursula Braun [mailto:ubraun@linux.vnet.ibm.com]
> Sent: Tuesday, October 10, 2017 9:14 AM
> To: davem@davemloft.net
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> s390@vger.kernel.org; jwi@linux.vnet.ibm.com; schwidefsky@de.ibm.com;
> heiko.carstens@de.ibm.com; raspl@linux.vnet.ibm.com;
> ubraun@linux.vnet.ibm.com; Parav Pandit <parav@mellanox.com>
> Subject: [PATCH net 1/2] net/smc: replace function pointer get_netdev()
> 
> SMC should not open code the function pointer get_netdev of the IB device.
> Replacing ib_query_gid(..., NULL) with ib_query_gid(..., gid_attr) allows access
> to the netdev.
> 
> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
> Suggested-by: Parav Pandit <parav@mellanox.com>
Looks fine to me. Minor nit below.
Reviewed-by: Parav Pandit <parav@mellanox.com>

> ---
>  net/smc/smc_ib.c | 26 +++++++++-----------------
>  1 file changed, 9 insertions(+), 17 deletions(-)
> 
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index
> 0b5852299158..b428c0f6c782 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -369,26 +369,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device
> *smcibdev,
> 
>  static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
> {
> -	struct net_device *ndev;
> +	struct ib_gid_attr gattr;
>  	int rc;
> 
>  	rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
> -			  &smcibdev->gid[ibport - 1], NULL);
> -	/* the SMC protocol requires specification of the roce MAC address;
> -	 * if net_device cannot be determined, it can be derived from gid 0
> -	 */
> -	ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
> -	if (ndev) {
> -		memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
> -		dev_put(ndev);
> -	} else if (!rc) {
> -		memcpy(&smcibdev->mac[ibport - 1][0],
> -		       &smcibdev->gid[ibport - 1].raw[8], 3);
> -		memcpy(&smcibdev->mac[ibport - 1][3],
> -		       &smcibdev->gid[ibport - 1].raw[13], 3);
> -		smcibdev->mac[ibport - 1][0] &= ~0x02;
> -	}
> -	return rc;
> +			  &smcibdev->gid[ibport - 1], &gattr);
> +	if (rc || !gattr.ndev)
> +		return -ENODEV;
> +
> +	memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
> +	dev_put(gattr.ndev);
> +	return 0;
>  }
> 
>  /* Create an identifier unique for this instance of SMC-R.
> @@ -419,6 +410,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device
> *smcibdev, u8 ibport)
>  			   &smcibdev->pattr[ibport - 1]);
>  	if (rc)
>  		goto out;
> +	/* the SMC protocol requires specification of the roce MAC address */
RoCE MAC address

>  	rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
>  	if (rc)
>  		goto out;
> --
> 2.13.5

^ permalink raw reply

* [PATCH][net-next] ipv6: fix dereference of rt6_ex before null check error
From: Colin King @ 2017-10-10 17:01 UTC (permalink / raw)
  To: David S . Miller, Alexey Kuznetsov, Hideaki YOSHIFUJI, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Currently rt6_ex is being dereferenced before it is null checked
hence there is a possible null dereference bug. Fix this by only
dereferencing rt6_ex after it has been null checked.

Detected by CoverityScan, CID#1457749 ("Dereference before null check")

Fixes: 81eb8447daae ("ipv6: take care of rt6_stats")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 net/ipv6/route.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 606e80325b21..6db1541eaa7b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1152,10 +1152,12 @@ static DEFINE_SPINLOCK(rt6_exception_lock);
 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
 				 struct rt6_exception *rt6_ex)
 {
-	struct net *net = dev_net(rt6_ex->rt6i->dst.dev);
+	struct net *net;

 	if (!bucket || !rt6_ex)
 		return;
+
+	net = dev_net(rt6_ex->rt6i->dst.dev);
 	rt6_ex->rt6i->rt6i_node = NULL;
 	hlist_del_rcu(&rt6_ex->hlist);
 	rt6_release(rt6_ex->rt6i);
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCHv4 iproute2 2/2] lib/libnetlink: update rtnl_talk to support malloc buff at run time
From: Stephen Hemminger @ 2017-10-10 16:47 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: Phil Sutter, Hangbin Liu, netdev, Hangbin Liu
In-Reply-To: <20171010064117.ipyarf5ml7fwnzdv@unicorn.suse.cz>

On Tue, 10 Oct 2017 08:41:17 +0200
Michal Kubecek <mkubecek@suse.cz> wrote:

> On Mon, Oct 09, 2017 at 10:25:25PM +0200, Phil Sutter wrote:
> > Hi Stephen,
> > 
> > On Mon, Oct 02, 2017 at 10:37:08AM -0700, Stephen Hemminger wrote:  
> > > On Thu, 28 Sep 2017 21:33:46 +0800
> > > Hangbin Liu <haliu@redhat.com> wrote:
> > >   
> > > > From: Hangbin Liu <liuhangbin@gmail.com>
> > > > 
> > > > This is an update for 460c03f3f3cc ("iplink: double the buffer size also in
> > > > iplink_get()"). After update, we will not need to double the buffer size
> > > > every time when VFs number increased.
> > > > 
> > > > With call like rtnl_talk(&rth, &req.n, NULL, 0), we can simply remove the
> > > > length parameter.
> > > > 
> > > > With call like rtnl_talk(&rth, nlh, nlh, sizeof(req), I add a new variable
> > > > answer to avoid overwrite data in nlh, because it may has more info after
> > > > nlh. also this will avoid nlh buffer not enough issue.
> > > > 
> > > > We need to free answer after using.
> > > > 
> > > > Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> > > > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > > > ---  
> > > 
> > > Most of the uses of rtnl_talk() don't need to this peek and dynamic sizing.
> > > Can only those places that need that be targeted?  
> > 
> > We could probably do that, by having a buffer on stack in __rtnl_talk()
> > which will be used instead of the allocated one if 'answer' is NULL. Or
> > maybe even introduce a dedicated API call for the dynamically allocated
> > receive buffer. But I really doubt that's feasible: AFAICT, that stack
> > buffer still needs to be reasonably sized since the reply might be
> > larger than the request (reusing the request buffer would be the most
> > simple way to tackle this), also there is support for extack which may
> > bloat the response to arbitrary size. Hangbin has shown in his benchmark
> > that the overhead of the second syscall is negligible, so why care about
> > that and increase code complexity even further?
> > 
> > Not saying it's not possible, but I just doubt it's worth the effort.  
> 
> Agreed. Current code is based on the assumption that we can estimate the
> maximum reply length in advance and the reason for this series is that
> this assumption turned out to be wrong. I'm afraid that if we replace
> it by an assumption that we can estimate the maximum reply length for
> most requests with only few exceptions, it's only matter of time for us
> to be proven wrong again.
> 
> Michal Kubecek
> 

For query responses, yes the response may be large. But for the common cases of
add address or add route, the response should just be ack or error.

^ permalink raw reply

* [RFC net-next 4/4] mlxsw: spectrum_router: Add extack message for RIF and VRF overflow
From: David Ahern @ 2017-10-10 16:41 UTC (permalink / raw)
  To: netdev; +Cc: jiri, idosch, kjlx, David Ahern
In-Reply-To: <1507653665-20540-1-git-send-email-dsahern@gmail.com>

Add extack argument down to mlxsw_sp_rif_create and mlxsw_sp_vr_create
to set an error message on RIF or VR overflow. Now an overflow of
either resource the use gets an informative message as opposed to
failing with EBUSY.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_router.c  | 114 +++++++++++++--------
 1 file changed, 69 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 7d53fdf2c0a8..ec4d313b9eca 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -731,14 +731,17 @@ static struct mlxsw_sp_fib *mlxsw_sp_vr_fib(const struct mlxsw_sp_vr *vr,
 }
 
 static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
-					      u32 tb_id)
+					      u32 tb_id,
+					      struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_vr *vr;
 	int err;
 
 	vr = mlxsw_sp_vr_find_unused(mlxsw_sp);
-	if (!vr)
+	if (!vr) {
+		NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported VRF");
 		return ERR_PTR(-EBUSY);
+	}
 	vr->fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4);
 	if (IS_ERR(vr->fib4))
 		return ERR_CAST(vr->fib4);
@@ -775,14 +778,15 @@ static void mlxsw_sp_vr_destroy(struct mlxsw_sp_vr *vr)
 	vr->fib4 = NULL;
 }
 
-static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id)
+static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id,
+					   struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_vr *vr;
 
 	tb_id = mlxsw_sp_fix_tb_id(tb_id);
 	vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id);
 	if (!vr)
-		vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id);
+		vr = mlxsw_sp_vr_create(mlxsw_sp, tb_id, extack);
 	return vr;
 }
 
@@ -948,7 +952,8 @@ static u32 mlxsw_sp_ipip_dev_ul_tb_id(const struct net_device *ol_dev)
 
 static struct mlxsw_sp_rif *
 mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
-		    const struct mlxsw_sp_rif_params *params);
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack);
 
 static struct mlxsw_sp_rif_ipip_lb *
 mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
@@ -966,7 +971,7 @@ mlxsw_sp_ipip_ol_ipip_lb_create(struct mlxsw_sp *mlxsw_sp,
 		.lb_config = ipip_ops->ol_loopback_config(mlxsw_sp, ol_dev),
 	};
 
-	rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common);
+	rif = mlxsw_sp_rif_create(mlxsw_sp, &lb_params.common, NULL);
 	if (IS_ERR(rif))
 		return ERR_CAST(rif);
 	return container_of(rif, struct mlxsw_sp_rif_ipip_lb, common);
@@ -3711,7 +3716,7 @@ mlxsw_sp_fib_node_get(struct mlxsw_sp *mlxsw_sp, u32 tb_id, const void *addr,
 	struct mlxsw_sp_vr *vr;
 	int err;
 
-	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id);
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id, NULL);
 	if (IS_ERR(vr))
 		return ERR_CAST(vr);
 	fib = mlxsw_sp_vr_fib(vr, proto);
@@ -4750,7 +4755,7 @@ static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp,
 	if (mlxsw_sp->router->aborted)
 		return 0;
 
-	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id);
+	vr = mlxsw_sp_vr_get(mlxsw_sp, men_info->tb_id, NULL);
 	if (IS_ERR(vr))
 		return PTR_ERR(vr);
 
@@ -4783,7 +4788,7 @@ mlxsw_sp_router_fibmr_vif_add(struct mlxsw_sp *mlxsw_sp,
 	if (mlxsw_sp->router->aborted)
 		return 0;
 
-	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id);
+	vr = mlxsw_sp_vr_get(mlxsw_sp, ven_info->tb_id, NULL);
 	if (IS_ERR(vr))
 		return PTR_ERR(vr);
 
@@ -5346,7 +5351,8 @@ const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif)
 
 static struct mlxsw_sp_rif *
 mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
-		    const struct mlxsw_sp_rif_params *params)
+		    const struct mlxsw_sp_rif_params *params,
+		    struct netlink_ext_ack *extack)
 {
 	u32 tb_id = l3mdev_fib_table(params->dev);
 	const struct mlxsw_sp_rif_ops *ops;
@@ -5360,14 +5366,16 @@ mlxsw_sp_rif_create(struct mlxsw_sp *mlxsw_sp,
 	type = mlxsw_sp_dev_rif_type(mlxsw_sp, params->dev);
 	ops = mlxsw_sp->router->rif_ops_arr[type];
 
-	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN);
+	vr = mlxsw_sp_vr_get(mlxsw_sp, tb_id ? : RT_TABLE_MAIN, extack);
 	if (IS_ERR(vr))
 		return ERR_CAST(vr);
 	vr->rif_count++;
 
 	err = mlxsw_sp_rif_index_alloc(mlxsw_sp, &rif_index);
-	if (err)
+	if (err) {
+		NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported router interfaces");
 		goto err_rif_index_alloc;
+	}
 
 	rif = mlxsw_sp_rif_alloc(ops->rif_size, rif_index, vr->id, params->dev);
 	if (!rif) {
@@ -5454,7 +5462,8 @@ mlxsw_sp_rif_subport_params_init(struct mlxsw_sp_rif_params *params,
 
 static int
 mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
-			       struct net_device *l3_dev)
+			       struct net_device *l3_dev,
+			       struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp_port_vlan->mlxsw_sp_port;
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
@@ -5470,7 +5479,7 @@ mlxsw_sp_port_vlan_router_join(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan,
 		};
 
 		mlxsw_sp_rif_subport_params_init(&params, mlxsw_sp_port_vlan);
-		rif = mlxsw_sp_rif_create(mlxsw_sp, &params);
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
 		if (IS_ERR(rif))
 			return PTR_ERR(rif);
 	}
@@ -5525,7 +5534,8 @@ mlxsw_sp_port_vlan_router_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan)
 
 static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
 					     struct net_device *port_dev,
-					     unsigned long event, u16 vid)
+					     unsigned long event, u16 vid,
+					     struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(port_dev);
 	struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan;
@@ -5537,7 +5547,7 @@ static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
 	switch (event) {
 	case NETDEV_UP:
 		return mlxsw_sp_port_vlan_router_join(mlxsw_sp_port_vlan,
-						      l3_dev);
+						      l3_dev, extack);
 	case NETDEV_DOWN:
 		mlxsw_sp_port_vlan_router_leave(mlxsw_sp_port_vlan);
 		break;
@@ -5547,19 +5557,22 @@ static int mlxsw_sp_inetaddr_port_vlan_event(struct net_device *l3_dev,
 }
 
 static int mlxsw_sp_inetaddr_port_event(struct net_device *port_dev,
-					unsigned long event)
+					unsigned long event,
+					struct netlink_ext_ack *extack)
 {
 	if (netif_is_bridge_port(port_dev) ||
 	    netif_is_lag_port(port_dev) ||
 	    netif_is_ovs_port(port_dev))
 		return 0;
 
-	return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1);
+	return mlxsw_sp_inetaddr_port_vlan_event(port_dev, port_dev, event, 1,
+						 extack);
 }
 
 static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
 					 struct net_device *lag_dev,
-					 unsigned long event, u16 vid)
+					 unsigned long event, u16 vid,
+					 struct netlink_ext_ack *extack)
 {
 	struct net_device *port_dev;
 	struct list_head *iter;
@@ -5569,7 +5582,8 @@ static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
 		if (mlxsw_sp_port_dev_check(port_dev)) {
 			err = mlxsw_sp_inetaddr_port_vlan_event(l3_dev,
 								port_dev,
-								event, vid);
+								event, vid,
+								extack);
 			if (err)
 				return err;
 		}
@@ -5579,16 +5593,19 @@ static int __mlxsw_sp_inetaddr_lag_event(struct net_device *l3_dev,
 }
 
 static int mlxsw_sp_inetaddr_lag_event(struct net_device *lag_dev,
-				       unsigned long event)
+				       unsigned long event,
+				       struct netlink_ext_ack *extack)
 {
 	if (netif_is_bridge_port(lag_dev))
 		return 0;
 
-	return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1);
+	return __mlxsw_sp_inetaddr_lag_event(lag_dev, lag_dev, event, 1,
+					     extack);
 }
 
 static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
-					  unsigned long event)
+					  unsigned long event,
+					  struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_lower_get(l3_dev);
 	struct mlxsw_sp_rif_params params = {
@@ -5598,7 +5615,7 @@ static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
 
 	switch (event) {
 	case NETDEV_UP:
-		rif = mlxsw_sp_rif_create(mlxsw_sp, &params);
+		rif = mlxsw_sp_rif_create(mlxsw_sp, &params, extack);
 		if (IS_ERR(rif))
 			return PTR_ERR(rif);
 		break;
@@ -5612,7 +5629,8 @@ static int mlxsw_sp_inetaddr_bridge_event(struct net_device *l3_dev,
 }
 
 static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
-					unsigned long event)
+					unsigned long event,
+					struct netlink_ext_ack *extack)
 {
 	struct net_device *real_dev = vlan_dev_real_dev(vlan_dev);
 	u16 vid = vlan_dev_vlan_id(vlan_dev);
@@ -5622,27 +5640,28 @@ static int mlxsw_sp_inetaddr_vlan_event(struct net_device *vlan_dev,
 
 	if (mlxsw_sp_port_dev_check(real_dev))
 		return mlxsw_sp_inetaddr_port_vlan_event(vlan_dev, real_dev,
-							 event, vid);
+							 event, vid, extack);
 	else if (netif_is_lag_master(real_dev))
 		return __mlxsw_sp_inetaddr_lag_event(vlan_dev, real_dev, event,
-						     vid);
+						     vid, extack);
 	else if (netif_is_bridge_master(real_dev) && br_vlan_enabled(real_dev))
-		return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event);
+		return mlxsw_sp_inetaddr_bridge_event(vlan_dev, event, extack);
 
 	return 0;
 }
 
 static int __mlxsw_sp_inetaddr_event(struct net_device *dev,
-				     unsigned long event)
+				     unsigned long event,
+				     struct netlink_ext_ack *extack)
 {
 	if (mlxsw_sp_port_dev_check(dev))
-		return mlxsw_sp_inetaddr_port_event(dev, event);
+		return mlxsw_sp_inetaddr_port_event(dev, event, extack);
 	else if (netif_is_lag_master(dev))
-		return mlxsw_sp_inetaddr_lag_event(dev, event);
+		return mlxsw_sp_inetaddr_lag_event(dev, event, extack);
 	else if (netif_is_bridge_master(dev))
-		return mlxsw_sp_inetaddr_bridge_event(dev, event);
+		return mlxsw_sp_inetaddr_bridge_event(dev, event, extack);
 	else if (is_vlan_dev(dev))
-		return mlxsw_sp_inetaddr_vlan_event(dev, event);
+		return mlxsw_sp_inetaddr_vlan_event(dev, event, extack);
 	else
 		return 0;
 }
@@ -5668,7 +5687,7 @@ int mlxsw_sp_inetaddr_event(struct notifier_block *unused,
 	if (!mlxsw_sp_rif_should_config(rif, dev, event))
 		goto out;
 
-	err = __mlxsw_sp_inetaddr_event(dev, event);
+	err = __mlxsw_sp_inetaddr_event(dev, event, NULL);
 out:
 	return notifier_from_errno(err);
 }
@@ -5691,7 +5710,7 @@ int mlxsw_sp_inetaddr_valid_event(struct notifier_block *unused,
 	if (!mlxsw_sp_rif_should_config(rif, dev, event))
 		goto out;
 
-	err = __mlxsw_sp_inetaddr_event(dev, event);
+	err = __mlxsw_sp_inetaddr_event(dev, event, ivi->extack);
 out:
 	return notifier_from_errno(err);
 }
@@ -5720,7 +5739,7 @@ static void mlxsw_sp_inet6addr_event_work(struct work_struct *work)
 	if (!mlxsw_sp_rif_should_config(rif, dev, event))
 		goto out;
 
-	__mlxsw_sp_inetaddr_event(dev, event);
+	__mlxsw_sp_inetaddr_event(dev, event, NULL);
 out:
 	rtnl_unlock();
 	dev_put(dev);
@@ -5772,7 +5791,7 @@ int mlxsw_sp_inet6addr_valid_event(struct notifier_block *unused,
 	if (!mlxsw_sp_rif_should_config(rif, dev, event))
 		goto out;
 
-	err = __mlxsw_sp_inetaddr_event(dev, event);
+	err = __mlxsw_sp_inetaddr_event(dev, event, i6vi->extack);
 out:
 	return notifier_from_errno(err);
 }
@@ -5849,7 +5868,8 @@ int mlxsw_sp_netdevice_router_port_event(struct net_device *dev)
 }
 
 static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
-				  struct net_device *l3_dev)
+				  struct net_device *l3_dev,
+				  struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp_rif *rif;
 
@@ -5858,9 +5878,9 @@ static int mlxsw_sp_port_vrf_join(struct mlxsw_sp *mlxsw_sp,
 	 */
 	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
 	if (rif)
-		__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN);
+		__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, extack);
 
-	return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP);
+	return __mlxsw_sp_inetaddr_event(l3_dev, NETDEV_UP, extack);
 }
 
 static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
@@ -5871,7 +5891,7 @@ static void mlxsw_sp_port_vrf_leave(struct mlxsw_sp *mlxsw_sp,
 	rif = mlxsw_sp_rif_find_by_dev(mlxsw_sp, l3_dev);
 	if (!rif)
 		return;
-	__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN);
+	__mlxsw_sp_inetaddr_event(l3_dev, NETDEV_DOWN, NULL);
 }
 
 int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
@@ -5887,10 +5907,14 @@ int mlxsw_sp_netdevice_vrf_event(struct net_device *l3_dev, unsigned long event,
 	case NETDEV_PRECHANGEUPPER:
 		return 0;
 	case NETDEV_CHANGEUPPER:
-		if (info->linking)
-			err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev);
-		else
+		if (info->linking) {
+			struct netlink_ext_ack *extack;
+
+			extack = netdev_notifier_info_to_extack(&info->info);
+			err = mlxsw_sp_port_vrf_join(mlxsw_sp, l3_dev, extack);
+		} else {
 			mlxsw_sp_port_vrf_leave(mlxsw_sp, l3_dev);
+		}
 		break;
 	}
 
@@ -6197,7 +6221,7 @@ mlxsw_sp_rif_ipip_lb_configure(struct mlxsw_sp_rif *rif)
 	struct mlxsw_sp_vr *ul_vr;
 	int err;
 
-	ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id);
+	ul_vr = mlxsw_sp_vr_get(mlxsw_sp, ul_tb_id, NULL);
 	if (IS_ERR(ul_vr))
 		return PTR_ERR(ul_vr);
 
-- 
2.1.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox