Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net] tcp: Avoid preprocessor directives in tracepoint macro args
From: Mat Martineau @ 2017-12-21 18:29 UTC (permalink / raw)
  To: netdev; +Cc: Mat Martineau, David Ahern

Using a preprocessor directive to check for CONFIG_IPV6 in the middle of
a DECLARE_EVENT_CLASS macro's arg list causes sparse to report a series
of errors:

./include/trace/events/tcp.h:68:1: error: directive in argument list
./include/trace/events/tcp.h:75:1: error: directive in argument list
./include/trace/events/tcp.h:144:1: error: directive in argument list
./include/trace/events/tcp.h:151:1: error: directive in argument list
./include/trace/events/tcp.h:216:1: error: directive in argument list
./include/trace/events/tcp.h:223:1: error: directive in argument list
./include/trace/events/tcp.h:274:1: error: directive in argument list
./include/trace/events/tcp.h:281:1: error: directive in argument list

Once sparse finds an error, it stops printing warnings for the file it
is checking. This masks any sparse warnings that would normally be
reported for the core TCP code.

Instead, handle the preprocessor conditionals in a couple of auxiliary
macros. This also has the benefit of reducing duplicate code.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 include/trace/events/tcp.h | 97 ++++++++++++++++++----------------------------
 1 file changed, 37 insertions(+), 60 deletions(-)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..ab34c561f26b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -25,6 +25,35 @@
 		tcp_state_name(TCP_CLOSING),		\
 		tcp_state_name(TCP_NEW_SYN_RECV))
 
+#define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
+	do {							\
+		struct in6_addr *pin6;				\
+								\
+		pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+		ipv6_addr_set_v4mapped(saddr, pin6);		\
+		pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+		ipv6_addr_set_v4mapped(daddr, pin6);		\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)		\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct in6_addr *pin6;				\
+									\
+			pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+			*pin6 = saddr6;					\
+			pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+			*pin6 = daddr6;					\
+		} else {						\
+			TP_STORE_V4MAPPED(__entry, saddr, daddr);	\
+		}							\
+	} while (0)
+#else
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)	\
+	TP_STORE_V4MAPPED(__entry, saddr, daddr)
+#endif
+
 /*
  * tcp event with arguments sk and skb
  *
@@ -50,7 +79,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skbaddr = skb;
@@ -65,20 +93,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk_skb,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			      sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -127,7 +143,6 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -141,20 +156,8 @@ DECLARE_EVENT_CLASS(tcp_event_sk,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
@@ -197,7 +200,6 @@ TRACE_EVENT(tcp_set_state,
 
 	TP_fast_assign(
 		struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -213,20 +215,8 @@ TRACE_EVENT(tcp_set_state,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
@@ -256,7 +246,6 @@ TRACE_EVENT(tcp_retransmit_synack,
 
 	TP_fast_assign(
 		struct inet_request_sock *ireq = inet_rsk(req);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -271,20 +260,8 @@ TRACE_EVENT(tcp_retransmit_synack,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 = ireq->ir_rmt_addr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = ireq->ir_v6_loc_addr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = ireq->ir_v6_rmt_addr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_loc_addr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr,
+			      ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr);
 	),
 
 	TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c",
-- 
2.15.1

^ permalink raw reply related

* [PATCH net-next] tcp: md5: Handle RCU dereference of md5sig_info
From: Mat Martineau @ 2017-12-21 18:29 UTC (permalink / raw)
  To: netdev; +Cc: Mat Martineau
In-Reply-To: <20171221182910.4785-1-mathew.j.martineau@linux.intel.com>

Dereference tp->md5sig_info in tcp_v4_destroy_sock() the same way it is
done in the adjacent call to tcp_clear_md5_list().

Resolves this sparse warning:

net/ipv4/tcp_ipv4.c:1914:17: warning: incorrect type in argument 1 (different address spaces)
net/ipv4/tcp_ipv4.c:1914:17:    expected struct callback_head *head
net/ipv4/tcp_ipv4.c:1914:17:    got struct callback_head [noderef] <asn:4>*<noident>

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dd945b114215..5d203248123e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1911,7 +1911,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	/* Clean up the MD5 key list, if any */
 	if (tp->md5sig_info) {
 		tcp_clear_md5_list(sk);
-		kfree_rcu(tp->md5sig_info, rcu);
+		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
 		tp->md5sig_info = NULL;
 	}
 #endif
-- 
2.15.1

^ permalink raw reply related

* [PATCH next-queue] ixgbe: no ipsec offload for 82598
From: Shannon Nelson @ 2017-12-21 18:21 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher; +Cc: steffen.klassert, netdev

Don't try to set up ipsec offload on the oldest part of
the ixgbe family.

Suggested-by: Yanjun Zhu <yanjun.zhu@oracle.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
index 424dbf7..12c7132 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
@@ -863,6 +863,9 @@ void ixgbe_init_ipsec_offload(struct ixgbe_adapter *adapter)
 	struct ixgbe_ipsec *ipsec;
 	size_t size;
 
+	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
+		return;
+
 	ipsec = kzalloc(sizeof(*ipsec), GFP_KERNEL);
 	if (!ipsec)
 		goto err1;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3 3/3] net: ibm: emac: support RGMII-[RX|TX]ID phymode
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <de458d66be6e804a45e1bd96e57aa5907bb98e03.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:50 +0100

> The RGMII spec allows compliance for devices that implement an internal
> delay on TXC and/or RXC inside the transmitter. This patch adds the
> necessary RGMII_[RX|TX]ID mode code to handle such PHYs with the
> emac driver.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v3 2/3] net: ibm: emac: replace custom PHY_MODE_* macros
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <2cb74d50c22d01873d1d976ec384917dc799be08.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:49 +0100

> The ibm_emac driver predates the PHY_INTERFACE_MODE_*
> enums by a few years.
> 
> And while the driver has been retrofitted to use the PHYLIB,
> the old definitions have stuck around to this day.
> 
> This patch replaces all occurences of PHY_MODE_* with
> the respective equivalent PHY_INTERFACE_MODE_* enum.
> And finally, it purges the old macros for good.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v3 1/3] net: ibm: emac: replace custom rgmii_mode_name with phy_modes
From: David Miller @ 2017-12-21 18:10 UTC (permalink / raw)
  To: chunkeey; +Cc: netdev, andrew, christophe.jaillet
In-Reply-To: <a9482f4f4037f6eb732de327290a432539648bcd.1513806256.git.chunkeey@gmail.com>

From: Christian Lamparter <chunkeey@gmail.com>
Date: Wed, 20 Dec 2017 23:01:48 +0100

> phy_modes() in the common phy.h already defines the same phy mode
> names in lower case. The deleted rgmii_mode_name() is used only
> in one place and for a "notice-level" printk. Hence, it will not
> be missed.
> 
> Signed-off-by: Christian Lamparter <chunkeey@gmail.com>

Applied to net-next.

^ permalink raw reply

* Re: [PATCH v3 next-queue 00/10] ixgbe: Add ipsec offload
From: Shannon Nelson @ 2017-12-21 17:55 UTC (permalink / raw)
  To: Yanjun Zhu, intel-wired-lan, jeffrey.t.kirsher
  Cc: steffen.klassert, sowmini.varadhan, netdev
In-Reply-To: <e253e271-eda9-8707-af57-3a5cf33cb097@oracle.com>

On 12/20/2017 11:09 PM, Yanjun Zhu wrote:
> On 2017/12/21 14:39, Yanjun Zhu wrote:
>> On 2017/12/20 7:59, Shannon Nelson wrote:
>>> This is an implementation of the ipsec hardware offload feature for
>>> the ixgbe driver and Intel's 10Gbe series NICs: x540, x550, 82599.
>> Hi, Nelson
>>
>> I notice that the ipsec feature is based on x540, x550, 82599. But 
>> this ixgbe driver
>> will also work with 82598.
>>
>> Does this ipsec feature also work with 82598?
> Sorry. I mean, after these ipsec patches are applied, whether ipsec 
> offload enabled or not,
> can this ixgbe driver still work well with 82598?

Hmm... I don't have one to test on, but I suspect the 82598 might not be 
happy with this.  I'll send a followup patch to catch this case.

Thanks!
sln


> 
> Zhu Yanjun
>>
>> Thanks a lot.
>> Zhu Yanjun
>>> These patches apply to net-next v4.14 as well as Jeff Kirsher's 
>>> next-queue
>>> v4.15-rc1-206-ge47375b.
>>>
>>> The ixgbe NICs support ipsec offload for 1024 Rx and 1024 Tx Security
>>> Associations (SAs), using up to 128 inbound IP addresses, and using the
>>> rfc4106(gcm(aes)) encryption.  This code does not yet support IPv6,
>>> checksum offload, or TSO in conjunction with the ipsec offload - those
>>> will be added in the future.
>>>
>>> This code shows improvements in both packet throughput and CPU 
>>> utilization.
>>> For example, here are some quicky numbers that show the magnitude of the
>>> performance gain on a single run of "iperf -c <dest>" with the ipsec
>>> offload on both ends of a point-to-point connection:
>>>
>>>     9.4 Gbps - normal case
>>>     7.6 Gbps - ipsec with offload
>>>     343 Mbps - ipsec no offload
>>>
>>> To set up a similar test case, you first need to be sure you have a 
>>> recent
>>> version of iproute2 that supports the ipsec offload tag, probably 
>>> something
>>> from ip 4.12 or newer would be best.  I have a shell script that builds
>>> up the appropriate commands for me, but here are the resulting commands
>>> for all tcp traffic between 14.0.0.52 and 14.0.0.70:
>>>
>>> For the left side (14.0.0.52):
>>>    ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp tmpl \
>>>       proto esp src 14.0.0.52 dst 14.0.0.70 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x p add dir in src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp tmpl \
>>>       proto esp dst 14.0.0.52 src 14.0.0.70 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp offload dev 
>>> eth4 dir out
>>>    ip x s add proto esp dst 14.0.0.52 src 14.0.0.70 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp offload dev 
>>> eth4 dir in
>>>   For the right side (14.0.0.70):
>>>    ip x p add dir out src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp tmpl \
>>>       proto esp src 14.0.0.70 dst 14.0.0.52 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x p add dir in src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp tmpl \
>>>       proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode transport 
>>> reqid 0x07
>>>    ip x s add proto esp src 14.0.0.70 dst 14.0.0.52 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.70/24 dst 14.0.0.52/24 proto tcp offload dev 
>>> eth4 dir out
>>>    ip x s add proto esp dst 14.0.0.70 src 14.0.0.52 spi 0x07 mode 
>>> transport \
>>>       reqid 0x07 replay-window 32 \
>>>       aead 'rfc4106(gcm(aes))' 
>>> 0x44434241343332312423222114131211f4f3f2f1 128 \
>>>       sel src 14.0.0.52/24 dst 14.0.0.70/24 proto tcp offload dev 
>>> eth4 dir in
>>>
>>> In both cases, the command "ip x s flush ; ip x p flush" will clean
>>> it all out and remove the offloads.
>>>
>>> Lastly, thanks to Alex Duyck for his early comments.
>>>
>>> Please see the individual patches for specific update info.
>>>
>>> v3: fixes after comments from those wonderfully pesky kbuild robots
>>> v2: fixes after comments from Alex
>>>
>>> Shannon Nelson (10):
>>>    ixgbe: clean up ipsec defines
>>>    ixgbe: add ipsec register access routines
>>>    ixgbe: add ipsec engine start and stop routines
>>>    ixgbe: add ipsec data structures
>>>    ixgbe: add ipsec offload add and remove SA
>>>    ixgbe: restore offloaded SAs after a reset
>>>    ixgbe: process the Rx ipsec offload
>>>    ixgbe: process the Tx ipsec offload
>>>    ixgbe: ipsec offload stats
>>>    ixgbe: register ipsec offload with the xfrm subsystem
>>>
>>>   drivers/net/ethernet/intel/ixgbe/Makefile        |   1 +
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe.h         |  33 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c |   2 +
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c   | 923 
>>> +++++++++++++++++++++++
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h   |  92 +++
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c     |   4 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  39 +-
>>>   drivers/net/ethernet/intel/ixgbe/ixgbe_type.h    |  22 +-
>>>   8 files changed, 1093 insertions(+), 23 deletions(-)
>>>   create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c
>>>   create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.h
>>>
>>
>>
> 

^ permalink raw reply

* Re: [PATCH net V3] net: reevalulate autoflowlabel setting after sysctl setting
From: David Miller @ 2017-12-21 18:07 UTC (permalink / raw)
  To: shli; +Cc: netdev, Kernel-team, shli, kafai, eric.dumazet, tom
In-Reply-To: <321216a522a3b46e77125a5b9df41c2b64821cf3.1513799711.git.shli@fb.com>

From: Shaohua Li <shli@kernel.org>
Date: Wed, 20 Dec 2017 12:10:21 -0800

> From: Shaohua Li <shli@fb.com>
> 
> sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2.
> If sockopt doesn't set autoflowlabel, outcome packets from the hosts are
> supposed to not include flowlabel. This is true for normal packet, but
> not for reset packet.
> 
> The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if
> we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't
> changed, so the sock will keep the old behavior in terms of auto
> flowlabel. Reset packet is suffering from this problem, because reset
> packet is sent from a special control socket, which is created at boot
> time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control
> socket will always have its ipv6_pinfo.autoflowlabel set, even after
> user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always
> have flowlabel. Normal sock created before sysctl setting suffers from
> the same issue. We can't even turn off autoflowlabel unless we kill all
> socks in the hosts.
> 
> To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the
> autoflowlabel setting from user, otherwise we always call
> ip6_default_np_autolabel() which has the new settings of sysctl.
> 
> Note, this changes behavior a little bit. Before commit 42240901f7c4
> (ipv6: Implement different admin modes for automatic flow labels), the
> autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes,
> existing connection will change autoflowlabel behavior. After that
> commit, autoflowlabel behavior is sticky in the whole life of the sock.
> With this patch, the behavior isn't sticky again.
> 
> Cc: Martin KaFai Lau <kafai@fb.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> Cc: Tom Herbert <tom@quantonium.net>
> Signed-off-by: Shaohua Li <shli@fb.com>

This looks a lot better, applied, thanks.

^ permalink raw reply

* Re: [PATCH net v3] openvswitch: Fix pop_vlan action for double tagged frames
From: David Miller @ 2017-12-21 18:05 UTC (permalink / raw)
  To: e; +Cc: netdev, ovs-dev, jbenc
In-Reply-To: <20171220200922.29415-1-e@erig.me>

From: Eric Garver <e@erig.me>
Date: Wed, 20 Dec 2017 15:09:22 -0500

> skb_vlan_pop() expects skb->protocol to be a valid TPID for double
> tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
> shift the true ethertype into position for us.
> 
> Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
> Signed-off-by: Eric Garver <e@erig.me>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net v3] openvswitch: Fix pop_vlan action for double tagged frames
From: David Miller @ 2017-12-21 18:05 UTC (permalink / raw)
  To: e; +Cc: netdev, ovs-dev, jbenc
In-Reply-To: <20171220200922.29415-1-e@erig.me>

From: Eric Garver <e@erig.me>
Date: Wed, 20 Dec 2017 15:09:22 -0500

> skb_vlan_pop() expects skb->protocol to be a valid TPID for double
> tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
> shift the true ethertype into position for us.
> 
> Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
> Signed-off-by: Eric Garver <e@erig.me>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCHv4 net-next 00/14] net: sched: sch: introduce extack support
From: David Miller @ 2017-12-21 17:42 UTC (permalink / raw)
  To: aring; +Cc: jhs, xiyou.wangcong, jiri, netdev, kernel, dsahern
In-Reply-To: <20171220173524.25874-1-aring@mojatatu.com>

From: Alexander Aring <aring@mojatatu.com>
Date: Wed, 20 Dec 2017 12:35:10 -0500

> this patch series basically add support for extack in common qdisc handling.
> Additional it adds extack pointer to common qdisc callback handling this
> offers per qdisc implementation to setting the extack message for each
> failure over netlink.

Series applied.

^ permalink raw reply

* Re: [PATCH 1/3] net: Fix possible race in peernet2id_alloc()
From: Eric W. Biederman @ 2017-12-21 17:39 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: netdev, davem, eric.dumazet
In-Reply-To: <151386201910.3724.7199367937841370542.stgit@localhost.localdomain>

Kirill Tkhai <ktkhai@virtuozzo.com> writes:

> peernet2id_alloc() is racy without rtnl_lock() as atomic_read(&peer->count)
> under net->nsid_lock does not guarantee, peer is alive:
>
> rcu_read_lock()
> peernet2id_alloc()                            ..
>   spin_lock_bh(&net->nsid_lock)               ..
>   atomic_read(&peer->count) == 1              ..
>   ..                                          put_net()
>   ..                                            cleanup_net()
>   ..                                              for_each_net(tmp)
>   ..                                                spin_lock_bh(&tmp->nsid_lock)
>   ..                                                __peernet2id(tmp, net) == -1
>   ..                                                    ..
>   ..                                                    ..
>     __peernet2id_alloc(alloc == true)                   ..
>   ..                                                    ..
> rcu_read_unlock()                                       ..
> ..                                                synchronize_rcu()
> ..                                                kmem_cache_free(net)
>
> After the above situation, net::netns_id contains id pointing to freed memory,
> and any other dereferencing by the id will operate with this freed memory.
>
> Currently, peernet2id_alloc() is used under rtnl_lock() everywhere except
> ovs_vport_cmd_fill_info(), and this race can't occur. But peernet2id_alloc()
> is generic interface, and better we fix it before someone really starts
> use it in wrong context.

So it comes down to this piece of code from ovs and just let me say ick.
	if (!net_eq(net, dev_net(vport->dev))) {
		int id = peernet2id_alloc(net, dev_net(vport->dev));

		if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id))
			goto nla_put_failure;
	}

Without the rtnl lock dev_net can cange between the test and the
call of peernet2id_alloc.

At first glance it looks like the bug is that we are running a control
path of the networking stack without the rtnl lock. So it may be that
ASSERT_RTNL() is the better fix.

Given that it would be nice to reduce the scope of the rtnl lock this
might not be a bad direction.  Let me see.

Is rtnl_notify safe without the rtnl lock?


>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> ---
>  net/core/net_namespace.c |   23 +++++++++++++++++++----
>  1 file changed, 19 insertions(+), 4 deletions(-)
>
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 60a71be75aea..6a4eab438221 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -221,17 +221,32 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
>   */
>  int peernet2id_alloc(struct net *net, struct net *peer)
>  {
> -	bool alloc;
> +	bool alloc = false, alive = false;
>  	int id;

        ^^^ Perhaps we want "ASSERT_RTNL();" here?
>  
> -	if (atomic_read(&net->count) == 0)
> -		return NETNSA_NSID_NOT_ASSIGNED;

Moving this hunk is of no benefit.  The code must be called with a valid
reference to net.   Which means net->count is a fancy way of testing to
see if the code is in cleanup_net.  In all other cases net->count should
be non-zero and it should remain that way because of our caller must
keep a reference.

>  	spin_lock_bh(&net->nsid_lock);
> -	alloc = atomic_read(&peer->count) == 0 ? false : true;
> +	/* Spinlock guarantees we never hash a peer to net->netns_ids
> +	 * after idr_destroy(&net->netns_ids) occurs in cleanup_net().
> +	 */
> +	if (atomic_read(&net->count) == 0) {
> +		id = NETNSA_NSID_NOT_ASSIGNED;
> +		goto unlock;
> +	}
> +	/*
> +	 * When peer is obtained from RCU lists, we may race with
> +	 * its cleanup. Check whether it's alive, and this guarantees
> +	 * we never hash a peer back to net->netns_ids, after it has
> +	 * just been idr_remove()'d from there in cleanup_net().
> +	 */
> +	if (maybe_get_net(peer))
> +		alive = alloc = true;

Yes this does seem reasonable.  The more obvious looking code which
would return NETNSA_NSID_NOT_ASSIGNED if the peer has a count of 0, is
silly as it makes would make it appear that a peer is momentary outside
of a network namespace when the peer is in fact moving from one network
namespace to another.
        
>  	id = __peernet2id_alloc(net, peer, &alloc);
> +unlock:
>  	spin_unlock_bh(&net->nsid_lock);
>  	if (alloc && id >= 0)
>  		rtnl_net_notifyid(net, RTM_NEWNSID, id);
                ^^^^^^
                Is this safe without the rtnl lock?
> +	if (alive)
> +		put_net(peer);
>  	return id;
>  }
>  EXPORT_SYMBOL_GPL(peernet2id_alloc);

Eric

^ permalink raw reply

* [PATCH v2 net-next] net: dsa: lan9303: lan9303_csr_reg_wait cleanups
From: Egil Hjelmeland @ 2017-12-21 17:34 UTC (permalink / raw)
  To: andrew, vivien.didelot, f.fainelli, netdev, linux-kernel; +Cc: Egil Hjelmeland

Non-functional cleanups in lan9303_csr_reg_wait():
 - Change type of param 'mask' from int to u32.
 - Remove param 'value' (will probably never be used)
 - Reduced retries from 1000 to 25, consistent with lan9303_read_wait.
 - Removed comments

Signed-off-by: Egil Hjelmeland <privat@egil-hjelmeland.no>

Changes v1 -> v2:
 - Removed comments
---
 drivers/net/dsa/lan9303-core.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index f412aad58253..944901f03f8b 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -249,7 +249,6 @@ static int lan9303_read(struct regmap *regmap, unsigned int offset, u32 *reg)
 	return -EIO;
 }
 
-/* Wait a while until mask & reg == value. Otherwise return timeout. */
 static int lan9303_read_wait(struct lan9303 *chip, int offset, u32 mask)
 {
 	int i;
@@ -541,20 +540,19 @@ lan9303_alr_cache_find_mac(struct lan9303 *chip, const u8 *mac_addr)
 	return NULL;
 }
 
-/* Wait a while until mask & reg == value. Otherwise return timeout. */
-static int lan9303_csr_reg_wait(struct lan9303 *chip, int regno,
-				int mask, char value)
+static int lan9303_csr_reg_wait(struct lan9303 *chip, int regno, u32 mask)
 {
 	int i;
 
-	for (i = 0; i < 0x1000; i++) {
+	for (i = 0; i < 25; i++) {
 		u32 reg;
 
 		lan9303_read_switch_reg(chip, regno, &reg);
-		if ((reg & mask) == value)
+		if (!(reg & mask))
 			return 0;
 		usleep_range(1000, 2000);
 	}
+
 	return -ETIMEDOUT;
 }
 
@@ -564,8 +562,7 @@ static int lan9303_alr_make_entry_raw(struct lan9303 *chip, u32 dat0, u32 dat1)
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_WR_DAT_1, dat1);
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD,
 				 LAN9303_ALR_CMD_MAKE_ENTRY);
-	lan9303_csr_reg_wait(chip, LAN9303_SWE_ALR_CMD_STS, ALR_STS_MAKE_PEND,
-			     0);
+	lan9303_csr_reg_wait(chip, LAN9303_SWE_ALR_CMD_STS, ALR_STS_MAKE_PEND);
 	lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD, 0);
 
 	return 0;
-- 
2.14.1

^ permalink raw reply related

* Re: [RFC PATCH net-next] tools/bpftool: use version from the kernel source tree
From: Jakub Kicinski @ 2017-12-21 17:34 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: netdev, linux-kernel, kernel-team, scientist, Alexei Starovoitov,
	Daniel Borkmann, Arnaldo Carvalho de Melo
In-Reply-To: <20171221120736.GA7054@castle.DHCP.thefacebook.com>

On Thu, 21 Dec 2017 12:07:42 +0000, Roman Gushchin wrote:
> On Wed, Dec 20, 2017 at 01:52:18PM -0800, Jakub Kicinski wrote:
> > On Wed, 20 Dec 2017 20:53:41 +0000, Roman Gushchin wrote:  
> > > On Wed, Dec 20, 2017 at 12:29:21PM -0800, Jakub Kicinski wrote:  
> > > Hm, why it's better? It's not only about the kernel version,
> > > IMO it's generally better to use includes from the source tree,
> > > rather then system-wide installed kernel headers.  
> > 
> > Right I agree the kernel headers are preferred.  I'm not entirely sure
> > why we don't use them, if it was OK to assume usr/ is there we wouldn't
> > need the tools/include/uapi/ contraption.  Maybe Arnaldo could explain?
> >   
> > > I've got about out-of-source builds, but do we support it in general?
> > > How can I build bpftool outside of the kernel tree?
> > > I've tried a bit, but failed.  
> > 
> > This is what I do:
> > 
> > make -C tools/bpf/bpftool/ W=1 O=/tmp/builds/bpftool  
> 
> This works perfectly with my patch:
> 
> $ make -C ~/linux/tools/bpf/ W=1 O=/home/guro/build/ --trace
> <...>
> echo '  CC       '/home/guro/build/main.o;gcc -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow -D__EXPORTED_HEADERS__ -I/home/guro/linux/tools/include/uapi -I/home/guro/linux/tools/include -I/home/guro/linux/tools/lib/bpf -I/home/guro/linux/kernel/bpf/ -I/home/guro/linux/usr/include -DNEW_DISSASSEMBLER_SIGNATURE   -c -MMD -o /home/guro/build/main.o main.c
> <...>
> echo '  LINK     '/home/guro/build/bpftool;gcc -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow -D__EXPORTED_HEADERS__ -I/home/guro/linux/tools/include/uapi -I/home/guro/linux/tools/include -I/home/guro/linux/tools/lib/bpf -I/home/guro/linux/kernel/bpf/ -I/home/guro/linux/usr/include -DNEW_DISSASSEMBLER_SIGNATURE -o /home/guro/build/bpftool /home/guro/build/common.o /home/guro/build/cgroup.o /home/guro/build/main.o /home/guro/build/json_writer.o /home/guro/build/prog.o /home/guro/build/map.o /home/guro/build/jit_disasm.o /home/guro/build/disasm.o /home/guro/build/libbpf.a -lelf -lbfd -lopcodes /home/guro/build/libbpf.a
>   LINK     /home/guro/build/bpftool
> make[1]: Leaving directory '/home/guro/linux/tools/bpf/bpftool'
> make: Leaving directory '/home/guro/linux/tools/bpf'
> 
> $ ./build/bpftool version
> ./build/bpftool v4.15.0

Argh, sorry for the confusion you need to build the kernel out-of-source
as well.  In my case I build the kernel and bpftool out of source, and
then the usr/ doesn't actually contain the auto-generated headers:

$ ls ~/devel/linux/usr/
gen_init_cpio.c  initramfs_data.S  Kconfig  Makefile

Only build directory does:

$ ls /tmp/builds/usr/
built-in.o  gen_init_cpio  include  initramfs_data.cpio  initramfs_data.o  modules.builtin  modules.order

Let me reiterate, the user space headers we need should all be already
included in -I$(srctree)/tools/include/uapi, and make kernelversion is
nice because it also adds the -rc tags.

^ permalink raw reply

* RE: [PATCH net-next v2] xen-netback: make copy batch size configurable
From: Paul Durrant @ 2017-12-21 17:29 UTC (permalink / raw)
  To: 'Joao Martins', netdev@vger.kernel.org
  Cc: Wei Liu, xen-devel@lists.xenproject.org
In-Reply-To: <20171221172428.32676-1-joao.m.martins@oracle.com>

> -----Original Message-----
> From: Joao Martins [mailto:joao.m.martins@oracle.com]
> Sent: 21 December 2017 17:24
> To: netdev@vger.kernel.org
> Cc: Joao Martins <joao.m.martins@oracle.com>; Wei Liu
> <wei.liu2@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>; xen-
> devel@lists.xenproject.org
> Subject: [PATCH net-next v2] xen-netback: make copy batch size
> configurable
> 
> Commit eb1723a29b9a ("xen-netback: refactor guest rx") refactored Rx
> handling and as a result decreased max grant copy ops from 4352 to 64.
> Before this commit it would drain the rx_queue (while there are
> enough slots in the ring to put packets) then copy to all pages and write
> responses on the ring. With the refactor we do almost the same albeit
> the last two steps are done every COPY_BATCH_SIZE (64) copies.
> 
> For big packets, the value of 64 means copying 3 packets best case scenario
> (17 copies) and worst-case only 1 packet (34 copies, i.e. if all frags
> plus head cross the 4k grant boundary) which could be the case when
> packets go from local backend process.
> 
> Instead of making it static to 64 grant copies, lets allow the user to
> select its value (while keeping the current as default) by introducing
> the `copy_batch_size` module parameter. This allows users to select
> the higher batches (i.e. for better throughput with big packets) as it
> was prior to the above mentioned commit.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>

Reviewed-by: Paul Durrant <paul.durrant@citrix.com>

> ---
> Changes since v1:
>  * move rx_copy.{idx,op} reallocation to separate helper
>  Addressed Paul's comments:
>  * rename xenvif_copy_state#size field to batch_size
>  * argument `size` should be unsigned int
>  * vfree is safe with NULL
>  * realloc rx_copy.{idx,op} after copy op flush
> ---
>  drivers/net/xen-netback/common.h    |  7 +++++--
>  drivers/net/xen-netback/interface.c | 16 +++++++++++++++-
>  drivers/net/xen-netback/netback.c   |  5 +++++
>  drivers/net/xen-netback/rx.c        | 35
> ++++++++++++++++++++++++++++++++++-
>  4 files changed, 59 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-
> netback/common.h
> index a46a1e94505d..8e4eaf3a507d 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -129,8 +129,9 @@ struct xenvif_stats {
>  #define COPY_BATCH_SIZE 64
> 
>  struct xenvif_copy_state {
> -	struct gnttab_copy op[COPY_BATCH_SIZE];
> -	RING_IDX idx[COPY_BATCH_SIZE];
> +	struct gnttab_copy *op;
> +	RING_IDX *idx;
> +	unsigned int batch_size;
>  	unsigned int num;
>  	struct sk_buff_head *completed;
>  };
> @@ -358,6 +359,7 @@ irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
> 
>  void xenvif_rx_action(struct xenvif_queue *queue);
>  void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff
> *skb);
> +int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size);
> 
>  void xenvif_carrier_on(struct xenvif *vif);
> 
> @@ -381,6 +383,7 @@ extern unsigned int rx_drain_timeout_msecs;
>  extern unsigned int rx_stall_timeout_msecs;
>  extern unsigned int xenvif_max_queues;
>  extern unsigned int xenvif_hash_cache_size;
> +extern unsigned int xenvif_copy_batch_size;
> 
>  #ifdef CONFIG_DEBUG_FS
>  extern struct dentry *xen_netback_dbg_root;
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-
> netback/interface.c
> index 78ebe494fef0..e12eb64ab0a9 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -518,6 +518,12 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  {
>  	int err, i;
> 
> +	err = xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
> +	if (err) {
> +		netdev_err(queue->vif->dev, "Could not alloc rx_copy\n");
> +		goto err;
> +	}
> +
>  	queue->credit_bytes = queue->remaining_credit = ~0UL;
>  	queue->credit_usec  = 0UL;
>  	timer_setup(&queue->credit_timeout, xenvif_tx_credit_callback, 0);
> @@ -544,7 +550,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  				 queue->mmap_pages);
>  	if (err) {
>  		netdev_err(queue->vif->dev, "Could not reserve
> mmap_pages\n");
> -		return -ENOMEM;
> +		goto err;
>  	}
> 
>  	for (i = 0; i < MAX_PENDING_REQS; i++) {
> @@ -556,6 +562,11 @@ int xenvif_init_queue(struct xenvif_queue *queue)
>  	}
> 
>  	return 0;
> +
> +err:
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +	return -ENOMEM;
>  }
> 
>  void xenvif_carrier_on(struct xenvif *vif)
> @@ -788,6 +799,9 @@ void xenvif_disconnect_ctrl(struct xenvif *vif)
>   */
>  void xenvif_deinit_queue(struct xenvif_queue *queue)
>  {
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +	queue->rx_copy.batch_size = 0;
>  	gnttab_free_pages(MAX_PENDING_REQS, queue->mmap_pages);
>  }
> 
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-
> netback/netback.c
> index a27daa23c9dc..3a5e1d7ac2f4 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -96,6 +96,11 @@ unsigned int xenvif_hash_cache_size =
> XENVIF_HASH_CACHE_SIZE_DEFAULT;
>  module_param_named(hash_cache_size, xenvif_hash_cache_size, uint,
> 0644);
>  MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash
> cache");
> 
> +/* This is the maximum batch of grant copies on Rx */
> +unsigned int xenvif_copy_batch_size = COPY_BATCH_SIZE;
> +module_param_named(copy_batch_size, xenvif_copy_batch_size, uint,
> 0644);
> +MODULE_PARM_DESC(copy_batch_size, "Maximum batch of grant copies
> on Rx");
> +
>  static void xenvif_idx_release(struct xenvif_queue *queue, u16
> pending_idx,
>  			       u8 status);
> 
> diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
> index b1cf7c6f407a..07eebd75e668 100644
> --- a/drivers/net/xen-netback/rx.c
> +++ b/drivers/net/xen-netback/rx.c
> @@ -130,6 +130,36 @@ static void xenvif_rx_queue_drop_expired(struct
> xenvif_queue *queue)
>  	}
>  }
> 
> +int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size)
> +{
> +	void *op = NULL, *idx = NULL;
> +
> +	if (!size || queue->rx_copy.num)
> +		return -EINVAL;
> +
> +	op = vzalloc(size * sizeof(struct gnttab_copy));
> +	if (!op)
> +		goto err;
> +
> +	idx = vzalloc(size * sizeof(RING_IDX));
> +	if (!idx)
> +		goto err;
> +
> +	vfree(queue->rx_copy.op);
> +	vfree(queue->rx_copy.idx);
> +
> +	queue->rx_copy.op = op;
> +	queue->rx_copy.idx = idx;
> +	queue->rx_copy.batch_size = size;
> +	netdev_dbg(queue->vif->dev, "Reallocated rx_copy for batch size
> %u\n",
> +		   size);
> +	return 0;
> +
> +err:
> +	vfree(op);
> +	return -ENOMEM;
> +}
> +
>  static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
>  {
>  	unsigned int i;
> @@ -162,6 +192,9 @@ static void xenvif_rx_copy_flush(struct xenvif_queue
> *queue)
>  		notify_remote_via_irq(queue->rx_irq);
> 
>  	__skb_queue_purge(queue->rx_copy.completed);
> +
> +	if (unlikely(xenvif_copy_batch_size != queue->rx_copy.batch_size))
> +		xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
>  }
> 
>  static void xenvif_rx_copy_add(struct xenvif_queue *queue,
> @@ -172,7 +205,7 @@ static void xenvif_rx_copy_add(struct xenvif_queue
> *queue,
>  	struct page *page;
>  	struct xen_page_foreign *foreign;
> 
> -	if (queue->rx_copy.num == COPY_BATCH_SIZE)
> +	if (queue->rx_copy.num == queue->rx_copy.batch_size)
>  		xenvif_rx_copy_flush(queue);
> 
>  	op = &queue->rx_copy.op[queue->rx_copy.num];
> --
> 2.11.0

^ permalink raw reply

* Re: Distress Call Please don't ignore
From: Sandra Younes @ 2017-12-21 16:43 UTC (permalink / raw)


Good Day,

Forgive my indignation if this message comes to you as a surprise and may offend your personality for contacting you without your prior consent and writing through this channel.

I came across your name and contact on the course of my personal searching when i was searching for a foreign reliable partner. I was assured of your capability and reliability after going true your profile.

I'm (Miss. Sandra) from Benghazi libya, My father of blessed memory by name late General Abdel Fattah Younes who was shot death by Islamist-linked militia within the anti-Gaddafi forces on 28th July, 2011 and after two days later my mother with my two brothers was killed one early morning by the rebels as result of civil war that is going on in my country Libya, then after the burial of my parents, my uncles conspired and sold my father's properties and left nothing for me. On a faithful morning, I opened my father's briefcase and discover a document which he has deposited ($6.250M USD) in a bank in a Turkish Bank which has a small branch in Canada with my name as the legitimate/next of kin. Meanwhile i have located the bank,and have also discussed the possiblity of transfering the fund. M
 y father left a clause to the bank that i must introduce a trusted foreign partner who would be my trustee to help me invest this fund; hence the need for your assistance,i request that you be my t
rustee and assist me in e

You will also be responsible for the investment and management of the fund for me and also you will help me get a good school where i will further my education.
I agreed to give you 40% of the $6.250M once the transfer is done. this is my true life story, I will be glad to receive your respond soonest for more details to enable us start and champion the transfer less than 14 banking days as i was informed by the bank manager.

Thanks for giving me your attention,

Yours sincerely,
Miss. Sandra Younes

^ permalink raw reply

* [PATCH net-next v2] xen-netback: make copy batch size configurable
From: Joao Martins @ 2017-12-21 17:24 UTC (permalink / raw)
  To: netdev; +Cc: Joao Martins, Wei Liu, Paul Durrant, xen-devel

Commit eb1723a29b9a ("xen-netback: refactor guest rx") refactored Rx
handling and as a result decreased max grant copy ops from 4352 to 64.
Before this commit it would drain the rx_queue (while there are
enough slots in the ring to put packets) then copy to all pages and write
responses on the ring. With the refactor we do almost the same albeit
the last two steps are done every COPY_BATCH_SIZE (64) copies.

For big packets, the value of 64 means copying 3 packets best case scenario
(17 copies) and worst-case only 1 packet (34 copies, i.e. if all frags
plus head cross the 4k grant boundary) which could be the case when
packets go from local backend process.

Instead of making it static to 64 grant copies, lets allow the user to
select its value (while keeping the current as default) by introducing
the `copy_batch_size` module parameter. This allows users to select
the higher batches (i.e. for better throughput with big packets) as it
was prior to the above mentioned commit.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
Changes since v1:
 * move rx_copy.{idx,op} reallocation to separate helper
 Addressed Paul's comments:
 * rename xenvif_copy_state#size field to batch_size
 * argument `size` should be unsigned int
 * vfree is safe with NULL
 * realloc rx_copy.{idx,op} after copy op flush
---
 drivers/net/xen-netback/common.h    |  7 +++++--
 drivers/net/xen-netback/interface.c | 16 +++++++++++++++-
 drivers/net/xen-netback/netback.c   |  5 +++++
 drivers/net/xen-netback/rx.c        | 35 ++++++++++++++++++++++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index a46a1e94505d..8e4eaf3a507d 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -129,8 +129,9 @@ struct xenvif_stats {
 #define COPY_BATCH_SIZE 64
 
 struct xenvif_copy_state {
-	struct gnttab_copy op[COPY_BATCH_SIZE];
-	RING_IDX idx[COPY_BATCH_SIZE];
+	struct gnttab_copy *op;
+	RING_IDX *idx;
+	unsigned int batch_size;
 	unsigned int num;
 	struct sk_buff_head *completed;
 };
@@ -358,6 +359,7 @@ irqreturn_t xenvif_ctrl_irq_fn(int irq, void *data);
 
 void xenvif_rx_action(struct xenvif_queue *queue);
 void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
+int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size);
 
 void xenvif_carrier_on(struct xenvif *vif);
 
@@ -381,6 +383,7 @@ extern unsigned int rx_drain_timeout_msecs;
 extern unsigned int rx_stall_timeout_msecs;
 extern unsigned int xenvif_max_queues;
 extern unsigned int xenvif_hash_cache_size;
+extern unsigned int xenvif_copy_batch_size;
 
 #ifdef CONFIG_DEBUG_FS
 extern struct dentry *xen_netback_dbg_root;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 78ebe494fef0..e12eb64ab0a9 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -518,6 +518,12 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 {
 	int err, i;
 
+	err = xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
+	if (err) {
+		netdev_err(queue->vif->dev, "Could not alloc rx_copy\n");
+		goto err;
+	}
+
 	queue->credit_bytes = queue->remaining_credit = ~0UL;
 	queue->credit_usec  = 0UL;
 	timer_setup(&queue->credit_timeout, xenvif_tx_credit_callback, 0);
@@ -544,7 +550,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 				 queue->mmap_pages);
 	if (err) {
 		netdev_err(queue->vif->dev, "Could not reserve mmap_pages\n");
-		return -ENOMEM;
+		goto err;
 	}
 
 	for (i = 0; i < MAX_PENDING_REQS; i++) {
@@ -556,6 +562,11 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 	}
 
 	return 0;
+
+err:
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+	return -ENOMEM;
 }
 
 void xenvif_carrier_on(struct xenvif *vif)
@@ -788,6 +799,9 @@ void xenvif_disconnect_ctrl(struct xenvif *vif)
  */
 void xenvif_deinit_queue(struct xenvif_queue *queue)
 {
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+	queue->rx_copy.batch_size = 0;
 	gnttab_free_pages(MAX_PENDING_REQS, queue->mmap_pages);
 }
 
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a27daa23c9dc..3a5e1d7ac2f4 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -96,6 +96,11 @@ unsigned int xenvif_hash_cache_size = XENVIF_HASH_CACHE_SIZE_DEFAULT;
 module_param_named(hash_cache_size, xenvif_hash_cache_size, uint, 0644);
 MODULE_PARM_DESC(hash_cache_size, "Number of flows in the hash cache");
 
+/* This is the maximum batch of grant copies on Rx */
+unsigned int xenvif_copy_batch_size = COPY_BATCH_SIZE;
+module_param_named(copy_batch_size, xenvif_copy_batch_size, uint, 0644);
+MODULE_PARM_DESC(copy_batch_size, "Maximum batch of grant copies on Rx");
+
 static void xenvif_idx_release(struct xenvif_queue *queue, u16 pending_idx,
 			       u8 status);
 
diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
index b1cf7c6f407a..07eebd75e668 100644
--- a/drivers/net/xen-netback/rx.c
+++ b/drivers/net/xen-netback/rx.c
@@ -130,6 +130,36 @@ static void xenvif_rx_queue_drop_expired(struct xenvif_queue *queue)
 	}
 }
 
+int xenvif_rx_copy_realloc(struct xenvif_queue *queue, unsigned int size)
+{
+	void *op = NULL, *idx = NULL;
+
+	if (!size || queue->rx_copy.num)
+		return -EINVAL;
+
+	op = vzalloc(size * sizeof(struct gnttab_copy));
+	if (!op)
+		goto err;
+
+	idx = vzalloc(size * sizeof(RING_IDX));
+	if (!idx)
+		goto err;
+
+	vfree(queue->rx_copy.op);
+	vfree(queue->rx_copy.idx);
+
+	queue->rx_copy.op = op;
+	queue->rx_copy.idx = idx;
+	queue->rx_copy.batch_size = size;
+	netdev_dbg(queue->vif->dev, "Reallocated rx_copy for batch size %u\n",
+		   size);
+	return 0;
+
+err:
+	vfree(op);
+	return -ENOMEM;
+}
+
 static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
 {
 	unsigned int i;
@@ -162,6 +192,9 @@ static void xenvif_rx_copy_flush(struct xenvif_queue *queue)
 		notify_remote_via_irq(queue->rx_irq);
 
 	__skb_queue_purge(queue->rx_copy.completed);
+
+	if (unlikely(xenvif_copy_batch_size != queue->rx_copy.batch_size))
+		xenvif_rx_copy_realloc(queue, xenvif_copy_batch_size);
 }
 
 static void xenvif_rx_copy_add(struct xenvif_queue *queue,
@@ -172,7 +205,7 @@ static void xenvif_rx_copy_add(struct xenvif_queue *queue,
 	struct page *page;
 	struct xen_page_foreign *foreign;
 
-	if (queue->rx_copy.num == COPY_BATCH_SIZE)
+	if (queue->rx_copy.num == queue->rx_copy.batch_size)
 		xenvif_rx_copy_flush(queue);
 
 	op = &queue->rx_copy.op[queue->rx_copy.num];
-- 
2.11.0

^ permalink raw reply related

* Re: Linux 4.14 - regression: broken tun/tap / bridge network with virtio - bisected
From: Willem de Bruijn @ 2017-12-21 17:11 UTC (permalink / raw)
  To: Andreas Hartmann
  Cc: Michal Kubecek, Jason Wang, David Miller, Network Development
In-Reply-To: <b10b506b-1fb1-2e85-1905-83d3ef091be1@01019freenet.de>

On Thu, Dec 21, 2017 at 12:05 PM, Andreas Hartmann
<andihartmann@01019freenet.de> wrote:
> On 12/20/2017 at 11:44 PM Willem de Bruijn wrote:
>>
>> On Wed, Dec 20, 2017 at 10:56 AM, Andreas Hartmann
>> <andihartmann@01019freenet.de> wrote:
>>>
>>> On 12/18/2017 at 06:11 PM Andreas Hartmann wrote:
>>>>
>>>> On 12/17/2017 at 11:33 PM Willem de Bruijn wrote:
>>>
>>> [...]
>>>>>
>>>>> I have been able to reproduce the hang by sending a UFO packet
>>>>> between two guests running v4.13 on a host running v4.15-rc1.
>>>>>
>>>>> The vhost_net_ubuf_ref refcount indeed hits overflow (-1) from
>>>>> vhost_zerocopy_callback being called for each segment of a
>>>>> segmented UFO skb. This refcount is decremented then on each
>>>>> segment, but incremented only once for the entire UFO skb.
>>>>>
>>>>> Before v4.14, these packets would be converted in skb_segment to
>>>>> regular copy packets with skb_orphan_frags and the callback function
>>>>> called once at this point. v4.14 added support for reference counted
>>>>> zerocopy skb that can pass through skb_orphan_frags unmodified and
>>>>> have their zerocopy state safely cloned with skb_zerocopy_clone.
>>>>>
>>>>> The call to skb_zerocopy_clone must come after skb_orphan_frags
>>>>> to limit cloning of this state to those skbs that can do so safely.
>>>>>
>>>>> Please try a host with the following patch. This fixes it for me. I
>>>>> intend to
>>>>> send it to net.
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index a592ca025fc4..d2d985418819 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -3654,8 +3654,6 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                  skb_shinfo(nskb)->tx_flags |=
>>>>> skb_shinfo(head_skb)->tx_flags &
>>>>>                                                SKBTX_SHARED_FRAG;
>>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>>> -                       goto err;
>>>>>
>>>>>                  while (pos < offset + len) {
>>>>>                          if (i >= nfrags) {
>>>>> @@ -3681,6 +3679,8 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                          if (unlikely(skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC)))
>>>>>                                  goto err;
>>>>> +                       if (skb_zerocopy_clone(nskb, frag_skb,
>>>>> GFP_ATOMIC))
>>>>> +                               goto err;
>>>>>
>>>>>                          *nskb_frag = *frag;
>>>>>                          __skb_frag_ref(nskb_frag);
>>>>>
>>>>>
>>>>> This is relatively inefficient, as it calls skb_zerocopy_clone for each
>>>>> frag
>>>>> in the frags[] array. I will follow-up with a patch to net-next that
>>>>> only
>>>>> checks once per skb:
>>>>>
>>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>>> index 466581cf4cdc..a293a33604ec 100644
>>>>> --- a/net/core/skbuff.c
>>>>> +++ b/net/core/skbuff.c
>>>>> @@ -3662,7 +3662,8 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                  skb_shinfo(nskb)->tx_flags |=
>>>>> skb_shinfo(head_skb)->tx_flags &
>>>>>                                                SKBTX_SHARED_FRAG;
>>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>>> +               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>>> +                   skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>>>                          goto err;
>>>>>
>>>>>                  while (pos < offset + len) {
>>>>> @@ -3676,6 +3677,11 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>
>>>>>                                  BUG_ON(!nfrags);
>>>>>
>>>>> +                               if (skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC) ||
>>>>> +                                   skb_zerocopy_clone(nskb, frag_skb,
>>>>> +                                                      GFP_ATOMIC))
>>>>> +                                       goto err;
>>>>> +
>>>>>                                  list_skb = list_skb->next;
>>>>>                          }
>>>>>
>>>>> @@ -3687,9 +3693,6 @@ struct sk_buff *skb_segment(struct sk_buff
>>>>> *head_skb,
>>>>>                                  goto err;
>>>>>                          }
>>>>>
>>>>> -                       if (unlikely(skb_orphan_frags(frag_skb,
>>>>> GFP_ATOMIC)))
>>>>> -                               goto err;
>>>>> -
>>>>
>>>>
>>>> I'm currently testing this one.
>>>>
>>>
>>> Test is in progress. I'm testing w/ 4.14.7, which already contains "net:
>>> accept UFO datagrams from tuntap and packet".
>>>
>>> At first, I tested an unpatched 4.14.7 - the problem (no more killable
>>> qemu-process) did occur promptly on shutdown of the machine. This was
>>> expected.
>>>
>>> Next, I applied the above patch (the second one). Until now, I didn't
>>> face any problem any more on shutdown of VMs. Looks promising.
>>
>>
>> Thanks for testing.
>>
>> I sent the first, simpler, one to net together with another fix.
>>
>>    http://patchwork.ozlabs.org/patch/851715/
>>
>
> If I'm using the second patch above (the more efficient one and not
> "[net,1/2] skbuff: orphan frags before zerocopy clone"), which I'm already
> testing here: Is it still necessary to apply this patch "[net,2/2] skbuff:
> skb_copy_ubufs must release uarg even without user frags"?

Not for this issue. It is an unrelated bug and not triggered by virtio_net
as configured normally.

^ permalink raw reply

* Re: Linux 4.14 - regression: broken tun/tap / bridge network with virtio - bisected
From: Andreas Hartmann @ 2017-12-21 17:05 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Michal Kubecek, Jason Wang, David Miller, Network Development
In-Reply-To: <CAF=yD-KFD3++koAi3mzYbH75D526QXZXBaKT5jfbNqniGMTL8w@mail.gmail.com>

On 12/20/2017 at 11:44 PM Willem de Bruijn wrote:
> On Wed, Dec 20, 2017 at 10:56 AM, Andreas Hartmann
> <andihartmann@01019freenet.de> wrote:
>> On 12/18/2017 at 06:11 PM Andreas Hartmann wrote:
>>> On 12/17/2017 at 11:33 PM Willem de Bruijn wrote:
>> [...]
>>>> I have been able to reproduce the hang by sending a UFO packet
>>>> between two guests running v4.13 on a host running v4.15-rc1.
>>>>
>>>> The vhost_net_ubuf_ref refcount indeed hits overflow (-1) from
>>>> vhost_zerocopy_callback being called for each segment of a
>>>> segmented UFO skb. This refcount is decremented then on each
>>>> segment, but incremented only once for the entire UFO skb.
>>>>
>>>> Before v4.14, these packets would be converted in skb_segment to
>>>> regular copy packets with skb_orphan_frags and the callback function
>>>> called once at this point. v4.14 added support for reference counted
>>>> zerocopy skb that can pass through skb_orphan_frags unmodified and
>>>> have their zerocopy state safely cloned with skb_zerocopy_clone.
>>>>
>>>> The call to skb_zerocopy_clone must come after skb_orphan_frags
>>>> to limit cloning of this state to those skbs that can do so safely.
>>>>
>>>> Please try a host with the following patch. This fixes it for me. I intend to
>>>> send it to net.
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index a592ca025fc4..d2d985418819 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -3654,8 +3654,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                  skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
>>>>                                                SKBTX_SHARED_FRAG;
>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>> -                       goto err;
>>>>
>>>>                  while (pos < offset + len) {
>>>>                          if (i >= nfrags) {
>>>> @@ -3681,6 +3679,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                          if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
>>>>                                  goto err;
>>>> +                       if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>> +                               goto err;
>>>>
>>>>                          *nskb_frag = *frag;
>>>>                          __skb_frag_ref(nskb_frag);
>>>>
>>>>
>>>> This is relatively inefficient, as it calls skb_zerocopy_clone for each frag
>>>> in the frags[] array. I will follow-up with a patch to net-next that only
>>>> checks once per skb:
>>>>
>>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>>> index 466581cf4cdc..a293a33604ec 100644
>>>> --- a/net/core/skbuff.c
>>>> +++ b/net/core/skbuff.c
>>>> @@ -3662,7 +3662,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                  skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
>>>>                                                SKBTX_SHARED_FRAG;
>>>> -               if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
>>>> +               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>> +                   skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
>>>>                          goto err;
>>>>
>>>>                  while (pos < offset + len) {
>>>> @@ -3676,6 +3677,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>
>>>>                                  BUG_ON(!nfrags);
>>>>
>>>> +                               if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
>>>> +                                   skb_zerocopy_clone(nskb, frag_skb,
>>>> +                                                      GFP_ATOMIC))
>>>> +                                       goto err;
>>>> +
>>>>                                  list_skb = list_skb->next;
>>>>                          }
>>>>
>>>> @@ -3687,9 +3693,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
>>>>                                  goto err;
>>>>                          }
>>>>
>>>> -                       if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
>>>> -                               goto err;
>>>> -
>>>
>>> I'm currently testing this one.
>>>
>>
>> Test is in progress. I'm testing w/ 4.14.7, which already contains "net:
>> accept UFO datagrams from tuntap and packet".
>>
>> At first, I tested an unpatched 4.14.7 - the problem (no more killable
>> qemu-process) did occur promptly on shutdown of the machine. This was
>> expected.
>>
>> Next, I applied the above patch (the second one). Until now, I didn't
>> face any problem any more on shutdown of VMs. Looks promising.
> 
> Thanks for testing.
> 
> I sent the first, simpler, one to net together with another fix.
> 
>    http://patchwork.ozlabs.org/patch/851715/
> 

If I'm using the second patch above (the more efficient one and not 
"[net,1/2] skbuff: orphan frags before zerocopy clone"), which I'm 
already testing here: Is it still necessary to apply this patch 
"[net,2/2] skbuff: skb_copy_ubufs must release uarg even without user 
frags"?


Thanks,
Andreas

^ permalink raw reply

* Re: [bpf-next V1-RFC PATCH 01/14] xdp: base API for new XDP rx-queue info concept
From: Jesper Dangaard Brouer @ 2017-12-21 16:59 UTC (permalink / raw)
  To: David Ahern
  Cc: Daniel Borkmann, Alexei Starovoitov, netdev, gospo, bjorn.topel,
	michael.chan, brouer, Saeed Mahameed
In-Reply-To: <20171218115501.3f1fcf36@redhat.com>

On Mon, 18 Dec 2017 11:55:01 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Wed, 13 Dec 2017 19:34:40 -0700
> David Ahern <dsahern@gmail.com> wrote:
> 
> > On 12/13/17 4:19 AM, Jesper Dangaard Brouer wrote:  
> > > +
> > > +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
> > > +{
> > > +	xdp_rxq->reg_state = REG_STATE_UNREGISTRED;
> > > +}
> > > +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
> > > +
> > > +void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
> > > +{
> > > +	if (xdp_rxq->reg_state == REG_STATE_REGISTRED) {
> > > +		WARN(1, "Missing unregister, handled but fix driver\n");
> > > +		xdp_rxq_info_unreg(xdp_rxq);
> > > +	}
> > > +	memset(xdp_rxq, 0, sizeof(*xdp_rxq));
> > > +	xdp_rxq->queue_index = U32_MAX;
> > > +	xdp_rxq->reg_state = REG_STATE_NEW;
> > > +}
> > > +EXPORT_SYMBOL_GPL(xdp_rxq_info_init);
> > > +
> > > +void xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq)
> > > +{
> > > +	WARN(!xdp_rxq->dev, "Missing net_device from driver");
> > > +	WARN(xdp_rxq->queue_index == U32_MAX, "Miss queue_index from driver");
> > > +	WARN(!(xdp_rxq->reg_state == REG_STATE_NEW),"API violation, miss init");
> > > +	xdp_rxq->reg_state = REG_STATE_REGISTRED;
> > > +}
> > > +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
> > >     
> > 
> > Rather than WARN()'s why not make the _reg and _init functions return an
> > int that indicates an error? For example you don't want to continue if
> > the dev is expected but missing.  
> 
> Handling return-errors in the drivers complicated the driver code, as it
> involves unraveling and deallocating other RX-rings etc (that were
> already allocated) if the reg fails. (Also notice next patch will allow
> dev == NULL, if right ptype is set).
> 
> I'm not completely rejecting you idea, as this is a good optimization
> trick, which is to move validation checks to setup-time, thus allowing
> less validation checks at runtime.  I sort-of actually already did
> this, as I allow bpf to deref dev without NULL check.  I would argue
> this is good enough, as we will crash in a predictable way, as above
> WARN will point to which driver violated the API.
> 
> If people think it is valuable I can change this API to return an err?

I will take Ahern's suggestion of returning an err-code, but only from
xdp_rxq_info_reg().  And I'm going to move xdp_rxq_info_init to be an
internal function (which Saeed also implicitly suggested).
I'm working through the drivers now, and only two drivers don't have a
proper error-return for handling xdp_rxq_info_reg() could fail.

I've also extended xdp_rxq_info_reg() to take args dev + idx, to reduce
the code-lines (given we now also have to check return code, this got
too big).  Thus, reg is a single call with if-return-check.


> I guess, it would be more future-proof to do this, as we (Bjørn,
> Michael, Andy) want to extend this to implement a XDP frame/mem return
> code-path.  And the register call will likely have to allocate some
> resource that could fail, which need to be handled...

I'm mostly doing it for above reason, as I'm hoping to avoid touching
every XDP driver once again.  It is a real pain.

> If we do this, we might as well (slab) alloc the xdp_rxq_info
> structure to reduce the bloat in the drivers RX-rings to a single
> pointer (and a pointer to xdp_rxq_info is what xdp_buff.rxq need).

I've dropped my idea of (slab) allocating the xdp_rxq_info structure.
I started coding this up, but realized the number of lines added per
driver got too excessive for no apparent gain. (e.g. I also needed to
take the numa-node into account in some drivers).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: KASAN: stack-out-of-bounds Read in rds_sendmsg
From: Santosh Shilimkar @ 2017-12-21 16:44 UTC (permalink / raw)
  To: syzkaller-bugs, Avinash Repaka
  Cc: syzbot, davem, linux-kernel, linux-rdma, netdev, rds-devel
In-Reply-To: <089e08263e589121d90560d610a5@google.com>

+Avinash

On 12/21/2017 1:10 AM, syzbot wrote:
> syzkaller has found reproducer for the following crash on 

[..]

> 
> audit: type=1400 audit(1513847224.110:7): avc:  denied  { map } for  
> pid=3157 comm="syzkaller455006" path="/root/syzkaller455006870" 
> dev="sda1" ino=16481 
> scontext=unconfined_u:system_r:insmod_t:s0-s0:c0.c1023 
> tcontext=unconfined_u:object_r:user_home_t:s0 tclass=file permissive=1
> ==================================================================
> BUG: KASAN: stack-out-of-bounds in rds_rdma_bytes net/rds/send.c:1013 
> [inline]

Could you please post the discussed fix if you are ready with it ?
This new report is same as last one and cmesg length check should
address it.

Regards,
Santosh

^ permalink raw reply

* Re: [PATCH v4 5/5] flow_dissector: Parse batman-adv unicast headers
From: Willem de Bruijn @ 2017-12-21 16:58 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Sven Eckelmann, b.a.t.m.a.n, Network Development, Tom Herbert,
	Jiri Pirko, LKML, Eric Dumazet
In-Reply-To: <20171221122436.GD1930@nanopsycho>

On Thu, Dec 21, 2017 at 7:24 AM, Jiri Pirko <jiri@resnulli.us> wrote:
> Thu, Dec 21, 2017 at 10:17:42AM CET, sven.eckelmann@openmesh.com wrote:
>>The batman-adv unicast packets contain a full layer 2 frame in encapsulated
>>form. The flow dissector must therefore be able to parse the batman-adv
>>unicast header to reach the layer 2+3 information.
>>
>>  +--------------------+
>>  | ip(v6)hdr          |
>>  +--------------------+
>>  | inner ethhdr       |
>>  +--------------------+
>>  | batadv unicast hdr |
>>  +--------------------+
>>  | outer ethhdr       |
>>  +--------------------+
>>
>>The obtained information from the upper layer can then be used by RPS to
>>schedule the processing on separate cores. This allows better distribution
>>of multiple flows from the same neighbor to different cores.
>>
>>Signed-off-by: Sven Eckelmann <sven.eckelmann@openmesh.com>
>
> Reviewed-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply

* Re: RCU callback crashes
From: Jakub Kicinski @ 2017-12-21 16:56 UTC (permalink / raw)
  To: John Fastabend; +Cc: Cong Wang, Jiri Pirko, netdev@vger.kernel.org
In-Reply-To: <97c5063d-fa28-c02f-2ad7-95a08e8d3cee@gmail.com>

On Thu, 21 Dec 2017 08:26:56 -0800, John Fastabend wrote:
> @Jakub, does your test have traffic generator running or just control
> path? My theory would be a bit odd if you didn't have traffic, but
> something is kicking the dequeue so must be some traffic.

It was just control traffic, but it's the first time I've seen it so it
may be very unlikely to trigger...

^ permalink raw reply

* [PATCH bpf] selftests/bpf: fix Makefile for passing LLC to the command line
From: Jakub Kicinski @ 2017-12-21 16:52 UTC (permalink / raw)
  To: netdev, alexei.starovoitov, daniel
  Cc: oss-drivers, Quentin Monnet, Jakub Kicinski

From: Quentin Monnet <quentin.monnet@netronome.com>

Makefile has a LLC variable that is initialised to "llc", but can
theoretically be overridden from the command line ("make LLC=llc-6.0").
However, this fails because for LLVM probe check, "llc" is called
directly. Use the $(LLC) variable instead to fix this.

Fixes: 22c8852624fc ("bpf: improve selftests and add tests for meta pointer")
Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 tools/testing/selftests/bpf/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 05fc4e2e7b3a..9316e648a880 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -39,7 +39,7 @@ $(BPFOBJ): force
 CLANG ?= clang
 LLC   ?= llc
 
-PROBE := $(shell llc -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
 
 # Let newer LLVM versions transparently probe the kernel for availability
 # of full BPF instruction set.
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH net] ipv6: Honor specified parameters in fibmatch lookup
From: David Miller @ 2017-12-21 16:51 UTC (permalink / raw)
  To: idosch; +Cc: netdev, roopa, dsahern, mlxsw
In-Reply-To: <20171220102825.28234-1-idosch@mellanox.com>

From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 20 Dec 2017 12:28:25 +0200

> Currently, parameters such as oif and source address are not taken into
> account during fibmatch lookup. Example (IPv4 for reference) before
> patch:
 ...
> The problem stems from the fact that the necessary route lookup flags
> are not set based on these parameters.
> 
> Instead of duplicating the same logic for fibmatch, we can simply
> resolve the original route from its copy and dump it instead.
> 
> Fixes: 18c3a61c4264 ("net: ipv6: RTM_GETROUTE: return matched fib result when requested")
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox