Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: 862591bf4("xfrm: skip policies marked as dead while rehashing")
From: Greg KH @ 2018-07-01 13:28 UTC (permalink / raw)
  To: Zubin Mithra; +Cc: netdev, groeck, stable
In-Reply-To: <20180620214118.GA37804@zsmcros.c.googlers.com>

On Wed, Jun 20, 2018 at 05:42:51PM -0400, Zubin Mithra wrote:
> Hello,
> 
> Syzkaller has reported a crash here[1] for a slab OOB read in
> xfrm_hash_rebuild.
> 
> Could the following 2 patches be applied in order to on 4.4.y?
> 
> 6916fb3b10("xfrm: Ignore socket policies when rebuilding hash tables")
> 862591bf4f("xfrm: skip policies marked as dead while rehashing")
> 
> [1] https://syzkaller.appspot.com/bug?id=1c11a638b7d27e871aa297f3b4d5fd5bc90f0cb4

Both now queued up, thanks.

greg k-h

^ permalink raw reply

* [PATCH] net: expose sk wmem in sock_exceed_buf_limit tracepoint
From: Yafang Shao @ 2018-07-01 15:31 UTC (permalink / raw)
  To: davem, satoru.moriya; +Cc: netdev, linux-kernel, Yafang Shao

Currently trace_sock_exceed_buf_limit() only show rmem info,
but wmem limit may also be hit.
So expose wmem info in this tracepoint as well.

Regarding memcg, I think it is better to introduce a new tracepoint(if
that is needed), i.e. trace_memcg_limit_hit other than show memcg info in
trace_sock_exceed_buf_limit.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/trace/events/sock.h | 30 +++++++++++++++++++++++++-----
 net/core/sock.c             |  6 ++++--
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 3176a39..a0c4b8a 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -35,6 +35,10 @@
 		EM(TCP_CLOSING)			\
 		EMe(TCP_NEW_SYN_RECV)
 
+#define skmem_kind_names			\
+		EM(SK_MEM_SEND)			\
+		EMe(SK_MEM_RECV)
+
 /* enums need to be exported to user space */
 #undef EM
 #undef EMe
@@ -44,6 +48,7 @@
 family_names
 inet_protocol_names
 tcp_state_names
+skmem_kind_names
 
 #undef EM
 #undef EMe
@@ -59,6 +64,9 @@
 #define show_tcp_state_name(val)        \
 	__print_symbolic(val, tcp_state_names)
 
+#define show_skmem_kind_names(val)	\
+	__print_symbolic(val, skmem_kind_names)
+
 TRACE_EVENT(sock_rcvqueue_full,
 
 	TP_PROTO(struct sock *sk, struct sk_buff *skb),
@@ -83,9 +91,9 @@
 
 TRACE_EVENT(sock_exceed_buf_limit,
 
-	TP_PROTO(struct sock *sk, struct proto *prot, long allocated),
+	TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),
 
-	TP_ARGS(sk, prot, allocated),
+	TP_ARGS(sk, prot, allocated, kind),
 
 	TP_STRUCT__entry(
 		__array(char, name, 32)
@@ -93,6 +101,10 @@
 		__field(long, allocated)
 		__field(int, sysctl_rmem)
 		__field(int, rmem_alloc)
+		__field(int, sysctl_wmem)
+		__field(int, wmem_alloc)
+		__field(int, wmem_queued)
+		__field(int, kind)
 	),
 
 	TP_fast_assign(
@@ -101,17 +113,25 @@
 		__entry->allocated = allocated;
 		__entry->sysctl_rmem = sk_get_rmem0(sk, prot);
 		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
+		__entry->sysctl_wmem = sk_get_wmem0(sk, prot);
+		__entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
+		__entry->wmem_queued = sk->sk_wmem_queued;
+		__entry->kind = kind;
 	),
 
-	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
-		"sysctl_rmem=%d rmem_alloc=%d",
+	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
 		__entry->name,
 		__entry->sysctl_mem[0],
 		__entry->sysctl_mem[1],
 		__entry->sysctl_mem[2],
 		__entry->allocated,
 		__entry->sysctl_rmem,
-		__entry->rmem_alloc)
+		__entry->rmem_alloc,
+		__entry->sysctl_wmem,
+		__entry->wmem_alloc,
+		__entry->wmem_queued,
+		show_skmem_kind_names(__entry->kind)
+	)
 );
 
 TRACE_EVENT(inet_sock_set_state,
diff --git a/net/core/sock.c b/net/core/sock.c
index bcc4182..65350ed 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2401,9 +2401,10 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	struct proto *prot = sk->sk_prot;
 	long allocated = sk_memory_allocated_add(sk, amt);
+	bool charged = true;
 
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
-	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
+	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
 		goto suppress_allocation;
 
 	/* Under limit. */
@@ -2461,7 +2462,8 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 			return 1;
 	}
 
-	trace_sock_exceed_buf_limit(sk, prot, allocated);
+	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
+		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
 
 	sk_memory_allocated_sub(sk, amt);
 
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH bpf] bpf: hash_map: decrement counter on error
From: Mauricio Vasquez @ 2018-07-01 16:33 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov; +Cc: netdev
In-Reply-To: <5f2040dc-f661-8346-c32e-5ba11ad93ae9@iogearbox.net>


On 06/30/2018 06:20 PM, Daniel Borkmann wrote:
> On 06/29/2018 02:48 PM, Mauricio Vasquez B wrote:
>> Decrement the number of elements in the map in case the allocation
>> of a new node fails.
>>
>> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>
> Thanks for the fix, Mauricio!
>
> Could you reply with a Fixes: tag in order to track the commit originally
> introducing this bug?
>
> Thanks,
> Daniel
>

Sure Daniel,

Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements")

Thanks,
Mauricio

^ permalink raw reply

* [PATCH net-next] net: phy: realtek: add missing entry for RTL8211 to mdio_device_id table
From: Heiner Kallweit @ 2018-07-01 17:14 UTC (permalink / raw)
  To: David Miller, Andrew Lunn, Florian Fainelli; +Cc: netdev@vger.kernel.org

When adding support for RTL8211 I forgot to update the mdio_device_id
table.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Fixes: d241d4aac93f ("net: phy: realtek: add support for RTL8211")
---
 drivers/net/phy/realtek.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 9757b162..7b516e5d 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -260,6 +260,7 @@ module_phy_driver(realtek_drvs);
 
 static struct mdio_device_id __maybe_unused realtek_tbl[] = {
 	{ 0x001cc816, 0x001fffff },
+	{ 0x001cc910, 0x001fffff },
 	{ 0x001cc912, 0x001fffff },
 	{ 0x001cc914, 0x001fffff },
 	{ 0x001cc915, 0x001fffff },
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH][next] netdevsim: fix sa_idx out of bounds check
From: Shannon Nelson @ 2018-07-01 19:21 UTC (permalink / raw)
  To: Colin King, Jakub Kicinski, David S . Miller, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180630203924.5121-1-colin.king@canonical.com>

On 6/30/2018 1:39 PM, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> Currently if sa_idx is equal to NSIM_IPSEC_MAX_SA_COUNT then
> an out-of-bounds read on ipsec->sa will occur. Fix the
> incorrect bounds check by using >= rather than >.
> 
> Detected by CoverityScan, CID#1470226 ("Out-of-bounds-read")
> 
> Fixes: 7699353da875 ("netdevsim: add ipsec offload testing")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ---
>   drivers/net/netdevsim/ipsec.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c
> index ceff544510b9..2dcf6cc269d0 100644
> --- a/drivers/net/netdevsim/ipsec.c
> +++ b/drivers/net/netdevsim/ipsec.c
> @@ -249,7 +249,7 @@ bool nsim_ipsec_tx(struct netdevsim *ns, struct sk_buff *skb)
>   	}
>   
>   	sa_idx = xs->xso.offload_handle & ~NSIM_IPSEC_VALID;
> -	if (unlikely(sa_idx > NSIM_IPSEC_MAX_SA_COUNT)) {
> +	if (unlikely(sa_idx >= NSIM_IPSEC_MAX_SA_COUNT)) {
>   		netdev_err(ns->netdev, "bad sa_idx=%d max=%d\n",
>   			   sa_idx, NSIM_IPSEC_MAX_SA_COUNT);
>   		return false;
> 

Good catch - thanks!

Acked-by: Shannon Nelson <shannon.nelson@oracle.com>

^ permalink raw reply

* [PATCH v5 net-next] net:sched: add action inheritdsfield to skbedit
From: Qiaobin Fu @ 2018-07-01 19:16 UTC (permalink / raw)
  To: davem
  Cc: marcelo.leitner, dcaratti, michel, netdev, jhs, xiyou.wangcong,
	qiaobinf

The new action inheritdsfield copies the field DS of
IPv4 and IPv6 packets into skb->priority. This enables
later classification of packets based on the DS field.

v5:
*Update the drop counter for TC_ACT_SHOT

v4:
*Not allow setting flags other than the expected ones.

*Allow dumping the pure flags.

v3:
*Use optional flags, so that it won't break old versions of tc.

*Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags.

v2:
*Fix the style issue

*Move the code from skbmod to skbedit

Original idea by Jamal Hadi Salim <jhs@mojatatu.com>

Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
Reviewed-by: Michel Machado <michel@digirati.com.br>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Davide Caratti <dcaratti@redhat.com>
---

Note that the motivation for this patch is found in the following discussion:
https://www.spinics.net/lists/netdev/msg501061.html
---
 include/uapi/linux/tc_act/tc_skbedit.h |  2 ++
 net/sched/act_skbedit.c                | 41 ++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h
index fbcfe27a4e6c..6de6071ebed6 100644
--- a/include/uapi/linux/tc_act/tc_skbedit.h
+++ b/include/uapi/linux/tc_act/tc_skbedit.h
@@ -30,6 +30,7 @@
 #define SKBEDIT_F_MARK			0x4
 #define SKBEDIT_F_PTYPE			0x8
 #define SKBEDIT_F_MASK			0x10
+#define SKBEDIT_F_INHERITDSFIELD	0x20
 
 struct tc_skbedit {
 	tc_gen;
@@ -45,6 +46,7 @@ enum {
 	TCA_SKBEDIT_PAD,
 	TCA_SKBEDIT_PTYPE,
 	TCA_SKBEDIT_MASK,
+	TCA_SKBEDIT_FLAGS,
 	__TCA_SKBEDIT_MAX
 };
 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6138d1d71900..dfaf5d8028dd 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -23,6 +23,9 @@
 #include <linux/rtnetlink.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/dsfield.h>
 
 #include <linux/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_skbedit.h>
@@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD) {
+		int wlen = skb_network_offset(skb);
+
+		switch (tc_skb_protocol(skb)) {
+		case htons(ETH_P_IP):
+			wlen += sizeof(struct iphdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
+			break;
+
+		case htons(ETH_P_IPV6):
+			wlen += sizeof(struct ipv6hdr);
+			if (!pskb_may_pull(skb, wlen))
+				goto err;
+			skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
+			break;
+		}
+	}
 	if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
 	    skb->dev->real_num_tx_queues > d->queue_mapping)
 		skb_set_queue_mapping(skb, d->queue_mapping);
@@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
 
 	spin_unlock(&d->tcf_lock);
 	return d->tcf_action;
+
+err:
+	d->tcf_qstats.drops++;
+	spin_unlock(&d->tcf_lock);
+	return TC_ACT_SHOT;
 }
 
 static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
@@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 	[TCA_SKBEDIT_MARK]		= { .len = sizeof(u32) },
 	[TCA_SKBEDIT_PTYPE]		= { .len = sizeof(u16) },
 	[TCA_SKBEDIT_MASK]		= { .len = sizeof(u32) },
+	[TCA_SKBEDIT_FLAGS]		= { .len = sizeof(u64) },
 };
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
@@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 		mask = nla_data(tb[TCA_SKBEDIT_MASK]);
 	}
 
+	if (tb[TCA_SKBEDIT_FLAGS] != NULL) {
+		u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]);
+
+		if (*pure_flags & SKBEDIT_F_INHERITDSFIELD)
+			flags |= SKBEDIT_F_INHERITDSFIELD;
+	}
+
 	parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
@@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 		.action  = d->tcf_action,
 	};
 	struct tcf_t t;
+	u64 pure_flags = 0;
 
 	if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
@@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
 	if ((d->flags & SKBEDIT_F_MASK) &&
 	    nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask))
 		goto nla_put_failure;
+	if (d->flags & SKBEDIT_F_INHERITDSFIELD)
+		pure_flags |= SKBEDIT_F_INHERITDSFIELD;
+	if (pure_flags != 0 &&
+	    nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags))
+		goto nla_put_failure;
 
 	tcf_tm_dump(&t, &d->tcf_tm);
 	if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD))
-- 
2.17.1

^ permalink raw reply related

* Compiler warnings in kernel 4.14.51
From: Enrico Mioso @ 2018-07-01 20:35 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Daniel Borkmann, Kirill Tkhai, Jakub Kicinski,
	Alexei Starovoitov, Rasmus Villemoes, John Fastabend,
	Jesper Dangaard Brouer, David Ahern

Hello!

While compiling kernel 4.14.51 I got the following warnings:
CC      net/core/dev.o
net/core/dev.c: In function 'validate_xmit_skb_list':
net/core/dev.c:3121:15: warning: 'tail' may be used uninitialized in this function [-Wmaybe-uninitialized]
...

CC      net/ipv4/fib_trie.o
net/ipv4/fib_trie.c: In function 'fib_trie_unmerge':
net/ipv4/fib_trie.c:1749:8: warning: 'local_tp' may be used uninitialized in this function [-Wmaybe-uninitialized]

The kernel has been compiled for the MIPS architecture, little-endian 32-bit.
My /prov/version file content is:
Linux version 4.14.51 (mrkiko@mStation) (gcc version 7.3.0 (OpenWrt GCC 7.3.0 r7360-e15565a01c)) #0 Fri Jun 29 05:54:17 2018

thank you very much to all of you for your great work.

Enrico

^ permalink raw reply

* [PATCH] TTY: isdn: Replace strncpy with memcpy
From: Guenter Roeck @ 2018-07-01 20:57 UTC (permalink / raw)
  To: Karsten Keil; +Cc: Kees Cook, linux-kernel, netdev, Guenter Roeck

gcc 8.1.0 complains:

drivers/isdn/i4l/isdn_tty.c: In function 'isdn_tty_suspend.isra.1':
drivers/isdn/i4l/isdn_tty.c:790:3: warning:
	'strncpy' output truncated before terminating nul copying
	as many bytes from a string as its length
drivers/isdn/i4l/isdn_tty.c:778:6: note: length computed here

drivers/isdn/i4l/isdn_tty.c: In function 'isdn_tty_resume':
drivers/isdn/i4l/isdn_tty.c:880:3: warning:
	'strncpy' output truncated before terminating nul copying
	as many bytes from a string as its length
drivers/isdn/i4l/isdn_tty.c:817:6: note: length computed here

Using strncpy() is indeed less than perfect since the length of data to
be copied has already been determined with strlen(). Replace strncpy()
with memcpy() to address the warning and optimize the code a little.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/isdn/i4l/isdn_tty.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index 960f26348bb5..b730037a0e2d 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -787,7 +787,7 @@ isdn_tty_suspend(char *id, modem_info *info, atemu *m)
 		cmd.parm.cmsg.para[3] = 4; /* 16 bit 0x0004 Suspend */
 		cmd.parm.cmsg.para[4] = 0;
 		cmd.parm.cmsg.para[5] = l;
-		strncpy(&cmd.parm.cmsg.para[6], id, l);
+		memcpy(&cmd.parm.cmsg.para[6], id, l);
 		cmd.command = CAPI_PUT_MESSAGE;
 		cmd.driver = info->isdn_driver;
 		cmd.arg = info->isdn_channel;
@@ -877,7 +877,7 @@ isdn_tty_resume(char *id, modem_info *info, atemu *m)
 		cmd.parm.cmsg.para[3] = 5; /* 16 bit 0x0005 Resume */
 		cmd.parm.cmsg.para[4] = 0;
 		cmd.parm.cmsg.para[5] = l;
-		strncpy(&cmd.parm.cmsg.para[6], id, l);
+		memcpy(&cmd.parm.cmsg.para[6], id, l);
 		cmd.command = CAPI_PUT_MESSAGE;
 		info->dialing = 1;
 //		strcpy(dev->num[i], n);
-- 
2.7.4

^ permalink raw reply related

* Reminder: Linux Plumbers Networking Track CFP
From: David Miller @ 2018-07-02  0:10 UTC (permalink / raw)
  To: netdev; +Cc: linux-wireless, netfilter-devel

The deadline is only 10 days away, and slots are filling up quickly.

This is a call for proposals for the networking track at the 2018
edition of the Linux Plumbers Conference which will be held in
Vancouver on November 13th and November 14th.

The LPC Networking Track is a community event, open to everyone, and
does not require an invitation.

We are seeking talks of 40 minutes in length, accompanied by papers of
2 to 10 pages in length.

Although proposals on finished work are perfectly acceptable, there is
even more value for talks on problems, proposals, and proof-of-concept
solutions that require face-to-face discussions and debate.

Please submit your proposals to the LPC Networking Technical Committee
at:

	lpc-netdev@vger.kernel.org

Proposals must be submitted by July 11th, and submitters will be
notified of acceptance by August 15th.

The format of the submission and other details can be found at:

	http://vger.kernel.org/lpc-networking.html

We are looking forward to seeing everyone in November!

^ permalink raw reply

* Re: [net-next PATCH v6 0/7] Symmetric queue selection using XPS for Rx queues
From: David Miller @ 2018-07-02  0:11 UTC (permalink / raw)
  To: amritha.nambiar
  Cc: netdev, alexander.h.duyck, willemdebruijn.kernel,
	sridhar.samudrala, alexander.duyck, edumazet, hannes, tom, tom
In-Reply-To: <153033254062.8297.3821413056768389263.stgit@anamhost.jf.intel.com>

From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 29 Jun 2018 21:26:35 -0700

> This patch series implements support for Tx queue selection based on
> Rx queue(s) map. This is done by configuring Rx queue(s) map per Tx-queue
> using sysfs attribute. If the user configuration for Rx queues does
> not apply, then the Tx queue selection falls back to XPS using CPUs and
> finally to hashing.
 ...

Series applied, thanks.

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the net tree
From: Stephen Rothwell @ 2018-07-02  0:15 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Jose Abreu

[-- Attachment #1: Type: text/plain, Size: 3545 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got conflicts in:

  drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
  drivers/net/ethernet/stmicro/stmmac/hwif.h

between commit:

  4205c88eaf17 ("net: stmmac: Set DMA buffer size in HW")

from the net tree and commit:

  1f705bc61aee ("net: stmmac: Add support for CBS QDISC")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 65bc3556bd8f,6e32f8a3710b..000000000000
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@@ -407,16 -407,19 +407,29 @@@ static void dwmac4_enable_tso(void __io
  	}
  }
  
 +static void dwmac4_set_bfsize(void __iomem *ioaddr, int bfsize, u32 chan)
 +{
 +	u32 value = readl(ioaddr + DMA_CHAN_RX_CONTROL(chan));
 +
 +	value &= ~DMA_RBSZ_MASK;
 +	value |= (bfsize << DMA_RBSZ_SHIFT) & DMA_RBSZ_MASK;
 +
 +	writel(value, ioaddr + DMA_CHAN_RX_CONTROL(chan));
 +}
 +
+ static void dwmac4_qmode(void __iomem *ioaddr, u32 channel, u8 qmode)
+ {
+ 	u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel));
+ 
+ 	mtl_tx_op &= ~MTL_OP_MODE_TXQEN_MASK;
+ 	if (qmode != MTL_QUEUE_AVB)
+ 		mtl_tx_op |= MTL_OP_MODE_TXQEN;
+ 	else
+ 		mtl_tx_op |= MTL_OP_MODE_TXQEN_AV;
+ 
+ 	writel(mtl_tx_op, ioaddr +  MTL_CHAN_TX_OP_MODE(channel));
+ }
+ 
  const struct stmmac_dma_ops dwmac4_dma_ops = {
  	.reset = dwmac4_dma_reset,
  	.init = dwmac4_dma_init,
@@@ -441,7 -444,7 +454,8 @@@
  	.set_rx_tail_ptr = dwmac4_set_rx_tail_ptr,
  	.set_tx_tail_ptr = dwmac4_set_tx_tail_ptr,
  	.enable_tso = dwmac4_enable_tso,
 +	.set_bfsize = dwmac4_set_bfsize,
+ 	.qmode = dwmac4_qmode,
  };
  
  const struct stmmac_dma_ops dwmac410_dma_ops = {
@@@ -468,5 -471,5 +482,6 @@@
  	.set_rx_tail_ptr = dwmac4_set_rx_tail_ptr,
  	.set_tx_tail_ptr = dwmac4_set_tx_tail_ptr,
  	.enable_tso = dwmac4_enable_tso,
 +	.set_bfsize = dwmac4_set_bfsize,
+ 	.qmode = dwmac4_qmode,
  };
diff --cc drivers/net/ethernet/stmicro/stmmac/hwif.h
index fe8b536b13f8,e2a965790648..000000000000
--- a/drivers/net/ethernet/stmicro/stmmac/hwif.h
+++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h
@@@ -183,7 -183,7 +183,8 @@@ struct stmmac_dma_ops 
  	void (*set_rx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
  	void (*set_tx_tail_ptr)(void __iomem *ioaddr, u32 tail_ptr, u32 chan);
  	void (*enable_tso)(void __iomem *ioaddr, bool en, u32 chan);
 +	void (*set_bfsize)(void __iomem *ioaddr, int bfsize, u32 chan);
+ 	void (*qmode)(void __iomem *ioaddr, u32 channel, u8 qmode);
  };
  
  #define stmmac_reset(__priv, __args...) \
@@@ -236,8 -236,8 +237,10 @@@
  	stmmac_do_void_callback(__priv, dma, set_tx_tail_ptr, __args)
  #define stmmac_enable_tso(__priv, __args...) \
  	stmmac_do_void_callback(__priv, dma, enable_tso, __args)
 +#define stmmac_set_dma_bfsize(__priv, __args...) \
 +	stmmac_do_void_callback(__priv, dma, set_bfsize, __args)
+ #define stmmac_dma_qmode(__priv, __args...) \
+ 	stmmac_do_void_callback(__priv, dma, qmode, __args)
  
  struct mac_device_info;
  struct net_device;

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* linux-next: manual merge of the net-next tree with the rdma tree
From: Stephen Rothwell @ 2018-07-02  0:21 UTC (permalink / raw)
  To: David Miller, Networking, Doug Ledford, Jason Gunthorpe
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Parav Pandit

[-- Attachment #1: Type: text/plain, Size: 3302 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  net/smc/smc_ib.c

between commit:

  ddb457c6993b ("net/smc: Replace ib_query_gid with rdma_get_gid_attr")

from the rdma tree and commit:

  be6a3f38ff2a ("net/smc: determine port attributes independent from pnet table")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc net/smc/smc_ib.c
index 74f29f814ec1,36de2fd76170..000000000000
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@@ -144,6 -143,62 +144,66 @@@ out
  	return rc;
  }
  
+ static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+ {
 -	struct ib_gid_attr gattr;
 -	int rc;
 -
 -	rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
 -			  &smcibdev->gid[ibport - 1], &gattr);
 -	if (rc || !gattr.ndev)
 -		return -ENODEV;
++	const struct ib_gid_attr *gattr;
++	int rc = 0;
+ 
 -	memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN);
 -	dev_put(gattr.ndev);
 -	return 0;
++	gattr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
++	if (IS_ERR(gattr))
++		return PTR_ERR(gattr);
++	if (!gattr->ndev) {
++		rc = -ENODEV;
++		goto done;
++	}
++	smcibdev->gid[ibport - 1] = gattr->gid;
++	memcpy(smcibdev->mac[ibport - 1], gattr->ndev->dev_addr, ETH_ALEN);
++done:
++	rdma_put_gid_attr(gattr);
++	return rc;
+ }
+ 
+ /* Create an identifier unique for this instance of SMC-R.
+  * The MAC-address of the first active registered IB device
+  * plus a random 2-byte number is used to create this identifier.
+  * This name is delivered to the peer during connection initialization.
+  */
+ static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ 						u8 ibport)
+ {
+ 	memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ 	       sizeof(smcibdev->mac[ibport - 1]));
+ 	get_random_bytes(&local_systemid[0], 2);
+ }
+ 
+ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+ {
+ 	return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+ }
+ 
+ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+ {
+ 	int rc;
+ 
+ 	memset(&smcibdev->pattr[ibport - 1], 0,
+ 	       sizeof(smcibdev->pattr[ibport - 1]));
+ 	rc = ib_query_port(smcibdev->ibdev, ibport,
+ 			   &smcibdev->pattr[ibport - 1]);
+ 	if (rc)
+ 		goto out;
+ 	/* the SMC protocol requires specification of the RoCE MAC address */
+ 	rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ 	if (rc)
+ 		goto out;
+ 	if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ 		     sizeof(local_systemid)) &&
+ 	    smc_ib_port_active(smcibdev, ibport))
+ 		/* create unique system identifier */
+ 		smc_ib_define_local_systemid(smcibdev, ibport);
+ out:
+ 	return rc;
+ }
+ 
  /* process context wrapper for might_sleep smc_ib_remember_port_attr */
  static void smc_ib_port_event_work(struct work_struct *work)
  {

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH] lib: rhashtable: Correct self-assignment in rhashtable.c
From: NeilBrown @ 2018-07-02  1:32 UTC (permalink / raw)
  To: Linux Kernel Network Developers, Rishabh Bhatnagar, tgraf,
	linux-arm-msm
  Cc: herbert, linux-kernel, Rishabh Bhatnagar
In-Reply-To: <1530493906-19711-1-git-send-email-rishabhb@codeaurora.org>

[-- Attachment #1: Type: text/plain, Size: 1261 bytes --]


(added netdev@vger.kernel.org)

On Sun, Jul 01 2018, Rishabh Bhatnagar wrote:

> In file lib/rhashtable.c line 777, skip variable is assigned to
> itself. The following error was observed:
>
> lib/rhashtable.c:777:41: warning: explicitly assigning value of
> variable of type 'int' to itself [-Wself-assign] error, forbidden
> warning: rhashtable.c:777
> This error was found when compiling with Clang 6.0. Change it to iter->skip.
>
> Change-Id: I5abd1ce5ba76737a73bd6eca94b07b1bd5267523
> Signed-off-by: Rishabh Bhatnagar <rishabhb@codeaurora.org>

Thanks for catching that!
Reviewed-by: NeilBrown <neilb@suse.com>

Thanks,
NeilBrown

> ---
>  lib/rhashtable.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
> index 9427b57..3109b2e 100644
> --- a/lib/rhashtable.c
> +++ b/lib/rhashtable.c
> @@ -774,7 +774,7 @@ int rhashtable_walk_start_check(struct rhashtable_iter *iter)
>  				skip++;
>  				if (list == iter->list) {
>  					iter->p = p;
> -					skip = skip;
> +					iter->skip = skip;
>  					goto found;
>  				}
>  			}
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [net-next 01/12] net/mlx5e: Add UDP GSO support
From: Willem de Bruijn @ 2018-07-02  1:45 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: borisp, David Miller, Network Development, Saeed Mahameed,
	ogerlitz, yossiku
In-Reply-To: <CAKgT0UfFMG+i9z_nxB0vkJesDE4CHWbudCh6kJ_1+=Vd1hv=wA@mail.gmail.com>

>> I've noticed that we could get cleaner code in our driver if we remove
>> these two lines from net/ipv4/udp_offload.c:
>> if (skb_is_gso(segs))
>>               mss *= skb_shinfo(segs)->gso_segs;
>>
>> I think that this is correct in case of GSO_PARTIAL segmentation for the
>> following reasons:
>> 1. After this change the UDP payload field is consistent with the IP
>> header payload length field. Currently, IPv4 length is 1500 and UDP
>> total length is the full unsegmented length.

How does this simplify the driver? Does it currently have to
change the udph->length field to the mss on the wire, because the
device only splits + replicates the headers + computes the csum?

> I don’t recall that the UDP header length field will match the IP length
> field. I had intentionally left it at the original value used to compute
> the UDP header checksum. That way you could just adjust it by
> cancelling out the length from the partial checksum.

>> 2. AFAIU, in GSO_PARTIAL no tunnel headers should be modified except the
>> IP ID field, including the UDP length field.
>> What do you think?
>
>
> For outer headers this is correct. For inner headers this is not. The inner
> UDP header will need to have the length updated and the checksum
> recomputed as a part of the segmentation.

^ permalink raw reply

* Re: [PATCH net-next v3 1/4] net: vhost: lock the vqs one by one
From: Jason Wang @ 2018-07-02  2:21 UTC (permalink / raw)
  To: xiangxia.m.yue
  Cc: mst, makita.toshiaki, virtualization, netdev, Tonghao Zhang
In-Reply-To: <1530340438-3039-2-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patch changes the way that lock all vqs
> at the same, to lock them one by one. It will
> be used for next patch to avoid the deadlock.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/vhost.c | 24 +++++++-----------------
>   1 file changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 895eaa2..4ca9383 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>   {
>   	int i;
>   
> -	for (i = 0; i < d->nvqs; ++i)
> +	for (i = 0; i < d->nvqs; ++i) {
> +		mutex_lock(&d->vqs[i]->mutex);
>   		__vhost_vq_meta_reset(d->vqs[i]);
> +		mutex_unlock(&d->vqs[i]->mutex);
> +	}
>   }
>   
>   static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>   #define vhost_get_used(vq, x, ptr) \
>   	vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
>   
> -static void vhost_dev_lock_vqs(struct vhost_dev *d)
> -{
> -	int i = 0;
> -	for (i = 0; i < d->nvqs; ++i)
> -		mutex_lock_nested(&d->vqs[i]->mutex, i);
> -}
> -
> -static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> -{
> -	int i = 0;
> -	for (i = 0; i < d->nvqs; ++i)
> -		mutex_unlock(&d->vqs[i]->mutex);
> -}
> -
>   static int vhost_new_umem_range(struct vhost_umem *umem,
>   				u64 start, u64 size, u64 end,
>   				u64 userspace_addr, int perm)
> @@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
>   		if (msg->iova <= vq_msg->iova &&
>   		    msg->iova + msg->size - 1 > vq_msg->iova &&
>   		    vq_msg->type == VHOST_IOTLB_MISS) {
> +			mutex_lock(&node->vq->mutex);
>   			vhost_poll_queue(&node->vq->poll);
> +			mutex_unlock(&node->vq->mutex);
> +
>   			list_del(&node->node);
>   			kfree(node);
>   		}
> @@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
>   	int ret = 0;
>   
>   	mutex_lock(&dev->mutex);
> -	vhost_dev_lock_vqs(dev);
>   	switch (msg->type) {
>   	case VHOST_IOTLB_UPDATE:
>   		if (!dev->iotlb) {
> @@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
>   		break;
>   	}
>   
> -	vhost_dev_unlock_vqs(dev);
>   	mutex_unlock(&dev->mutex);
>   
>   	return ret;

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>

Thanks

^ permalink raw reply

* Re: [PATCH net-next v3 2/4] net: vhost: replace magic number of lock annotation
From: Jason Wang @ 2018-07-02  2:21 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-3-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 6 +++---
>   1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index e7cf7d2..62bb8e8 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
>   	bool zcopy, zcopy_used;
>   	int sent_pkts = 0;
>   
> -	mutex_lock(&vq->mutex);
> +	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>   	sock = vq->private_data;
>   	if (!sock)
>   		goto out;
> @@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>   		/* Flush batched heads first */
>   		vhost_rx_signal_used(rvq);
>   		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, 1);
> +		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>   		vhost_disable_notify(&net->dev, vq);
>   
>   		preempt_disable();
> @@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
>   	__virtio16 num_buffers;
>   	int recv_pkts = 0;
>   
> -	mutex_lock_nested(&vq->mutex, 0);
> +	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
>   	sock = vq->private_data;
>   	if (!sock)
>   		goto out;

Acked-by: Jason Wang <jasowang@redhat.com>

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Jason Wang @ 2018-07-02  2:29 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-4-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Factor out generic busy polling logic and will be
> used for tx path in the next patch. And with the patch,
> qemu can set differently the busyloop_timeout for rx queue.
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
>   1 file changed, 53 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 62bb8e8..458f81d 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
>   	return vhost_poll_start(poll, sock->file);
>   }
>   
> +static int sk_has_rx_data(struct sock *sk)
> +{
> +	struct socket *sock = sk->sk_socket;
> +
> +	if (sock->ops->peek_len)
> +		return sock->ops->peek_len(sock);
> +
> +	return skb_queue_empty(&sk->sk_receive_queue);
> +}
> +
> +static void vhost_net_busy_poll(struct vhost_net *net,
> +				struct vhost_virtqueue *rvq,
> +				struct vhost_virtqueue *tvq,
> +				bool rx)
> +{
> +	unsigned long uninitialized_var(endtime);
> +	struct socket *sock = rvq->private_data;
> +	struct vhost_virtqueue *vq = rx ? tvq : rvq;
> +	unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
> +					      tvq->busyloop_timeout;

As simple as vq->busyloop_timeout?

> +
> +	mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);

We need move sock = rvq->private_data under the protection of vq mutex 
if rx is false.

> +	vhost_disable_notify(&net->dev, vq);
> +
> +	preempt_disable();
> +	endtime = busy_clock() + busyloop_timeout;
> +	while (vhost_can_busy_poll(tvq->dev, endtime) &&
> +	       !(sock && sk_has_rx_data(sock->sk)) &&
> +	       vhost_vq_avail_empty(tvq->dev, tvq))
> +		cpu_relax();
> +	preempt_enable();
> +
> +	if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> +	    (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> +		vhost_poll_queue(&vq->poll);
> +	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> +		vhost_disable_notify(&net->dev, vq);
> +		vhost_poll_queue(&vq->poll);
> +	}
> +
> +	mutex_unlock(&vq->mutex);
> +}
> +
> +
>   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>   				    struct vhost_virtqueue *vq,
>   				    struct iovec iov[], unsigned int iov_size,
> @@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>   	return len;
>   }
>   
> -static int sk_has_rx_data(struct sock *sk)
> -{
> -	struct socket *sock = sk->sk_socket;
> -
> -	if (sock->ops->peek_len)
> -		return sock->ops->peek_len(sock);
> -
> -	return skb_queue_empty(&sk->sk_receive_queue);
> -}
> -
>   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>   {
>   	struct vhost_virtqueue *vq = &nvq->vq;
> @@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
>   
>   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
>   {
> -	struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> -	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> -	struct vhost_virtqueue *vq = &nvq->vq;
> -	unsigned long uninitialized_var(endtime);
> -	int len = peek_head_len(rvq, sk);
> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> +	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];

It looks to me rnvq and tnvq is slightly better.

Other looks good to me.

Thanks

>   
> -	if (!len && vq->busyloop_timeout) {
> -		/* Flush batched heads first */
> -		vhost_rx_signal_used(rvq);
> -		/* Both tx vq and rx socket were polled here */
> -		mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> -		vhost_disable_notify(&net->dev, vq);
> +	int len = peek_head_len(nvq_rx, sk);
>   
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -
> -		while (vhost_can_busy_poll(&net->dev, endtime) &&
> -		       !sk_has_rx_data(sk) &&
> -		       vhost_vq_avail_empty(&net->dev, vq))
> -			cpu_relax();
> -
> -		preempt_enable();
> -
> -		if (!vhost_vq_avail_empty(&net->dev, vq))
> -			vhost_poll_queue(&vq->poll);
> -		else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> -			vhost_disable_notify(&net->dev, vq);
> -			vhost_poll_queue(&vq->poll);
> -		}
> +	if (!len && nvq_rx->vq.busyloop_timeout) {
> +		/* Flush batched heads first */
> +		vhost_rx_signal_used(nvq_rx);
>   
> -		mutex_unlock(&vq->mutex);
> +		/* Both tx vq and rx socket were polled here */
> +		vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
>   
> -		len = peek_head_len(rvq, sk);
> +		len = peek_head_len(nvq_rx, sk);
>   	}
>   
>   	return len;

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next v3 4/4] net: vhost: add rx busy polling in tx path
From: Jason Wang @ 2018-07-02  2:32 UTC (permalink / raw)
  To: xiangxia.m.yue; +Cc: netdev, virtualization, Tonghao Zhang, mst
In-Reply-To: <1530340438-3039-5-git-send-email-xiangxia.m.yue@gmail.com>



On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patch improves the guest receive and transmit performance.
> On the handle_tx side, we poll the sock receive queue at the
> same time. handle_rx do that in the same way.
>
> We set the poll-us=100us and use the iperf3 to test
> its bandwidth, use the netperf to test throughput and mean
> latency. When running the tests, the vhost-net kthread of
> that VM, is alway 100% CPU. The commands are shown as below.
>
> iperf3  -s -D
> iperf3  -c IP -i 1 -P 1 -t 20 -M 1400
>
> or
> netserver
> netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"
>
> host -> guest:
> iperf3:
> * With the patch:     27.0 Gbits/sec
> * Without the patch:  14.4 Gbits/sec
>
> netperf (TCP_RR):
> * With the patch:     48039.56 trans/s, 20.64us mean latency
> * Without the patch:  46027.07 trans/s, 21.58us mean latency
>
> This patch also improves the guest transmit performance.
>
> guest -> host:
> iperf3:
> * With the patch:     27.2 Gbits/sec
> * Without the patch:  24.4 Gbits/sec
>
> netperf (TCP_RR):
> * With the patch:     47963.25 trans/s, 20.71us mean latency
> * Without the patch:  45796.70 trans/s, 21.68us mean latency
>
> Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> ---
>   drivers/vhost/net.c | 10 +++-------
>   1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 458f81d..fb43d82 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -478,17 +478,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>   				    struct iovec iov[], unsigned int iov_size,
>   				    unsigned int *out_num, unsigned int *in_num)
>   {
> -	unsigned long uninitialized_var(endtime);
> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
>   	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>   				  out_num, in_num, NULL, NULL);
>   
>   	if (r == vq->num && vq->busyloop_timeout) {
> -		preempt_disable();
> -		endtime = busy_clock() + vq->busyloop_timeout;
> -		while (vhost_can_busy_poll(vq->dev, endtime) &&
> -		       vhost_vq_avail_empty(vq->dev, vq))
> -			cpu_relax();
> -		preempt_enable();
> +		vhost_net_busy_poll(net, &nvq_rx->vq, vq, false);
> +
>   		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>   				      out_num, in_num, NULL, NULL);
>   	}

Looks good to me.

A nit is "rnvq" looks better.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  2:41 UTC (permalink / raw)
  To: Michael S. Tsirkin, Toshiaki Makita; +Cc: netdev, kvm, virtualization
In-Reply-To: <20180629193448-mutt-send-email-mst@kernel.org>

On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>> Under heavy load vhost busypoll may run without suppressing
>> notification. For example tx zerocopy callback can push tx work while
>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>> condition and enables notification but immediately reenters handle_tx()
>> because the pushed work was tx. In this case handle_tx() tries to
>> disable notification again, but when using event_idx it by design
>> cannot. Then busyloop will run without suppressing notification.
>> Another example is the case where handle_tx() tries to enable
>> notification but avail idx is advanced so disables it again. This case
>> also lead to the same situation with event_idx.
>>
>> The problem is that once we enter this situation busyloop does not work
>> under heavy load for considerable amount of time, because notification
>> is likely to happen during busyloop and handle_tx() immediately enables
>> notification after notification happens. Specifically busyloop detects
>> notification by vhost_has_work() and then handle_tx() calls
>> vhost_enable_notify().
> I'd like to understand the problem a bit better.
> Why does this happen?
> Doesn't this only happen if ring is empty?
>

My understanding is:

vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause 
the busy loop in vhost_net_tx_get_vq_desc() to exit because of 
vhost_has_work() return true. Then handle_tx() tends to enable 
notification. Then guest may kick us even if handle_tx() call 
vhost_disable_notify() which in fact did nothing for even index.

Maybe we can try to call vhost_zerocopy_signal_used() if we found 
there's pending used from zerocopy instead.

Thanks

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:45 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <62908af8-ae65-c8d3-6f37-941471e0780e@redhat.com>

Hi Jason,

On 2018/06/29 18:30, Jason Wang wrote:
> On 2018年06月29日 16:09, Toshiaki Makita wrote:
...
>> To fix this, poll the work instead of enabling notification when
>> busypoll is interrupted by something. IMHO signal_pending() and
>> vhost_has_work() are kind of interruptions rather than signals to
>> completely cancel the busypoll, so let's run busypoll after the
>> necessary work is done. In order to avoid too long busyloop due to
>> interruption, save the endtime in vq field and use it when reentering
>> the work function.
> 
> I think we don't care long busyloop unless e.g tx can starve rx?

I just want to keep it user-controllable. Unless memorizing it busypoll
can run unexpectedly long.

>> I observed this problem makes tx performance poor but does not rx. I
>> guess it is because rx notification from socket is not that heavy so
>> did not impact the performance even when we failed to mask the
>> notification.
> 
> I think the reason is:
> 
> For tx, we may only process used ring under handle_tx(). Busy polling
> code can not recognize this case.
> For rx, we call peek_head_len() after exiting busy loop, so if the work
> is for rx, it can be processed in handle_rx() immediately.

Sorry but I don't see the difference. Tx busypoll calls
vhost_get_vq_desc() immediately after busypoll exits in
vhost_net_tx_get_vq_desc().

The scenario I described above (in 2nd paragraph) is a bit simplified.
To be clearer what I think is happening is:

1. handle_tx() runs busypoll with notification enabled (due to zerocopy
   callback or something).
2. Guest produces a packet in vring.
3. handle_tx() busypoll detects the produced packet and get it.
4. While vhost is processing the packet, guest kicks vring because it
   produced the packet. Vring notification is disabled automatically by
   event_idx and tx work is queued.
5. After processing the packet vhost enters busyloop again, but detects
   tx work and exits busyloop immediately. Then handle_tx() exits after
   enabling the notification.
6. Because the queued work was tx, handle_tx() is called immediately
   again. handle_tx() runs busypoll with notification enabled.
7. Repeat 2-6.

> 
>> Anyway for consistency I fixed rx routine as well as tx.
>>
>> Performance numbers:
>>
>> - Bulk transfer from guest to external physical server.
>>      [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
> 
> Just to confirm in this case since zerocopy is enabled, we are in fact
> use the generic XDP datapath?

For some reason zerocopy was not applied for most packets, so in most
cases driver XDP was used. I was going to dig into it but not yet.

> 
>> - Set 10us busypoll.
>> - Guest disables checksum and TSO because of host XDP.
>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>    (EPT_MISCONFIG event).
>>
>>                              Before              After
>>                            Mbps  kicks/s      Mbps  kicks/s
>> UDP_STREAM 1472byte              247758                 27
>>                  Send   3645.37            6958.10
>>                  Recv   3588.56            6958.10
>>                1byte                9865                 37
>>                  Send      4.34               5.43
>>                  Recv      4.17               5.26
>> TCP_STREAM             8801.03    45794   9592.77     2884
> 
> Impressive numbers.
> 
>>
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>> ---
...
>> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
>> +{
>> +    return unlikely(signal_pending(current)) || vhost_has_work(dev);
>>   }
>>     static void vhost_net_disable_vq(struct vhost_net *n,
>> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct
>> vhost_net *net,
>>         if (r == vq->num && vq->busyloop_timeout) {
>>           preempt_disable();
>> -        endtime = busy_clock() + vq->busyloop_timeout;
>> -        while (vhost_can_busy_poll(vq->dev, endtime) &&
>> -               vhost_vq_avail_empty(vq->dev, vq))
>> +        if (vq->busyloop_endtime) {
>> +            endtime = vq->busyloop_endtime;
>> +            vq->busyloop_endtime = 0;
> 
> Looks like endtime may be before current time. So we skip busy loop in
> this case.

Sure, I'll add a condition.

> 
>> +        } else {
>> +            endtime = busy_clock() + vq->busyloop_timeout;
>> +        }
>> +        while (vhost_can_busy_poll(endtime)) {
>> +            if (vhost_busy_poll_interrupted(vq->dev)) {
>> +                vq->busyloop_endtime = endtime;
>> +                break;
>> +            }
>> +            if (!vhost_vq_avail_empty(vq->dev, vq))
>> +                break;
>>               cpu_relax();
>> +        }
>>           preempt_enable();
>>           r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>>                         out_num, in_num, NULL, NULL);
...
>> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct
>> vhost_net_virtqueue *nvq)
>>     static int vhost_net_rx_peek_head_len(struct vhost_net *net,
>> struct sock *sk)
>>   {
>> -    struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>> -    struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>> -    struct vhost_virtqueue *vq = &nvq->vq;
>> +    struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>> +    struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>> +    struct vhost_virtqueue *rvq = &rnvq->vq;
>> +    struct vhost_virtqueue *tvq = &tnvq->vq;
> 
> NIt: I admit the variable name is confusing. It would be better to do
> the renaming in a separate patch to ease review.

OK.

> 
>>       unsigned long uninitialized_var(endtime);
>> -    int len = peek_head_len(rvq, sk);
>> +    int len = peek_head_len(rnvq, sk);
>>   -    if (!len && vq->busyloop_timeout) {
>> +    if (!len && tvq->busyloop_timeout) {
>>           /* Flush batched heads first */
>> -        vhost_rx_signal_used(rvq);
>> +        vhost_rx_signal_used(rnvq);
>>           /* Both tx vq and rx socket were polled here */
>> -        mutex_lock_nested(&vq->mutex, 1);
>> -        vhost_disable_notify(&net->dev, vq);
>> +        mutex_lock_nested(&tvq->mutex, 1);
>> +        vhost_disable_notify(&net->dev, tvq);
>>             preempt_disable();
>> -        endtime = busy_clock() + vq->busyloop_timeout;
>> +        if (rvq->busyloop_endtime) {
>> +            endtime = rvq->busyloop_endtime;
>> +            rvq->busyloop_endtime = 0;
>> +        } else {
>> +            endtime = busy_clock() + tvq->busyloop_timeout;
>> +        }
>>   -        while (vhost_can_busy_poll(&net->dev, endtime) &&
>> -               !sk_has_rx_data(sk) &&
>> -               vhost_vq_avail_empty(&net->dev, vq))
>> +        while (vhost_can_busy_poll(endtime)) {
>> +            if (vhost_busy_poll_interrupted(&net->dev)) {
>> +                rvq->busyloop_endtime = endtime;
>> +                break;
>> +            }
>> +            if (sk_has_rx_data(sk) ||
>> +                !vhost_vq_avail_empty(&net->dev, tvq))
>> +                break;
>>               cpu_relax();
> 
> Actually, I plan to poll guest rx refilling as well here to avoid rx
> kicks. Want to draft another patch to do it as well? It's as simple as
> add a vhost_vq_avail_empty for rvq above.

OK. will try it.

> 
>> +        }
>>             preempt_enable();
>>   -        if (!vhost_vq_avail_empty(&net->dev, vq))
>> -            vhost_poll_queue(&vq->poll);
>> -        else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>> -            vhost_disable_notify(&net->dev, vq);
>> -            vhost_poll_queue(&vq->poll);
>> +        if (!vhost_vq_avail_empty(&net->dev, tvq)) {
>> +            vhost_poll_queue(&tvq->poll);
>> +        } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
>> +            vhost_disable_notify(&net->dev, tvq);
>> +            vhost_poll_queue(&tvq->poll);
>>           }
>>   -        mutex_unlock(&vq->mutex);
>> +        mutex_unlock(&tvq->mutex);
>>   -        len = peek_head_len(rvq, sk);
>> +        len = peek_head_len(rnvq, sk);
>>       }
>>         return len;
...
>> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>>               goto out;
>>           }
>>       }
>> -    vhost_net_enable_vq(net, vq);
>> +    if (unlikely(vq->busyloop_endtime)) {
>> +        /* Busy poll is interrupted. */
>> +        vhost_poll_queue(&vq->poll);
>> +    } else {
>> +        vhost_net_enable_vq(net, vq);
>> +    }
> 
> This is to reduce the rx wake ups. Better with another patch.
> 
> So I suggest to split the patches into:
> 
> 1 better naming of variable of vhost_net_rx_peek_head_len().
> 2 avoid tx kicks (most of what this patch did)
> 3 avoid rx wakeups (exactly the above 6 lines did)
> 4 avoid rx kicks

OK, I'll reorganize the patch.
Thank you for your feedback!

> 
> Btw, tonghao is doing some refactoring of busy polling as well. Depends
> on the order of being merged, one of you may need rebasing.

Thanks for sharing. I'll take a look.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [net-next:master 75/95] drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:undefined reference to `__udivdi3'
From: kbuild test robot @ 2018-07-02  2:45 UTC (permalink / raw)
  To: Jose Abreu; +Cc: kbuild-all, netdev

[-- Attachment #1: Type: text/plain, Size: 2431 bytes --]

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   97680ade43dc6a8ad2451389d66b0492458196d4
commit: 1f705bc61aee5fab2826bcf6de152a5d92378a85 [75/95] net: stmmac: Add support for CBS QDISC
config: microblaze-allyesconfig (attached as .config)
compiler: microblaze-linux-gcc (GCC) 8.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        git checkout 1f705bc61aee5fab2826bcf6de152a5d92378a85
        # save the attached .config to linux build tree
        GCC_VERSION=8.1.0 make.cross ARCH=microblaze 

All errors (new ones prefixed by >>):

   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   `.exit.data' referenced in section `.exit.text' of drivers/tty/n_hdlc.o: defined in discarded section `.exit.data' of drivers/tty/n_hdlc.o
   drivers/net/ethernet/stmicro/stmmac/stmmac_tc.o: In function `tc_setup_cbs':
>> drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x318): undefined reference to `__udivdi3'
   drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x458): undefined reference to `__udivdi3'
   drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x840): undefined reference to `__udivdi3'
   drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x860): undefined reference to `__udivdi3'
   drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x89c): undefined reference to `__udivdi3'
   drivers/net/ethernet/stmicro/stmmac/stmmac_tc.o:drivers/net/ethernet/stmicro/stmmac/.tmp_gl_stmmac_tc.o:(.text+0x934): more undefined references to `__udivdi3' follow
   drivers/android/binder.o: In function `binder_thread_write':
   drivers/android/.tmp_gl_binder.o:(.text+0xcba8): undefined reference to `__user_bad'
   drivers/android/.tmp_gl_binder.o:(.text+0xcbd4): undefined reference to `__user_bad'
   drivers/android/.tmp_gl_binder.o:(.text+0xcfbc): undefined reference to `__user_bad'
   drivers/android/.tmp_gl_binder.o:(.text+0xd648): undefined reference to `__user_bad'
   drivers/android/.tmp_gl_binder.o:(.text+0xdbc0): undefined reference to `__user_bad'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 53698 bytes --]

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:52 UTC (permalink / raw)
  To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <c21a4671-df85-4805-a4ec-2441859e4e1b@redhat.com>

On 2018/07/02 11:41, Jason Wang wrote:
> On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
>> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>>> Under heavy load vhost busypoll may run without suppressing
>>> notification. For example tx zerocopy callback can push tx work while
>>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>>> condition and enables notification but immediately reenters handle_tx()
>>> because the pushed work was tx. In this case handle_tx() tries to
>>> disable notification again, but when using event_idx it by design
>>> cannot. Then busyloop will run without suppressing notification.
>>> Another example is the case where handle_tx() tries to enable
>>> notification but avail idx is advanced so disables it again. This case
>>> also lead to the same situation with event_idx.
>>>
>>> The problem is that once we enter this situation busyloop does not work
>>> under heavy load for considerable amount of time, because notification
>>> is likely to happen during busyloop and handle_tx() immediately enables
>>> notification after notification happens. Specifically busyloop detects
>>> notification by vhost_has_work() and then handle_tx() calls
>>> vhost_enable_notify().
>> I'd like to understand the problem a bit better.
>> Why does this happen?
>> Doesn't this only happen if ring is empty?
>>
> 
> My understanding is:
> 
> vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause
> the busy loop in vhost_net_tx_get_vq_desc() to exit because of
> vhost_has_work() return true. Then handle_tx() tends to enable
> notification. Then guest may kick us even if handle_tx() call
> vhost_disable_notify() which in fact did nothing for even index.

Yes.

> Maybe we can try to call vhost_zerocopy_signal_used() if we found
> there's pending used from zerocopy instead.

Note that even when zerocopy is disabled the problem happens as I wrote.
When vhost_enable_notify() detects avail_idx advanced it tries to
disable notification again but it fails.

-- 
Toshiaki Makita

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02  2:54 UTC (permalink / raw)
  To: Toshiaki Makita, Michael S. Tsirkin
  Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <4a5b4456-0f3e-40b3-849c-930ebb97ef3c@lab.ntt.co.jp>



On 2018年07月02日 10:45, Toshiaki Makita wrote:
> Hi Jason,
>
> On 2018/06/29 18:30, Jason Wang wrote:
>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
> ...
>>> To fix this, poll the work instead of enabling notification when
>>> busypoll is interrupted by something. IMHO signal_pending() and
>>> vhost_has_work() are kind of interruptions rather than signals to
>>> completely cancel the busypoll, so let's run busypoll after the
>>> necessary work is done. In order to avoid too long busyloop due to
>>> interruption, save the endtime in vq field and use it when reentering
>>> the work function.
>> I think we don't care long busyloop unless e.g tx can starve rx?
> I just want to keep it user-controllable. Unless memorizing it busypoll
> can run unexpectedly long.

I think the total amount of time for busy polling is bounded. If I was 
wrong, it should be a bug somewhere.

>
>>> I observed this problem makes tx performance poor but does not rx. I
>>> guess it is because rx notification from socket is not that heavy so
>>> did not impact the performance even when we failed to mask the
>>> notification.
>> I think the reason is:
>>
>> For tx, we may only process used ring under handle_tx(). Busy polling
>> code can not recognize this case.
>> For rx, we call peek_head_len() after exiting busy loop, so if the work
>> is for rx, it can be processed in handle_rx() immediately.
> Sorry but I don't see the difference. Tx busypoll calls
> vhost_get_vq_desc() immediately after busypoll exits in
> vhost_net_tx_get_vq_desc().

Yes, so for the problem of zerocopy callback issue is we don't poll used 
(done_idx != upend_idx). Maybe we can try to add this and avoid the 
check of vhost_has_worker().

>
> The scenario I described above (in 2nd paragraph) is a bit simplified.
> To be clearer what I think is happening is:
>
> 1. handle_tx() runs busypoll with notification enabled (due to zerocopy
>     callback or something).

I think it was due to we enable notification when we exit handle_tx().

> 2. Guest produces a packet in vring.
> 3. handle_tx() busypoll detects the produced packet and get it.
> 4. While vhost is processing the packet, guest kicks vring because it
>     produced the packet. Vring notification is disabled automatically by
>     event_idx and tx work is queued.
> 5. After processing the packet vhost enters busyloop again, but detects
>     tx work and exits busyloop immediately. Then handle_tx() exits after
>     enabling the notification.
> 6. Because the queued work was tx, handle_tx() is called immediately
>     again. handle_tx() runs busypoll with notification enabled.
> 7. Repeat 2-6.

Exactly what I understand.

>
>>> Anyway for consistency I fixed rx routine as well as tx.
>>>
>>> Performance numbers:
>>>
>>> - Bulk transfer from guest to external physical server.
>>>       [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
>> Just to confirm in this case since zerocopy is enabled, we are in fact
>> use the generic XDP datapath?
> For some reason zerocopy was not applied for most packets, so in most
> cases driver XDP was used. I was going to dig into it but not yet.

Right, just to confirm this. This is expected.

In tuntap, we do native XDP only for small and non zerocopy packets. See 
tun_can_build_skb(). The reason is XDP may adjust packet header which is 
not supported by zercopy. We can only use XDP generic for zerocopy in 
this case.

>
>>> - Set 10us busypoll.
>>> - Guest disables checksum and TSO because of host XDP.
>>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>>     (EPT_MISCONFIG event).
>>>
>>>                               Before              After
>>>                             Mbps  kicks/s      Mbps  kicks/s
>>> UDP_STREAM 1472byte              247758                 27
>>>                   Send   3645.37            6958.10
>>>                   Recv   3588.56            6958.10
>>>                 1byte                9865                 37
>>>                   Send      4.34               5.43
>>>                   Recv      4.17               5.26
>>> TCP_STREAM             8801.03    45794   9592.77     2884
>> Impressive numbers.
>>
>>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>>> ---
> ...
>>> +static bool vhost_busy_poll_interrupted(struct vhost_dev *dev)
>>> +{
>>> +    return unlikely(signal_pending(current)) || vhost_has_work(dev);
>>>    }
>>>      static void vhost_net_disable_vq(struct vhost_net *n,
>>> @@ -437,10 +438,21 @@ static int vhost_net_tx_get_vq_desc(struct
>>> vhost_net *net,
>>>          if (r == vq->num && vq->busyloop_timeout) {
>>>            preempt_disable();
>>> -        endtime = busy_clock() + vq->busyloop_timeout;
>>> -        while (vhost_can_busy_poll(vq->dev, endtime) &&
>>> -               vhost_vq_avail_empty(vq->dev, vq))
>>> +        if (vq->busyloop_endtime) {
>>> +            endtime = vq->busyloop_endtime;
>>> +            vq->busyloop_endtime = 0;
>> Looks like endtime may be before current time. So we skip busy loop in
>> this case.
> Sure, I'll add a condition.
>
>>> +        } else {
>>> +            endtime = busy_clock() + vq->busyloop_timeout;
>>> +        }
>>> +        while (vhost_can_busy_poll(endtime)) {
>>> +            if (vhost_busy_poll_interrupted(vq->dev)) {
>>> +                vq->busyloop_endtime = endtime;
>>> +                break;
>>> +            }
>>> +            if (!vhost_vq_avail_empty(vq->dev, vq))
>>> +                break;
>>>                cpu_relax();
>>> +        }
>>>            preempt_enable();
>>>            r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
>>>                          out_num, in_num, NULL, NULL);
> ...
>>> @@ -642,39 +658,51 @@ static void vhost_rx_signal_used(struct
>>> vhost_net_virtqueue *nvq)
>>>      static int vhost_net_rx_peek_head_len(struct vhost_net *net,
>>> struct sock *sk)
>>>    {
>>> -    struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
>>> -    struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>>> -    struct vhost_virtqueue *vq = &nvq->vq;
>>> +    struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
>>> +    struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
>>> +    struct vhost_virtqueue *rvq = &rnvq->vq;
>>> +    struct vhost_virtqueue *tvq = &tnvq->vq;
>> NIt: I admit the variable name is confusing. It would be better to do
>> the renaming in a separate patch to ease review.
> OK.
>
>>>        unsigned long uninitialized_var(endtime);
>>> -    int len = peek_head_len(rvq, sk);
>>> +    int len = peek_head_len(rnvq, sk);
>>>    -    if (!len && vq->busyloop_timeout) {
>>> +    if (!len && tvq->busyloop_timeout) {
>>>            /* Flush batched heads first */
>>> -        vhost_rx_signal_used(rvq);
>>> +        vhost_rx_signal_used(rnvq);
>>>            /* Both tx vq and rx socket were polled here */
>>> -        mutex_lock_nested(&vq->mutex, 1);
>>> -        vhost_disable_notify(&net->dev, vq);
>>> +        mutex_lock_nested(&tvq->mutex, 1);
>>> +        vhost_disable_notify(&net->dev, tvq);
>>>              preempt_disable();
>>> -        endtime = busy_clock() + vq->busyloop_timeout;
>>> +        if (rvq->busyloop_endtime) {
>>> +            endtime = rvq->busyloop_endtime;
>>> +            rvq->busyloop_endtime = 0;
>>> +        } else {
>>> +            endtime = busy_clock() + tvq->busyloop_timeout;
>>> +        }
>>>    -        while (vhost_can_busy_poll(&net->dev, endtime) &&
>>> -               !sk_has_rx_data(sk) &&
>>> -               vhost_vq_avail_empty(&net->dev, vq))
>>> +        while (vhost_can_busy_poll(endtime)) {
>>> +            if (vhost_busy_poll_interrupted(&net->dev)) {
>>> +                rvq->busyloop_endtime = endtime;
>>> +                break;
>>> +            }
>>> +            if (sk_has_rx_data(sk) ||
>>> +                !vhost_vq_avail_empty(&net->dev, tvq))
>>> +                break;
>>>                cpu_relax();
>> Actually, I plan to poll guest rx refilling as well here to avoid rx
>> kicks. Want to draft another patch to do it as well? It's as simple as
>> add a vhost_vq_avail_empty for rvq above.
> OK. will try it.
>
>>> +        }
>>>              preempt_enable();
>>>    -        if (!vhost_vq_avail_empty(&net->dev, vq))
>>> -            vhost_poll_queue(&vq->poll);
>>> -        else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
>>> -            vhost_disable_notify(&net->dev, vq);
>>> -            vhost_poll_queue(&vq->poll);
>>> +        if (!vhost_vq_avail_empty(&net->dev, tvq)) {
>>> +            vhost_poll_queue(&tvq->poll);
>>> +        } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
>>> +            vhost_disable_notify(&net->dev, tvq);
>>> +            vhost_poll_queue(&tvq->poll);
>>>            }
>>>    -        mutex_unlock(&vq->mutex);
>>> +        mutex_unlock(&tvq->mutex);
>>>    -        len = peek_head_len(rvq, sk);
>>> +        len = peek_head_len(rnvq, sk);
>>>        }
>>>          return len;
> ...
>>> @@ -889,7 +918,12 @@ static void handle_rx(struct vhost_net *net)
>>>                goto out;
>>>            }
>>>        }
>>> -    vhost_net_enable_vq(net, vq);
>>> +    if (unlikely(vq->busyloop_endtime)) {
>>> +        /* Busy poll is interrupted. */
>>> +        vhost_poll_queue(&vq->poll);
>>> +    } else {
>>> +        vhost_net_enable_vq(net, vq);
>>> +    }
>> This is to reduce the rx wake ups. Better with another patch.
>>
>> So I suggest to split the patches into:
>>
>> 1 better naming of variable of vhost_net_rx_peek_head_len().
>> 2 avoid tx kicks (most of what this patch did)
>> 3 avoid rx wakeups (exactly the above 6 lines did)
>> 4 avoid rx kicks
> OK, I'll reorganize the patch.
> Thank you for your feedback!
>
>> Btw, tonghao is doing some refactoring of busy polling as well. Depends
>> on the order of being merged, one of you may need rebasing.
> Thanks for sharing. I'll take a look.
>

Thanks.

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02  2:59 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <20180629193448-mutt-send-email-mst@kernel.org>

On 2018/06/30 1:38, Michael S. Tsirkin wrote:
...
>> Performance numbers:
>>
>> - Bulk transfer from guest to external physical server.
>>     [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)--> [Server]
>> - Set 10us busypoll.
>> - Guest disables checksum and TSO because of host XDP.
>> - Measured single flow Mbps by netperf, and kicks by perf kvm stat
>>   (EPT_MISCONFIG event).
>>
>>                             Before              After
>>                           Mbps  kicks/s      Mbps  kicks/s
>> UDP_STREAM 1472byte              247758                 27
>>                 Send   3645.37            6958.10
>>                 Recv   3588.56            6958.10
>>               1byte                9865                 37
>>                 Send      4.34               5.43
>>                 Recv      4.17               5.26
>> TCP_STREAM             8801.03    45794   9592.77     2884
>>
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> 
> Is this with busy poll enabled?

Yes, as I wrote "Set 10us busypoll" above.

> Are there CPU utilization #s?

I used one cpu for one vcpu and one cpu for one vhost.
Each host cpu for vcpu/vhost was like this:

- Before
 vcpu cpu : %guest 70 %sys 30
 vhost cpu: %sys 100

- After
 vcpu cpu : %guest 100
 vhost cpu: %sys 100

I think %sys before patch was caused by vring kick.

-- 
Toshiaki Makita

^ permalink raw reply

* [RFC PATCH] ipv6: make ipv6_renew_options() interrupt/kernel safe
From: Paul Moore @ 2018-07-02  3:01 UTC (permalink / raw)
  To: netdev; +Cc: Al Viro, selinux, linux-security-module

From: Paul Moore <paul@paul-moore.com>

At present the ipv6_renew_options_kern() function ends up calling into
access_ok() which is problematic if done from inside an interrupt as
access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures
(x86-64 is affected).  Example warning/backtrace is shown below:

 WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90
 ...
 Call Trace:
  <IRQ>
  ipv6_renew_option+0xb2/0xf0
  ipv6_renew_options+0x26a/0x340
  ipv6_renew_options_kern+0x2c/0x40
  calipso_req_setattr+0x72/0xe0
  netlbl_req_setattr+0x126/0x1b0
  selinux_netlbl_inet_conn_request+0x80/0x100
  selinux_inet_conn_request+0x6d/0xb0
  security_inet_conn_request+0x32/0x50
  tcp_conn_request+0x35f/0xe00
  ? __lock_acquire+0x250/0x16c0
  ? selinux_socket_sock_rcv_skb+0x1ae/0x210
  ? tcp_rcv_state_process+0x289/0x106b
  tcp_rcv_state_process+0x289/0x106b
  ? tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_do_rcv+0x1a7/0x3c0
  tcp_v6_rcv+0xc82/0xcf0
  ip6_input_finish+0x10d/0x690
  ip6_input+0x45/0x1e0
  ? ip6_rcv_finish+0x1d0/0x1d0
  ipv6_rcv+0x32b/0x880
  ? ip6_make_skb+0x1e0/0x1e0
  __netif_receive_skb_core+0x6f2/0xdf0
  ? process_backlog+0x85/0x250
  ? process_backlog+0x85/0x250
  ? process_backlog+0xec/0x250
  process_backlog+0xec/0x250
  net_rx_action+0x153/0x480
  __do_softirq+0xd9/0x4f7
  do_softirq_own_stack+0x2a/0x40
  </IRQ>
  ...

While not present in the backtrace, ipv6_renew_option() ends up calling
access_ok() via the following chain:

  access_ok()
  _copy_from_user()
  copy_from_user()
  ipv6_renew_option()

The fix presented in this patch is to perform the userspace copy
earlier in the call chain such that it is only called when the option
data is actually coming from userspace; that place is
do_ipv6_setsockopt().  Not only does this solve the problem seen in
the backtrace above, it also allows us to simplify the code quite a
bit by removing ipv6_renew_options_kern() completely.  We also take
this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option()
a small amount as well.

This patch is heavily based on a rough patch by Al Viro.  I've taken
his original patch, converted a kmemdup() call in do_ipv6_setsockopt()
to a memdup_user() call, made better use of the e_inval jump target in
the same function, and cleaned up the use ipv6_renew_option() by
ipv6_renew_options().

CC: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/net/ipv6.h       |    9 ----
 net/ipv6/calipso.c       |    9 +---
 net/ipv6/exthdrs.c       |  108 ++++++++++++----------------------------------
 net/ipv6/ipv6_sockglue.c |   27 ++++++++----
 4 files changed, 50 insertions(+), 103 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 16475c269749..d02881e4ad1f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
 struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
 					  struct ipv6_txoptions *opt,
 					  int newtype,
-					  struct ipv6_opt_hdr __user *newopt,
-					  int newoptlen);
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk,
-			struct ipv6_txoptions *opt,
-			int newtype,
-			struct ipv6_opt_hdr *newopt,
-			int newoptlen);
+					  struct ipv6_opt_hdr *newopt);
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
 					  struct ipv6_txoptions *opt);
 
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
 {
 	struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
 
-	txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
-					 hop, hop ? ipv6_optlen(hop) : 0);
+	txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
 	txopt_put(old);
 	if (IS_ERR(txopts))
 		return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	kfree(new);
 
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
 	if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
 		return; /* Nothing to do */
 
-	txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
-					 new, new ? ipv6_optlen(new) : 0);
+	txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
 
 	if (!IS_ERR(txopts)) {
 		txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..1e1d9bc2fd3d 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 }
 EXPORT_SYMBOL_GPL(ipv6_dup_options);
 
-static int ipv6_renew_option(void *ohdr,
-			     struct ipv6_opt_hdr __user *newopt, int newoptlen,
-			     int inherit,
-			     struct ipv6_opt_hdr **hdr,
-			     char **p)
+static void ipv6_renew_option(int renewtype,
+			      struct ipv6_opt_hdr **dest,
+			      struct ipv6_opt_hdr *old,
+			      struct ipv6_opt_hdr *new,
+			      int newtype, char **p)
 {
-	if (inherit) {
-		if (ohdr) {
-			memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			*p += CMSG_ALIGN(ipv6_optlen(*hdr));
-		}
-	} else {
-		if (newopt) {
-			if (copy_from_user(*p, newopt, newoptlen))
-				return -EFAULT;
-			*hdr = (struct ipv6_opt_hdr *)*p;
-			if (ipv6_optlen(*hdr) > newoptlen)
-				return -EINVAL;
-			*p += CMSG_ALIGN(newoptlen);
-		}
-	}
-	return 0;
+	struct ipv6_opt_hdr *src;
+
+	src = (renewtype == newtype ? new : old);
+	if (!src)
+		return;
+
+	memcpy(*p, src, ipv6_optlen(src));
+	*dest = (struct ipv6_opt_hdr *)*p;
+	p += CMSG_ALIGN(ipv6_optlen(*dest));
 }
 
 /**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
  */
 struct ipv6_txoptions *
 ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
-		   int newtype,
-		   struct ipv6_opt_hdr __user *newopt, int newoptlen)
+		   int newtype, struct ipv6_opt_hdr *newopt)
 {
 	int tot_len = 0;
 	char *p;
 	struct ipv6_txoptions *opt2;
-	int err;
 
 	if (opt) {
 		if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 			tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
 	}
 
-	if (newopt && newoptlen)
-		tot_len += CMSG_ALIGN(newoptlen);
+	if (newopt)
+		tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
 
 	if (!tot_len)
 		return NULL;
@@ -1098,29 +1088,16 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->tot_len = tot_len;
 	p = (char *)(opt2 + 1);
 
-	err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
-				newtype != IPV6_HOPOPTS,
-				&opt2->hopopt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDRDSTOPTS,
-				&opt2->dst0opt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
-				newtype != IPV6_RTHDR,
-				(struct ipv6_opt_hdr **)&opt2->srcrt, &p);
-	if (err)
-		goto out;
-
-	err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
-				newtype != IPV6_DSTOPTS,
-				&opt2->dst1opt, &p);
-	if (err)
-		goto out;
+	ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, opt->hopopt,
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, opt->dst0opt,
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_RTHDR,
+			  (struct ipv6_opt_hdr **)&opt2->srcrt,
+			  (struct ipv6_opt_hdr *)opt->srcrt,
+			  newopt, newtype, &p);
+	ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, opt->dst1opt,
+			  newopt, newtype, &p);
 
 	opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
 			  (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1105,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
 	opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
 
 	return opt2;
-out:
-	sock_kfree_s(sk, opt2, opt2->tot_len);
-	return ERR_PTR(err);
-}
-
-/**
- * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
- *
- * @sk: sock from which to allocate memory
- * @opt: original options
- * @newtype: option type to replace in @opt
- * @newopt: new option of type @newtype to replace (kernel-mem)
- * @newoptlen: length of @newopt
- *
- * See ipv6_renew_options().  The difference is that @newopt is
- * kernel memory, rather than user memory.
- */
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
-			int newtype, struct ipv6_opt_hdr *newopt,
-			int newoptlen)
-{
-	struct ipv6_txoptions *ret_val;
-	const mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret_val = ipv6_renew_options(sk, opt, newtype,
-				     (struct ipv6_opt_hdr __user *)newopt,
-				     newoptlen);
-	set_fs(old_fs);
-	return ret_val;
 }
 
 struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..c95c3486d904 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 	case IPV6_DSTOPTS:
 	{
 		struct ipv6_txoptions *opt;
+		struct ipv6_opt_hdr *new = NULL;
+
+		/* hop-by-hop / destination options are privileged option */
+		retv = -EPERM;
+		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+			break;
 
 		/* remove any sticky options header with a zero option
 		 * length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		else if (optlen < sizeof(struct ipv6_opt_hdr) ||
 			 optlen & 0x7 || optlen > 8 * 255)
 			goto e_inval;
-
-		/* hop-by-hop / destination options are privileged option */
-		retv = -EPERM;
-		if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
-			break;
+		else {
+			new = memdup_user(optval, optlen);
+			if (IS_ERR(new)) {
+				retv = PTR_ERR(new);
+				break;
+			}
+			if (unlikely(ipv6_optlen(new) > optlen)) {
+				kfree(new);
+				goto e_inval;
+			}
+		}
 
 		opt = rcu_dereference_protected(np->opt,
 						lockdep_sock_is_held(sk));
-		opt = ipv6_renew_options(sk, opt, optname,
-					 (struct ipv6_opt_hdr __user *)optval,
-					 optlen);
+		opt = ipv6_renew_options(sk, opt, optname, new);
+		kfree(new);
 		if (IS_ERR(opt)) {
 			retv = PTR_ERR(opt);
 			break;

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox