Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2 net-next 0/3] ss: Allow selection of columns to be displayed
From: Stefano Brivio @ 2018-10-30 15:05 UTC (permalink / raw)
  To: David Ahern; +Cc: Yoann P., Stephen Hemminger, netdev

Now that we have an abstraction for columns, it's relatively easy to
selectively display only some of them, and Yoann has a use case for it.

Patch 1/3 fixes a rendering issue that shows up only when display of
arbitrary columns is disabled. Patch 2/3 implements the relevant option,
and patch 3/3 makes the output more readable when some columns are
disabled.

Stefano Brivio (3):
  ss: Discard empty descriptor at the end of buffer, if any, before
    rendering
  ss: Introduce option to display selected columns only
  ss: Beautify output when arbitrary columns are hidden

 man/man8/ss.8 |  5 +++
 misc/ss.c     | 85 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 77 insertions(+), 13 deletions(-)

-- 
2.19.1

^ permalink raw reply

* [PATCH iproute2 net-next 1/3] ss: Discard empty descriptor at the end of buffer, if any, before rendering
From: Stefano Brivio @ 2018-10-30 15:05 UTC (permalink / raw)
  To: David Ahern; +Cc: Yoann P., Stephen Hemminger, netdev
In-Reply-To: <cover.1540910943.git.sbrivio@redhat.com>

This will allow us to disable display of any given column.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 misc/ss.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index c8970438ce73..c3f61ef66258 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1245,8 +1245,15 @@ static void render(void)
 
 	token = (struct buf_token *)buffer.head->data;
 
-	/* Ensure end alignment of last token, it wasn't necessarily flushed */
-	buffer.tail->end += buffer.cur->len % 2;
+	if (!buffer.cur->len) {
+		/* Last token was flushed, a new empty descriptor was appended:
+		 * discard it
+		 */
+		buffer.tail->end -= sizeof(buffer.cur->len);
+	} else {
+		/* Last token wasn't flushed: ensure end alignment */
+		buffer.tail->end += buffer.cur->len % 2;
+	}
 
 	render_calc_width();
 
-- 
2.19.1

^ permalink raw reply related

* [PATCH iproute2 net-next 2/3] ss: Introduce option to display selected columns only
From: Stefano Brivio @ 2018-10-30 15:05 UTC (permalink / raw)
  To: David Ahern; +Cc: Yoann P., Stephen Hemminger, netdev
In-Reply-To: <cover.1540910943.git.sbrivio@redhat.com>

The new option --columns (short: -c) allows to select columns to be
displayed. Note that this doesn't affect the order in which columns are
displayed.

Reported-by: Yoann P. <yoann.p.public@gmail.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 man/man8/ss.8 |  5 +++++
 misc/ss.c     | 62 ++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/man/man8/ss.8 b/man/man8/ss.8
index 7a6572b17364..c987dec6bcd7 100644
--- a/man/man8/ss.8
+++ b/man/man8/ss.8
@@ -24,6 +24,11 @@ Output version information.
 .B \-H, \-\-no-header
 Suppress header line.
 .TP
+.B \-c COLS, \-\-columns=COLS
+Only display selected columns, separated by commas. The following column names
+are understood: netid, state, local, lport, peer, pport, ext. This does not
+define the order of columns.
+.TP
 .B \-n, \-\-numeric
 Do not try to resolve service names.
 .TP
diff --git a/misc/ss.c b/misc/ss.c
index c3f61ef66258..91be3c6db151 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -132,6 +132,7 @@ enum col_align {
 
 struct column {
 	const enum col_align align;
+	const char *optname;
 	const char *header;
 	const char *ldelim;
 	int disabled;
@@ -140,15 +141,15 @@ struct column {
 };
 
 static struct column columns[] = {
-	{ ALIGN_LEFT,	"Netid",		"",	0, 0, 0 },
-	{ ALIGN_LEFT,	"State",		" ",	0, 0, 0 },
-	{ ALIGN_LEFT,	"Recv-Q",		" ",	0, 0, 0 },
-	{ ALIGN_LEFT,	"Send-Q",		" ",	0, 0, 0 },
-	{ ALIGN_RIGHT,	"Local Address:",	" ",	0, 0, 0 },
-	{ ALIGN_LEFT,	"Port",			"",	0, 0, 0 },
-	{ ALIGN_RIGHT,	"Peer Address:",	" ",	0, 0, 0 },
-	{ ALIGN_LEFT,	"Port",			"",	0, 0, 0 },
-	{ ALIGN_LEFT,	"",			"",	0, 0, 0 },
+	{ ALIGN_LEFT,	"netid",	"Netid",		"",  0, 0, 0 },
+	{ ALIGN_LEFT,	"state",	"State",		" ", 0, 0, 0 },
+	{ ALIGN_LEFT,	"recvq",	"Recv-Q",		" ", 0, 0, 0 },
+	{ ALIGN_LEFT,	"sendq",	"Send-Q",		" ", 0, 0, 0 },
+	{ ALIGN_RIGHT,	"local",	"Local Address:",	" ", 0, 0, 0 },
+	{ ALIGN_LEFT,	"lport",	"Port",			"",  0, 0, 0 },
+	{ ALIGN_RIGHT,	"peer",		"Peer Address:",	" ", 0, 0, 0 },
+	{ ALIGN_LEFT,	"pport",	"Port",			"",  0, 0, 0 },
+	{ ALIGN_LEFT,	"ext",		"",			"",  0, 0, 0 },
 };
 
 static struct column *current_field = columns;
@@ -1073,6 +1074,11 @@ static int field_is_last(struct column *f)
 	return f - columns == COL_MAX - 1;
 }
 
+static int field_is_valid(struct column *f)
+{
+	return f >= columns && f - columns < COL_MAX;
+}
+
 static void field_next(void)
 {
 	field_flush(current_field);
@@ -4666,6 +4672,8 @@ static void _usage(FILE *dest)
 "\n"
 "   -K, --kill          forcibly close sockets, display what was closed\n"
 "   -H, --no-header     Suppress header line\n"
+"   -c, --columns=COLS  display only COLS columns\n"
+"       COLS := {netid|state|local|lport|peer|pport|ext}[,COLS]\n"
 "\n"
 "   -A, --query=QUERY, --socket=QUERY\n"
 "       QUERY := {all|inet|tcp|udp|raw|unix|unix_dgram|unix_stream|unix_seqpacket|packet|netlink|vsock_stream|vsock_dgram|tipc}[,QUERY]\n"
@@ -4785,6 +4793,7 @@ static const struct option long_opts[] = {
 	{ "tipcinfo", 0, 0, OPT_TIPCINFO},
 	{ "kill", 0, 0, 'K' },
 	{ "no-header", 0, 0, 'H' },
+	{ "columns", 1, 0, 'c' },
 	{ 0 }
 
 };
@@ -4800,7 +4809,7 @@ int main(int argc, char *argv[])
 	int state_filter = 0;
 
 	while ((ch = getopt_long(argc, argv,
-				 "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHS",
+				 "dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHc:S",
 				 long_opts, NULL)) != EOF) {
 		switch (ch) {
 		case 'n':
@@ -4966,6 +4975,39 @@ int main(int argc, char *argv[])
 		case 'H':
 			show_header = 0;
 			break;
+		case 'c':
+		{
+			struct column *f;
+			char *p, *p1;
+
+			if (!optarg) {
+				fprintf(stderr, "ss: No columns given.\n");
+				usage();
+			}
+
+			for (f = columns; field_is_valid(f); f++)
+				f->disabled = 1;
+
+			p = optarg;
+			do {
+				p1 = strchr(p, ',');
+				if (p1)
+					*p1 = 0;
+				for (f = columns; field_is_valid(f); f++) {
+					if (!strcmp(f->optname, p)) {
+						f->disabled = 0;
+						break;
+					}
+				}
+				if (!field_is_valid(f)) {
+					fprintf(stderr, "ss: No column %s\n",
+						p);
+					usage();
+				}
+				p = p1 + 1;
+			} while (p1);
+			break;
+		}
 		case 'h':
 			help();
 		case '?':
-- 
2.19.1

^ permalink raw reply related

* [PATCH iproute2 net-next 3/3] ss: Beautify output when arbitrary columns are hidden
From: Stefano Brivio @ 2018-10-30 15:05 UTC (permalink / raw)
  To: David Ahern; +Cc: Yoann P., Stephen Hemminger, netdev
In-Reply-To: <cover.1540910943.git.sbrivio@redhat.com>

Define a secondary alignment for columns in case the next column is
hidden, this avoids awkward outputs if e.g. the local address is shown,
but not the local port.

Omit embedded delimiter in socket specifiers if the port or service field
is hidden.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 misc/ss.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 91be3c6db151..d489233681e9 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -131,7 +131,8 @@ enum col_align {
 };
 
 struct column {
-	const enum col_align align;
+	enum col_align align;
+	const enum col_align align_without_next;
 	const char *optname;
 	const char *header;
 	const char *ldelim;
@@ -141,15 +142,15 @@ struct column {
 };
 
 static struct column columns[] = {
-	{ ALIGN_LEFT,	"netid",	"Netid",		"",  0, 0, 0 },
-	{ ALIGN_LEFT,	"state",	"State",		" ", 0, 0, 0 },
-	{ ALIGN_LEFT,	"recvq",	"Recv-Q",		" ", 0, 0, 0 },
-	{ ALIGN_LEFT,	"sendq",	"Send-Q",		" ", 0, 0, 0 },
-	{ ALIGN_RIGHT,	"local",	"Local Address:",	" ", 0, 0, 0 },
-	{ ALIGN_LEFT,	"lport",	"Port",			"",  0, 0, 0 },
-	{ ALIGN_RIGHT,	"peer",		"Peer Address:",	" ", 0, 0, 0 },
-	{ ALIGN_LEFT,	"pport",	"Port",			"",  0, 0, 0 },
-	{ ALIGN_LEFT,	"ext",		"",			"",  0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "netid", "Netid",          "",  0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "state", "State",          " ", 0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "recvq", "Recv-Q",         " ", 0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "sendq", "Send-Q",         " ", 0, 0, 0 },
+	{ ALIGN_RIGHT, ALIGN_LEFT, "local", "Local Address:", " ", 0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "lport", "Port",           "",  0, 0, 0 },
+	{ ALIGN_RIGHT, ALIGN_LEFT, "peer",  "Peer Address:",  " ", 0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "pport", "Port",           "",  0, 0, 0 },
+	{ ALIGN_LEFT,  ALIGN_LEFT, "ext",   "",               "",  0, 0, 0 },
 };
 
 static struct column *current_field = columns;
@@ -1374,6 +1375,9 @@ static void sock_details_print(struct sockstat *s)
 static void sock_addr_print(const char *addr, char *delim, const char *port,
 		const char *ifname)
 {
+	if ((current_field + 1)->disabled)
+		delim = "";
+
 	if (ifname)
 		out("%s" "%%" "%s%s", addr, ifname, delim);
 	else
@@ -5006,6 +5010,12 @@ int main(int argc, char *argv[])
 				}
 				p = p1 + 1;
 			} while (p1);
+
+			for (f = columns; field_is_valid(f + 1); f++) {
+				if ((f + 1)->disabled)
+					f->align = f->align_without_next;
+			}
+
 			break;
 		}
 		case 'h':
-- 
2.19.1

^ permalink raw reply related

* Re: [BUG] MVPP2 driver exploding in presence of a tap interface
From: Thomas Petazzoni @ 2018-10-30 15:10 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Marcin Wojtas, Antoine Tenart, Maxime Chevallier,
	linux-arm-kernel, netdev, Grzegorz Jaszczyk, Tomasz Nowicki
In-Reply-To: <6bf82e04-5463-aa7d-bbac-f09519ff9815@arm.com>

Hello,

On Tue, 30 Oct 2018 14:55:01 +0000, Marc Zyngier wrote:

> > I.e, isn't the firmware fix papering over a bug that should be fixed in
> > Linux mvpp2 driver anyway ?  
> 
> Absolutely. Leaving this unpatched in the kernel, with a 100% chance of
> memory corruption is just mad.
> 
> I'm pretty sure there should be a way to sanely reset the interface
> before it starts repainting the memory.

I agree here. Do you still have an image of that old firmware version,
so that we can try to reproduce, and see if we can come up with a way
to reset the BM on boot up that would avoid this issue ?

Thanks,

Thomas
-- 
Thomas Petazzoni, CTO, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [BUG] MVPP2 driver exploding in presence of a tap interface
From: Marc Zyngier @ 2018-10-30 15:22 UTC (permalink / raw)
  To: Thomas Petazzoni
  Cc: Marcin Wojtas, Antoine Tenart, Maxime Chevallier,
	linux-arm-kernel, netdev, Grzegorz Jaszczyk, Tomasz Nowicki
In-Reply-To: <20181030161007.360d5a53@windsurf>

On 30/10/18 15:10, Thomas Petazzoni wrote:
> Hello,
> 
> On Tue, 30 Oct 2018 14:55:01 +0000, Marc Zyngier wrote:
> 
>>> I.e, isn't the firmware fix papering over a bug that should be fixed in
>>> Linux mvpp2 driver anyway ?  
>>
>> Absolutely. Leaving this unpatched in the kernel, with a 100% chance of
>> memory corruption is just mad.
>>
>> I'm pretty sure there should be a way to sanely reset the interface
>> before it starts repainting the memory.
> 
> I agree here. Do you still have an image of that old firmware version,
> so that we can try to reproduce, and see if we can come up with a way
> to reset the BM on boot up that would avoid this issue ?

Yup. I still have both the original build tree as well as the sdcard, so
you should be able to trigger on demand.

I'll email you the stuff separately, unless you want another delivery
method.

Thanks,

	M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply

* [RFC bpf-next] libbpf: increase rlimit before trying to create BPF maps
From: Quentin Monnet @ 2018-10-30 15:23 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, oss-drivers, Quentin Monnet

The limit for memory locked in the kernel by a process is usually set to
64 bytes by default. This can be an issue when creating large BPF maps.
A workaround is to raise this limit for the current process before
trying to create a new BPF map. Changing the hard limit requires the
CAP_SYS_RESOURCE and can usually only be done by root user (but then
only root can create BPF maps).

As far as I know there is not API to get the current amount of memory
locked for a user, therefore we cannot raise the limit only when
required. One solution, used by bcc, is to try to create the map, and on
getting a EPERM error, raising the limit to infinity before giving
another try. Another approach, used in iproute, is to raise the limit in
all cases, before trying to create the map.

Here we do the same as in iproute2: the rlimit is raised to infinity
before trying to load the map.

I send this patch as a RFC to see if people would prefer the bcc
approach instead, or the rlimit change to be in bpftool rather than in
libbpf.

Signed-off-by: Quentin Monnet <quentin.monnet@netronome.com>
---
 tools/lib/bpf/bpf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 03f9bcc4ef50..456a5a7b112c 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -26,6 +26,8 @@
 #include <unistd.h>
 #include <asm/unistd.h>
 #include <linux/bpf.h>
+#include <sys/resource.h>
+#include <sys/types.h>
 #include "bpf.h"
 #include "libbpf.h"
 #include <errno.h>
@@ -68,8 +70,11 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
 int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 {
 	__u32 name_len = create_attr->name ? strlen(create_attr->name) : 0;
+	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
 	union bpf_attr attr;

+	setrlimit(RLIMIT_MEMLOCK, &rinf);
+
 	memset(&attr, '\0', sizeof(attr));

 	attr.map_type = create_attr->map_type;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH iproute2 net-next 0/3] ss: Allow selection of columns to be displayed
From: David Ahern @ 2018-10-30 16:34 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Yoann P., Stephen Hemminger, netdev
In-Reply-To: <cover.1540910943.git.sbrivio@redhat.com>

On 10/30/18 9:05 AM, Stefano Brivio wrote:
> Now that we have an abstraction for columns, it's relatively easy to
> selectively display only some of them, and Yoann has a use case for it.
> 
> Patch 1/3 fixes a rendering issue that shows up only when display of
> arbitrary columns is disabled. Patch 2/3 implements the relevant option,
> and patch 3/3 makes the output more readable when some columns are
> disabled.
> 
>

I like the intent, and I have prototyped something similar for 'ip'.

A more flexible approach is to use format strings to allow users to
customize the output order and whitespace as well. So for ss and your
column list (winging it here):

    netid          = %N
    state          = %S
    recv Q         = %Qr
    send Q         = %Qs
    local address  = %Al
    lport port     = %Pl
    remote address = %Ar
    remote port    = %Pr
    process data   = %p
    ...

then a format string could be: "%S  %Qr %Qs  %Al:%Pl %Ar:%Pr  %p\n"

or for csv output: "%S,%Qr,%Qs,%Al,%Pl,%Ar,%Pr,%p\n"

I have not had time to look into an implementation for ip. Conceptually
- and scanning the kernel's vsprintf code - it does not look that
difficult, just time consuming on the frontend with the initial setup.

^ permalink raw reply

* Re: [PATCH iproute2 net-next 0/3] ss: Allow selection of columns to be displayed
From: Stephen Hemminger @ 2018-10-30 16:38 UTC (permalink / raw)
  To: David Ahern; +Cc: Stefano Brivio, Yoann P., netdev
In-Reply-To: <7ffc00c8-bdf6-5c75-564e-2663494bda5d@gmail.com>

On Tue, 30 Oct 2018 10:34:45 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 10/30/18 9:05 AM, Stefano Brivio wrote:
> > Now that we have an abstraction for columns, it's relatively easy to
> > selectively display only some of them, and Yoann has a use case for it.
> > 
> > Patch 1/3 fixes a rendering issue that shows up only when display of
> > arbitrary columns is disabled. Patch 2/3 implements the relevant option,
> > and patch 3/3 makes the output more readable when some columns are
> > disabled.
> > 
> >  
> 
> I like the intent, and I have prototyped something similar for 'ip'.
> 
> A more flexible approach is to use format strings to allow users to
> customize the output order and whitespace as well. So for ss and your
> column list (winging it here):
> 
>     netid          = %N
>     state          = %S
>     recv Q         = %Qr
>     send Q         = %Qs
>     local address  = %Al
>     lport port     = %Pl
>     remote address = %Ar
>     remote port    = %Pr
>     process data   = %p
>     ...
> 
> then a format string could be: "%S  %Qr %Qs  %Al:%Pl %Ar:%Pr  %p\n"
> 
> or for csv output: "%S,%Qr,%Qs,%Al,%Pl,%Ar,%Pr,%p\n"
> 
> I have not had time to look into an implementation for ip. Conceptually
> - and scanning the kernel's vsprintf code - it does not look that
> difficult, just time consuming on the frontend with the initial setup.

The problem with custom formats is that you lose all ability for Gcc
to check format strings.

^ permalink raw reply

* Re: [PATCH iproute2 net-next 0/3] ss: Allow selection of columns to be displayed
From: David Ahern @ 2018-10-30 16:45 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Stefano Brivio, Yoann P., netdev
In-Reply-To: <20181030093842.0e174ea6@xeon-e3>

On 10/30/18 10:38 AM, Stephen Hemminger wrote:
> On Tue, 30 Oct 2018 10:34:45 -0600
> David Ahern <dsahern@gmail.com> wrote:
> 
>> On 10/30/18 9:05 AM, Stefano Brivio wrote:
>>> Now that we have an abstraction for columns, it's relatively easy to
>>> selectively display only some of them, and Yoann has a use case for it.
>>>
>>> Patch 1/3 fixes a rendering issue that shows up only when display of
>>> arbitrary columns is disabled. Patch 2/3 implements the relevant option,
>>> and patch 3/3 makes the output more readable when some columns are
>>> disabled.
>>>
>>>  
>>
>> I like the intent, and I have prototyped something similar for 'ip'.
>>
>> A more flexible approach is to use format strings to allow users to
>> customize the output order and whitespace as well. So for ss and your
>> column list (winging it here):
>>
>>     netid          = %N
>>     state          = %S
>>     recv Q         = %Qr
>>     send Q         = %Qs
>>     local address  = %Al
>>     lport port     = %Pl
>>     remote address = %Ar
>>     remote port    = %Pr
>>     process data   = %p
>>     ...
>>
>> then a format string could be: "%S  %Qr %Qs  %Al:%Pl %Ar:%Pr  %p\n"
>>
>> or for csv output: "%S,%Qr,%Qs,%Al,%Pl,%Ar,%Pr,%p\n"
>>
>> I have not had time to look into an implementation for ip. Conceptually
>> - and scanning the kernel's vsprintf code - it does not look that
>> difficult, just time consuming on the frontend with the initial setup.
> 
> The problem with custom formats is that you lose all ability for Gcc
> to check format strings.
> 

Sure, trade-offs. A custom print string is powerful.

While selecting columns is an improvement, column ordering is also
important - even handling other output formats (csv).

^ permalink raw reply

* [RFC PATCH v3 00/10] udp: implement GRO support
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan

This series implements GRO support for UDP sockets, as the RX counterpart
of commit bec1f6f69736 ("udp: generate gso with UDP_SEGMENT").
The core functionality is implemented by the second patch, introducing a new
sockopt to enable UDP_GRO, while patch 3 implements support for passing the
segment size to the user space via a new cmsg.
UDP GRO performs a socket lookup for each ingress packets and aggregate datagram
directed to UDP GRO enabled sockets with constant l4 tuple.

UDP GRO packets can land on non GRO-enabled sockets, e.g. due to iptables NAT
rules, and that could potentially confuse existing applications.

The solution adopted here is to de-segment the GRO packet before enqueuing
as needed. Since we must cope with packet reinsertion after de-segmentation,
the relevant code is factored-out in ipv4 and ipv6 specific helpers and exposed
to UDP usage.

While the current code can probably be improved, this safeguard ,implemented in
the patches 4-7, allows future enachements to enable UDP GSO offload on more
virtual devices eventually even on forwarded packets.

The last 4 for patches implement some performance and functional self-tests,
re-using the existing udpgso infrastructure. The problematic scenario described
above is explicitly tested.

This revision of the series try to address the feedback provided by Willem,
Steffen and Subash fixing several bugs all along

rfc v2 - rfc v3:
 - cope better with exceptional conditions
 - test cases cleanup

rfc v1 - rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup
 - cope with UDP GRO misdirection
 - add self-tests

Paolo Abeni (10):
  udp: implement complete book-keeping for encap_needed
  udp: implement GRO for plain UDP sockets.
  udp: add support for UDP_GRO cmsg
  ip: factor out protocol delivery helper
  ipv6: factor out protocol delivery helper
  udp: cope with UDP GRO packet misdirection
  selftests: add GRO support to udp bench rx program
  selftests: conditionally enable XDP support in udpgso_bench_rx
  selftests: add some benchmark for UDP GRO
  selftests: add functionals test for UDP GRO

 include/linux/udp.h                           |  25 ++-
 include/net/udp.h                             |  51 ++++-
 include/net/udp_tunnel.h                      |   6 +
 include/uapi/linux/udp.h                      |   1 +
 net/ipv4/ip_input.c                           |  73 ++++---
 net/ipv4/udp.c                                |  54 ++++-
 net/ipv4/udp_offload.c                        | 109 ++++++++--
 net/ipv6/ip6_input.c                          |  28 +--
 net/ipv6/udp.c                                |  44 +++-
 net/ipv6/udp_offload.c                        |   6 +-
 tools/testing/selftests/net/Makefile          |  70 +++++++
 tools/testing/selftests/net/udpgro.sh         | 147 +++++++++++++
 tools/testing/selftests/net/udpgro_bench.sh   |  94 +++++++++
 tools/testing/selftests/net/udpgso_bench.sh   |   2 +-
 tools/testing/selftests/net/udpgso_bench_rx.c | 193 ++++++++++++++++--
 tools/testing/selftests/net/udpgso_bench_tx.c |  22 +-
 tools/testing/selftests/net/xdp_dummy.c       |  13 ++
 17 files changed, 816 insertions(+), 122 deletions(-)
 create mode 100755 tools/testing/selftests/net/udpgro.sh
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

-- 
2.17.2

^ permalink raw reply

* [RFC PATCH v3 01/10] udp: implement complete book-keeping for encap_needed
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

The *encap_needed static keys are enabled by UDP tunnels
and several UDP encapsulations type, but they are never
turned off. This can cause unneeded overall performance
degradation for systems where such features are used
transiently.

This patch introduces complete book-keeping for such keys,
decreasing the usage at socket destruction time, if needed,
and avoiding that the same socket could increase the key
usage multiple times.

rfc v2 - rfc v3:
 - use udp_tunnel_encap_enable() in setsockopt()

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h      |  7 ++++++-
 include/net/udp_tunnel.h |  6 ++++++
 net/ipv4/udp.c           | 17 +++++++++++------
 net/ipv6/udp.c           | 14 +++++++++-----
 4 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..a4dafff407fb 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,7 +49,12 @@ struct udp_sock {
 	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
+			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+			 encap_enabled:1; /* This socket enabled encap
+					   * processing; UDP tunnels and
+					   * different encapsulation layer set
+					   * this
+					   */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..3fbe56430e3b 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -165,6 +165,12 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
 
 static inline void udp_tunnel_encap_enable(struct socket *sock)
 {
+	struct udp_sock *up = udp_sk(sock->sk);
+
+	if (up->encap_enabled)
+		return;
+
+	up->encap_enabled = 1;
 #if IS_ENABLED(CONFIG_IPV6)
 	if (sock->sk->sk_family == PF_INET6)
 		ipv6_stub->udpv6_encap_enable();
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ca3ed931f2a9..c51721fb293a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -115,6 +115,7 @@
 #include "udp_impl.h"
 #include <net/sock_reuseport.h>
 #include <net/addrconf.h>
+#include <net/udp_tunnel.h>
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -2398,11 +2399,15 @@ void udp_destroy_sock(struct sock *sk)
 	bool slow = lock_sock_fast(sk);
 	udp_flush_pending_frames(sk);
 	unlock_sock_fast(sk, slow);
-	if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
-		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = READ_ONCE(up->encap_destroy);
-		if (encap_destroy)
-			encap_destroy(sk);
+	if (static_branch_unlikely(&udp_encap_needed_key)) {
+		if (up->encap_type) {
+			void (*encap_destroy)(struct sock *sk);
+			encap_destroy = READ_ONCE(up->encap_destroy);
+			if (encap_destroy)
+				encap_destroy(sk);
+		}
+		if (up->encap_enabled)
+			static_branch_disable(&udp_encap_needed_key);
 	}
 }
 
@@ -2447,7 +2452,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 			/* FALLTHROUGH */
 		case UDP_ENCAP_L2TPINUDP:
 			up->encap_type = val;
-			udp_encap_enable();
+			udp_tunnel_encap_enable(sk->sk_socket);
 			break;
 		default:
 			err = -ENOPROTOOPT;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..fc0ce6c59ebb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1458,11 +1458,15 @@ void udpv6_destroy_sock(struct sock *sk)
 	udp_v6_flush_pending_frames(sk);
 	release_sock(sk);
 
-	if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
-		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = READ_ONCE(up->encap_destroy);
-		if (encap_destroy)
-			encap_destroy(sk);
+	if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+		if (up->encap_type) {
+			void (*encap_destroy)(struct sock *sk);
+			encap_destroy = READ_ONCE(up->encap_destroy);
+			if (encap_destroy)
+				encap_destroy(sk);
+		}
+		if (up->encap_enabled)
+			static_branch_disable(&udpv6_encap_needed_key);
 	}
 
 	inet6_destroy_sock(sk);
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 02/10] udp: implement GRO for plain UDP sockets.
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

This is the RX counterpart of commit bec1f6f69736 ("udp: generate gso
with UDP_SEGMENT"). When UDP_GRO is enabled, such socket is also
eligible for GRO in the rx path: UDP segments directed to such socket
are assembled into a larger GSO_UDP_L4 packet.

The core UDP GRO support is enabled with setsockopt(UDP_GRO).

Initial benchmark numbers:

Before:
udp rx:   1079 MB/s   769065 calls/s

After:
udp rx:   1466 MB/s    24877 calls/s

This change introduces a side effect in respect to UDP tunnels:
after a UDP tunnel creation, now the kernel performs a lookup per ingress
UDP packet, while before such lookup happened only if the ingress packet
carried a valid internal header csum.

rfc v2 -> rfc v3:
 - fixed typos in macro name and comments
 - really enforce UDP_GRO_CNT_MAX, instead of UDP_GRO_CNT_MAX + 1
 - acquire socket lock in UDP_GRO setsockopt

rfc v1 -> rfc v2:
 - use a new option to enable UDP GRO
 - use static keys to protect the UDP GRO socket lookup

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
Note: I opted for acquiring the socket lock only for the newly introduced
setsockopt instead for every value, despite the previous conversation on
this topic, to avoid introducing somewhat larger and unrelated changes.
---
 include/linux/udp.h      |   3 +-
 include/uapi/linux/udp.h |   1 +
 net/ipv4/udp.c           |   8 +++
 net/ipv4/udp_offload.c   | 109 +++++++++++++++++++++++++++++++--------
 net/ipv6/udp_offload.c   |   6 +--
 5 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index a4dafff407fb..f613b329852e 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -50,11 +50,12 @@ struct udp_sock {
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
 			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 encap_enabled:1; /* This socket enabled encap
+			 encap_enabled:1, /* This socket enabled encap
 					   * processing; UDP tunnels and
 					   * different encapsulation layer set
 					   * this
 					   */
+			 gro_enabled:1;	/* Can accept GRO packets */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 09502de447f5..30baccb6c9c4 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -33,6 +33,7 @@ struct udphdr {
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
 #define UDP_SEGMENT	103	/* Set GSO segmentation size */
+#define UDP_GRO		104	/* This socket can receive UDP GRO packets */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c51721fb293a..4d4f4d044c28 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2474,6 +2474,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->gso_size = val;
 		break;
 
+	case UDP_GRO:
+		lock_sock(sk);
+		if (valbool)
+			udp_tunnel_encap_enable(sk->sk_socket);
+		up->gro_enabled = valbool;
+		release_sock(sk);
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 802f2bc00d69..0646d61f4fa8 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -343,6 +343,54 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 	return segs;
 }
 
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+					       struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+	struct sk_buff *pp = NULL;
+	struct udphdr *uh2;
+	struct sk_buff *p;
+
+	/* requires non zero csum, for symmetry with GSO */
+	if (!uh->check) {
+		NAPI_GRO_CB(skb)->flush = 1;
+		return NULL;
+	}
+
+	/* pull encapsulating udp header */
+	skb_gro_pull(skb, sizeof(struct udphdr));
+	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		uh2 = udp_hdr(p);
+
+		/* Match ports only, as csum is always non zero */
+		if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		/* Terminate the flow on len mismatch or if it grow "too much".
+		 * Under small packet flood GRO count could elsewhere grow a lot
+		 * leading to execessive truesize values
+		 */
+		if (!skb_gro_receive(p, skb) &&
+		    NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+			pp = p;
+		else if (uh->len != uh2->len)
+			pp = p;
+
+		return pp;
+	}
+
+	/* mismatch, but we never need to flush */
+	return NULL;
+}
+
 struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 				struct udphdr *uh, udp_lookup_t lookup)
 {
@@ -353,23 +401,27 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	int flush = 1;
 	struct sock *sk;
 
+	rcu_read_lock();
+	sk = (*lookup)(skb, uh->source, uh->dest);
+	if (!sk)
+		goto out_unlock;
+
+	if (udp_sk(sk)->gro_enabled) {
+		pp = call_gro_receive(udp_gro_receive_segment, head, skb);
+		rcu_read_unlock();
+		return pp;
+	}
+
 	if (NAPI_GRO_CB(skb)->encap_mark ||
 	    (skb->ip_summed != CHECKSUM_PARTIAL &&
 	     NAPI_GRO_CB(skb)->csum_cnt == 0 &&
-	     !NAPI_GRO_CB(skb)->csum_valid))
-		goto out;
+	     !NAPI_GRO_CB(skb)->csum_valid) ||
+	    !udp_sk(sk)->gro_receive)
+		goto out_unlock;
 
 	/* mark that this skb passed once through the tunnel gro layer */
 	NAPI_GRO_CB(skb)->encap_mark = 1;
 
-	rcu_read_lock();
-	sk = (*lookup)(skb, uh->source, uh->dest);
-
-	if (sk && udp_sk(sk)->gro_receive)
-		goto unflush;
-	goto out_unlock;
-
-unflush:
 	flush = 0;
 
 	list_for_each_entry(p, head, list) {
@@ -394,7 +446,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
 
 out_unlock:
 	rcu_read_unlock();
-out:
 	skb_gro_flush_final(skb, pp, flush);
 	return pp;
 }
@@ -427,6 +478,19 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
 	return NULL;
 }
 
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+	struct udphdr *uh = udp_hdr(skb);
+
+	skb->csum_start = (unsigned char *)uh - skb->head;
+	skb->csum_offset = offsetof(struct udphdr, check);
+	skb->ip_summed = CHECKSUM_PARTIAL;
+
+	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+	skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+	return 0;
+}
+
 int udp_gro_complete(struct sk_buff *skb, int nhoff,
 		     udp_lookup_t lookup)
 {
@@ -437,16 +501,21 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
 
 	uh->len = newlen;
 
-	/* Set encapsulation before calling into inner gro_complete() functions
-	 * to make them set up the inner offsets.
-	 */
-	skb->encapsulation = 1;
-
 	rcu_read_lock();
 	sk = (*lookup)(skb, uh->source, uh->dest);
-	if (sk && udp_sk(sk)->gro_complete)
+	if (sk && udp_sk(sk)->gro_enabled) {
+		err = udp_gro_complete_segment(skb);
+	} else if (sk && udp_sk(sk)->gro_complete) {
+		skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+					: SKB_GSO_UDP_TUNNEL;
+
+		/* Set encapsulation before calling into inner gro_complete()
+		 * functions to make them set up the inner offsets.
+		 */
+		skb->encapsulation = 1;
 		err = udp_sk(sk)->gro_complete(sk, skb,
 				nhoff + sizeof(struct udphdr));
+	}
 	rcu_read_unlock();
 
 	if (skb->remcsum_offload)
@@ -461,13 +530,9 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
 					  iph->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
 }
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 1b8e161ac527..828b2457f97b 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -147,13 +147,9 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
-	if (uh->check) {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+	if (uh->check)
 		uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
 					  &ipv6h->daddr, 0);
-	} else {
-		skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
-	}
 
 	return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
 }
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 03/10] udp: add support for UDP_GRO cmsg
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

When UDP GRO is enabled, the UDP_GRO cmsg will carry the ingress
datagram size. User-space can use such info to compute the original
packets layout.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: I avoided setting a bit in cmsg_flag for UDP_GRO, as that
attempt produced some uglyfication, expecially on the ipv6 side
with no measurable performances benefits.
---
 include/linux/udp.h | 11 +++++++++++
 net/ipv4/udp.c      |  4 ++++
 net/ipv6/udp.c      |  3 +++
 3 files changed, 18 insertions(+)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index f613b329852e..e23d5024f42f 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -121,6 +121,17 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
+static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
+				 struct sk_buff *skb)
+{
+	int gso_size;
+
+	if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
+		gso_size = skb_shinfo(skb)->gso_size;
+		put_cmsg(msg, SOL_UDP, UDP_GRO, sizeof(gso_size), &gso_size);
+	}
+}
+
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4d4f4d044c28..b345f71b1cbb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1714,6 +1714,10 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 		*addr_len = sizeof(*sin);
 	}
+
+	if (udp_sk(sk)->gro_enabled)
+		udp_cmsg_recv(msg, sk, skb);
+
 	if (inet->cmsg_flags)
 		ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index fc0ce6c59ebb..8e76e719305c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -421,6 +421,9 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		*addr_len = sizeof(*sin6);
 	}
 
+	if (udp_sk(sk)->gro_enabled)
+		udp_cmsg_recv(msg, sk, skb);
+
 	if (np->rxopt.all)
 		ip6_datagram_recv_common_ctl(sk, msg, skb);
 
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 04/10] ip: factor out protocol delivery helper
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

So that we can re-use it at the UDP lavel in a later patch

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv4/ip_input.c | 73 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 35a786c0aaa0..72250b4e466d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -188,51 +188,50 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 	return false;
 }
 
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
 {
-	__skb_pull(skb, skb_network_header_len(skb));
-
-	rcu_read_lock();
-	{
-		int protocol = ip_hdr(skb)->protocol;
-		const struct net_protocol *ipprot;
-		int raw;
+	const struct net_protocol *ipprot;
+	int raw, ret;
 
-	resubmit:
-		raw = raw_local_deliver(skb, protocol);
+resubmit:
+	raw = raw_local_deliver(skb, protocol);
 
-		ipprot = rcu_dereference(inet_protos[protocol]);
-		if (ipprot) {
-			int ret;
-
-			if (!ipprot->no_policy) {
-				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-					kfree_skb(skb);
-					goto out;
-				}
-				nf_reset(skb);
+	ipprot = rcu_dereference(inet_protos[protocol]);
+	if (ipprot) {
+		if (!ipprot->no_policy) {
+			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				kfree_skb(skb);
+				return;
 			}
-			ret = ipprot->handler(skb);
-			if (ret < 0) {
-				protocol = -ret;
-				goto resubmit;
+			nf_reset(skb);
+		}
+		ret = ipprot->handler(skb);
+		if (ret < 0) {
+			protocol = -ret;
+			goto resubmit;
+		}
+		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+	} else {
+		if (!raw) {
+			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+				icmp_send(skb, ICMP_DEST_UNREACH,
+					  ICMP_PROT_UNREACH, 0);
 			}
-			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+			kfree_skb(skb);
 		} else {
-			if (!raw) {
-				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-					__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
-					icmp_send(skb, ICMP_DEST_UNREACH,
-						  ICMP_PROT_UNREACH, 0);
-				}
-				kfree_skb(skb);
-			} else {
-				__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
-				consume_skb(skb);
-			}
+			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+			consume_skb(skb);
 		}
 	}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	__skb_pull(skb, skb_network_header_len(skb));
+
+	rcu_read_lock();
+	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
 	rcu_read_unlock();
 
 	return 0;
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 05/10] ipv6: factor out protocol delivery helper
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

So that we can re-use it at the UDP lavel in the next patch

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv6/ip6_input.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 96577e742afd..3065226bdc57 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -319,28 +319,26 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 /*
  *	Deliver the packet to the host
  */
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+			      bool have_final)
 {
 	const struct inet6_protocol *ipprot;
 	struct inet6_dev *idev;
 	unsigned int nhoff;
-	int nexthdr;
 	bool raw;
-	bool have_final = false;
 
 	/*
 	 *	Parse extension headers
 	 */
 
-	rcu_read_lock();
 resubmit:
 	idev = ip6_dst_idev(skb_dst(skb));
-	if (!pskb_pull(skb, skb_transport_offset(skb)))
-		goto discard;
 	nhoff = IP6CB(skb)->nhoff;
-	nexthdr = skb_network_header(skb)[nhoff];
+	if (!have_final) {
+		if (!pskb_pull(skb, skb_transport_offset(skb)))
+			goto discard;
+		nexthdr = skb_network_header(skb)[nhoff];
+	}
 
 resubmit_final:
 	raw = raw6_local_deliver(skb, nexthdr);
@@ -411,13 +409,19 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
 			consume_skb(skb);
 		}
 	}
-	rcu_read_unlock();
-	return 0;
+	return;
 
 discard:
 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
-	rcu_read_unlock();
 	kfree_skb(skb);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	rcu_read_lock();
+	ip6_protocol_deliver_rcu(net, skb, 0, false);
+	rcu_read_unlock();
+
 	return 0;
 }
 
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 06/10] udp: cope with UDP GRO packet misdirection
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

In some scenarios, the GRO engine can assemble an UDP GRO packet
that ultimately lands on a non GRO-enabled socket.
This patch tries to address the issue explicitly checking for the UDP
socket features before enqueuing the packet, and eventually segmenting
the unexpected GRO packet, as needed.

We must also cope with re-insertion requests: after segmentation the
UDP code calls the helper introduced by the previous patches, as needed.

Segmentation is performed by a common helper, which takes care of
updating socket and protocol stats is case of failure.

rfc v2 -> rfc v3
 - moved udp_rcv_segment() into net/udp.h, account errors to socket
   and ns, always return NULL or segs list

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/udp.h |  6 ++++++
 include/net/udp.h   | 51 ++++++++++++++++++++++++++++++++++++++-------
 net/ipv4/udp.c      | 25 +++++++++++++++++++++-
 net/ipv6/udp.c      | 27 +++++++++++++++++++++++-
 4 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e23d5024f42f..0a9c54e76305 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -132,6 +132,12 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
 	}
 }
 
+static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
+{
+	return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) &&
+	       skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4;
+}
+
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 9e82cb391dea..f94aed316a04 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -406,17 +406,24 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
 } while(0)
 
 #if IS_ENABLED(CONFIG_IPV6)
-#define __UDPX_INC_STATS(sk, field)					\
-do {									\
-	if ((sk)->sk_family == AF_INET)					\
-		__UDP_INC_STATS(sock_net(sk), field, 0);		\
-	else								\
-		__UDP6_INC_STATS(sock_net(sk), field, 0);		\
-} while (0)
+#define __UDPX_MIB(sk, ipv4)						\
+({									\
+	ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :	\
+				 sock_net(sk)->mib.udp_statistics) :	\
+		(IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 :	\
+				 sock_net(sk)->mib.udp_stats_in6);	\
+})
 #else
-#define __UDPX_INC_STATS(sk, field) __UDP_INC_STATS(sock_net(sk), field, 0)
+#define __UDPX_MIB(sk, ipv4)						\
+({									\
+	IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics :		\
+			 sock_net(sk)->mib.udp_statistics;		\
+})
 #endif
 
+#define __UDPX_INC_STATS(sk, field) \
+	__SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET, field)
+
 #ifdef CONFIG_PROC_FS
 struct udp_seq_afinfo {
 	sa_family_t			family;
@@ -450,4 +457,32 @@ DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void);
 #endif
 
+static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
+					      struct sk_buff *skb)
+{
+	bool ipv4 = skb->protocol == htons(ETH_P_IP);
+	int segs_nr = skb_shinfo(skb)->gso_segs;
+	struct sk_buff *segs;
+
+	/* the GSO CB lays after the UDP one, no need to save and restore any
+	 * CB fragment
+	 */
+	segs = __skb_gso_segment(skb, NETIF_F_SG, false);
+	if (unlikely(IS_ERR(segs))) {
+		kfree_skb(skb);
+		goto drop;
+	}
+
+	if (unlikely(!segs))
+		goto drop;
+
+	consume_skb(skb);
+	return segs;
+
+drop:
+	atomic_add(segs_nr, &sk->sk_drops);
+	SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr);
+	return NULL;
+}
+
 #endif	/* _UDP_H */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b345f71b1cbb..b45033f63673 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1909,7 +1909,7 @@ EXPORT_SYMBOL(udp_encap_enable);
  * Note that in the success and error cases, the skb is assumed to
  * have either been requeued or freed.
  */
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int is_udplite = IS_UDPLITE(sk);
@@ -2012,6 +2012,29 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return -1;
 }
 
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
+
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *next, *segs;
+	int ret;
+
+	if (likely(!udp_unexpected_gso(sk, skb)))
+		return udp_queue_rcv_one_skb(sk, skb);
+
+	BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+	__skb_push(skb, -skb_mac_offset(skb));
+	segs = udp_rcv_segment(sk, skb);
+	for (skb = segs; skb; skb = next) {
+		next = skb->next;
+		__skb_pull(skb, skb_transport_offset(skb));
+		ret = udp_queue_rcv_one_skb(sk, skb);
+		if (ret > 0)
+			ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+	}
+	return 0;
+}
+
 /* For TCP sockets, sk_rx_dst is protected by socket lock
  * For UDP, we use xchg() to guard against concurrent changes.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 8e76e719305c..137c421bef82 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -558,7 +558,7 @@ void udpv6_encap_enable(void)
 }
 EXPORT_SYMBOL(udpv6_encap_enable);
 
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct udp_sock *up = udp_sk(sk);
 	int is_udplite = IS_UDPLITE(sk);
@@ -641,6 +641,31 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	return -1;
 }
 
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+			      bool have_final);
+
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff *next, *segs;
+	int ret;
+
+	if (likely(!udp_unexpected_gso(sk, skb)))
+		return udpv6_queue_rcv_one_skb(sk, skb);
+
+	__skb_push(skb, -skb_mac_offset(skb));
+	segs = udp_rcv_segment(sk, skb);
+	for (skb = segs; skb; skb = next) {
+		next = skb->next;
+		__skb_pull(skb, skb_transport_offset(skb));
+
+		ret = udpv6_queue_rcv_one_skb(sk, skb);
+		if (ret > 0)
+			ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+						 true);
+	}
+	return 0;
+}
+
 static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
 				   __be16 loc_port, const struct in6_addr *loc_addr,
 				   __be16 rmt_port, const struct in6_addr *rmt_addr,
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 07/10] selftests: add GRO support to udp bench rx program
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

And fix a couple of buglets (port option processing,
clean termination on SIGINT). This is preparatory work
for GRO tests.

rfc v2 -> rfc v3:
 - use ETH_MAX_MTU

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/udpgso_bench_rx.c | 37 +++++++++++++++----
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 727cf67a3f75..8f48d7fb32cf 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,9 +31,15 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#ifndef UDP_GRO
+#define UDP_GRO		104
+#endif
+
 static int  cfg_port		= 8000;
 static bool cfg_tcp;
 static bool cfg_verify;
+static bool cfg_read_all;
+static bool cfg_gro_segment;
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -63,6 +69,8 @@ static void do_poll(int fd)
 
 	do {
 		ret = poll(&pfd, 1, 10);
+		if (interrupted)
+			break;
 		if (ret == -1)
 			error(1, errno, "poll");
 		if (ret == 0)
@@ -70,7 +78,7 @@ static void do_poll(int fd)
 		if (pfd.revents != POLLIN)
 			error(1, errno, "poll: 0x%x expected 0x%x\n",
 					pfd.revents, POLLIN);
-	} while (!ret && !interrupted);
+	} while (!ret);
 }
 
 static int do_socket(bool do_tcp)
@@ -102,6 +110,8 @@ static int do_socket(bool do_tcp)
 			error(1, errno, "listen");
 
 		do_poll(accept_fd);
+		if (interrupted)
+			exit(0);
 
 		fd = accept(accept_fd, NULL, NULL);
 		if (fd == -1)
@@ -167,10 +177,10 @@ static void do_verify_udp(const char *data, int len)
 /* Flush all outstanding datagrams. Verify first few bytes of each. */
 static void do_flush_udp(int fd)
 {
-	static char rbuf[ETH_DATA_LEN];
+	static char rbuf[ETH_MAX_MTU];
 	int ret, len, budget = 256;
 
-	len = cfg_verify ? sizeof(rbuf) : 0;
+	len = cfg_read_all ? sizeof(rbuf) : 0;
 	while (budget--) {
 		/* MSG_TRUNC will make return value full datagram length */
 		ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
@@ -178,7 +188,7 @@ static void do_flush_udp(int fd)
 			return;
 		if (ret == -1)
 			error(1, errno, "recv");
-		if (len) {
+		if (len && cfg_verify) {
 			if (ret == 0)
 				error(1, errno, "recv: 0 byte datagram\n");
 
@@ -192,23 +202,30 @@ static void do_flush_udp(int fd)
 
 static void usage(const char *filepath)
 {
-	error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+	error(1, 0, "Usage: %s [-Grtv] [-p port]", filepath);
 }
 
 static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "ptv")) != -1) {
+	while ((c = getopt(argc, argv, "Gp:rtv")) != -1) {
 		switch (c) {
+		case 'G':
+			cfg_gro_segment = true;
+			break;
 		case 'p':
-			cfg_port = htons(strtoul(optarg, NULL, 0));
+			cfg_port = strtoul(optarg, NULL, 0);
+			break;
+		case 'r':
+			cfg_read_all = true;
 			break;
 		case 't':
 			cfg_tcp = true;
 			break;
 		case 'v':
 			cfg_verify = true;
+			cfg_read_all = true;
 			break;
 		}
 	}
@@ -227,6 +244,12 @@ static void do_recv(void)
 
 	fd = do_socket(cfg_tcp);
 
+	if (cfg_gro_segment && !cfg_tcp) {
+		int val = 1;
+		if (setsockopt(fd, IPPROTO_UDP, UDP_GRO, &val, sizeof(val)))
+			error(1, errno, "setsockopt UDP_GRO");
+	}
+
 	treport = gettimeofday_ms() + 1000;
 	do {
 		do_poll(fd);
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 08/10] selftests: conditionally enable XDP support in udpgso_bench_rx
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

XDP support will be used by a later patch to test the GRO path
in a net namespace, leveraging the veth XDP implementation.
To avoid breaking existing setup, XDP support is conditionally
enabled and build only if llc is locally available.

rfc v2 -> rfc v3:
 - move 'x' option handling here

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile          | 69 +++++++++++++++++++
 tools/testing/selftests/net/udpgso_bench_rx.c | 41 ++++++++++-
 tools/testing/selftests/net/xdp_dummy.c       | 13 ++++
 3 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/net/xdp_dummy.c

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 256d82d5fa87..176459b7c4d6 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -16,8 +16,77 @@ TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
 TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
 
 KSFT_KHDR_INSTALL := 1
+
+# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
+#  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+LLC ?= llc
+CLANG ?= clang
+LLVM_OBJCOPY ?= llvm-objcopy
+BTF_PAHOLE ?= pahole
+HAS_LLC := $(shell which $(LLC) 2>/dev/null)
+
+# conditional enable testes requiring llc
+ifneq (, $(HAS_LLC))
+TEST_GEN_FILES += xdp_dummy.o
+endif
+
 include ../lib.mk
 
+ifneq (, $(HAS_LLC))
+
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+# Let newer LLVM versions transparently probe the kernel for availability
+# of full BPF instruction set.
+ifeq ($(PROBE),)
+  CPU ?= probe
+else
+  CPU ?= generic
+endif
+
+SRC_PATH := $(abspath ../../../..)
+LIB_PATH := $(SRC_PATH)/tools/lib
+XDP_CFLAGS := -D SUPPORT_XDP=1 -I$(LIB_PATH)
+LIBBPF = $(LIB_PATH)/bpf/libbpf.a
+BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
+BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
+BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+        | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+CLANG_FLAGS = -I. -I$(SRC_PATH)/include -I../bpf/ \
+	      $(CLANG_SYS_INCLUDES) -Wno-compare-distinct-pointer-types
+
+ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
+	CLANG_CFLAGS += -g
+	LLC_FLAGS += -mattr=dwarfris
+	DWARF2BTF = y
+endif
+
+$(LIBBPF): FORCE
+# Fix up variables inherited from Kbuild that tools/ build system won't like
+	$(MAKE) -C $(dir $@) RM='rm -rf' LDFLAGS= srctree=$(SRC_PATH) O= $(nodir $@)
+
+$(OUTPUT)/udpgso_bench_rx: $(OUTPUT)/udpgso_bench_rx.c $(LIBBPF)
+	$(CC) -o $@ $(XDP_CFLAGS) $(CFLAGS) $(LOADLIBES) $(LDLIBS) $^ -lelf
+
+FORCE:
+
+# bpf program[s] generation
+$(OUTPUT)/%.o: %.c
+	$(CLANG) $(CLANG_FLAGS) \
+		 -O2 -target bpf -emit-llvm -c $< -o - |      \
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+ifeq ($(DWARF2BTF),y)
+	$(BTF_PAHOLE) -J $@
+endif
+
+endif
+
 $(OUTPUT)/reuseport_bpf_numa: LDFLAGS += -lnuma
 $(OUTPUT)/tcp_mmap: LDFLAGS += -lpthread
 $(OUTPUT)/tcp_inq: LDFLAGS += -lpthread
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 8f48d7fb32cf..5dcb719abe04 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -31,6 +31,10 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
+#ifdef SUPPORT_XDP
+#include "bpf/libbpf.h"
+#endif
+
 #ifndef UDP_GRO
 #define UDP_GRO		104
 #endif
@@ -40,6 +44,9 @@ static bool cfg_tcp;
 static bool cfg_verify;
 static bool cfg_read_all;
 static bool cfg_gro_segment;
+#ifdef SUPPORT_XDP
+static int cfg_xdp_iface;
+#endif
 
 static bool interrupted;
 static unsigned long packets, bytes;
@@ -202,14 +209,14 @@ static void do_flush_udp(int fd)
 
 static void usage(const char *filepath)
 {
-	error(1, 0, "Usage: %s [-Grtv] [-p port]", filepath);
+	error(1, 0, "Usage: %s [-Grtv] [-p port] [-x device]", filepath);
 }
 
 static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "Gp:rtv")) != -1) {
+	while ((c = getopt(argc, argv, "Gp:rtvx:")) != -1) {
 		switch (c) {
 		case 'G':
 			cfg_gro_segment = true;
@@ -227,6 +234,13 @@ static void parse_opts(int argc, char **argv)
 			cfg_verify = true;
 			cfg_read_all = true;
 			break;
+#ifdef SUPPORT_XDP
+		case 'x':
+			cfg_xdp_iface = if_nametoindex(optarg);
+			if (!cfg_xdp_iface)
+				error(1, errno, "unknown interface %s", optarg);
+			break;
+#endif
 		}
 	}
 
@@ -240,6 +254,9 @@ static void parse_opts(int argc, char **argv)
 static void do_recv(void)
 {
 	unsigned long tnow, treport;
+#ifdef SUPPORT_XDP
+	int prog_fd = -1;
+#endif
 	int fd;
 
 	fd = do_socket(cfg_tcp);
@@ -250,6 +267,22 @@ static void do_recv(void)
 			error(1, errno, "setsockopt UDP_GRO");
 	}
 
+#ifdef SUPPORT_XDP
+	if (cfg_xdp_iface) {
+		struct bpf_prog_load_attr prog_load_attr = {
+			.prog_type	= BPF_PROG_TYPE_XDP,
+			.file 		= "xdp_dummy.o",
+		};
+		struct bpf_object *obj;
+
+		if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+			error(1, errno, "xdp program load failed\n");
+
+		if (bpf_set_link_xdp_fd(cfg_xdp_iface, prog_fd, 0) < 0)
+			error(1, errno, "link set xdp fd failed\n");
+	}
+#endif
+
 	treport = gettimeofday_ms() + 1000;
 	do {
 		do_poll(fd);
@@ -274,6 +307,10 @@ static void do_recv(void)
 
 	if (close(fd))
 		error(1, errno, "close");
+#ifdef SUPPORT_XDP
+	if (cfg_xdp_iface && bpf_set_link_xdp_fd(cfg_xdp_iface, -1, 0))
+		error(1, errno, "removing xdp program");
+#endif
 }
 
 int main(int argc, char **argv)
diff --git a/tools/testing/selftests/net/xdp_dummy.c b/tools/testing/selftests/net/xdp_dummy.c
new file mode 100644
index 000000000000..1a64cf5099ed
--- /dev/null
+++ b/tools/testing/selftests/net/xdp_dummy.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define KBUILD_MODNAME "xdp_dummy"
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 09/10] selftests: add some benchmark for UDP GRO
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

Run on top of veth pair, using a dummy XDP program to enable the GRO.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile        |  1 +
 tools/testing/selftests/net/udpgro_bench.sh | 92 +++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100755 tools/testing/selftests/net/udpgro_bench.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 176459b7c4d6..ac999354af54 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,6 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
+TEST_PROGS += udpgro_bench.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
new file mode 100755
index 000000000000..03d37e5e7424
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro benchmarks
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+cleanup() {
+	local -r jobs="$(jobs -p)"
+	local -r ns="$(ip netns list|grep $PEER_NS)"
+
+	[ -n "${jobs}" ] && kill -INT ${jobs} 2>/dev/null
+	[ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+run_one() {
+	# use 'rx' as separator between sender args and receiver args
+	local -r all="$@"
+	local -r tx_args=${all%rx*}
+	local -r rx_args=${all#*rx}
+
+	ip netns add "${PEER_NS}"
+	ip -netns "${PEER_NS}" link set lo up
+	ip link add type veth
+	ip link set dev veth0 up
+	ip addr add dev veth0 192.168.1.2/24
+	ip addr add dev veth0 2001:db8::2/64 nodad
+
+	ip link set dev veth1 netns "${PEER_NS}"
+	ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+	ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+	ip -netns "${PEER_NS}" link set dev veth1 up
+
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r -x veth1 &
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r &
+
+	# Hack: let bg programs complete the startup
+	sleep 0.1
+	./udpgso_bench_tx ${tx_args}
+}
+
+run_in_netns() {
+	local -r args=$@
+
+	./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+	local -r args=$@
+
+	echo "udp gso - over veth touching data"
+	run_in_netns ${args} -S rx
+
+	echo "udp gso and gro - over veth touching data"
+	run_in_netns ${args} -S rx -G
+}
+
+run_tcp() {
+	local -r args=$@
+
+	echo "tcp - over veth touching data"
+	run_in_netns ${args} -t rx
+}
+
+run_all() {
+	local -r core_args="-l 4"
+	local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+	local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+	echo "ipv4"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv4_args}"
+
+	echo "ipv6"
+	run_tcp "${ipv4_args}"
+	run_udp "${ipv6_args}"
+}
+
+if [ ! -f xdp_dummy.o ]; then
+	echo "Skipping GRO benchmarks - missing LLC"
+	exit 0
+fi
+
+if [[ $# -eq 0 ]]; then
+	run_all
+elif [[ $1 == "__subprocess" ]]; then
+	shift
+	run_one $@
+else
+	run_in_netns $@
+fi
-- 
2.17.2

^ permalink raw reply related

* [RFC PATCH v3 10/10] selftests: add functionals test for UDP GRO
From: Paolo Abeni @ 2018-10-30 17:24 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Willem de Bruijn, Steffen Klassert,
	Subash Abhinov Kasiviswanathan
In-Reply-To: <cover.1540920083.git.pabeni@redhat.com>

Extends the existing udp programs to allow checking for proper
GRO aggregation/GSO size, and run the tests via a shell script, using
a veth pair with XDP program attached to trigger the GRO code path.

rfc v2 -> rfc v3:
 - add missing test program options documentation
 - fix sporatic test failures (receiver faster than sender)

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/udpgro.sh         | 147 ++++++++++++++++++
 tools/testing/selftests/net/udpgro_bench.sh   |   8 +-
 tools/testing/selftests/net/udpgso_bench.sh   |   2 +-
 tools/testing/selftests/net/udpgso_bench_rx.c | 123 +++++++++++++--
 tools/testing/selftests/net/udpgso_bench_tx.c |  22 ++-
 6 files changed, 281 insertions(+), 23 deletions(-)
 create mode 100755 tools/testing/selftests/net/udpgro.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index ac999354af54..a8a0d256aafb 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -7,7 +7,7 @@ CFLAGS += -I../../../../usr/include/
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
 TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
-TEST_PROGS += udpgro_bench.sh
+TEST_PROGS += udpgro_bench.sh udpgro.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
new file mode 100755
index 000000000000..3f12b72a3568
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro functional tests.
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+cleanup() {
+	local -r jobs="$(jobs -p)"
+	local -r ns="$(ip netns list|grep $PEER_NS)"
+
+	[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+	[ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+cfg_veth() {
+	ip netns add "${PEER_NS}"
+	ip -netns "${PEER_NS}" link set lo up
+	ip link add type veth
+	ip link set dev veth0 up
+	ip addr add dev veth0 192.168.1.2/24
+	ip addr add dev veth0 2001:db8::2/64 nodad
+
+	ip link set dev veth1 netns "${PEER_NS}"
+	ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+	ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+	ip -netns "${PEER_NS}" link set dev veth1 up
+}
+
+run_one() {
+	# use 'rx' as separator between sender args and receiver args
+	local -r all="$@"
+	local -r tx_args=${all%rx*}
+	local -r rx_args=${all#*rx}
+
+	cfg_veth
+
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \
+		echo "ok" || \
+		echo "failed" &
+
+	# Hack: let bg programs complete the startup
+	sleep 0.1
+	./udpgso_bench_tx ${tx_args}
+	wait $(jobs -p)
+}
+
+run_test() {
+	local -r args=$@
+
+	printf " %-40s" "$1"
+	./in_netns.sh $0 __subprocess $2 rx -G -r -x veth1 $3
+}
+
+run_one_nat() {
+	# use 'rx' as separator between sender args and receiver args
+	local addr1 addr2 pid family="" ipt_cmd=ip6tables
+	local -r all="$@"
+	local -r tx_args=${all%rx*}
+	local -r rx_args=${all#*rx}
+
+	if [[ ${tx_args} = *-4* ]]; then
+		ipt_cmd=iptables
+		family=-4
+		addr1=192.168.1.1
+		addr2=192.168.1.3/24
+	else
+		addr1=2001:db8::1
+		addr2="2001:db8::3/64 nodad"
+	fi
+
+	cfg_veth
+	ip -netns "${PEER_NS}" addr add dev veth1 ${addr2}
+
+	# fool the GRO engine changing the destination address ...
+	ip netns exec "${PEER_NS}" $ipt_cmd -t nat -I PREROUTING -d ${addr1} -j DNAT --to-destination ${addr2%/*}
+
+	# ... so that GRO will match the UDP_GRO enabled socket, but packets
+	# will land on the 'plain' one
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -x veth1 -b ${addr1} -n 0 &
+	pid=$!
+	ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${family} -b ${addr2%/*} ${rx_args} && \
+		echo "ok" || \
+		echo "failed"&
+
+	sleep 0.1
+	./udpgso_bench_tx ${tx_args}
+	kill -INT $pid
+	wait $(jobs -p)
+}
+
+run_nat_test() {
+	local -r args=$@
+
+	printf " %-40s" "$1"
+	./in_netns.sh $0 __subprocess_nat $2 rx -r $3
+}
+
+run_all() {
+	local -r core_args="-l 4"
+	local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+	local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+	echo "ipv4"
+	run_test "no GRO" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400"
+
+	# explicitly check we are not receiving UDP_SEGMENT cmsg (-S -1)
+	# when GRO does not take place
+	run_test "no GRO chk cmsg" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400 -S -1"
+
+	# the GSO packets are aggregated because:
+	# * veth schedule napi after each xmit
+	# * segmentation happens in BH context, veth napi poll is delayed after
+	#   the transmission of the last segment
+	run_test "GRO" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720"
+	run_test "GRO chk cmsg" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720 -S 1472"
+	run_test "GRO with custom segment size" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720"
+	run_test "GRO with custom segment size cmsg" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720 -S 500"
+
+	run_nat_test "bad GRO lookup" "${ipv4_args} -M 1 -s 14720 -S 0" "-n 10 -l 1472"
+
+	echo "ipv6"
+	run_test "no GRO" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400"
+	run_test "no GRO chk cmsg" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400 -S -1"
+	run_test "GRO" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520"
+	run_test "GRO chk cmsg" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520 -S 1452"
+	run_test "GRO with custom segment size" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520"
+	run_test "GRO with custom segment size cmsg" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520 -S 500"
+
+	run_nat_test "bad GRO lookup" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 10 -l 1452"
+}
+
+if [ ! -f xdp_dummy.o ]; then
+	echo "Skipping GRO tests - missing LLC"
+	exit 0
+fi
+
+if [[ $# -eq 0 ]]; then
+	run_all
+elif [[ $1 == "__subprocess" ]]; then
+	shift
+	run_one $@
+elif [[ $1 == "__subprocess_nat" ]]; then
+	shift
+	run_one_nat $@
+fi
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
index 03d37e5e7424..77a1fb0ae0bc 100755
--- a/tools/testing/selftests/net/udpgro_bench.sh
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -18,7 +18,9 @@ run_one() {
 	# use 'rx' as separator between sender args and receiver args
 	local -r all="$@"
 	local -r tx_args=${all%rx*}
-	local -r rx_args=${all#*rx}
+	local rx_args=${all#*rx}
+
+	[[ "${tx_args}" == *"-4"* ]] && rx_args="${rx_args} -4"
 
 	ip netns add "${PEER_NS}"
 	ip -netns "${PEER_NS}" link set lo up
@@ -50,10 +52,10 @@ run_udp() {
 	local -r args=$@
 
 	echo "udp gso - over veth touching data"
-	run_in_netns ${args} -S rx
+	run_in_netns ${args} -S 0 rx
 
 	echo "udp gso and gro - over veth touching data"
-	run_in_netns ${args} -S rx -G
+	run_in_netns ${args} -S 0 rx -G
 }
 
 run_tcp() {
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
index 99e537ab5ad9..0f0628613f81 100755
--- a/tools/testing/selftests/net/udpgso_bench.sh
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -34,7 +34,7 @@ run_udp() {
 	run_in_netns ${args}
 
 	echo "udp gso"
-	run_in_netns ${args} -S
+	run_in_netns ${args} -S 0
 }
 
 run_tcp() {
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
index 5dcb719abe04..9657c6988f26 100644
--- a/tools/testing/selftests/net/udpgso_bench_rx.c
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -44,6 +44,12 @@ static bool cfg_tcp;
 static bool cfg_verify;
 static bool cfg_read_all;
 static bool cfg_gro_segment;
+static int  cfg_family		= PF_INET6;
+static int  cfg_alen 		= sizeof(struct sockaddr_in6);
+static int  cfg_expected_pkt_nr;
+static int  cfg_expected_pkt_len;
+static int  cfg_expected_gso_size;
+static struct sockaddr_storage cfg_bind_addr;
 #ifdef SUPPORT_XDP
 static int cfg_xdp_iface;
 #endif
@@ -57,6 +63,29 @@ static void sigint_handler(int signum)
 		interrupted = true;
 }
 
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+	struct sockaddr_in6 *addr6 = (void *) sockaddr;
+	struct sockaddr_in *addr4 = (void *) sockaddr;
+
+	switch (domain) {
+	case PF_INET:
+		addr4->sin_family = AF_INET;
+		addr4->sin_port = htons(cfg_port);
+		if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+			error(1, 0, "ipv4 parse error: %s", str_addr);
+		break;
+	case PF_INET6:
+		addr6->sin6_family = AF_INET6;
+		addr6->sin6_port = htons(cfg_port);
+		if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+			error(1, 0, "ipv6 parse error: %s", str_addr);
+		break;
+	default:
+		error(1, 0, "illegal domain");
+	}
+}
+
 static unsigned long gettimeofday_ms(void)
 {
 	struct timeval tv;
@@ -90,10 +119,9 @@ static void do_poll(int fd)
 
 static int do_socket(bool do_tcp)
 {
-	struct sockaddr_in6 addr = {0};
 	int fd, val;
 
-	fd = socket(PF_INET6, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+	fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
 	if (fd == -1)
 		error(1, errno, "socket");
 
@@ -104,10 +132,7 @@ static int do_socket(bool do_tcp)
 	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
 		error(1, errno, "setsockopt reuseport");
 
-	addr.sin6_family =	PF_INET6;
-	addr.sin6_port =	htons(cfg_port);
-	addr.sin6_addr =	in6addr_any;
-	if (bind(fd, (void *) &addr, sizeof(addr)))
+	if (bind(fd, (void *)&cfg_bind_addr, cfg_alen))
 		error(1, errno, "bind");
 
 	if (do_tcp) {
@@ -181,52 +206,117 @@ static void do_verify_udp(const char *data, int len)
 	}
 }
 
+static int recv_msg(int fd, char *buf, int len, int *gso_size)
+{
+	char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
+	struct msghdr msg = {0};
+	struct iovec iov = {0};
+	struct cmsghdr *cmsg;
+	uint16_t *gsosizeptr;
+	int ret;
+
+	iov.iov_base = buf;
+	iov.iov_len = len;
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	msg.msg_control = control;
+	msg.msg_controllen = sizeof(control);
+
+	*gso_size = -1;
+	ret = recvmsg(fd, &msg, MSG_TRUNC | MSG_DONTWAIT);
+	if (ret != -1) {
+		for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
+		     cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+			if (cmsg->cmsg_level == SOL_UDP
+			    && cmsg->cmsg_type == UDP_GRO) {
+				gsosizeptr = (uint16_t *) CMSG_DATA(cmsg);
+				*gso_size = *gsosizeptr;
+				break;
+			}
+		}
+	}
+	return ret;
+}
+
 /* Flush all outstanding datagrams. Verify first few bytes of each. */
 static void do_flush_udp(int fd)
 {
 	static char rbuf[ETH_MAX_MTU];
-	int ret, len, budget = 256;
+	int ret, len, gso_size, budget = 256;
 
 	len = cfg_read_all ? sizeof(rbuf) : 0;
 	while (budget--) {
 		/* MSG_TRUNC will make return value full datagram length */
-		ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+		if (!cfg_expected_gso_size)
+			ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+		else
+			ret = recv_msg(fd, rbuf, len, &gso_size);
 		if (ret == -1 && errno == EAGAIN)
-			return;
+			break;
 		if (ret == -1)
 			error(1, errno, "recv");
+		if (cfg_expected_pkt_len && ret != cfg_expected_pkt_len)
+			error(1, 0, "recv: bad packet len, got %d,"
+			      " expected %d\n", ret, cfg_expected_pkt_len);
 		if (len && cfg_verify) {
 			if (ret == 0)
 				error(1, errno, "recv: 0 byte datagram\n");
 
 			do_verify_udp(rbuf, ret);
 		}
+		if (cfg_expected_gso_size && cfg_expected_gso_size != gso_size)
+			error(1, 0, "recv: bad gso size, got %d, expected %d "
+			      "(-1 == no gso cmsg))\n", gso_size,
+			      cfg_expected_gso_size);
 
 		packets++;
 		bytes += ret;
+		if (cfg_expected_pkt_nr && packets >= cfg_expected_pkt_nr)
+			break;
 	}
 }
 
 static void usage(const char *filepath)
 {
-	error(1, 0, "Usage: %s [-Grtv] [-p port] [-x device]", filepath);
+	error(1, 0, "Usage: %s [-Grtv] [-b addr] [-p port] [-l pktlen] [-n packetnr] [-S gsosize] [-x device]", filepath);
 }
 
 static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "Gp:rtvx:")) != -1) {
+	/* bind to any by default */
+	setup_sockaddr(PF_INET6, "::", &cfg_bind_addr);
+	while ((c = getopt(argc, argv, "4b:Gl:n:p:rS:tvx:")) != -1) {
 		switch (c) {
+		case '4':
+			cfg_family = PF_INET;
+			cfg_alen = sizeof(struct sockaddr_in);
+			setup_sockaddr(PF_INET, "0.0.0.0", &cfg_bind_addr);
+			break;
+		case 'b':
+			setup_sockaddr(cfg_family, optarg, &cfg_bind_addr);
+			break;
 		case 'G':
 			cfg_gro_segment = true;
 			break;
+		case 'l':
+			cfg_expected_pkt_len = strtoul(optarg, NULL, 0);
+			break;
+		case 'n':
+			cfg_expected_pkt_nr = strtoul(optarg, NULL, 0);
+			break;
 		case 'p':
 			cfg_port = strtoul(optarg, NULL, 0);
 			break;
 		case 'r':
 			cfg_read_all = true;
 			break;
+		case 'S':
+			cfg_expected_gso_size = strtol(optarg, NULL, 0);
+			break;
 		case 't':
 			cfg_tcp = true;
 			break;
@@ -253,7 +343,7 @@ static void parse_opts(int argc, char **argv)
 
 static void do_recv(void)
 {
-	unsigned long tnow, treport;
+	unsigned long tnow, treport, loop = 0;
 #ifdef SUPPORT_XDP
 	int prog_fd = -1;
 #endif
@@ -285,6 +375,11 @@ static void do_recv(void)
 
 	treport = gettimeofday_ms() + 1000;
 	do {
+		/* force termination after the second poll(); this cope both
+		 * with sender slower than receiver and missing packet errors
+		 */
+		if (cfg_expected_pkt_nr && loop++)
+			interrupted = true;
 		do_poll(fd);
 
 		if (cfg_tcp)
@@ -305,6 +400,10 @@ static void do_recv(void)
 
 	} while (!interrupted);
 
+	if (cfg_expected_pkt_nr && (packets != cfg_expected_pkt_nr))
+		error(1, 0, "wrong packet number! got %ld, expected %d\n",
+		      packets, cfg_expected_pkt_nr);
+
 	if (close(fd))
 		error(1, errno, "close");
 #ifdef SUPPORT_XDP
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
index e821564053cf..2b24de666750 100644
--- a/tools/testing/selftests/net/udpgso_bench_tx.c
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -52,6 +52,8 @@ static bool	cfg_segment;
 static bool	cfg_sendmmsg;
 static bool	cfg_tcp;
 static bool	cfg_zerocopy;
+static int	cfg_msg_nr;
+static uint16_t	cfg_gso_size;
 
 static socklen_t cfg_alen;
 static struct sockaddr_storage cfg_dst_addr;
@@ -205,14 +207,14 @@ static void send_udp_segment_cmsg(struct cmsghdr *cm)
 
 	cm->cmsg_level = SOL_UDP;
 	cm->cmsg_type = UDP_SEGMENT;
-	cm->cmsg_len = CMSG_LEN(sizeof(cfg_mss));
+	cm->cmsg_len = CMSG_LEN(sizeof(cfg_gso_size));
 	valp = (void *)CMSG_DATA(cm);
-	*valp = cfg_mss;
+	*valp = cfg_gso_size;
 }
 
 static int send_udp_segment(int fd, char *data)
 {
-	char control[CMSG_SPACE(sizeof(cfg_mss))] = {0};
+	char control[CMSG_SPACE(sizeof(cfg_gso_size))] = {0};
 	struct msghdr msg = {0};
 	struct iovec iov = {0};
 	int ret;
@@ -241,7 +243,7 @@ static int send_udp_segment(int fd, char *data)
 
 static void usage(const char *filepath)
 {
-	error(1, 0, "Usage: %s [-46cmStuz] [-C cpu] [-D dst ip] [-l secs] [-p port] [-s sendsize]",
+	error(1, 0, "Usage: %s [-46cmtuz] [-C cpu] [-D dst ip] [-l secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]",
 		    filepath);
 }
 
@@ -250,7 +252,7 @@ static void parse_opts(int argc, char **argv)
 	int max_len, hdrlen;
 	int c;
 
-	while ((c = getopt(argc, argv, "46cC:D:l:mp:s:Stuz")) != -1) {
+	while ((c = getopt(argc, argv, "46cC:D:l:mM:p:s:S:tuz")) != -1) {
 		switch (c) {
 		case '4':
 			if (cfg_family != PF_UNSPEC)
@@ -279,6 +281,9 @@ static void parse_opts(int argc, char **argv)
 		case 'm':
 			cfg_sendmmsg = true;
 			break;
+		case 'M':
+			cfg_msg_nr = strtoul(optarg, NULL, 10);
+			break;
 		case 'p':
 			cfg_port = strtoul(optarg, NULL, 0);
 			break;
@@ -286,6 +291,7 @@ static void parse_opts(int argc, char **argv)
 			cfg_payload_len = strtoul(optarg, NULL, 0);
 			break;
 		case 'S':
+			cfg_gso_size = strtoul(optarg, NULL, 0);
 			cfg_segment = true;
 			break;
 		case 't':
@@ -317,6 +323,8 @@ static void parse_opts(int argc, char **argv)
 
 	cfg_mss = ETH_DATA_LEN - hdrlen;
 	max_len = ETH_MAX_MTU - hdrlen;
+	if (!cfg_gso_size)
+		cfg_gso_size = cfg_mss;
 
 	if (cfg_payload_len > max_len)
 		error(1, 0, "payload length %u exceeds max %u",
@@ -392,10 +400,12 @@ int main(int argc, char **argv)
 		else
 			num_sends += send_udp(fd, buf[i]);
 		num_msgs++;
-
 		if (cfg_zerocopy && ((num_msgs & 0xF) == 0))
 			flush_zerocopy(fd);
 
+		if (cfg_msg_nr && num_msgs >= cfg_msg_nr)
+			break;
+
 		tnow = gettimeofday_ms();
 		if (tnow > treport) {
 			fprintf(stderr,
-- 
2.17.2

^ permalink raw reply related

* Re: Latest net-next kernel 4.19.0+
From: Cong Wang @ 2018-10-30 17:32 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Paweł Staszewski, dmichail, Linux Kernel Network Developers
In-Reply-To: <76dfbbda-d7f1-b13a-5921-c12c3b0f8e3e@gmail.com>

On Tue, Oct 30, 2018 at 7:16 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 10/30/2018 01:09 AM, Paweł Staszewski wrote:
> >
> >
> > W dniu 30.10.2018 o 08:29, Eric Dumazet pisze:
> >>
> >> On 10/29/2018 11:09 PM, Dimitris Michailidis wrote:
> >>
> >>> Indeed this is a bug. I would expect it to produce frequent errors
> >>> though as many odd-length
> >>> packets would trigger it. Do you have RXFCS? Regardless, how
> >>> frequently do you see the problem?
> >>>
> >> Old kernels (before 88078d98d1bb) were simply resetting ip_summed to CHECKSUM_NONE
> >>
> >> And before your fix (commit d55bef5059dd057bd), mlx5 bug was canceling the bug you fixed.
> >>
> >> So we now need to also fix mlx5.
> >>
> >> And of course use skb_header_pointer() in mlx5e_get_fcs() as I mentioned earlier,
> >> plus __get_unaligned_cpu32() as you hinted.
> >>
> >>
> >>
> >>
> >
> > No RXFCS


Same with Pawel, RXFCS is disabled by default.


> >
> > And this trace is rly frequently like once per 3/4 seconds
> > like below:
> > [28965.776864] vlan1490: hw csum failure
>
> Might be vlan related.

Unlike Pawel's case, we don't use vlan at all, maybe this is why we see
it much less frequently than Pawel.

Also, it is probably not specific to mlx5, as there is another report which
is probably a non-mlx5 driver.

Thanks.

^ permalink raw reply

* Re: [PATCH iproute2 net-next 0/3] ss: Allow selection of columns to be displayed
From: Stefano Brivio @ 2018-10-30 17:34 UTC (permalink / raw)
  To: David Ahern; +Cc: Yoann P., Stephen Hemminger, netdev
In-Reply-To: <7ffc00c8-bdf6-5c75-564e-2663494bda5d@gmail.com>

On Tue, 30 Oct 2018 10:34:45 -0600
David Ahern <dsahern@gmail.com> wrote:

> A more flexible approach is to use format strings to allow users to
> customize the output order and whitespace as well. So for ss and your
> column list (winging it here):
> 
>     netid          = %N
>     state          = %S
>     recv Q         = %Qr
>     send Q         = %Qs
>     local address  = %Al
>     lport port     = %Pl
>     remote address = %Ar
>     remote port    = %Pr
>     process data   = %p
>     ...
> 
> then a format string could be: "%S  %Qr %Qs  %Al:%Pl %Ar:%Pr  %p\n"

I like the idea indeed, but I see two issues with ss:

- the current column abstraction is rather lightweight, things are
  already buffered in the defined column order so we don't have to jump
  back and forth in the buffer while rendering. Doing that needs some
  extra care to avoid a performance hit, but it's probably doable, I
  can put that on my to-do list

- how would you model automatic spacing in a format string? Should we
  support width specifiers? Disable automatic spacing if a format
  string is given? It might even make sense to allow partial automatic
  spacing with a special character in the format string, that is:

	"%S.%Qr.%Qs  %Al:%Pl %Ar:%Pr  %p\n"

  would mean "align everything to the right, distribute remaining
  whitespace between %S, %Qr and %Qs". But it looks rather complicated
  at a glance.

-- 
Stefano

^ permalink raw reply

* Re: Latest net-next kernel 4.19.0+
From: Eric Dumazet @ 2018-10-30 17:50 UTC (permalink / raw)
  To: Cong Wang
  Cc: Paweł Staszewski, dmichail, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpUKTh51maAzht8M3LuJAYDRMRnsGn_+Db0rGG-scW2SnA@mail.gmail.com>



On 10/30/2018 10:32 AM, Cong Wang wrote:

> Unlike Pawel's case, we don't use vlan at all, maybe this is why we see
> it much less frequently than Pawel.
> 
> Also, it is probably not specific to mlx5, as there is another report which
> is probably a non-mlx5 driver.

Not sure if you provided a stack trace ?

Have you tried IPv6 frags maybe ?

^ permalink raw reply

* Re: Latest net-next kernel 4.19.0+
From: Cong Wang @ 2018-10-30 17:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Paweł Staszewski, dmichail, Linux Kernel Network Developers
In-Reply-To: <473bee73-b40e-2038-35e2-2c03482f7b75@gmail.com>

On Tue, Oct 30, 2018 at 10:50 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 10/30/2018 10:32 AM, Cong Wang wrote:
>
> > Unlike Pawel's case, we don't use vlan at all, maybe this is why we see
> > it much less frequently than Pawel.
> >
> > Also, it is probably not specific to mlx5, as there is another report which
> > is probably a non-mlx5 driver.
>
> Not sure if you provided a stack trace ?

I said it is the same with Pawel's. Here it is anyway:

[ 3731.075989] eth0: hw csum failure
[ 3731.079316] CPU: 15 PID: 0 Comm: swapper/15 Not tainted 4.14.74.x86_64 #1
[ 3731.086703] Hardware name: Wiwynn F4WW/Y 300-0284/F4WW MAIN BOARD,
BIOS F4WWP02 10/19/2018
[ 3731.094961] Call Trace:
[ 3731.097408]  <IRQ>
[ 3731.099432]  dump_stack+0x46/0x59
[ 3731.102751]  __skb_checksum_complete+0xb8/0xd0
[ 3731.107194]  tcp_v4_rcv+0x116/0xa30
[ 3731.110688]  ip_local_deliver_finish+0x5d/0x1f0
[ 3731.115218]  ip_local_deliver+0x6b/0xe0
[ 3731.119056]  ? ip_rcv_finish+0x400/0x400
[ 3731.122973]  ip_rcv+0x287/0x360
[ 3731.126112]  ? inet_del_offload+0x40/0x40
[ 3731.130124]  __netif_receive_skb_core+0x404/0xc10
[ 3731.134831]  ? netif_receive_skb_internal+0x34/0xd0
[ 3731.139709]  netif_receive_skb_internal+0x34/0xd0
[ 3731.144415]  napi_gro_receive+0xb8/0xe0
[ 3731.148271]  mlx5e_handle_rx_cqe_mpwrq+0x4e3/0x7f0 [mlx5_core]
[ 3731.154099]  ? enqueue_entity+0x103/0x7f0
[ 3731.158114]  mlx5e_poll_rx_cq+0xba/0x850 [mlx5_core]
[ 3731.163080]  mlx5e_napi_poll+0x91/0x290 [mlx5_core]
[ 3731.167955]  net_rx_action+0x14a/0x3e0
[ 3731.171707]  ? credit_entropy_bits+0x23d/0x260
[ 3731.176153]  __do_softirq+0xe2/0x2c3
[ 3731.179734]  irq_exit+0xbc/0xd0
[ 3731.182878]  do_IRQ+0x89/0xd0
[ 3731.185851]  common_interrupt+0x7a/0x7a
[ 3731.189690]  </IRQ>
[ 3731.191799] RIP: 0010:cpuidle_enter_state+0xa6/0x2d0
[ 3731.196761] RSP: 0018:ffffbb950c6f7eb0 EFLAGS: 00000246 ORIG_RAX:
ffffffffffffff60
[ 3731.204328] RAX: ffff9fe25fbe14c0 RBX: 00000364b57553af RCX: 000000000000001f
[ 3731.211459] RDX: 20c49ba5e353f7cf RSI: ffff68294248f469 RDI: 0000000000000000
[ 3731.218583] RBP: ffffdb7d003c3300 R08: 000000000000c3be R09: 0000000000008612
[ 3731.225709] R10: ffffbb950c6f7e98 R11: 000000000000c3be R12: 0000000000000003
[ 3731.232841] R13: ffffffff912c9d18 R14: 0000000000000000 R15: 00000364b396207a
[ 3731.239968]  do_idle+0x166/0x1a0
[ 3731.243199]  cpu_startup_entry+0x6f/0x80
[ 3731.247128]  start_secondary+0x19c/0x1f0
[ 3731.251052]  secondary_startup_64+0xa5/0xb0



>
> Have you tried IPv6 frags maybe ?
>

We have no IPv6 traffic. I asked people to try to generate IPv4 fragment
traffic to see if it would be more reproducible, no progress yet.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox