* [RFC iproute2-next 2/5] ss: make tcp_mem long
From: Stephen Hemminger @ 2018-05-02 20:27 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger
In-Reply-To: <20180502202801.5255-1-stephen@networkplumber.org>
The tcp_memory field in /proc/net/sockstat is formatted as
a long value by kernel. Change ss to keep this as full value.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
misc/ss.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 22c76e34f83b..c88a25581755 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -4589,7 +4589,7 @@ static int get_snmp_int(const char *proto, const char *key, int *result)
struct ssummary {
int socks;
- int tcp_mem;
+ long tcp_mem;
int tcp_total;
int tcp_orphans;
int tcp_tws;
@@ -4629,7 +4629,7 @@ static void get_sockstat_line(char *line, struct ssummary *s)
else if (strcmp(id, "FRAG6:") == 0)
sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
else if (strcmp(id, "TCP:") == 0)
- sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
+ sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
&s->tcp4_hashed,
&s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
}
--
2.17.0
^ permalink raw reply related
* [RFC iproute2-next 3/5] ss: use sockstat to get TCP bind ports
From: Stephen Hemminger @ 2018-05-02 20:27 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180502202801.5255-1-stephen@networkplumber.org>
From: Stephen Hemminger <sthemmin@microsoft.com>
Using slabinfo to try and get the number of bind_buckets no longer
works because of slab cache merging. Instead use proposed enhancment
of /proc/net/sockstat to get the same data.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
misc/ss.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index c88a25581755..4f76999c0fee 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -732,7 +732,6 @@ next:
struct slabstat {
int socks;
- int tcp_ports;
int tcp_tws;
int tcp_syns;
int skbs;
@@ -748,7 +747,6 @@ static int get_slabstat(struct slabstat *s)
static int slabstat_valid;
static const char * const slabstat_ids[] = {
"sock",
- "tcp_bind_bucket",
"tcp_tw_bucket",
"tcp_open_request",
"skbuff_head_cache",
@@ -4594,6 +4592,7 @@ struct ssummary {
int tcp_orphans;
int tcp_tws;
int tcp4_hashed;
+ int tcp_ports;
int udp4;
int raw4;
int frag4;
@@ -4629,9 +4628,9 @@ static void get_sockstat_line(char *line, struct ssummary *s)
else if (strcmp(id, "FRAG6:") == 0)
sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
else if (strcmp(id, "TCP:") == 0)
- sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
+ sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld%*s%d",
&s->tcp4_hashed,
- &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
+ &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem, &s->tcp_ports);
}
static int get_sockstat(struct ssummary *s)
@@ -4676,8 +4675,7 @@ static int print_summary(void)
s.tcp_total - (s.tcp4_hashed+s.tcp6_hashed-s.tcp_tws),
s.tcp_orphans,
slabstat.tcp_syns,
- s.tcp_tws, slabstat.tcp_tws,
- slabstat.tcp_ports
+ s.tcp_tws, slabstat.tcp_tws, s.tcp_ports
);
printf("\n");
--
2.17.0
^ permalink raw reply related
* [RFC iproute2-next 4/5] ss: don't look for skbuff_head_cache
From: Stephen Hemminger @ 2018-05-02 20:28 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180502202801.5255-1-stephen@networkplumber.org>
From: Stephen Hemminger <sthemmin@microsoft.com>
Not used in current code.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
misc/ss.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 4f76999c0fee..97304cd8abfc 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -734,7 +734,6 @@ struct slabstat {
int socks;
int tcp_tws;
int tcp_syns;
- int skbs;
};
static struct slabstat slabstat;
@@ -749,7 +748,6 @@ static int get_slabstat(struct slabstat *s)
"sock",
"tcp_tw_bucket",
"tcp_open_request",
- "skbuff_head_cache",
};
if (slabstat_valid)
--
2.17.0
^ permalink raw reply related
* [RFC iproute2-next 5/5] ss: use correct slab statistics
From: Stephen Hemminger @ 2018-05-02 20:28 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Stephen Hemminger
In-Reply-To: <20180502202801.5255-1-stephen@networkplumber.org>
From: Stephen Hemminger <sthemmin@microsoft.com>
The slabinfo names changed years ago, and ss statistics were broken.
This changes to use current slab names and handle TCP IPv6.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
misc/ss.c | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 97304cd8abfc..66c767cc415b 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -742,12 +742,12 @@ static int get_slabstat(struct slabstat *s)
{
char buf[256];
FILE *fp;
- int cnt;
+ int *stats = (int *) s;
static int slabstat_valid;
static const char * const slabstat_ids[] = {
- "sock",
- "tcp_tw_bucket",
- "tcp_open_request",
+ "sock_inode_cache",
+ "tw_sock_TCP",
+ "request_sock_TCP",
};
if (slabstat_valid)
@@ -759,24 +759,23 @@ static int get_slabstat(struct slabstat *s)
if (!fp)
return -1;
- cnt = sizeof(*s)/sizeof(int);
-
if (!fgets(buf, sizeof(buf), fp)) {
fclose(fp);
return -1;
}
+
while (fgets(buf, sizeof(buf), fp) != NULL) {
- int i;
+ int i, v;
for (i = 0; i < ARRAY_SIZE(slabstat_ids); i++) {
- if (memcmp(buf, slabstat_ids[i], strlen(slabstat_ids[i])) == 0) {
- sscanf(buf, "%*s%d", ((int *)s) + i);
- cnt--;
+ if (memcmp(buf, slabstat_ids[i], strlen(slabstat_ids[i])) != 0)
+ continue;
+
+ if (sscanf(buf, "%*s%d", &v) == 1) {
+ stats[i] += v;
break;
}
}
- if (cnt <= 0)
- break;
}
slabstat_valid = 1;
--
2.17.0
^ permalink raw reply related
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Michael S. Tsirkin @ 2018-05-02 20:30 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: Jiri Pirko, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
loseweigh, aaron.f.brown
In-Reply-To: <052e2c60-dc4c-d6a0-0159-fc1821cb4365@intel.com>
On Wed, May 02, 2018 at 10:51:12AM -0700, Samudrala, Sridhar wrote:
>
>
> On 5/2/2018 9:15 AM, Jiri Pirko wrote:
> > Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
> > > Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
> > [...]
> >
> >
> > > > +
> > > > + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
> > > > + failover_dev);
> > > > + if (err) {
> > > > + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
> > > > + err);
> > > > + goto err_handler_register;
> > > > + }
> > > > +
> > > > + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
> > > Please use netdev_master_upper_dev_link().
> > Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
> >
> >
> > Also, please call netdev_lower_state_changed() when the active slave
> > device changes from primary->backup of backup->primary and whenever link
> > state of a slave changes
> >
> Sure. will look into it. Do you think this will help with the issue
> you saw with having to change mac on standy twice to get the init scripts
> working? We are now going to block changing the mac on both standby and
> failover.
>
> Also, i was wondering if we should set dev->flags to IFF_MASTER on failover
> and IFF_SLAVE on primary and standby.
We do need a way to find things out, that's for sure.
How does userspace know it's a failover
config and find the failover device right now?
> netvsc does this.
> Does this help with the init scripts and network manager to skip slave
> devices for dhcp requests?
Try it?
^ permalink raw reply
* Re: [PATCH net] ipv6: Revert "ipv6: Allow non-gateway ECMP for IPv6"
From: David Miller @ 2018-05-02 20:33 UTC (permalink / raw)
To: idosch; +Cc: netdev, dsahern, eric.dumazet, Thomas.Winter, mlxsw
In-Reply-To: <20180502194156.9275-1-idosch@mellanox.com>
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 2 May 2018 22:41:56 +0300
> This reverts commit edd7ceb78296 ("ipv6: Allow non-gateway ECMP for
> IPv6").
>
> Eric reported a division by zero in rt6_multipath_rebalance() which is
> caused by above commit that considers identical local routes to be
> siblings. The division by zero happens because a nexthop weight is not
> set for local routes.
>
> Revert the commit as it does not fix a bug and has side effects.
>
> To reproduce:
>
> # ip -6 address add 2001:db8::1/64 dev dummy0
> # ip -6 address add 2001:db8::1/64 dev dummy1
>
> Fixes: edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6")
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Applied, thank you.
^ permalink raw reply
* Re: [PATCH net] net_sched: fq: take care of throttled flows before reuse
From: David Miller @ 2018-05-02 20:38 UTC (permalink / raw)
To: edumazet; +Cc: netdev, eric.dumazet
In-Reply-To: <20180502170330.55458-1-edumazet@google.com>
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 2 May 2018 10:03:30 -0700
> Normally, a socket can not be freed/reused unless all its TX packets
> left qdisc and were TX-completed. However connect(AF_UNSPEC) allows
> this to happen.
>
> With commit fc59d5bdf1e3 ("pkt_sched: fq: clear time_next_packet for
> reused flows") we cleared f->time_next_packet but took no special
> action if the flow was still in the throttled rb-tree.
>
> Since f->time_next_packet is the key used in the rb-tree searches,
> blindly clearing it might break rb-tree integrity. We need to make
> sure the flow is no longer in the rb-tree to avoid this problem.
>
> Fixes: fc59d5bdf1e3 ("pkt_sched: fq: clear time_next_packet for reused flows")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Applied and queued up for -stable, thanks Eric.
^ permalink raw reply
* Re: [PATCH 0/2] sh_eth: complain on access to unimplemented TSU registers
From: David Miller @ 2018-05-02 20:41 UTC (permalink / raw)
To: sergei.shtylyov; +Cc: netdev, linux-renesas-soc, linux-sh
In-Reply-To: <4a17ab65-4d9c-897e-c340-7d86e0296f2f@cogentembedded.com>
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Wed, 2 May 2018 22:53:23 +0300
> Here's a set of 2 patches against DaveM's 'net-next.git' repo. The 1st patch
> routes TSU_POST<n> register accesses thru sh_eth_tsu_{read|write}() and the 2nd
> added WARN_ON() unimplemented register to those functions. I'm going to deal with
> TSU_ADR{H|L}<n> registers in a later series...
>
> [1/2] sh_eth: use TSU register accessors for TSU_POST<n>
> [2/2] sh_eth: WARN_ON() access to unimplemented TSU register
Series applied to net-next, thanks.
^ permalink raw reply
* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
From: Thomas Winter @ 2018-05-02 20:48 UTC (permalink / raw)
To: Ido Schimmel, David Ahern
Cc: Eric Dumazet, davem@davemloft.net, Ido Schimmel,
netdev@vger.kernel.org, roopa@cumulusnetworks.com,
nikolay@cumulusnetworks.com, pch@ordbogen.com, jkbs@redhat.com,
yoshfuji@linux-ipv6.org, mlxsw@mellanox.com
In-Reply-To: <20180502190401.GA470@splinter>
> On Wed, May 02, 2018 at 12:58:56PM -0600, David Ahern wrote:
> > On 5/2/18 12:53 PM, Ido Schimmel wrote:
> > >
> > > So this fixes the issue for me. To reproduce:
> > >
> > > # ip -6 address add 2001:db8::1/64 dev dummy0
> > > # ip -6 address add 2001:db8::1/64 dev dummy1
> > >
> > > This reproduces the issue because due to above commit both local routes
> > > are considered siblings... :/
> > >
> > > local 2001:db8::1 proto kernel metric 0
> > > nexthop dev dummy0 weight 1
> > > nexthop dev dummy1 weight 1 pref medium
> > >
> > > I think it's best to revert the patch and have Thomas submit a fixed
> > > version to net-next. I was actually surprised to see it applied to net.
> >
> > ugly side effect of the way ecmp routes are managed in IPv6. I think
> > revert is the best option for now.
>
> OK. I'll send a patch.
fe80::/64 proto kernel metric 256
nexthop dev vlan1 weight 1
nexthop dev vlan10 weight 1
nexthop dev vlan30 weight 1
nexthop dev tunnel11 weight 1
nexthop dev tunnel12 weight 1
Sorry I completely missed that, I was always looking at other route tables.
Should I look at reworking this? It would be great to have these ECMP routes for other purposes.
ip -6 ro show table 601
default metric 1024
nexthop dev tunnel11 weight 1
nexthop dev tunnel12 weight 1
^ permalink raw reply
* [bpf PATCH v2 0/3] sockmap error path fixes
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
When I added the test_sockmap to selftests I mistakenly changed the
test logic a bit. The result of this was on redirect cases we ended up
choosing the wrong sock from the BPF program and ended up sending to a
socket that had no receive handler. The result was the actual receive
handler, running on a different socket, is timing out and closing the
socket. This results in errors (-EPIPE to be specific) on the sending
side. Typically happening if the sender does not complete the send
before the receive side times out. So depending on timing and the size
of the send we may get errors. This exposed some bugs in the sockmap
error path handling.
This series fixes the errors. The primary issue is we did not do proper
memory accounting in these cases which resulted in missing a
sk_mem_uncharge(). This happened in the redirect path and in one case
on the normal send path. See the three patches for the details.
The other take-away from this is we need to fix the test_sockmap and
also add more negative test cases. That will happen in bpf-next.
Finally, I tested this using the existing test_sockmap program, the
older sockmap sample test script, and a few real use cases with
Cilium. All of these seem to be in working correctly.
v2: fix compiler warning, drop iterator variable 'i' that is no longer
used in patch 3.
---
John Fastabend (3):
bpf: sockmap, fix scatterlist update on error path in send with apply
bpf: sockmap, zero sg_size on error when buffer is released
bpf: sockmap, fix error handling in redirect failures
kernel/bpf/sockmap.c | 48 ++++++++++++++++++++++++++----------------------
1 file changed, 26 insertions(+), 22 deletions(-)
^ permalink raw reply
* [bpf PATCH v2 1/3] bpf: sockmap, fix scatterlist update on error path in send with apply
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
When the call to do_tcp_sendpage() fails to send the complete block
requested we either retry if only a partial send was completed or
abort if we receive a error less than or equal to zero. Before
returning though we must update the scatterlist length/offset to
account for any partial send completed.
Before this patch we did this at the end of the retry loop, but
this was buggy when used while applying a verdict to fewer bytes
than in the scatterlist. When the scatterlist length was being set
we forgot to account for the apply logic reducing the size variable.
So the result was we chopped off some bytes in the scatterlist without
doing proper cleanup on them. This results in a WARNING when the
sock is tore down because the bytes have previously been charged to
the socket but are never uncharged.
The simple fix is to simply do the accounting inside the retry loop
subtracting from the absolute scatterlist values rather than trying
to accumulate the totals and subtract at the end.
Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 634415c..943929a 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -326,6 +326,9 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
if (ret > 0) {
if (apply)
apply_bytes -= ret;
+
+ sg->offset += ret;
+ sg->length -= ret;
size -= ret;
offset += ret;
if (uncharge)
@@ -333,8 +336,6 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
goto retry;
}
- sg->length = size;
- sg->offset = offset;
return ret;
}
^ permalink raw reply related
* [bpf PATCH v2 2/3] bpf: sockmap, zero sg_size on error when buffer is released
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
When an error occurs during a redirect we have two cases that need
to be handled (i) we have a cork'ed buffer (ii) we have a normal
sendmsg buffer.
In the cork'ed buffer case we don't currently support recovering from
errors in a redirect action. So the buffer is released and the error
should _not_ be pushed back to the caller of sendmsg/sendpage. The
rationale here is the user will get an error that relates to old
data that may have been sent by some arbitrary thread on that sock.
Instead we simple consume the data and tell the user that the data
has been consumed. We may add proper error recovery in the future.
However, this patch fixes a bug where the bytes outstanding counter
sg_size was not zeroed. This could result in a case where if the user
has both a cork'ed action and apply action in progress we may
incorrectly call into the BPF program when the user expected an
old verdict to be applied via the apply action. I don't have a use
case where using apply and cork at the same time is valid but we
never explicitly reject it because it should work fine. This patch
ensures the sg_size is zeroed so we don't have this case.
In the normal sendmsg buffer case (no cork data) we also do not
zero sg_size. Again this can confuse the apply logic when the logic
calls into the BPF program when the BPF programmer expected the old
verdict to remain. So ensure we set sg_size to zero here as well. And
additionally to keep the psock state in-sync with the sk_msg_buff
release all the memory as well. Previously we did this before
returning to the user but this left a gap where psock and sk_msg_buff
states were out of sync which seems fragile. No additional overhead
is taken here except for a call to check the length and realize its
already been freed. This is in the error path as well so in my
opinion lets have robust code over optimized error paths.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 943929a..052c313 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -701,15 +701,22 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
lock_sock(sk);
+ if (unlikely(err < 0)) {
+ free_start_sg(sk, m);
+ psock->sg_size = 0;
+ if (!cork)
+ *copied -= send;
+ } else {
+ psock->sg_size -= send;
+ }
+
if (cork) {
free_start_sg(sk, m);
+ psock->sg_size = 0;
kfree(m);
m = NULL;
+ err = 0;
}
- if (unlikely(err))
- *copied -= err;
- else
- psock->sg_size -= send;
break;
case __SK_DROP:
default:
^ permalink raw reply related
* [bpf PATCH v2 3/3] bpf: sockmap, fix error handling in redirect failures
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
When a redirect failure happens we release the buffers in-flight
without calling a sk_mem_uncharge(), the uncharge is called before
dropping the sock lock for the redirecte, however we missed updating
the ring start index. When no apply actions are in progress this
is OK because we uncharge the entire buffer before the redirect.
But, when we have apply logic running its possible that only a
portion of the buffer is being redirected. In this case we only
do memory accounting for the buffer slice being redirected and
expect to be able to loop over the BPF program again and/or if
a sock is closed uncharge the memory at sock destruct time.
With an invalid start index however the program logic looks at
the start pointer index, checks the length, and when seeing the
length is zero (from the initial release and failure to update
the pointer) aborts without uncharging/releasing the remaining
memory.
The fix for this is simply to update the start index. To avoid
fixing this error in two locations we do a small refactor and
remove one case where it is open-coded. Then fix it in the
single function.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 052c313..098eca5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
} while (i != md->sg_end);
}
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+ struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = md->sg_start, free;
@@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (bytes < free) {
sg[i].length -= bytes;
sg[i].offset += bytes;
- sk_mem_uncharge(sk, bytes);
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
break;
}
- sk_mem_uncharge(sk, sg[i].length);
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
put_page(sg_page(&sg[i]));
bytes -= sg[i].length;
sg[i].length = 0;
@@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+ md->sg_start = i;
}
static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
struct sk_msg_buff *md,
int flags)
{
+ bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock;
struct scatterlist *sg;
- int i, err, free = 0;
- bool ingress = !!(md->flags & BPF_F_INGRESS);
+ int err = 0;
sg = md->sg_data;
@@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
out_rcu:
rcu_read_unlock();
out:
- i = md->sg_start;
- while (sg[i].length) {
- free += sg[i].length;
- put_page(sg_page(&sg[i]));
- sg[i].length = 0;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- }
- return free;
+ free_bytes_sg(NULL, send, md, false);
+ return err;
}
static inline void bpf_md_init(struct smap_psock *psock)
@@ -720,7 +716,7 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
break;
case __SK_DROP:
default:
- free_bytes_sg(sk, send, m);
+ free_bytes_sg(sk, send, m, true);
apply_bytes_dec(psock, send);
*copied -= send;
psock->sg_size -= send;
^ permalink raw reply related
* Re: [PATCH bpf-next] bpf/verifier: enable ctx + const + 0.
From: Jakub Kicinski @ 2018-05-02 20:51 UTC (permalink / raw)
To: William Tu
Cc: Daniel Borkmann, Alexei Starovoitov,
Linux Kernel Network Developers, Yonghong Song, Yifeng Sun
In-Reply-To: <CALDO+Sbx_fkW19N5F=vHfZCawoYCvoyS7jp_sTMp3Bz0FPQ8aw@mail.gmail.com>
On Wed, 2 May 2018 10:54:56 -0700, William Tu wrote:
> On Wed, May 2, 2018 at 1:29 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > On 05/02/2018 06:52 AM, Alexei Starovoitov wrote:
> >> On Tue, May 01, 2018 at 09:35:29PM -0700, William Tu wrote:
> >> Please test it with real program and you'll see crashes and garbage returned.
> >
> > +1, *convert_ctx_access() use bpf_insn's off to determine what to rewrite,
> > so this is definitely buggy, and wasn't properly tested as it should have
> > been. The test case is also way too simple, just the LDX and then doing a
> > return 0 will get you past verifier, but won't give you anything in terms
> > of runtime testing that test_verifier is doing. A single test case for a
> > non trivial verifier change like this is also _completely insufficient_,
> > this really needs to test all sort of weird corner cases (involving out of
> > bounds accesses, overflows, etc).
>
> Thanks, now I understand.
> It's much more complicated than I thought.
FWIW NFP JIT would also have to be updated, similarly to
*convert_ctx_access() in mem_ldx_skb()/mem_ldx_xdp() we are currently
looking at insn.off. In case you find a way to solve this.. :)
^ permalink raw reply
* Re: DSA switch
From: Andrew Lunn @ 2018-05-02 20:56 UTC (permalink / raw)
To: Ran Shalit; +Cc: netdev
In-Reply-To: <CAJ2oMh+=1VKbJmj8v5s__xoJEb1JcUPRwrvJP3hQA6gBLiCLgg@mail.gmail.com>
On Wed, May 02, 2018 at 11:20:05PM +0300, Ran Shalit wrote:
> Hello,
>
> Is it possible to use switch just like external real switch,
> connecting all ports to the same subnet ?
Yes. Just bridge all ports/interfaces together and put your host IP
address on the bridge.
Andrew
^ permalink raw reply
* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
From: David Ahern @ 2018-05-02 20:56 UTC (permalink / raw)
To: Thomas Winter, Ido Schimmel
Cc: Eric Dumazet, davem@davemloft.net, Ido Schimmel,
netdev@vger.kernel.org, roopa@cumulusnetworks.com,
nikolay@cumulusnetworks.com, pch@ordbogen.com, jkbs@redhat.com,
yoshfuji@linux-ipv6.org, mlxsw@mellanox.com
In-Reply-To: <1525294137082.85951@alliedtelesis.co.nz>
On 5/2/18 2:48 PM, Thomas Winter wrote:
> Should I look at reworking this? It would be great to have these ECMP routes for other purposes.
Looking at my IPv6 bug list this change is on it -- allowing ECMP routes
to have a device only hop.
Let me take a look at it at the same time as a few other bugs.
^ permalink raw reply
* Re: [PATCH bpf-next v3 15/15] samples/bpf: sample application and documentation for AF_XDP sockets
From: Jesper Dangaard Brouer @ 2018-05-02 20:59 UTC (permalink / raw)
To: Björn Töpel
Cc: magnus.karlsson, alexander.h.duyck, alexander.duyck,
john.fastabend, ast, willemdebruijn.kernel, daniel, mst, netdev,
michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
Björn Töpel, brouer
In-Reply-To: <20180502110136.3738-16-bjorn.topel@gmail.com>
On Wed, 2 May 2018 13:01:36 +0200 Björn Töpel <bjorn.topel@gmail.com> wrote:
> +static void rx_drop(struct xdpsock *xsk)
> +{
> + struct xdp_desc descs[BATCH_SIZE];
> + unsigned int rcvd, i;
> +
> + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
> + if (!rcvd)
> + return;
> +
> + for (i = 0; i < rcvd; i++) {
> + u32 idx = descs[i].idx;
> +
> + lassert(idx < NUM_FRAMES);
> +#if DEBUG_HEXDUMP
> + char *pkt;
> + char buf[32];
> +
> + pkt = xq_get_data(xsk, idx, descs[i].offset);
> + sprintf(buf, "idx=%d", idx);
> + hex_dump(pkt, descs[i].len, buf);
> +#endif
> + }
> +
> + xsk->rx_npkts += rcvd;
> +
> + umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
> +}
I would really like to see an option that can enable reading the
data/memory in the packet. Else the test is rather fake...
I hacked it myself manually to read first u32.
- Before: 10,771,083 pps
- After: 9,430,741 pps
The slowdown is not as big as I expected, which is good :-)
With perf stat I can see more LLC-load's, but not misses. It is not
getting registered as a cache-miss that I read data on the remote CPPU.
p.s. these tests are with mlx5 (which only have XDP_REDIRECT RX-side).
- -
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
Before:
sudo ~/perf stat -C3 -e L1-icache-load-misses -e cycles -e instructions -e cache-misses -e cache-references -e LLC-store-misses -e LLC-store -e LLC-load-misses -e LLC-load -r 3 sleep 1
Performance counter stats for 'CPU(s) 3' (3 runs):
200,020 L1-icache-load-misses ( +- 0.76% ) (33.31%)
3,920,754,587 cycles ( +- 0.14% ) (44.50%)
3,062,308,209 instructions # 0.78 insn per cycle ( +- 0.28% ) (55.65%)
823 cache-misses # 0.011 % of all cache refs ( +- 70.81% ) (66.74%)
7,587,132 cache-references ( +- 0.48% ) (77.83%)
0 LLC-store-misses (77.83%)
384,401 LLC-store ( +- 2.97% ) (77.83%)
15 LLC-load-misses # 0.00% of all LL-cache hits ( +-100.00% ) (22.17%)
3,192,312 LLC-load ( +- 0.35% ) (22.17%)
1.001199221 seconds time elapsed ( +- 0.00% )
After:
$ sudo ~/perf stat -C3 -e L1-icache-load-misses -e cycles -e instructions -e cache-misses -e cache-references -e LLC-store-misses -e LLC-store -e LLC-load-misses -e LLC-load -r 3 sleep 1
Performance counter stats for 'CPU(s) 3' (3 runs):
154,921 L1-icache-load-misses ( +- 3.88% ) (33.31%)
3,924,791,213 cycles ( +- 0.10% ) (44.50%)
2,930,116,185 instructions # 0.75 insn per cycle ( +- 0.33% ) (55.65%)
342 cache-misses # 0.002 % of all cache refs ( +- 65.52% ) (66.74%)
15,810,892 cache-references ( +- 0.13% ) (77.83%)
0 LLC-store-misses (77.83%)
925,544 LLC-store ( +- 2.33% ) (77.83%)
155 LLC-load-misses # 0.00% of all LL-cache hits ( +- 67.22% ) (22.17%)
12,791,264 LLC-load ( +- 0.04% ) (22.17%)
1.001206058 seconds time elapsed ( +- 0.00% )
^ permalink raw reply
* Re: [RFC iproute2-next 2/5] ss: make tcp_mem long
From: Eric Dumazet @ 2018-05-02 21:08 UTC (permalink / raw)
To: Stephen Hemminger, netdev
In-Reply-To: <20180502202801.5255-3-stephen@networkplumber.org>
On 05/02/2018 01:27 PM, Stephen Hemminger wrote:
> The tcp_memory field in /proc/net/sockstat is formatted as
> a long value by kernel. Change ss to keep this as full value.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> misc/ss.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/misc/ss.c b/misc/ss.c
> index 22c76e34f83b..c88a25581755 100644
> --- a/misc/ss.c
> +++ b/misc/ss.c
> @@ -4589,7 +4589,7 @@ static int get_snmp_int(const char *proto, const char *key, int *result)
>
> struct ssummary {
> int socks;
> - int tcp_mem;
> + long tcp_mem;
> int tcp_total;
> int tcp_orphans;
> int tcp_tws;
> @@ -4629,7 +4629,7 @@ static void get_sockstat_line(char *line, struct ssummary *s)
> else if (strcmp(id, "FRAG6:") == 0)
> sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
> else if (strcmp(id, "TCP:") == 0)
> - sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
> + sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
> &s->tcp4_hashed,
> &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
> }
>
Hi Stephen
It seems nothing uses yet the value ?
Also, do we care of iproute2 being compiled in 32bit mode, but eventually running on 64bit kernel ?
^ permalink raw reply
* Re: [RFC iproute2-next 2/5] ss: make tcp_mem long
From: Stephen Hemminger @ 2018-05-02 21:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <537df45c-f5d2-e06a-f66c-fe7cd322a255@gmail.com>
On Wed, 2 May 2018 14:08:53 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On 05/02/2018 01:27 PM, Stephen Hemminger wrote:
> > The tcp_memory field in /proc/net/sockstat is formatted as
> > a long value by kernel. Change ss to keep this as full value.
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> > misc/ss.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/misc/ss.c b/misc/ss.c
> > index 22c76e34f83b..c88a25581755 100644
> > --- a/misc/ss.c
> > +++ b/misc/ss.c
> > @@ -4589,7 +4589,7 @@ static int get_snmp_int(const char *proto, const char *key, int *result)
> >
> > struct ssummary {
> > int socks;
> > - int tcp_mem;
> > + long tcp_mem;
> > int tcp_total;
> > int tcp_orphans;
> > int tcp_tws;
> > @@ -4629,7 +4629,7 @@ static void get_sockstat_line(char *line, struct ssummary *s)
> > else if (strcmp(id, "FRAG6:") == 0)
> > sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
> > else if (strcmp(id, "TCP:") == 0)
> > - sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
> > + sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
> > &s->tcp4_hashed,
> > &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
> > }
> >
>
> Hi Stephen
>
> It seems nothing uses yet the value ?
Yup. let's just drop it from the scan
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Samudrala, Sridhar @ 2018-05-02 21:36 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: alexander.h.duyck, virtio-dev, Jiri Pirko, kubakici, netdev,
virtualization, loseweigh, aaron.f.brown, davem
In-Reply-To: <20180502232907-mutt-send-email-mst@kernel.org>
On 5/2/2018 1:30 PM, Michael S. Tsirkin wrote:
> On Wed, May 02, 2018 at 10:51:12AM -0700, Samudrala, Sridhar wrote:
>>
>> On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>>>> Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>>> [...]
>>>
>>>
>>>>> +
>>>>> + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>>>>> + failover_dev);
>>>>> + if (err) {
>>>>> + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>>>>> + err);
>>>>> + goto err_handler_register;
>>>>> + }
>>>>> +
>>>>> + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>>>> Please use netdev_master_upper_dev_link().
>>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>>
>>>
>>> Also, please call netdev_lower_state_changed() when the active slave
>>> device changes from primary->backup of backup->primary and whenever link
>>> state of a slave changes
>>>
>> Sure. will look into it. Do you think this will help with the issue
>> you saw with having to change mac on standy twice to get the init scripts
>> working? We are now going to block changing the mac on both standby and
>> failover.
>>
>> Also, i was wondering if we should set dev->flags to IFF_MASTER on failover
>> and IFF_SLAVE on primary and standby.
> We do need a way to find things out, that's for sure.
> How does userspace know it's a failover
> config and find the failover device right now?
# ethtool -i ens12|grep driver
driver: failover
# ethtool -i ens12n_sby|grep driver
driver: virtio_net
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Jiri Pirko @ 2018-05-02 21:39 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
loseweigh, aaron.f.brown
In-Reply-To: <052e2c60-dc4c-d6a0-0159-fc1821cb4365@intel.com>
Wed, May 02, 2018 at 07:51:12PM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>> > Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>> [...]
>>
>>
>> > > +
>> > > + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>> > > + failover_dev);
>> > > + if (err) {
>> > > + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>> > > + err);
>> > > + goto err_handler_register;
>> > > + }
>> > > +
>> > > + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>> > Please use netdev_master_upper_dev_link().
>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>
>>
>> Also, please call netdev_lower_state_changed() when the active slave
>> device changes from primary->backup of backup->primary and whenever link
>> state of a slave changes
>>
>Sure. will look into it. Do you think this will help with the issue
>you saw with having to change mac on standy twice to get the init scripts
>working? We are now going to block changing the mac on both standby and
>failover.
>
>Also, i was wondering if we should set dev->flags to IFF_MASTER on failover
>and IFF_SLAVE on primary and standby. netvsc does this.
No. Don't set it. It is wrong.
>Does this help with the init scripts and network manager to skip slave
>devices for dhcp requests?
>
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Jiri Pirko @ 2018-05-02 21:39 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
loseweigh, aaron.f.brown
In-Reply-To: <052e2c60-dc4c-d6a0-0159-fc1821cb4365@intel.com>
Wed, May 02, 2018 at 07:51:12PM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>> > Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>> [...]
>>
>>
>> > > +
>> > > + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>> > > + failover_dev);
>> > > + if (err) {
>> > > + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>> > > + err);
>> > > + goto err_handler_register;
>> > > + }
>> > > +
>> > > + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>> > Please use netdev_master_upper_dev_link().
>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>
>>
>> Also, please call netdev_lower_state_changed() when the active slave
>> device changes from primary->backup of backup->primary and whenever link
>> state of a slave changes
>>
>Sure. will look into it. Do you think this will help with the issue
>you saw with having to change mac on standy twice to get the init scripts
>working? We are now going to block changing the mac on both standby and
>failover.
I don't see any relation to that.
^ permalink raw reply
* RE: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
From: Michael Wenig @ 2018-05-02 21:47 UTC (permalink / raw)
To: Eric Dumazet, Ben Greear, Steven Rostedt
Cc: netdev@vger.kernel.org, Shilpi Agarwal, Boon Ang, Darren Hart,
Steven Rostedt, Abdul Anshad Azeez, Rajender M, Michael Wenig
In-Reply-To: <544a8c5e-2aec-7a64-1414-e8d9b86b9311@gmail.com>
After applying Eric's proposed change (see below) to a 4.17 RC3 kernel, the regressions that we had observed in our TCP_STREAM small message tests with TCP_NODELAY enabled are now drastically reduced. Instead of the original 3x thruput and cpu cost regressions, the regression depth is now < 10% for thruput and between 10% - 20% for cpu cost. The improvements in the TCP_RR tests that we had observed after Eric's original commit are not impacted by the change. It would be great if this change could make it into a patch.
Michael Wenig
VMware Performance Engineering
-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Monday, April 30, 2018 10:48 AM
To: Ben Greear <greearb@candelatech.com>; Steven Rostedt <rostedt@goodmis.org>; Michael Wenig <mwenig@vmware.com>
Cc: netdev@vger.kernel.org; Shilpi Agarwal <sagarwal@vmware.com>; Boon Ang <bang@vmware.com>; Darren Hart <dvhart@vmware.com>; Steven Rostedt <srostedt@vmware.com>; Abdul Anshad Azeez <aazees@vmware.com>
Subject: Re: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
On 04/30/2018 09:36 AM, Eric Dumazet wrote:
>
>
> On 04/30/2018 09:14 AM, Ben Greear wrote:
>> On 04/27/2018 08:11 PM, Steven Rostedt wrote:
>>>
>>> We'd like this email archived in netdev list, but since netdev is
>>> notorious for blocking outlook email as spam, it didn't go through.
>>> So I'm replying here to help get it into the archives.
>>>
>>> Thanks!
>>>
>>> -- Steve
>>>
>>>
>>> On Fri, 27 Apr 2018 23:05:46 +0000
>>> Michael Wenig <mwenig@vmware.com> wrote:
>>>
>>>> As part of VMware's performance testing with the Linux 4.15 kernel,
>>>> we identified CPU cost and throughput regressions when comparing to
>>>> the Linux 4.14 kernel. The impacted test cases are mostly
>>>> TCP_STREAM send tests when using small message sizes. The
>>>> regressions are significant (up 3x) and were tracked down to be a
>>>> side effect of Eric Dumazat's RB tree changes that went into the Linux 4.15 kernel.
>>>> Further investigation showed our use of the TCP_NODELAY flag in
>>>> conjunction with Eric's change caused the regressions to show and
>>>> simply disabling TCP_NODELAY brought performance back to normal.
>>>> Eric's change also resulted into significant improvements in our
>>>> TCP_RR test cases.
>>>>
>>>>
>>>>
>>>> Based on these results, our theory is that Eric's change made the
>>>> system overall faster (reduced latency) but as a side effect less
>>>> aggregation is happening (with TCP_NODELAY) and that results in
>>>> lower throughput. Previously even though TCP_NODELAY was set,
>>>> system was slower and we still got some benefit of aggregation.
>>>> Aggregation helps in better efficiency and higher throughput
>>>> although it can increase the latency. If you are seeing a
>>>> regression in your application throughput after this change, using
>>>> TCP_NODELAY might help bring performance back however that might increase latency.
>>
>> I guess you mean _disabling_ TCP_NODELAY instead of _using_ TCP_NODELAY?
>>
>
> Yeah, I guess auto-corking does not work as intended.
I would try the following patch :
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44be7f43455e4aefde8db61e2d941a69abcc642a..c9d00ef54deca15d5760bcbe154001a96fa1e2a7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, {
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
- skb != tcp_write_queue_head(sk) &&
+ !tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize; }
^ permalink raw reply
* [PATCH net] rds: do not leak kernel memory to user land
From: Eric Dumazet @ 2018-05-02 21:53 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Santosh Shilimkar, linux-rdma
syzbot/KMSAN reported an uninit-value in put_cmsg(), originating
from rds_cmsg_recv().
Simply clear the structure, since we have holes there, or since
rx_traces might be smaller than RDS_MSG_RX_DGRAM_TRACE_MAX.
BUG: KMSAN: uninit-value in copy_to_user include/linux/uaccess.h:184 [inline]
BUG: KMSAN: uninit-value in put_cmsg+0x600/0x870 net/core/scm.c:242
CPU: 0 PID: 4459 Comm: syz-executor582 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
kmsan_internal_check_memory+0x135/0x1e0 mm/kmsan/kmsan.c:1157
kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199
copy_to_user include/linux/uaccess.h:184 [inline]
put_cmsg+0x600/0x870 net/core/scm.c:242
rds_cmsg_recv net/rds/recv.c:570 [inline]
rds_recvmsg+0x2db5/0x3170 net/rds/recv.c:657
sock_recvmsg_nosec net/socket.c:803 [inline]
sock_recvmsg+0x1d0/0x230 net/socket.c:810
___sys_recvmsg+0x3fb/0x810 net/socket.c:2205
__sys_recvmsg net/socket.c:2250 [inline]
SYSC_recvmsg+0x298/0x3c0 net/socket.c:2262
SyS_recvmsg+0x54/0x80 net/socket.c:2257
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: linux-rdma <linux-rdma@vger.kernel.org>
---
net/rds/recv.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index de50e2126e404aed541b8d268a28da08154bf08d..dc67458b52f0043c2328d4a77a43536e7c62b0ed 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -558,6 +558,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
struct rds_cmsg_rx_trace t;
int i, j;
+ memset(&t, 0, sizeof(t));
inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
t.rx_traces = rs->rs_rx_traces;
for (i = 0; i < rs->rs_rx_traces; i++) {
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [PATCH V2 net-next 0/6] virtio-net: Add SCTP checksum offload support
From: Marcelo Ricardo Leitner @ 2018-05-02 21:57 UTC (permalink / raw)
To: Vladislav Yasevich
Cc: netdev, linux-sctp, virtualization, virtio-dev, mst, jasowang,
nhorman, Vladislav Yasevich
In-Reply-To: <20180502020739.19239-1-vyasevic@redhat.com>
On Tue, May 01, 2018 at 10:07:33PM -0400, Vladislav Yasevich wrote:
> Now that we have SCTP offload capabilities in the kernel, we can add
> them to virtio as well. First step is SCTP checksum.
SCTP-wise, LGTM:
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox