* [bpf PATCH v2 2/3] bpf: sockmap, zero sg_size on error when buffer is released
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
When an error occurs during a redirect we have two cases that need
to be handled (i) we have a cork'ed buffer (ii) we have a normal
sendmsg buffer.
In the cork'ed buffer case we don't currently support recovering from
errors in a redirect action. So the buffer is released and the error
should _not_ be pushed back to the caller of sendmsg/sendpage. The
rationale here is the user will get an error that relates to old
data that may have been sent by some arbitrary thread on that sock.
Instead we simple consume the data and tell the user that the data
has been consumed. We may add proper error recovery in the future.
However, this patch fixes a bug where the bytes outstanding counter
sg_size was not zeroed. This could result in a case where if the user
has both a cork'ed action and apply action in progress we may
incorrectly call into the BPF program when the user expected an
old verdict to be applied via the apply action. I don't have a use
case where using apply and cork at the same time is valid but we
never explicitly reject it because it should work fine. This patch
ensures the sg_size is zeroed so we don't have this case.
In the normal sendmsg buffer case (no cork data) we also do not
zero sg_size. Again this can confuse the apply logic when the logic
calls into the BPF program when the BPF programmer expected the old
verdict to remain. So ensure we set sg_size to zero here as well. And
additionally to keep the psock state in-sync with the sk_msg_buff
release all the memory as well. Previously we did this before
returning to the user but this left a gap where psock and sk_msg_buff
states were out of sync which seems fragile. No additional overhead
is taken here except for a call to check the length and realize its
already been freed. This is in the error path as well so in my
opinion lets have robust code over optimized error paths.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 943929a..052c313 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -701,15 +701,22 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
lock_sock(sk);
+ if (unlikely(err < 0)) {
+ free_start_sg(sk, m);
+ psock->sg_size = 0;
+ if (!cork)
+ *copied -= send;
+ } else {
+ psock->sg_size -= send;
+ }
+
if (cork) {
free_start_sg(sk, m);
+ psock->sg_size = 0;
kfree(m);
m = NULL;
+ err = 0;
}
- if (unlikely(err))
- *copied -= err;
- else
- psock->sg_size -= send;
break;
case __SK_DROP:
default:
^ permalink raw reply related
* [bpf PATCH v2 3/3] bpf: sockmap, fix error handling in redirect failures
From: John Fastabend @ 2018-05-02 20:50 UTC (permalink / raw)
To: borkmann, ast; +Cc: netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
When a redirect failure happens we release the buffers in-flight
without calling a sk_mem_uncharge(), the uncharge is called before
dropping the sock lock for the redirecte, however we missed updating
the ring start index. When no apply actions are in progress this
is OK because we uncharge the entire buffer before the redirect.
But, when we have apply logic running its possible that only a
portion of the buffer is being redirected. In this case we only
do memory accounting for the buffer slice being redirected and
expect to be able to loop over the BPF program again and/or if
a sock is closed uncharge the memory at sock destruct time.
With an invalid start index however the program logic looks at
the start pointer index, checks the length, and when seeing the
length is zero (from the initial release and failure to update
the pointer) aborts without uncharging/releasing the remaining
memory.
The fix for this is simply to update the start index. To avoid
fixing this error in two locations we do a small refactor and
remove one case where it is open-coded. Then fix it in the
single function.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
kernel/bpf/sockmap.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 052c313..098eca5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
} while (i != md->sg_end);
}
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+ struct sk_msg_buff *md, bool charge)
{
struct scatterlist *sg = md->sg_data;
int i = md->sg_start, free;
@@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (bytes < free) {
sg[i].length -= bytes;
sg[i].offset += bytes;
- sk_mem_uncharge(sk, bytes);
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
break;
}
- sk_mem_uncharge(sk, sg[i].length);
+ if (charge)
+ sk_mem_uncharge(sk, sg[i].length);
put_page(sg_page(&sg[i]));
bytes -= sg[i].length;
sg[i].length = 0;
@@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS)
i = 0;
}
+ md->sg_start = i;
}
static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
struct sk_msg_buff *md,
int flags)
{
+ bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock;
struct scatterlist *sg;
- int i, err, free = 0;
- bool ingress = !!(md->flags & BPF_F_INGRESS);
+ int err = 0;
sg = md->sg_data;
@@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
out_rcu:
rcu_read_unlock();
out:
- i = md->sg_start;
- while (sg[i].length) {
- free += sg[i].length;
- put_page(sg_page(&sg[i]));
- sg[i].length = 0;
- i++;
- if (i == MAX_SKB_FRAGS)
- i = 0;
- }
- return free;
+ free_bytes_sg(NULL, send, md, false);
+ return err;
}
static inline void bpf_md_init(struct smap_psock *psock)
@@ -720,7 +716,7 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
break;
case __SK_DROP:
default:
- free_bytes_sg(sk, send, m);
+ free_bytes_sg(sk, send, m, true);
apply_bytes_dec(psock, send);
*copied -= send;
psock->sg_size -= send;
^ permalink raw reply related
* Re: [PATCH bpf-next] bpf/verifier: enable ctx + const + 0.
From: Jakub Kicinski @ 2018-05-02 20:51 UTC (permalink / raw)
To: William Tu
Cc: Daniel Borkmann, Alexei Starovoitov,
Linux Kernel Network Developers, Yonghong Song, Yifeng Sun
In-Reply-To: <CALDO+Sbx_fkW19N5F=vHfZCawoYCvoyS7jp_sTMp3Bz0FPQ8aw@mail.gmail.com>
On Wed, 2 May 2018 10:54:56 -0700, William Tu wrote:
> On Wed, May 2, 2018 at 1:29 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > On 05/02/2018 06:52 AM, Alexei Starovoitov wrote:
> >> On Tue, May 01, 2018 at 09:35:29PM -0700, William Tu wrote:
> >> Please test it with real program and you'll see crashes and garbage returned.
> >
> > +1, *convert_ctx_access() use bpf_insn's off to determine what to rewrite,
> > so this is definitely buggy, and wasn't properly tested as it should have
> > been. The test case is also way too simple, just the LDX and then doing a
> > return 0 will get you past verifier, but won't give you anything in terms
> > of runtime testing that test_verifier is doing. A single test case for a
> > non trivial verifier change like this is also _completely insufficient_,
> > this really needs to test all sort of weird corner cases (involving out of
> > bounds accesses, overflows, etc).
>
> Thanks, now I understand.
> It's much more complicated than I thought.
FWIW NFP JIT would also have to be updated, similarly to
*convert_ctx_access() in mem_ldx_skb()/mem_ldx_xdp() we are currently
looking at insn.off. In case you find a way to solve this.. :)
^ permalink raw reply
* Re: DSA switch
From: Andrew Lunn @ 2018-05-02 20:56 UTC (permalink / raw)
To: Ran Shalit; +Cc: netdev
In-Reply-To: <CAJ2oMh+=1VKbJmj8v5s__xoJEb1JcUPRwrvJP3hQA6gBLiCLgg@mail.gmail.com>
On Wed, May 02, 2018 at 11:20:05PM +0300, Ran Shalit wrote:
> Hello,
>
> Is it possible to use switch just like external real switch,
> connecting all ports to the same subnet ?
Yes. Just bridge all ports/interfaces together and put your host IP
address on the bridge.
Andrew
^ permalink raw reply
* Re: [PATCH net-next 1/4] ipv6: Calculate hash thresholds for IPv6 nexthops
From: David Ahern @ 2018-05-02 20:56 UTC (permalink / raw)
To: Thomas Winter, Ido Schimmel
Cc: Eric Dumazet, davem@davemloft.net, Ido Schimmel,
netdev@vger.kernel.org, roopa@cumulusnetworks.com,
nikolay@cumulusnetworks.com, pch@ordbogen.com, jkbs@redhat.com,
yoshfuji@linux-ipv6.org, mlxsw@mellanox.com
In-Reply-To: <1525294137082.85951@alliedtelesis.co.nz>
On 5/2/18 2:48 PM, Thomas Winter wrote:
> Should I look at reworking this? It would be great to have these ECMP routes for other purposes.
Looking at my IPv6 bug list this change is on it -- allowing ECMP routes
to have a device only hop.
Let me take a look at it at the same time as a few other bugs.
^ permalink raw reply
* Re: [PATCH bpf-next v3 15/15] samples/bpf: sample application and documentation for AF_XDP sockets
From: Jesper Dangaard Brouer @ 2018-05-02 20:59 UTC (permalink / raw)
To: Björn Töpel
Cc: magnus.karlsson, alexander.h.duyck, alexander.duyck,
john.fastabend, ast, willemdebruijn.kernel, daniel, mst, netdev,
michael.lundkvist, jesse.brandeburg, anjali.singhai, qi.z.zhang,
Björn Töpel, brouer
In-Reply-To: <20180502110136.3738-16-bjorn.topel@gmail.com>
On Wed, 2 May 2018 13:01:36 +0200 Björn Töpel <bjorn.topel@gmail.com> wrote:
> +static void rx_drop(struct xdpsock *xsk)
> +{
> + struct xdp_desc descs[BATCH_SIZE];
> + unsigned int rcvd, i;
> +
> + rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
> + if (!rcvd)
> + return;
> +
> + for (i = 0; i < rcvd; i++) {
> + u32 idx = descs[i].idx;
> +
> + lassert(idx < NUM_FRAMES);
> +#if DEBUG_HEXDUMP
> + char *pkt;
> + char buf[32];
> +
> + pkt = xq_get_data(xsk, idx, descs[i].offset);
> + sprintf(buf, "idx=%d", idx);
> + hex_dump(pkt, descs[i].len, buf);
> +#endif
> + }
> +
> + xsk->rx_npkts += rcvd;
> +
> + umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
> +}
I would really like to see an option that can enable reading the
data/memory in the packet. Else the test is rather fake...
I hacked it myself manually to read first u32.
- Before: 10,771,083 pps
- After: 9,430,741 pps
The slowdown is not as big as I expected, which is good :-)
With perf stat I can see more LLC-load's, but not misses. It is not
getting registered as a cache-miss that I read data on the remote CPPU.
p.s. these tests are with mlx5 (which only have XDP_REDIRECT RX-side).
- -
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
Before:
sudo ~/perf stat -C3 -e L1-icache-load-misses -e cycles -e instructions -e cache-misses -e cache-references -e LLC-store-misses -e LLC-store -e LLC-load-misses -e LLC-load -r 3 sleep 1
Performance counter stats for 'CPU(s) 3' (3 runs):
200,020 L1-icache-load-misses ( +- 0.76% ) (33.31%)
3,920,754,587 cycles ( +- 0.14% ) (44.50%)
3,062,308,209 instructions # 0.78 insn per cycle ( +- 0.28% ) (55.65%)
823 cache-misses # 0.011 % of all cache refs ( +- 70.81% ) (66.74%)
7,587,132 cache-references ( +- 0.48% ) (77.83%)
0 LLC-store-misses (77.83%)
384,401 LLC-store ( +- 2.97% ) (77.83%)
15 LLC-load-misses # 0.00% of all LL-cache hits ( +-100.00% ) (22.17%)
3,192,312 LLC-load ( +- 0.35% ) (22.17%)
1.001199221 seconds time elapsed ( +- 0.00% )
After:
$ sudo ~/perf stat -C3 -e L1-icache-load-misses -e cycles -e instructions -e cache-misses -e cache-references -e LLC-store-misses -e LLC-store -e LLC-load-misses -e LLC-load -r 3 sleep 1
Performance counter stats for 'CPU(s) 3' (3 runs):
154,921 L1-icache-load-misses ( +- 3.88% ) (33.31%)
3,924,791,213 cycles ( +- 0.10% ) (44.50%)
2,930,116,185 instructions # 0.75 insn per cycle ( +- 0.33% ) (55.65%)
342 cache-misses # 0.002 % of all cache refs ( +- 65.52% ) (66.74%)
15,810,892 cache-references ( +- 0.13% ) (77.83%)
0 LLC-store-misses (77.83%)
925,544 LLC-store ( +- 2.33% ) (77.83%)
155 LLC-load-misses # 0.00% of all LL-cache hits ( +- 67.22% ) (22.17%)
12,791,264 LLC-load ( +- 0.04% ) (22.17%)
1.001206058 seconds time elapsed ( +- 0.00% )
^ permalink raw reply
* Re: [RFC iproute2-next 2/5] ss: make tcp_mem long
From: Eric Dumazet @ 2018-05-02 21:08 UTC (permalink / raw)
To: Stephen Hemminger, netdev
In-Reply-To: <20180502202801.5255-3-stephen@networkplumber.org>
On 05/02/2018 01:27 PM, Stephen Hemminger wrote:
> The tcp_memory field in /proc/net/sockstat is formatted as
> a long value by kernel. Change ss to keep this as full value.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> misc/ss.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/misc/ss.c b/misc/ss.c
> index 22c76e34f83b..c88a25581755 100644
> --- a/misc/ss.c
> +++ b/misc/ss.c
> @@ -4589,7 +4589,7 @@ static int get_snmp_int(const char *proto, const char *key, int *result)
>
> struct ssummary {
> int socks;
> - int tcp_mem;
> + long tcp_mem;
> int tcp_total;
> int tcp_orphans;
> int tcp_tws;
> @@ -4629,7 +4629,7 @@ static void get_sockstat_line(char *line, struct ssummary *s)
> else if (strcmp(id, "FRAG6:") == 0)
> sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
> else if (strcmp(id, "TCP:") == 0)
> - sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
> + sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
> &s->tcp4_hashed,
> &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
> }
>
Hi Stephen
It seems nothing uses yet the value ?
Also, do we care of iproute2 being compiled in 32bit mode, but eventually running on 64bit kernel ?
^ permalink raw reply
* Re: [RFC iproute2-next 2/5] ss: make tcp_mem long
From: Stephen Hemminger @ 2018-05-02 21:29 UTC (permalink / raw)
To: Eric Dumazet; +Cc: netdev
In-Reply-To: <537df45c-f5d2-e06a-f66c-fe7cd322a255@gmail.com>
On Wed, 2 May 2018 14:08:53 -0700
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On 05/02/2018 01:27 PM, Stephen Hemminger wrote:
> > The tcp_memory field in /proc/net/sockstat is formatted as
> > a long value by kernel. Change ss to keep this as full value.
> >
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> > misc/ss.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/misc/ss.c b/misc/ss.c
> > index 22c76e34f83b..c88a25581755 100644
> > --- a/misc/ss.c
> > +++ b/misc/ss.c
> > @@ -4589,7 +4589,7 @@ static int get_snmp_int(const char *proto, const char *key, int *result)
> >
> > struct ssummary {
> > int socks;
> > - int tcp_mem;
> > + long tcp_mem;
> > int tcp_total;
> > int tcp_orphans;
> > int tcp_tws;
> > @@ -4629,7 +4629,7 @@ static void get_sockstat_line(char *line, struct ssummary *s)
> > else if (strcmp(id, "FRAG6:") == 0)
> > sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem);
> > else if (strcmp(id, "TCP:") == 0)
> > - sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d",
> > + sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%ld",
> > &s->tcp4_hashed,
> > &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem);
> > }
> >
>
> Hi Stephen
>
> It seems nothing uses yet the value ?
Yup. let's just drop it from the scan
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Samudrala, Sridhar @ 2018-05-02 21:36 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: alexander.h.duyck, virtio-dev, Jiri Pirko, kubakici, netdev,
virtualization, loseweigh, aaron.f.brown, davem
In-Reply-To: <20180502232907-mutt-send-email-mst@kernel.org>
On 5/2/2018 1:30 PM, Michael S. Tsirkin wrote:
> On Wed, May 02, 2018 at 10:51:12AM -0700, Samudrala, Sridhar wrote:
>>
>> On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>>>> Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>>> [...]
>>>
>>>
>>>>> +
>>>>> + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>>>>> + failover_dev);
>>>>> + if (err) {
>>>>> + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>>>>> + err);
>>>>> + goto err_handler_register;
>>>>> + }
>>>>> +
>>>>> + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>>>> Please use netdev_master_upper_dev_link().
>>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>>
>>>
>>> Also, please call netdev_lower_state_changed() when the active slave
>>> device changes from primary->backup of backup->primary and whenever link
>>> state of a slave changes
>>>
>> Sure. will look into it. Do you think this will help with the issue
>> you saw with having to change mac on standy twice to get the init scripts
>> working? We are now going to block changing the mac on both standby and
>> failover.
>>
>> Also, i was wondering if we should set dev->flags to IFF_MASTER on failover
>> and IFF_SLAVE on primary and standby.
> We do need a way to find things out, that's for sure.
> How does userspace know it's a failover
> config and find the failover device right now?
# ethtool -i ens12|grep driver
driver: failover
# ethtool -i ens12n_sby|grep driver
driver: virtio_net
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Jiri Pirko @ 2018-05-02 21:39 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
loseweigh, aaron.f.brown
In-Reply-To: <052e2c60-dc4c-d6a0-0159-fc1821cb4365@intel.com>
Wed, May 02, 2018 at 07:51:12PM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>> > Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>> [...]
>>
>>
>> > > +
>> > > + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>> > > + failover_dev);
>> > > + if (err) {
>> > > + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>> > > + err);
>> > > + goto err_handler_register;
>> > > + }
>> > > +
>> > > + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>> > Please use netdev_master_upper_dev_link().
>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>
>>
>> Also, please call netdev_lower_state_changed() when the active slave
>> device changes from primary->backup of backup->primary and whenever link
>> state of a slave changes
>>
>Sure. will look into it. Do you think this will help with the issue
>you saw with having to change mac on standy twice to get the init scripts
>working? We are now going to block changing the mac on both standby and
>failover.
>
>Also, i was wondering if we should set dev->flags to IFF_MASTER on failover
>and IFF_SLAVE on primary and standby. netvsc does this.
No. Don't set it. It is wrong.
>Does this help with the init scripts and network manager to skip slave
>devices for dhcp requests?
>
^ permalink raw reply
* Re: [PATCH net-next v9 2/4] net: Introduce generic failover module
From: Jiri Pirko @ 2018-05-02 21:39 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
loseweigh, aaron.f.brown
In-Reply-To: <052e2c60-dc4c-d6a0-0159-fc1821cb4365@intel.com>
Wed, May 02, 2018 at 07:51:12PM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 5/2/2018 9:15 AM, Jiri Pirko wrote:
>> Sat, Apr 28, 2018 at 11:06:01AM CEST, jiri@resnulli.us wrote:
>> > Fri, Apr 27, 2018 at 07:06:58PM CEST, sridhar.samudrala@intel.com wrote:
>> [...]
>>
>>
>> > > +
>> > > + err = netdev_rx_handler_register(slave_dev, net_failover_handle_frame,
>> > > + failover_dev);
>> > > + if (err) {
>> > > + netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
>> > > + err);
>> > > + goto err_handler_register;
>> > > + }
>> > > +
>> > > + err = netdev_upper_dev_link(slave_dev, failover_dev, NULL);
>> > Please use netdev_master_upper_dev_link().
>> Don't forget to fillup struct netdev_lag_upper_info - NETDEV_LAG_TX_TYPE_ACTIVEBACKUP
>>
>>
>> Also, please call netdev_lower_state_changed() when the active slave
>> device changes from primary->backup of backup->primary and whenever link
>> state of a slave changes
>>
>Sure. will look into it. Do you think this will help with the issue
>you saw with having to change mac on standy twice to get the init scripts
>working? We are now going to block changing the mac on both standby and
>failover.
I don't see any relation to that.
^ permalink raw reply
* RE: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
From: Michael Wenig @ 2018-05-02 21:47 UTC (permalink / raw)
To: Eric Dumazet, Ben Greear, Steven Rostedt
Cc: netdev@vger.kernel.org, Shilpi Agarwal, Boon Ang, Darren Hart,
Steven Rostedt, Abdul Anshad Azeez, Rajender M, Michael Wenig
In-Reply-To: <544a8c5e-2aec-7a64-1414-e8d9b86b9311@gmail.com>
After applying Eric's proposed change (see below) to a 4.17 RC3 kernel, the regressions that we had observed in our TCP_STREAM small message tests with TCP_NODELAY enabled are now drastically reduced. Instead of the original 3x thruput and cpu cost regressions, the regression depth is now < 10% for thruput and between 10% - 20% for cpu cost. The improvements in the TCP_RR tests that we had observed after Eric's original commit are not impacted by the change. It would be great if this change could make it into a patch.
Michael Wenig
VMware Performance Engineering
-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Monday, April 30, 2018 10:48 AM
To: Ben Greear <greearb@candelatech.com>; Steven Rostedt <rostedt@goodmis.org>; Michael Wenig <mwenig@vmware.com>
Cc: netdev@vger.kernel.org; Shilpi Agarwal <sagarwal@vmware.com>; Boon Ang <bang@vmware.com>; Darren Hart <dvhart@vmware.com>; Steven Rostedt <srostedt@vmware.com>; Abdul Anshad Azeez <aazees@vmware.com>
Subject: Re: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
On 04/30/2018 09:36 AM, Eric Dumazet wrote:
>
>
> On 04/30/2018 09:14 AM, Ben Greear wrote:
>> On 04/27/2018 08:11 PM, Steven Rostedt wrote:
>>>
>>> We'd like this email archived in netdev list, but since netdev is
>>> notorious for blocking outlook email as spam, it didn't go through.
>>> So I'm replying here to help get it into the archives.
>>>
>>> Thanks!
>>>
>>> -- Steve
>>>
>>>
>>> On Fri, 27 Apr 2018 23:05:46 +0000
>>> Michael Wenig <mwenig@vmware.com> wrote:
>>>
>>>> As part of VMware's performance testing with the Linux 4.15 kernel,
>>>> we identified CPU cost and throughput regressions when comparing to
>>>> the Linux 4.14 kernel. The impacted test cases are mostly
>>>> TCP_STREAM send tests when using small message sizes. The
>>>> regressions are significant (up 3x) and were tracked down to be a
>>>> side effect of Eric Dumazat's RB tree changes that went into the Linux 4.15 kernel.
>>>> Further investigation showed our use of the TCP_NODELAY flag in
>>>> conjunction with Eric's change caused the regressions to show and
>>>> simply disabling TCP_NODELAY brought performance back to normal.
>>>> Eric's change also resulted into significant improvements in our
>>>> TCP_RR test cases.
>>>>
>>>>
>>>>
>>>> Based on these results, our theory is that Eric's change made the
>>>> system overall faster (reduced latency) but as a side effect less
>>>> aggregation is happening (with TCP_NODELAY) and that results in
>>>> lower throughput. Previously even though TCP_NODELAY was set,
>>>> system was slower and we still got some benefit of aggregation.
>>>> Aggregation helps in better efficiency and higher throughput
>>>> although it can increase the latency. If you are seeing a
>>>> regression in your application throughput after this change, using
>>>> TCP_NODELAY might help bring performance back however that might increase latency.
>>
>> I guess you mean _disabling_ TCP_NODELAY instead of _using_ TCP_NODELAY?
>>
>
> Yeah, I guess auto-corking does not work as intended.
I would try the following patch :
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44be7f43455e4aefde8db61e2d941a69abcc642a..c9d00ef54deca15d5760bcbe154001a96fa1e2a7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, {
return skb->len < size_goal &&
sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
- skb != tcp_write_queue_head(sk) &&
+ !tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize; }
^ permalink raw reply
* [PATCH net] rds: do not leak kernel memory to user land
From: Eric Dumazet @ 2018-05-02 21:53 UTC (permalink / raw)
To: David S . Miller
Cc: netdev, Eric Dumazet, Eric Dumazet, Santosh Shilimkar, linux-rdma
syzbot/KMSAN reported an uninit-value in put_cmsg(), originating
from rds_cmsg_recv().
Simply clear the structure, since we have holes there, or since
rx_traces might be smaller than RDS_MSG_RX_DGRAM_TRACE_MAX.
BUG: KMSAN: uninit-value in copy_to_user include/linux/uaccess.h:184 [inline]
BUG: KMSAN: uninit-value in put_cmsg+0x600/0x870 net/core/scm.c:242
CPU: 0 PID: 4459 Comm: syz-executor582 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
kmsan_internal_check_memory+0x135/0x1e0 mm/kmsan/kmsan.c:1157
kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199
copy_to_user include/linux/uaccess.h:184 [inline]
put_cmsg+0x600/0x870 net/core/scm.c:242
rds_cmsg_recv net/rds/recv.c:570 [inline]
rds_recvmsg+0x2db5/0x3170 net/rds/recv.c:657
sock_recvmsg_nosec net/socket.c:803 [inline]
sock_recvmsg+0x1d0/0x230 net/socket.c:810
___sys_recvmsg+0x3fb/0x810 net/socket.c:2205
__sys_recvmsg net/socket.c:2250 [inline]
SYSC_recvmsg+0x298/0x3c0 net/socket.c:2262
SyS_recvmsg+0x54/0x80 net/socket.c:2257
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: linux-rdma <linux-rdma@vger.kernel.org>
---
net/rds/recv.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index de50e2126e404aed541b8d268a28da08154bf08d..dc67458b52f0043c2328d4a77a43536e7c62b0ed 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -558,6 +558,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
struct rds_cmsg_rx_trace t;
int i, j;
+ memset(&t, 0, sizeof(t));
inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock();
t.rx_traces = rs->rs_rx_traces;
for (i = 0; i < rs->rs_rx_traces; i++) {
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [PATCH V2 net-next 0/6] virtio-net: Add SCTP checksum offload support
From: Marcelo Ricardo Leitner @ 2018-05-02 21:57 UTC (permalink / raw)
To: Vladislav Yasevich
Cc: netdev, linux-sctp, virtualization, virtio-dev, mst, jasowang,
nhorman, Vladislav Yasevich
In-Reply-To: <20180502020739.19239-1-vyasevic@redhat.com>
On Tue, May 01, 2018 at 10:07:33PM -0400, Vladislav Yasevich wrote:
> Now that we have SCTP offload capabilities in the kernel, we can add
> them to virtio as well. First step is SCTP checksum.
SCTP-wise, LGTM:
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
^ permalink raw reply
* [PATCH net-next] inet: add bound ports statistic
From: Stephen Hemminger @ 2018-05-02 22:11 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger, Stephen Hemminger
This adds a number of bound ports which fixes socket summary
command. The ss -s has been broken since changes to slab info
and this is one way to recover the missing value by adding a
field onto /proc/net/sockstat.
Since this is an informational value only, there is no need
for locking.
Overhead of keeping count in hash bucket head is minimal.
It is cache hot already, and the same thing is already done for
listen buckets.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
v2 - use unsigned for count
get rid of leftover increment
include/net/inet_hashtables.h | 3 +++
include/net/inet_timewait_sock.h | 2 ++
net/dccp/proto.c | 1 +
net/ipv4/inet_hashtables.c | 22 +++++++++++++++++++---
net/ipv4/inet_timewait_sock.c | 8 +++++---
net/ipv4/proc.c | 5 +++--
net/ipv4/tcp.c | 1 +
7 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 9141e95529e7..2302ae0f7818 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -103,6 +103,7 @@ static inline struct net *ib_net(struct inet_bind_bucket *ib)
struct inet_bind_hashbucket {
spinlock_t lock;
+ unsigned int count;
struct hlist_head chain;
};
@@ -193,7 +194,9 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum);
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+ struct inet_bind_hashbucket *head,
struct inet_bind_bucket *tb);
+int inet_bind_bucket_count(struct proto *prot);
static inline u32 inet_bhashfn(const struct net *net, const __u16 lport,
const u32 bhash_size)
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index c7be1ca8e562..4cdb8034ad80 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -87,7 +87,9 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
void inet_twsk_free(struct inet_timewait_sock *tw);
void inet_twsk_put(struct inet_timewait_sock *tw);
+struct inet_bind_hashbucket;
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_bind_hashbucket *head,
struct inet_hashinfo *hashinfo);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 84cd4e3fd01b..25f03e62cfea 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1208,6 +1208,7 @@ static int __init dccp_init(void)
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ dccp_hashinfo.bhash[i].count = 0;
}
rc = dccp_mib_init();
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 31ff46daae97..aac6de8e5381 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,18 @@ static u32 sk_ehashfn(const struct sock *sk)
sk->sk_daddr, sk->sk_dport);
}
+/* Count how many any entries are in the bind hash table */
+unsigned int inet_bind_bucket_count(struct proto *prot)
+{
+ struct inet_hashinfo *hinfo = prot->h.hashinfo;
+ unsigned int i, ports = 0;
+
+ for (i = 0; i < hinfo->bhash_size; i++)
+ ports += hinfo->bhash[i].count;
+
+ return ports;
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -76,6 +88,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->fastreuseport = 0;
INIT_HLIST_HEAD(&tb->owners);
hlist_add_head(&tb->node, &head->chain);
+ ++head->count;
}
return tb;
}
@@ -83,10 +96,13 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct kmem_cache *cachep,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
+ --head->count;
kmem_cache_free(cachep, tb);
}
}
@@ -115,7 +131,7 @@ static void __inet_put_port(struct sock *sk)
__sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, head, tb);
spin_unlock(&head->lock);
}
@@ -756,7 +772,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_ehash_nolisten(sk, (struct sock *)tw);
}
if (tw)
- inet_twsk_bind_unhash(tw, hinfo);
+ inet_twsk_bind_unhash(tw, head, hinfo);
spin_unlock(&head->lock);
if (tw)
inet_twsk_deschedule_put(tw);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 88c5069b5d20..dd888c52f958 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -26,7 +26,8 @@
* Returns 1 if caller should call inet_twsk_put() after lock release.
*/
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
- struct inet_hashinfo *hashinfo)
+ struct inet_bind_hashbucket *head,
+ struct inet_hashinfo *hashinfo)
{
struct inet_bind_bucket *tb = tw->tw_tb;
@@ -35,7 +36,8 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
__hlist_del(&tw->tw_bind_node);
tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep,
+ head, tb);
__sock_put((struct sock *)tw);
}
@@ -55,7 +57,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
- inet_twsk_bind_unhash(tw, hashinfo);
+ inet_twsk_bind_unhash(tw, bhead, hashinfo);
spin_unlock(&bhead->lock);
atomic_dec(&tw->tw_dr->tw_count);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 261b71d0ccc5..83bc9a0f2785 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -60,10 +60,11 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
socket_seq_show(seq);
- seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
+ seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld ports %u\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ proto_memory_allocated(&tcp_prot),
+ inet_bind_bucket_count(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 868ed74a76a8..f62e2fb02fdf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3836,6 +3836,7 @@ void __init tcp_init(void)
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ tcp_hashinfo.bhash[i].count = 0;
}
--
2.17.0
^ permalink raw reply related
* Re: [PATCH v5] bpf, x86_32: add eBPF JIT compiler for ia32
From: Daniel Borkmann @ 2018-05-02 22:12 UTC (permalink / raw)
To: Wang YanQing, ast, illusionist.neo, tglx, mingo, hpa, davem, x86,
netdev, linux-kernel
In-Reply-To: <20180429123723.GA7309@udknight>
Hi Wang,
On 04/29/2018 02:37 PM, Wang YanQing wrote:
> The JIT compiler emits ia32 bit instructions. Currently, It supports eBPF
> only. Classic BPF is supported because of the conversion by BPF core.
>
> Almost all instructions from eBPF ISA supported except the following:
> BPF_ALU64 | BPF_DIV | BPF_K
> BPF_ALU64 | BPF_DIV | BPF_X
> BPF_ALU64 | BPF_MOD | BPF_K
> BPF_ALU64 | BPF_MOD | BPF_X
> BPF_STX | BPF_XADD | BPF_W
> BPF_STX | BPF_XADD | BPF_DW
>
> It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL at the moment.
>
> IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI. I use
> EAX|EDX|ECX|EBX as temporary registers to simulate instructions in eBPF
> ISA, and allocate ESI|EDI to BPF_REG_AX for constant blinding, all others
> eBPF registers, R0-R10, are simulated through scratch space on stack.
>
[...]
>
> The numbers show we get 30%~50% improvement.
>
> See Documentation/networking/filter.txt for more information.
>
> Signed-off-by: Wang YanQing <udknight@gmail.com>
Sorry for the delay. There's still a memory leak in this patch I found
while reviewing, more below and how to fix it. Otherwise few small nits
that would be nice to address in the respin along with it.
> ---
> Changes v4-v5:
> 1:Delete is_on_stack, BPF_REG_AX is the only one
> on real hardware registers, so just check with
> it.
> 2:Apply commit 1612a981b766 ("bpf, x64: fix JIT emission
> for dead code"), suggested by Daniel Borkmann.
>
> Changes v3-v4:
> 1:Fix changelog in commit.
> I install llvm-6.0, then test_progs willn't report errors.
> I submit another patch:
> "bpf: fix misaligned access for BPF_PROG_TYPE_PERF_EVENT program type on x86_32 platform"
> to fix another problem, after that patch, test_verifier willn't report errors too.
> 2:Fix clear r0[1] twice unnecessarily in *BPF_IND|BPF_ABS* simulation.
>
> Changes v2-v3:
> 1:Move BPF_REG_AX to real hardware registers for performance reason.
> 3:Using bpf_load_pointer instead of bpf_jit32.S, suggested by Daniel Borkmann.
> 4:Delete partial codes in 1c2a088a6626, suggested by Daniel Borkmann.
> 5:Some bug fixes and comments improvement.
>
> Changes v1-v2:
> 1:Fix bug in emit_ia32_neg64.
> 2:Fix bug in emit_ia32_arsh_r64.
> 3:Delete filename in top level comment, suggested by Thomas Gleixner.
> 4:Delete unnecessary boiler plate text, suggested by Thomas Gleixner.
> 5:Rewrite some words in changelog.
> 6:CodingSytle improvement and a little more comments.
>
> arch/x86/Kconfig | 2 +-
> arch/x86/include/asm/nospec-branch.h | 26 +-
> arch/x86/net/Makefile | 9 +-
> arch/x86/net/bpf_jit_comp32.c | 2527 ++++++++++++++++++++++++++++++++++
> 4 files changed, 2559 insertions(+), 5 deletions(-)
> create mode 100644 arch/x86/net/bpf_jit_comp32.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 00fcf81..1f5fa2f 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -137,7 +137,7 @@ config X86
> select HAVE_DMA_CONTIGUOUS
> select HAVE_DYNAMIC_FTRACE
> select HAVE_DYNAMIC_FTRACE_WITH_REGS
> - select HAVE_EBPF_JIT if X86_64
> + select HAVE_EBPF_JIT
> select HAVE_EFFICIENT_UNALIGNED_ACCESS
> select HAVE_EXIT_THREAD
> select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
> diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
> index f928ad9..a4c7ca4 100644
> --- a/arch/x86/include/asm/nospec-branch.h
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -291,14 +291,17 @@ static inline void indirect_branch_prediction_barrier(void)
> * lfence
> * jmp spec_trap
> * do_rop:
> - * mov %rax,(%rsp)
> + * mov %rax,(%rsp) for x86_64
> + * mov %edx,(%esp) for x86_32
> * retq
> *
> * Without retpolines configured:
> *
> - * jmp *%rax
> + * jmp *%rax for x86_64
> + * jmp *%edx for x86_32
> */
> #ifdef CONFIG_RETPOLINE
> +#ifdef CONFIG_X86_64
> # define RETPOLINE_RAX_BPF_JIT_SIZE 17
> # define RETPOLINE_RAX_BPF_JIT() \
> EMIT1_off32(0xE8, 7); /* callq do_rop */ \
> @@ -310,9 +313,28 @@ static inline void indirect_branch_prediction_barrier(void)
> EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \
> EMIT1(0xC3); /* retq */
> #else
> +# define RETPOLINE_EDX_BPF_JIT() \
> +do { \
> + EMIT1_off32(0xE8, 7); /* call do_rop */ \
> + /* spec_trap: */ \
> + EMIT2(0xF3, 0x90); /* pause */ \
> + EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
> + EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
> + /* do_rop: */ \
> + EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
> + EMIT1(0xC3); /* ret */ \
> +} while (0)
Nit: by the way, the RETPOLINE_RAX_BPF_JIT() doesn't have a do {} while (0)
construct but for RETPOLINE_EDX_BPF_JIT(), you add it. Could you make both
consistent to each other? I don't really mind which way as long as they're
both consistent.
> +#endif
> +#else /* !CONFIG_RETPOLINE */
> +
> +#ifdef CONFIG_X86_64
> # define RETPOLINE_RAX_BPF_JIT_SIZE 2
> # define RETPOLINE_RAX_BPF_JIT() \
> EMIT2(0xFF, 0xE0); /* jmp *%rax */
> +#else
> +# define RETPOLINE_EDX_BPF_JIT() \
> + EMIT2(0xFF, 0xE2) /* jmp *%edx */
> +#endif
> #endif
>
> #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
[...]
> +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
> +{
> + struct bpf_binary_header *header = NULL;
> + struct bpf_prog *tmp, *orig_prog = prog;
> + int proglen, oldproglen = 0;
> + struct jit_context ctx = {};
> + bool tmp_blinded = false;
> + u8 *image = NULL;
> + int *addrs;
> + int pass;
> + int i;
> +
> + if (!prog->jit_requested)
> + return orig_prog;
> +
> + tmp = bpf_jit_blind_constants(prog);
> + /* If blinding was requested and we failed during blinding,
> + * we must fall back to the interpreter.
> + */
> + if (IS_ERR(tmp))
> + return orig_prog;
> + if (tmp != prog) {
> + tmp_blinded = true;
> + prog = tmp;
> + }
> +
> + addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
> + if (!addrs) {
> + prog = orig_prog;
> + goto out;
> + }
> +
> + /* Before first pass, make a rough estimation of addrs[]
> + * each bpf instruction is translated to less than 64 bytes
> + */
> + for (proglen = 0, i = 0; i < prog->len; i++) {
> + proglen += 64;
> + addrs[i] = proglen;
> + }
> + ctx.cleanup_addr = proglen;
> +
> + /* JITed image shrinks with every pass and the loop iterates
> + * until the image stops shrinking. Very large bpf programs
> + * may converge on the last pass. In such case do one more
> + * pass to emit the final image
> + */
> + for (pass = 0; pass < 20 || image; pass++) {
> + proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
> + if (proglen <= 0) {
> + image = NULL;
> + if (header)
> + bpf_jit_binary_free(header);
> + prog = orig_prog;
> + goto out_addrs;
> + }
> + if (image) {
> + if (proglen != oldproglen) {
> + pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
> + proglen, oldproglen);
> + prog = orig_prog;
> + goto out_addrs;
This one above will leak image, you need to also free the header by
calling bpf_jit_binary_free(). You've copied this from x86-64 JIT,
fixed there:
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/commit/?id=3aab8884c9eb99189a3569ac4e6b205371c9ac0b
We've recently merged this cleanup as well for x86-64 JIT, could you
please adapt the x86-32 JIT with similar cleanup as here:
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=a2c7a98301d9f9bcfc4244de04252a04c0b68a79
Would be nice to address in one go so we don't need a follow-up cleanup.
Rest looks good from my side.
Thanks,
Daniel
> + }
> + break;
> + }
> + if (proglen == oldproglen) {
> + header = bpf_jit_binary_alloc(proglen, &image,
> + 1, jit_fill_hole);
> + if (!header) {
> + prog = orig_prog;
> + goto out_addrs;
> + }
> + }
> + oldproglen = proglen;
> + cond_resched();
> + }
> +
> + if (bpf_jit_enable > 1)
> + bpf_jit_dump(prog->len, proglen, pass + 1, image);
> +
> + if (image) {
> + bpf_jit_binary_lock_ro(header);
> + prog->bpf_func = (void *)image;
> + prog->jited = 1;
> + prog->jited_len = proglen;
> + } else {
> + prog = orig_prog;
> + }
> +
> +out_addrs:
> + kfree(addrs);
> +out:
> + if (tmp_blinded)
> + bpf_jit_prog_release_other(prog, prog == orig_prog ?
> + tmp : orig_prog);
> + return prog;
> +}
>
^ permalink raw reply
* [PATCH] sctp: fix a potential missing-check bug
From: Wenwen Wang @ 2018-05-02 22:12 UTC (permalink / raw)
To: Wenwen Wang
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list
In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
and max_len to check whether it is in the appropriate range. If it is not,
an error code -EINVAL will be returned. This is enforced by a security
check. But, this check is only executed when 'val' is not 0. In fact, if
'val' is 0, it will be assigned with a new value (if the return value of
the function sctp_id2assoc() is not 0) in the following execution. However,
this new value of 'val' is not checked before it is used to assigned to
asoc->user_frag. That means it is possible that the new value of 'val'
could be out of the expected range. This can cause security issues
such as buffer overflows, e.g., the new value of 'val' is used as an index
to access a buffer.
This patch inserts a check for the new value of 'val' to see if it is in
the expected range. If it is not, an error code -EINVAL will be returned.
Signed-off-by: Wenwen Wang <wang6495@umn.edu>
---
net/sctp/socket.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 80835ac..2beb601 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3212,6 +3212,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
struct sctp_af *af = sp->pf->af;
struct sctp_assoc_value params;
struct sctp_association *asoc;
+ int min_len, max_len;
int val;
if (optlen == sizeof(int)) {
@@ -3231,19 +3232,15 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
return -EINVAL;
}
- if (val) {
- int min_len, max_len;
+ min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
+ min_len -= af->ip_options_len(sk);
+ min_len -= sizeof(struct sctphdr) +
+ sizeof(struct sctp_data_chunk);
- min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
- min_len -= af->ip_options_len(sk);
- min_len -= sizeof(struct sctphdr) +
- sizeof(struct sctp_data_chunk);
+ max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
- max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
-
- if (val < min_len || val > max_len)
- return -EINVAL;
- }
+ if (val && (val < min_len || val > max_len))
+ return -EINVAL;
asoc = sctp_id2assoc(sk, params.assoc_id);
if (asoc) {
@@ -3253,6 +3250,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
val -= sizeof(struct sctphdr) +
sctp_datachk_len(&asoc->stream);
}
+ if (val < min_len || val > max_len)
+ return -EINVAL;
asoc->user_frag = val;
asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
} else {
--
2.7.4
^ permalink raw reply related
* Re: [bpf PATCH v2 0/3] sockmap error path fixes
From: Alexei Starovoitov @ 2018-05-02 22:34 UTC (permalink / raw)
To: John Fastabend; +Cc: borkmann, ast, netdev
In-Reply-To: <20180502204748.12776.80509.stgit@john-Precision-Tower-5810>
On Wed, May 02, 2018 at 01:50:14PM -0700, John Fastabend wrote:
> When I added the test_sockmap to selftests I mistakenly changed the
> test logic a bit. The result of this was on redirect cases we ended up
> choosing the wrong sock from the BPF program and ended up sending to a
> socket that had no receive handler. The result was the actual receive
> handler, running on a different socket, is timing out and closing the
> socket. This results in errors (-EPIPE to be specific) on the sending
> side. Typically happening if the sender does not complete the send
> before the receive side times out. So depending on timing and the size
> of the send we may get errors. This exposed some bugs in the sockmap
> error path handling.
>
> This series fixes the errors. The primary issue is we did not do proper
> memory accounting in these cases which resulted in missing a
> sk_mem_uncharge(). This happened in the redirect path and in one case
> on the normal send path. See the three patches for the details.
>
> The other take-away from this is we need to fix the test_sockmap and
> also add more negative test cases. That will happen in bpf-next.
>
> Finally, I tested this using the existing test_sockmap program, the
> older sockmap sample test script, and a few real use cases with
> Cilium. All of these seem to be in working correctly.
>
> v2: fix compiler warning, drop iterator variable 'i' that is no longer
> used in patch 3.
Applied, Thanks.
^ permalink raw reply
* Re: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
From: Eric Dumazet @ 2018-05-02 22:41 UTC (permalink / raw)
To: Michael Wenig, Eric Dumazet, Ben Greear, Steven Rostedt
Cc: netdev@vger.kernel.org, Shilpi Agarwal, Boon Ang, Darren Hart,
Steven Rostedt, Abdul Anshad Azeez, Rajender M
In-Reply-To: <BN3PR0501MB1425A937390695B738419E9CB0800@BN3PR0501MB1425.namprd05.prod.outlook.com>
On 05/02/2018 02:47 PM, Michael Wenig wrote:
> After applying Eric's proposed change (see below) to a 4.17 RC3 kernel, the regressions that we had observed in our TCP_STREAM small message tests with TCP_NODELAY enabled are now drastically reduced. Instead of the original 3x thruput and cpu cost regressions, the regression depth is now < 10% for thruput and between 10% - 20% for cpu cost. The improvements in the TCP_RR tests that we had observed after Eric's original commit are not impacted by the change. It would be great if this change could make it into a patch.
>
Thanks for a lot testing, I will submit this patch after more tests from my side.
> Michael Wenig
> VMware Performance Engineering
>
> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Monday, April 30, 2018 10:48 AM
> To: Ben Greear <greearb@candelatech.com>; Steven Rostedt <rostedt@goodmis.org>; Michael Wenig <mwenig@vmware.com>
> Cc: netdev@vger.kernel.org; Shilpi Agarwal <sagarwal@vmware.com>; Boon Ang <bang@vmware.com>; Darren Hart <dvhart@vmware.com>; Steven Rostedt <srostedt@vmware.com>; Abdul Anshad Azeez <aazees@vmware.com>
> Subject: Re: Performance regressions in TCP_STREAM tests in Linux 4.15 (and later)
>
>
>
> On 04/30/2018 09:36 AM, Eric Dumazet wrote:
>>
>>
>> On 04/30/2018 09:14 AM, Ben Greear wrote:
>>> On 04/27/2018 08:11 PM, Steven Rostedt wrote:
>>>>
>>>> We'd like this email archived in netdev list, but since netdev is
>>>> notorious for blocking outlook email as spam, it didn't go through.
>>>> So I'm replying here to help get it into the archives.
>>>>
>>>> Thanks!
>>>>
>>>> -- Steve
>>>>
>>>>
>>>> On Fri, 27 Apr 2018 23:05:46 +0000
>>>> Michael Wenig <mwenig@vmware.com> wrote:
>>>>
>>>>> As part of VMware's performance testing with the Linux 4.15 kernel,
>>>>> we identified CPU cost and throughput regressions when comparing to
>>>>> the Linux 4.14 kernel. The impacted test cases are mostly
>>>>> TCP_STREAM send tests when using small message sizes. The
>>>>> regressions are significant (up 3x) and were tracked down to be a
>>>>> side effect of Eric Dumazat's RB tree changes that went into the Linux 4.15 kernel.
>>>>> Further investigation showed our use of the TCP_NODELAY flag in
>>>>> conjunction with Eric's change caused the regressions to show and
>>>>> simply disabling TCP_NODELAY brought performance back to normal.
>>>>> Eric's change also resulted into significant improvements in our
>>>>> TCP_RR test cases.
>>>>>
>>>>>
>>>>>
>>>>> Based on these results, our theory is that Eric's change made the
>>>>> system overall faster (reduced latency) but as a side effect less
>>>>> aggregation is happening (with TCP_NODELAY) and that results in
>>>>> lower throughput. Previously even though TCP_NODELAY was set,
>>>>> system was slower and we still got some benefit of aggregation.
>>>>> Aggregation helps in better efficiency and higher throughput
>>>>> although it can increase the latency. If you are seeing a
>>>>> regression in your application throughput after this change, using
>>>>> TCP_NODELAY might help bring performance back however that might increase latency.
>>>
>>> I guess you mean _disabling_ TCP_NODELAY instead of _using_ TCP_NODELAY?
>>>
>>
>> Yeah, I guess auto-corking does not work as intended.
>
> I would try the following patch :
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 44be7f43455e4aefde8db61e2d941a69abcc642a..c9d00ef54deca15d5760bcbe154001a96fa1e2a7 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -697,7 +697,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, {
> return skb->len < size_goal &&
> sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
> - skb != tcp_write_queue_head(sk) &&
> + !tcp_rtx_queue_empty(sk) &&
> refcount_read(&sk->sk_wmem_alloc) > skb->truesize; }
>
>
^ permalink raw reply
* Re: [PATCH net] ipv4: fix fnhe usage by non-cached routes
From: David Ahern @ 2018-05-02 22:49 UTC (permalink / raw)
To: Julian Anastasov, David Miller
Cc: netdev, Martin KaFai Lau, kernel-team, Xin Long
In-Reply-To: <20180502064119.4552-1-ja@ssi.bg>
On 5/2/18 12:41 AM, Julian Anastasov wrote:
> Allow some non-cached routes to use non-expired fnhe:
>
> 1. ip_del_fnhe: moved above and now called by find_exception.
> The 4.5+ commit deed49df7390 expires fnhe only when caching
> routes. Change that to:
>
> 1.1. use fnhe for non-cached local output routes, with the help
> from (2)
>
> 1.2. allow __mkroute_input to detect expired fnhe (outdated
> fnhe_gw, for example) when do_cache is false, eg. when itag!=0
> for unicast destinations.
>
> 2. __mkroute_output: keep fi to allow local routes with orig_oif != 0
> to use fnhe info even when the new route will not be cached into fnhe.
> After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib
> result for local traffic") it means all local routes will be affected
> because they are not cached. This change is used to solve a PMTU
> problem with IPVS (and probably Netfilter DNAT) setups that redirect
> local clients from target local IP (local route to Virtual IP)
> to new remote IP target, eg. IPVS TUN real server. Loopback has
> 64K MTU and we need to create fnhe on the local route that will
> keep the reduced PMTU for the Virtual IP. Without this change
> fnhe_pmtu is updated from ICMP but never exposed to non-cached
> local routes. This includes routes with flowi4_oif!=0 for 4.6+ and
> with flowi4_oif=any for 4.14+).
Can you add a test case to tools/testing/selftests/net/pmtu.sh to cover
this situation?
> @@ -1310,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
>
> for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
> fnhe = rcu_dereference(fnhe->fnhe_next)) {
> - if (fnhe->fnhe_daddr == daddr)
> + if (fnhe->fnhe_daddr == daddr) {
> + if (fnhe->fnhe_expires &&
> + time_after(jiffies, fnhe->fnhe_expires)) {
> + ip_del_fnhe(nh, daddr);
I'm surprised this is done in the fast path vs gc time. (the existing
code does as well; your change is only moving the call to make the input
and output paths the same)
The change looks correct to me and all of my functional tests passed.
Acked-by: David Ahern <dsahern@gmail.com>
^ permalink raw reply
* [PATCH v2 bpf-next 0/2] bpf: enable stackmap with build_id in nmi
From: Song Liu @ 2018-05-02 23:20 UTC (permalink / raw)
To: netdev; +Cc: Song Liu, kernel-team, qinteng
Changes v1 -> v2:
1. Rename some variables to (hopefully) reduce confusion;
2. Check irq_work status with IRQ_WORK_BUSY (instead of work->sem);
3. In Kconfig, let BPF_SYSCALL select IRQ_WORK;
4. Add static to DEFINE_PER_CPU();
5. Remove pr_info() in stack_map_init().
Song Liu (2):
bpf: enable stackmap with build_id in nmi context
bpf: add selftest for stackmap with build_id in NMI context
init/Kconfig | 1 +
kernel/bpf/stackmap.c | 59 +++++++++++--
tools/testing/selftests/bpf/test_progs.c | 137 +++++++++++++++++++++++++++++
tools/testing/selftests/bpf/urandom_read.c | 10 ++-
4 files changed, 199 insertions(+), 8 deletions(-)
^ permalink raw reply
* [PATCH v2 bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-02 23:20 UTC (permalink / raw)
To: netdev; +Cc: Song Liu, kernel-team, qinteng
In-Reply-To: <20180502232030.3788284-1-songliubraving@fb.com>
This new test captures stackmap with build_id with hardware event
PERF_COUNT_HW_CPU_CYCLES.
Because we only support one ips-to-build_id lookup per cpu in NMI
context, stack_amap will not be able to do the lookup in this test.
Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.
urandom_read.c is extended to run configurable cycles so that it can be
caught by the perf event.
Signed-off-by: Song Liu <songliubraving@fb.com>
---
tools/testing/selftests/bpf/test_progs.c | 137 +++++++++++++++++++++++++++++
tools/testing/selftests/bpf/urandom_read.c | 10 ++-
2 files changed, 145 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index aa336f0..00bb08c 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1272,6 +1272,142 @@ static void test_stacktrace_build_id(void)
return;
}
+static void test_stacktrace_build_id_nmi(void)
+{
+ int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+ const char *file = "./test_stacktrace_build_id.o";
+ int err, pmu_fd, prog_fd;
+ struct perf_event_attr attr = {
+ .sample_freq = 5000,
+ .freq = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ __u32 key, previous_key, val, duration = 0;
+ struct bpf_object *obj;
+ char buf[256];
+ int i, j;
+ struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+ int build_id_matches = 0;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+ goto out;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (CHECK(pmu_fd < 0, "perf_event_open",
+ "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+ pmu_fd, errno))
+ goto close_prog;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+ if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+ err, errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+ if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+ err, errno))
+ goto disable_pmu;
+
+ /* find map fds */
+ control_map_fd = bpf_find_map(__func__, obj, "control_map");
+ if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+ if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+ if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+ err, errno))
+ goto disable_pmu;
+
+ stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+ if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+ == 0);
+ assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+ /* disable stack trace collection */
+ key = 0;
+ val = 1;
+ bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+ /* for every element in stackid_hmap, we can find a corresponding one
+ * in stackmap, and vise versa.
+ */
+ err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+ if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+ if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ err = extract_build_id(buf, 256);
+
+ if (CHECK(err, "get build_id with readelf",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+ if (CHECK(err, "get_next_key from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto disable_pmu;
+
+ do {
+ char build_id[64];
+
+ err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+ if (CHECK(err, "lookup_elem from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto disable_pmu;
+ for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+ if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+ id_offs[i].offset != 0) {
+ for (j = 0; j < 20; ++j)
+ sprintf(build_id + 2 * j, "%02x",
+ id_offs[i].build_id[j] & 0xff);
+ if (strstr(buf, build_id) != NULL)
+ build_id_matches = 1;
+ }
+ previous_key = key;
+ } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+ if (CHECK(build_id_matches < 1, "build id match",
+ "Didn't find expected build ID from the map\n"))
+ goto disable_pmu;
+
+ /*
+ * We intentionally skip compare_stack_ips(). This is because we
+ * only support one in_nmi() ips-to-build_id translation per cpu
+ * at any time, thus stack_amap here will always fallback to
+ * BPF_STACK_BUILD_ID_IP;
+ */
+
+disable_pmu:
+ ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+ close(pmu_fd);
+
+close_prog:
+ bpf_object__close(obj);
+
+out:
+ return;
+}
+
#define MAX_CNT_RAWTP 10ull
#define MAX_STACK_RAWTP 100
struct get_stack_trace_t {
@@ -1425,6 +1561,7 @@ int main(void)
test_tp_attach_query();
test_stacktrace_map();
test_stacktrace_build_id();
+ test_stacktrace_build_id_nmi();
test_stacktrace_map_raw_tp();
test_get_stack_raw_tp();
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index 4acfdeb..9de8b7c 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -6,15 +6,21 @@
#include <stdlib.h>
#define BUF_SIZE 256
-int main(void)
+
+int main(int argc, char *argv[])
{
int fd = open("/dev/urandom", O_RDONLY);
int i;
char buf[BUF_SIZE];
+ int count = 4;
if (fd < 0)
return 1;
- for (i = 0; i < 4; ++i)
+
+ if (argc == 2)
+ count = atoi(argv[1]);
+
+ for (i = 0; i < count; ++i)
read(fd, buf, BUF_SIZE);
close(fd);
--
2.9.5
^ permalink raw reply related
* [PATCH v2 bpf-next 1/2] bpf: enable stackmap with build_id in nmi context
From: Song Liu @ 2018-05-02 23:20 UTC (permalink / raw)
To: netdev
Cc: Song Liu, kernel-team, qinteng, Alexei Starovoitov,
Daniel Borkmann, Peter Zijlstra
In-Reply-To: <20180502232030.3788284-1-songliubraving@fb.com>
Currently, we cannot parse build_id in nmi context because of
up_read(¤t->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
init/Kconfig | 1 +
kernel/bpf/stackmap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------
2 files changed, 54 insertions(+), 6 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index f013afc..480a4f2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1391,6 +1391,7 @@ config BPF_SYSCALL
bool "Enable bpf() system call"
select ANON_INODES
select BPF
+ select IRQ_WORK
default n
help
Enable the bpf() system call that allows to manipulate eBPF
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b..51d4aea 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
#include <linux/perf_event.h>
#include <linux/elf.h>
#include <linux/pagemap.h>
+#include <linux/irq_work.h>
#include "percpu_freelist.h"
#define STACK_CREATE_FLAG_MASK \
@@ -32,6 +33,23 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+ struct irq_work irq_work;
+ struct rw_semaphore *sem;
+};
+
+static void do_up_read(struct irq_work *entry)
+{
+ struct stack_map_irq_work *work = container_of(entry,
+ struct stack_map_irq_work, irq_work);
+
+ up_read(work->sem);
+ work->sem = NULL;
+}
+
+static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
+
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
{
int i;
struct vm_area_struct *vma;
+ bool in_nmi_ctx = in_nmi();
+ bool irq_work_busy = false;
+ struct stack_map_irq_work *work;
+
+ if (in_nmi_ctx) {
+ work = this_cpu_ptr(&up_read_work);
+ if (work->irq_work.flags & IRQ_WORK_BUSY)
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
/*
- * We cannot do up_read() in nmi context, so build_id lookup is
- * only supported for non-nmi events. If at some point, it is
- * possible to run find_vma() without taking the semaphore, we
- * would like to allow build_id lookup in nmi context.
+ * We cannot do up_read() in nmi context. To do build_id lookup
+ * in nmi context, we need to run up_read() in irq_work. We use
+ * a percpu variable to do the irq_work. If the irq_work is
+ * already used by another lookup, we fall back to report ips.
*
* Same fallback is used for kernel stack (!user) on a stackmap
* with build_id.
*/
- if (!user || !current || !current->mm || in_nmi() ||
+ if (!user || !current || !current->mm || irq_work_busy ||
down_read_trylock(¤t->mm->mmap_sem) == 0) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
- up_read(¤t->mm->mmap_sem);
+
+ if (!in_nmi_ctx)
+ up_read(¤t->mm->mmap_sem);
+ else {
+ work->sem = ¤t->mm->mmap_sem;
+ irq_work_queue(&work->irq_work);
+ }
}
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,16 @@ const struct bpf_map_ops stack_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
};
+
+static int __init stack_map_init(void)
+{
+ int cpu;
+ struct stack_map_irq_work *work;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&up_read_work, cpu);
+ init_irq_work(&work->irq_work, do_up_read);
+ }
+ return 0;
+}
+subsys_initcall(stack_map_init);
--
2.9.5
^ permalink raw reply related
* Re: [PATCH] sctp: fix a potential missing-check bug
From: Marcelo Ricardo Leitner @ 2018-05-02 23:23 UTC (permalink / raw)
To: Wenwen Wang
Cc: Kangjie Lu, Vlad Yasevich, Neil Horman, David S. Miller,
open list:SCTP PROTOCOL, open list:NETWORKING [GENERAL],
open list
In-Reply-To: <1525299165-27098-1-git-send-email-wang6495@umn.edu>
Hi Wenwen,
On Wed, May 02, 2018 at 05:12:45PM -0500, Wenwen Wang wrote:
> In sctp_setsockopt_maxseg(), the integer 'val' is compared against min_len
> and max_len to check whether it is in the appropriate range. If it is not,
> an error code -EINVAL will be returned. This is enforced by a security
> check. But, this check is only executed when 'val' is not 0. In fact, if
Which makes sense, no? Especially if considering that 0 should be an
allowed value as it turns off the user limit.
> 'val' is 0, it will be assigned with a new value (if the return value of
> the function sctp_id2assoc() is not 0) in the following execution. However,
> this new value of 'val' is not checked before it is used to assigned to
Which 'new value'? val is not set to something new during the
function. It always contains the user supplied value.
> asoc->user_frag. That means it is possible that the new value of 'val'
> could be out of the expected range. This can cause security issues
> such as buffer overflows, e.g., the new value of 'val' is used as an index
> to access a buffer.
>
> This patch inserts a check for the new value of 'val' to see if it is in
> the expected range. If it is not, an error code -EINVAL will be returned.
>
> Signed-off-by: Wenwen Wang <wang6495@umn.edu>
> ---
> net/sctp/socket.c | 21 ++++++++++-----------
> 1 file changed, 10 insertions(+), 11 deletions(-)
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 80835ac..2beb601 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -3212,6 +3212,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
> struct sctp_af *af = sp->pf->af;
> struct sctp_assoc_value params;
> struct sctp_association *asoc;
> + int min_len, max_len;
> int val;
>
> if (optlen == sizeof(int)) {
> @@ -3231,19 +3232,15 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
> return -EINVAL;
> }
>
> - if (val) {
> - int min_len, max_len;
> + min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
> + min_len -= af->ip_options_len(sk);
> + min_len -= sizeof(struct sctphdr) +
> + sizeof(struct sctp_data_chunk);
On which tree did you base your patch on? Your patch lacks a tag so it
defaults to net-next, and I reworked this section on current net-next
and these MTU calculcations are now handled by sctp_mtu_payload().
But even for net tree, I don't understand which issue you're fixing
here. Actually it seems to me that both codes seems to do the same
thing.
>
> - min_len = SCTP_DEFAULT_MINSEGMENT - af->net_header_len;
> - min_len -= af->ip_options_len(sk);
> - min_len -= sizeof(struct sctphdr) +
> - sizeof(struct sctp_data_chunk);
> + max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
>
> - max_len = SCTP_MAX_CHUNK_LEN - sizeof(struct sctp_data_chunk);
> -
> - if (val < min_len || val > max_len)
> - return -EINVAL;
> - }
> + if (val && (val < min_len || val > max_len))
> + return -EINVAL;
>
> asoc = sctp_id2assoc(sk, params.assoc_id);
> if (asoc) {
> @@ -3253,6 +3250,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned
> val -= sizeof(struct sctphdr) +
> sctp_datachk_len(&asoc->stream);
> }
> + if (val < min_len || val > max_len)
> + return -EINVAL;
> asoc->user_frag = val;
> asoc->frag_point = sctp_frag_point(asoc, asoc->pathmtu);
> } else {
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* Re: Silently dropped UDP packets on kernel 4.14
From: Kristian Evensen @ 2018-05-02 23:54 UTC (permalink / raw)
To: Florian Westphal; +Cc: Netfilter Development Mailing list, Network Development
In-Reply-To: <CAKfDRXhkXvF==6LQFOG3Rf4dH4QVVUxR=1cXZoM2FPQksGdxwg@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 12216 bytes --]
Hello,
On Wed, May 2, 2018 at 12:42 AM, Kristian Evensen
<kristian.evensen@gmail.com> wrote:
> My knowledge of the conntrack/nat subsystem is not that great, and I
> don't know the implications of what I am about to suggest. However,
> considering that the two packets represent the same flow, wouldn't it
> be possible to apply the existing nat-mapping to the second packet,
> and then let the second packet pass?
I have spent the day today trying to solve my problem and I think I am
almost there. I have attached my work in progress patch to this email
if anyone wants to take a look (for kernel 4.14).
I went for the early-insert approached and have patched
nfnetlink_queue to perform an insert if no conntrack entry is found
when the verdict is passed from user-space. If a conntrack entry is
found, then I replace the ct attached to the skb with the existing
conntrack entry. I have verified that my approach works by
artificially delaying the verdict from my application for the second
packet (so that the first packet has passed all netfilter hooks).
After replacing the ct, the second packet is handled correctly and
sent over the wire.
However, something goes wrong after the conntrack entry has been
inserted (using nf_conntrack_hash_check_insert()). At some random (but
very short) time after I see a couple of "early insert ..."/"early
insert confirmed", I get an RCU-stall. The trace for the stall looks
as follows:
[ 105.420024] INFO: rcu_sched self-detected stall on CPU
[ 105.425191] 2-...: (5999 ticks this GP) idle=12a/140000000000001/0
softirq=2674/2674 fqs=2543
[ 105.433845] (t=6001 jiffies g=587 c=586 q=5896)
[ 105.438545] CPU: 2 PID: 3632 Comm: dlb Not tainted 4.14.36 #0
[ 105.444261] Stack : 00000000 00000000 00000000 00000000 805c7ada
00000031 00000000 8052c588
[ 105.452610] 8fd48ff4 805668e7 804f5a2c 00000002 00000e30
00000001 8fc11d20 00000007
[ 105.460957] 00000000 00000000 805c0000 00007448 00000000
0000016e 00000007 00000000
[ 105.469303] 00000000 80570000 0006b111 00000000 80000000
00000000 80590000 804608e4
[ 105.477650] 8056c2c0 8056408c 000000e0 80560000 00000000
8027c418 00000008 805c0008
[ 105.485996] ...
[ 105.488437] Call Trace:
[ 105.490909] [<800103f8>] show_stack+0x58/0x100
[ 105.495363] [<8043c1dc>] dump_stack+0x9c/0xe0
[ 105.499707] [<8000d938>] arch_trigger_cpumask_backtrace+0x50/0x78
[ 105.505785] [<80083540>] rcu_dump_cpu_stacks+0xc4/0x134
[ 105.510991] [<800829c4>] rcu_check_callbacks+0x310/0x814
[ 105.516295] [<80085f04>] update_process_times+0x34/0x70
[ 105.521522] [<8009691c>] tick_handle_periodic+0x34/0xd0
[ 105.526749] [<802f6e98>] gic_compare_interrupt+0x48/0x58
[ 105.532047] [<80076450>] handle_percpu_devid_irq+0xbc/0x1a8
[ 105.537619] [<800707c0>] generic_handle_irq+0x40/0x58
[ 105.542679] [<8023a274>] gic_handle_local_int+0x84/0xd0
[ 105.547886] [<8023a434>] gic_irq_dispatch+0x10/0x20
[ 105.552747] [<800707c0>] generic_handle_irq+0x40/0x58
[ 105.557806] [<804591b4>] do_IRQ+0x1c/0x2c
[ 105.561805] [<802394bc>] plat_irq_dispatch+0xfc/0x138
[ 105.566839] [<8000b508>] except_vec_vi_end+0xb8/0xc4
[ 105.571904] [<8f200ca4>] nf_conntrack_lock+0x28c/0x440 [nf_conntrack]
[ 105.578341] ------------[ cut here ]------------
[ 105.582948] WARNING: CPU: 2 PID: 3632 at kernel/smp.c:416
smp_call_function_many+0xc8/0x3bc
[ 105.591257] Modules linked in: rt2800pci rt2800mmio rt2800lib
qcserial ppp_async option usb_wwan rt2x00pci rt2x00mmio rt2x00lib
rndis_host qmi_wwan ppp_generic nf_nat_pptp nf_conntrack_pptp
nf_conntrack_ipv6p
[ 105.662308] nf_nat_snmp_basic nf_nat_sip nf_nat_redirect
nf_nat_proto_gre nf_nat_masquerade_ipv4 nf_nat_irc nf_conntrack_ipv4
nf_nat_ipv4 nf_nat_h323 nf_nat_ftp nf_nat_amanda nf_nat nf_log_ipv4
nf_flow_tablt
[ 105.733288] ip_set_hash_netiface ip_set_hash_netport
ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet
ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip
ip_set_hash_ipport ip_set_hash_ipmarm
[ 105.804724] ohci_hcd ehci_platform sd_mod scsi_mod ehci_hcd
gpio_button_hotplug usbcore nls_base usb_common mii
[ 105.814899] CPU: 2 PID: 3632 Comm: dlb Not tainted 4.14.36 #0
[ 105.820615] Stack : 00000000 00000000 00000000 00000000 805c7ada
00000031 00000000 8052c588
[ 105.828961] 8fd48ff4 805668e7 804f5a2c 00000002 00000e30
00000001 8fc11c88 00000007
[ 105.837308] 00000000 00000000 805c0000 00008590 00000000
0000018d 00000007 00000000
[ 105.845654] 00000000 80570000 000c6f33 00000000 80000000
00000000 80590000 8009e304
[ 105.854001] 00000009 000001a0 8000d0fc 00000000 00000000
8027c418 00000008 805c0008
[ 105.862348] ...
[ 105.864786] Call Trace:
[ 105.867230] [<800103f8>] show_stack+0x58/0x100
[ 105.871662] [<8043c1dc>] dump_stack+0x9c/0xe0
[ 105.876009] [<8002e190>] __warn+0xe0/0x114
[ 105.880091] [<8002e254>] warn_slowpath_null+0x1c/0x28
[ 105.885124] [<8009e304>] smp_call_function_many+0xc8/0x3bc
[ 105.890591] [<8000d950>] arch_trigger_cpumask_backtrace+0x68/0x78
[ 105.896663] [<80083540>] rcu_dump_cpu_stacks+0xc4/0x134
[ 105.901869] [<800829c4>] rcu_check_callbacks+0x310/0x814
[ 105.907164] [<80085f04>] update_process_times+0x34/0x70
[ 105.912374] [<8009691c>] tick_handle_periodic+0x34/0xd0
[ 105.917585] [<802f6e98>] gic_compare_interrupt+0x48/0x58
[ 105.922878] [<80076450>] handle_percpu_devid_irq+0xbc/0x1a8
[ 105.928434] [<800707c0>] generic_handle_irq+0x40/0x58
[ 105.933474] [<8023a274>] gic_handle_local_int+0x84/0xd0
[ 105.938681] [<8023a434>] gic_irq_dispatch+0x10/0x20
[ 105.943542] [<800707c0>] generic_handle_irq+0x40/0x58
[ 105.948581] [<804591b4>] do_IRQ+0x1c/0x2c
[ 105.952580] [<802394bc>] plat_irq_dispatch+0xfc/0x138
[ 105.957613] [<8000b508>] except_vec_vi_end+0xb8/0xc4
[ 105.962602] [<8f200ca4>] nf_conntrack_lock+0x28c/0x440 [nf_conntrack]
[ 105.969038] ---[ end trace 0b9e3b8a6909a0fb ]---
[ 105.973638] INFO: rcu_sched detected stalls on CPUs/tasks:
[ 105.979149] 2-...: (6054 ticks this GP) idle=12a/140000000000000/0
softirq=2674/2674 fqs=2544
[ 105.987818] (detected by 3, t=6056 jiffies, g=587, c=586, q=5896)
[ 105.993996] ------------[ cut here ]------------
[ 105.998615] WARNING: CPU: 3 PID: 0 at kernel/smp.c:291
smp_call_function_single+0xc0/0x18c
[ 106.006841] Modules linked in: rt2800pci rt2800mmio rt2800lib
qcserial ppp_async option usb_wwan rt2x00pci rt2x00mmio rt2x00lib
rndis_host qmi_wwan ppp_generic nf_nat_pptp nf_conntrack_pptp
nf_conntrack_ipv6p
[ 106.077960] nf_nat_snmp_basic nf_nat_sip nf_nat_redirect
nf_nat_proto_gre nf_nat_masquerade_ipv4 nf_nat_irc nf_conntrack_ipv4
nf_nat_ipv4 nf_nat_h323 nf_nat_ftp nf_nat_amanda nf_nat nf_log_ipv4
nf_flow_tablt
[ 106.148997] ip_set_hash_netiface ip_set_hash_netport
ip_set_hash_netnet ip_set_hash_net ip_set_hash_netportnet
ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip
ip_set_hash_ipport ip_set_hash_ipmarm
[ 106.220489] ohci_hcd ehci_platform sd_mod scsi_mod ehci_hcd
gpio_button_hotplug usbcore nls_base usb_common mii
[ 106.230686] CPU: 3 PID: 0 Comm: swapper/3 Tainted: G W
4.14.36 #0
[ 106.237876] Stack : 00000000 00000000 00000000 00000000 805c7ada
00000042 00000000 8052c588
[ 106.246232] 8fc44e74 805668e7 804f5a2c 00000003 00000000
00000001 8fc15c80 00000007
[ 106.254587] 00000000 00000000 805c0000 000098c0 00000000
000001b3 00000007 00000000
[ 106.262942] 00000000 80570000 0003851e 00000000 00000000
00000000 80590000 8009dfb0
[ 106.271299] 00000009 00000123 000000e0 80560000 00000001
8027c418 0000000c 805c000c
[ 106.279653] ...
[ 106.282097] Call Trace:
[ 106.284563] [<800103f8>] show_stack+0x58/0x100
[ 106.289026] [<8043c1dc>] dump_stack+0x9c/0xe0
[ 106.293380] [<8002e190>] __warn+0xe0/0x114
[ 106.297467] [<8002e254>] warn_slowpath_null+0x1c/0x28
[ 106.302506] [<8009dfb0>] smp_call_function_single+0xc0/0x18c
[ 106.308150] [<8000d950>] arch_trigger_cpumask_backtrace+0x68/0x78
[ 106.314235] [<80083540>] rcu_dump_cpu_stacks+0xc4/0x134
[ 106.319445] [<80082c8c>] rcu_check_callbacks+0x5d8/0x814
[ 106.324757] [<80085f04>] update_process_times+0x34/0x70
[ 106.329992] [<8009691c>] tick_handle_periodic+0x34/0xd0
[ 106.335230] [<802f6e98>] gic_compare_interrupt+0x48/0x58
[ 106.340537] [<80076450>] handle_percpu_devid_irq+0xbc/0x1a8
[ 106.346118] [<800707c0>] generic_handle_irq+0x40/0x58
[ 106.351190] [<8023a274>] gic_handle_local_int+0x84/0xd0
[ 106.356402] [<8023a434>] gic_irq_dispatch+0x10/0x20
[ 106.361270] [<800707c0>] generic_handle_irq+0x40/0x58
[ 106.366339] [<804591b4>] do_IRQ+0x1c/0x2c
[ 106.370344] [<802394bc>] plat_irq_dispatch+0xfc/0x138
[ 106.375383] [<8000b508>] except_vec_vi_end+0xb8/0xc4
[ 106.380336] [<8000ced0>] r4k_wait_irqoff+0x1c/0x24
[ 106.385146] [<80065f6c>] do_idle+0xe4/0x168
[ 106.389322] [<800661e8>] cpu_startup_entry+0x24/0x2c
[ 106.394275] ---[ end trace 0b9e3b8a6909a0fc ]---
[ 106.398888] CPU: 2 PID: 3632 Comm: dlb Tainted: G W 4.14.36 #0
[ 106.405819] task: 8fd48c80 task.stack: 8e274000
[ 106.410324] $ 0 : 00000000 00000001 814b7588 00000001
[ 106.415544] $ 4 : 09036633 80564198 71804fce d0db5a72
[ 106.420763] $ 8 : 00000000 8aee9400 814bee18 00157558
[ 106.425982] $12 : 00000000 00000000 fffffffe 00000114
[ 106.431201] $16 : 00005b78 8e275998 8e874540 80560000
[ 106.436421] $20 : 00000001 8f210000 00005a31 8f210000
[ 106.441639] $24 : 00000000 80015e5c
[ 106.446857] $28 : 8e274000 8e275940 8f210000 8f201634
[ 106.452078] Hi : 00005a31
[ 106.454942] Lo : 6e620000
[ 106.457912] epc : 8f2017f4 nf_conntrack_tuple_taken+0x258/0x308
[nf_conntrack]
[ 106.465299] ra : 8f201634 nf_conntrack_tuple_taken+0x98/0x308
[nf_conntrack]
[ 106.472571] Status: 11007c03 KERNEL EXL IE
[ 106.476748] Cause : 50800400 (ExcCode 00)
[ 106.480738] PrId : 0001992f (MIPS 1004Kc)
[ 106.484819] CPU: 2 PID: 3632 Comm: dlb Tainted: G W 4.14.36 #0
[ 106.491742] Stack : 00000000 00000000 00000000 00000000 805c7ada
0000003f 00000000 8052c588
[ 106.500088] 8fd48ff4 805668e7 804f5a2c 00000002 00000e30
00000001 8fc11d68 00000007
[ 106.508435] 00000000 00000000 805c0000 0000a498 00000000
000001e3 00000007 00000000
[ 106.516779] 00000000 80570000 000765d3 00000000 80000000
00000000 80590000 8000d0fc
[ 106.525126] 804fce50 805574e0 804faa5c 80570000 00000002
8027c418 00000008 805c0008
[ 106.533473] ...
[ 106.535911] Call Trace:
[ 106.538356] [<800103f8>] show_stack+0x58/0x100
[ 106.542788] [<8043c1dc>] dump_stack+0x9c/0xe0
[ 106.547130] [<8009dccc>] flush_smp_call_function_queue+0x144/0x1d8
[ 106.553294] [<80015830>] ipi_call_interrupt+0x10/0x20
[ 106.558336] [<80071380>] __handle_irq_event_percpu+0x78/0x18c
[ 106.564061] [<800714b4>] handle_irq_event_percpu+0x20/0x64
[ 106.569528] [<80071558>] handle_irq_event+0x60/0xa8
[ 106.574388] [<800757d0>] handle_edge_irq+0x204/0x25c
[ 106.579337] [<800707c0>] generic_handle_irq+0x40/0x58
[ 106.584377] [<8023a3c8>] gic_handle_shared_int+0x108/0x164
[ 106.589844] [<800707c0>] generic_handle_irq+0x40/0x58
[ 106.594883] [<804591b4>] do_IRQ+0x1c/0x2c
[ 106.598880] [<802394bc>] plat_irq_dispatch+0xfc/0x138
[ 106.603913] [<8000b508>] except_vec_vi_end+0xb8/0xc4
[ 106.608901] [<8f2017f4>] nf_conntrack_tuple_taken+0x258/0x308 [nf_conntrack]
[ 106.615963] [<8f1fcd88>] nf_ct_nat_ext_add+0x7ac/0x908 [nf_nat]
I instrumented nf_conntrack_tuple_taken() and it seems that what
happens, is that tuple_taken() spins on the "goto begin"-part. I am
probably doing something wrong with my insert, but I am not able to
see what. Is anyone able to see what is wrong, or can provide me with
pointers on where to look? When looking at the syslog, I also noticed
the following:
[ 45.394817] early insert: tuple 17 192.168.6.2:36376 -> 1.1.1.1:53
[ 45.401056] early insert confirmed
[ 45.405468] early insert: tuple 17 192.168.6.2:36376 -> 1.1.1.1:53
[ 45.411996] early insert confirmed
I.e., the entry created by the first packet is not found when the
second packet is handled. Could it be some synchronization step that I
am missing?
Thanks in advance for any help,
Kristian
[-- Attachment #2: 0001-NFQUEUE-early-insert.patch --]
[-- Type: text/x-patch, Size: 3148 bytes --]
From cfb4e8187940b89acf163a7f4181bf04abd333d0 Mon Sep 17 00:00:00 2001
From: Kristian Evensen <kristian.evensen@gmail.com>
Date: Tue, 1 May 2018 22:44:16 +0200
Subject: [PATCH] NFQUEUE early insert
---
net/netfilter/nfnetlink_queue.c | 69 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 69 insertions(+)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index c97966298..254866d5c 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -43,6 +43,9 @@
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_l4proto.h>
#endif
#define NFQNL_QMAX_DEFAULT 1024
@@ -1151,6 +1154,64 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry,
return 0;
}
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+static void nfqnl_update_ct(struct net *net, struct sk_buff *skb)
+{
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conntrack_l3proto *l3proto;
+ const struct nf_conntrack_l4proto *l4proto;
+ u_int16_t l3num;
+ unsigned int dataoff;
+ u_int8_t l4num;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ l3num = nf_ct_l3num(ct);
+ l3proto = nf_ct_l3proto_find_get(l3num);
+
+ if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+ &l4num) <= 0) {
+ pr_info("Failed to get l4 protonum\n");
+ return;
+ }
+
+ l4proto = nf_ct_l4proto_find_get(l3num, l4num);
+
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+ l4num, net, &tuple, l3proto, l4proto)) {
+ pr_info("Failed to get tuple\n");
+ return;
+ }
+
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+ h = nf_conntrack_find_get(net, &(ct->zone), &tuple);
+#else
+ h = nf_conntrack_find_get(net, NULL, &tuple);
+#endif
+
+ if (!h) {
+ pr_info("early insert: tuple %u %pI4:%hu -> %pI4:%hu\n",
+ tuple.dst.protonum, &(tuple.src.u3.ip), ntohs(tuple.src.u.all),
+ &(tuple.dst.u3.ip), ntohs(tuple.dst.u.all));
+
+ rcu_read_lock();
+ if (!nf_conntrack_hash_check_insert(ct)) {
+ pr_info("early insert confirmed\n");
+ }
+ rcu_read_unlock();
+ } else {
+ pr_info("replace: tuple %u %pI4:%hu -> %pI4:%hu\n",
+ tuple.dst.protonum, &(tuple.src.u3.ip), ntohs(tuple.src.u.all),
+ &(tuple.dst.u3.ip), ntohs(tuple.dst.u.all));
+ nf_ct_put(ct);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ nf_ct_set(skb, ct, IP_CT_NEW);
+ }
+}
+#endif
+
static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
struct sk_buff *skb,
const struct nlmsghdr *nlh,
@@ -1213,6 +1274,14 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
if (nfqa[NFQA_MARK])
entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ nf_ct_get(entry->skb, &ctinfo);
+
+ if (ctinfo == IP_CT_NEW && verdict != NF_STOLEN && verdict != NF_DROP) {
+ nfqnl_update_ct(net, entry->skb);
+ }
+#endif
+
nf_reinject(entry, verdict);
return 0;
}
--
2.14.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox