* [PATCH bpf-next v4 2/4] bpf: support cloning sk storage on accept()
From: Stanislav Fomichev @ 2019-08-14 17:37 UTC (permalink / raw)
To: netdev, bpf
Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau,
Yonghong Song
In-Reply-To: <20190814173751.31806-1-sdf@google.com>
Add new helper bpf_sk_storage_clone which optionally clones sk storage
and call it from sk_clone_lock.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
include/net/bpf_sk_storage.h | 10 ++++
include/uapi/linux/bpf.h | 3 +
net/core/bpf_sk_storage.c | 104 ++++++++++++++++++++++++++++++++++-
net/core/sock.c | 9 ++-
4 files changed, 120 insertions(+), 6 deletions(-)
diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
index b9dcb02e756b..8e4f831d2e52 100644
--- a/include/net/bpf_sk_storage.h
+++ b/include/net/bpf_sk_storage.h
@@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
+#ifdef CONFIG_BPF_SYSCALL
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
+#else
+static inline int bpf_sk_storage_clone(const struct sock *sk,
+ struct sock *newsk)
+{
+ return 0;
+}
+#endif
+
#endif /* _BPF_SK_STORAGE_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4393bd4b2419..0ef594ac3899 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -337,6 +337,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE (1U << 9)
+
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 94c7f77ecb6b..da5639a5bd3b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -12,6 +12,9 @@
static atomic_t cache_idx;
+#define SK_STORAGE_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
struct bucket {
struct hlist_head list;
raw_spinlock_t lock;
@@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
kfree_rcu(sk_storage, rcu);
}
-/* sk_storage->lock must be held and sk_storage->list cannot be empty */
static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
struct bpf_sk_storage_elem *selem)
{
@@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
return 0;
}
-/* Called by __sk_destruct() */
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_sk_storage_elem *selem;
@@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
smap = (struct bpf_sk_storage_map *)map;
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
synchronize_rcu();
/* bpf prog and the userspace can no longer access this map
@@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
- if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
+ if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
attr->key_size != sizeof(int) || !attr->value_size ||
/* Enforce BTF for userspace sk dumping */
!attr->btf_key_type_id || !attr->btf_value_type_id)
@@ -739,6 +748,95 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+ struct bpf_sk_storage_map *smap,
+ struct bpf_sk_storage_elem *selem)
+{
+ struct bpf_sk_storage_elem *copy_selem;
+
+ copy_selem = selem_alloc(smap, newsk, NULL, true);
+ if (!copy_selem)
+ return NULL;
+
+ if (map_value_has_spin_lock(&smap->map))
+ copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data, true);
+ else
+ copy_map_value(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data);
+
+ return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct bpf_sk_storage *new_sk_storage = NULL;
+ struct bpf_sk_storage *sk_storage;
+ struct bpf_sk_storage_elem *selem;
+ int ret = 0;
+
+ RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+ if (!sk_storage || hlist_empty(&sk_storage->list))
+ goto out;
+
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ struct bpf_sk_storage_elem *copy_selem;
+ struct bpf_sk_storage_map *smap;
+ struct bpf_map *map;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ if (!(smap->map.map_flags & BPF_F_CLONE))
+ continue;
+
+ /* Note that for lockless listeners adding new element
+ * here can race with cleanup in bpf_sk_storage_map_free.
+ * Try to grab map refcnt to make sure that it's still
+ * alive and prevent concurrent removal.
+ */
+ map = bpf_map_inc_not_zero(&smap->map, false);
+ if (IS_ERR(map))
+ continue;
+
+ copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+ if (!copy_selem) {
+ ret = -ENOMEM;
+ bpf_map_put(map);
+ goto out;
+ }
+
+ if (new_sk_storage) {
+ selem_link_map(smap, copy_selem);
+ __selem_link_sk(new_sk_storage, copy_selem);
+ } else {
+ ret = sk_storage_alloc(newsk, smap, copy_selem);
+ if (ret) {
+ kfree(copy_selem);
+ atomic_sub(smap->elem_size,
+ &newsk->sk_omem_alloc);
+ bpf_map_put(map);
+ goto out;
+ }
+
+ new_sk_storage = rcu_dereference(copy_selem->sk_storage);
+ }
+ bpf_map_put(map);
+ }
+
+out:
+ rcu_read_unlock();
+
+ /* In case of an error, don't free anything explicitly here, the
+ * caller is responsible to call bpf_sk_storage_free.
+ */
+
+ return ret;
+}
+
BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
diff --git a/net/core/sock.c b/net/core/sock.c
index d57b0cc995a0..f5e801a9cea4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
goto out;
}
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
-#ifdef CONFIG_BPF_SYSCALL
- RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
-#endif
+
+ if (bpf_sk_storage_clone(sk, newsk)) {
+ sk_free_unlock_clone(newsk);
+ newsk = NULL;
+ goto out;
+ }
newsk->sk_err = 0;
newsk->sk_err_soft = 0;
--
2.23.0.rc1.153.gdeed80330f-goog
^ permalink raw reply related
* [PATCH bpf-next v4 1/4] bpf: export bpf_map_inc_not_zero
From: Stanislav Fomichev @ 2019-08-14 17:37 UTC (permalink / raw)
To: netdev, bpf
Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau,
Yonghong Song
In-Reply-To: <20190814173751.31806-1-sdf@google.com>
Rename existing bpf_map_inc_not_zero to __bpf_map_inc_not_zero to
indicate that it's caller's responsibility to do proper locking.
Create and export bpf_map_inc_not_zero wrapper that properly
locks map_idr_lock. Will be used in the next commit to
hold a map while cloning a socket.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
include/linux/bpf.h | 2 ++
kernel/bpf/syscall.c | 16 +++++++++++++---
2 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9a506147c8a..15ae49862b82 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -647,6 +647,8 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f16f6fa..cf8052b016e7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
- bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
{
int refold;
@@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
return map;
}
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+{
+ spin_lock_bh(&map_idr_lock);
+ map = __bpf_map_inc_not_zero(map, uref);
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -2177,7 +2187,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
- map = bpf_map_inc_not_zero(map, true);
+ map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
--
2.23.0.rc1.153.gdeed80330f-goog
^ permalink raw reply related
* [PATCH bpf-next v4 0/4] bpf: support cloning sk storage on accept()
From: Stanislav Fomichev @ 2019-08-14 17:37 UTC (permalink / raw)
To: netdev, bpf
Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau,
Yonghong Song
Currently there is no way to propagate sk storage from the listener
socket to a newly accepted one. Consider the following use case:
fd = socket();
setsockopt(fd, SOL_IP, IP_TOS,...);
/* ^^^ setsockopt BPF program triggers here and saves something
* into sk storage of the listener.
*/
listen(fd, ...);
while (client = accept(fd)) {
/* At this point all association between listener
* socket and newly accepted one is gone. New
* socket will not have any sk storage attached.
*/
}
Let's add new BPF_F_CLONE flag that can be specified when creating
a socket storage map. This new flag indicates that map contents
should be cloned when the socket is cloned.
v4:
* drop 'goto err' in bpf_sk_storage_clone (Yonghong Song)
* add comment about race with bpf_sk_storage_map_free to the
bpf_sk_storage_clone side as well (Daniel Borkmann)
v3:
* make sure BPF_F_NO_PREALLOC is always present when creating
a map (Martin KaFai Lau)
* don't call bpf_sk_storage_free explicitly, rely on
sk_free_unlock_clone to do the cleanup (Martin KaFai Lau)
v2:
* remove spinlocks around selem_link_map/sk (Martin KaFai Lau)
* BPF_F_CLONE on a map, not selem (Martin KaFai Lau)
* hold a map while cloning (Martin KaFai Lau)
* use BTF maps in selftests (Yonghong Song)
* do proper cleanup selftests; don't call close(-1) (Yonghong Song)
* export bpf_map_inc_not_zero
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Stanislav Fomichev (4):
bpf: export bpf_map_inc_not_zero
bpf: support cloning sk storage on accept()
bpf: sync bpf.h to tools/
selftests/bpf: add sockopt clone/inheritance test
include/linux/bpf.h | 2 +
include/net/bpf_sk_storage.h | 10 +
include/uapi/linux/bpf.h | 3 +
kernel/bpf/syscall.c | 16 +-
net/core/bpf_sk_storage.c | 104 ++++++-
net/core/sock.c | 9 +-
tools/include/uapi/linux/bpf.h | 3 +
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 3 +-
.../selftests/bpf/progs/sockopt_inherit.c | 97 +++++++
.../selftests/bpf/test_sockopt_inherit.c | 253 ++++++++++++++++++
11 files changed, 491 insertions(+), 10 deletions(-)
create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c
--
2.23.0.rc1.153.gdeed80330f-goog
^ permalink raw reply
* Re: [PATCH bpf-next v3 2/4] bpf: support cloning sk storage on accept()
From: Stanislav Fomichev @ 2019-08-14 17:35 UTC (permalink / raw)
To: Martin Lau
Cc: netdev@vger.kernel.org, bpf@vger.kernel.org, davem@davemloft.net,
ast@kernel.org, daniel@iogearbox.net, Yonghong Song
In-Reply-To: <20190814172819.syz5skzil2ekdu5g@kafai-mbp>
On Wed, Aug 14, 2019 at 10:28 AM Martin Lau <kafai@fb.com> wrote:
>
> On Tue, Aug 13, 2019 at 09:26:28AM -0700, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from sk_clone_lock.
> Acked-by: Martin KaFai Lau <kafai@fb.com>
Thanks! Will send out a v4 to address Yonghong's and Daniel's suggestions.
^ permalink raw reply
* Re: [PATCH bpf-next v3 2/4] bpf: support cloning sk storage on accept()
From: Martin Lau @ 2019-08-14 17:28 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: netdev@vger.kernel.org, bpf@vger.kernel.org, davem@davemloft.net,
ast@kernel.org, daniel@iogearbox.net, Yonghong Song
In-Reply-To: <20190813162630.124544-3-sdf@google.com>
On Tue, Aug 13, 2019 at 09:26:28AM -0700, Stanislav Fomichev wrote:
> Add new helper bpf_sk_storage_clone which optionally clones sk storage
> and call it from sk_clone_lock.
Acked-by: Martin KaFai Lau <kafai@fb.com>
^ permalink raw reply
* Re: fallout from net-next netfilter changes
From: Pablo Neira Ayuso @ 2019-08-14 17:28 UTC (permalink / raw)
To: David Miller; +Cc: netfilter-devel, netdev
In-Reply-To: <20190814172758.xtf6ioke4qztzzqi@salvia>
On Wed, Aug 14, 2019 at 07:27:58PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Aug 14, 2019 at 12:53:30PM -0400, David Miller wrote:
> >
> > This started happening after Jakub's pull of your net-next changes
> > yesterday:
> >
> > ./include/uapi/linux/netfilter_ipv6/ip6t_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> > #warning "Please update iptables, this file will be removed soon!"
> > ^~~~~~~
> > In file included from <command-line>:
> > ./include/uapi/linux/netfilter_ipv4/ipt_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> > #warning "Please update iptables, this file will be removed soon!"
> > ^~~~~~~
> >
> > It's probaly from the standard kernel build UAPI header checks.
> >
> > Please fix this.
>
> Would you apply this patch that Jeremy posted via net-next instead of
> nf-next?
>
> http://patchwork.ozlabs.org/patch/1146821/
Else I can prepare a small pull request with a few patches for
net-next later today, your choice.
^ permalink raw reply
* Re: [PATCH bpf-next] libbpf: make libbpf.map source of truth for libbpf version
From: Andrii Nakryiko @ 2019-08-14 17:28 UTC (permalink / raw)
To: Andrey Ignatov
Cc: Andrii Nakryiko, bpf@vger.kernel.org, netdev@vger.kernel.org,
Alexei Starovoitov, daniel@iogearbox.net, Kernel Team
In-Reply-To: <20190814071242.GA41688@rdna-mbp>
On Wed, Aug 14, 2019 at 12:12 AM Andrey Ignatov <rdna@fb.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> [Tue, 2019-08-13 21:46 -0700]:
> > On Tue, Aug 13, 2019 at 5:28 PM Andrey Ignatov <rdna@fb.com> wrote:
> > >
> > > Andrii Nakryiko <andriin@fb.com> [Tue, 2019-08-13 16:24 -0700]:
> > > > Currently libbpf version is specified in 2 places: libbpf.map and
> > > > Makefile. They easily get out of sync and it's very easy to update one,
> > > > but forget to update another one. In addition, Github projection of
> > > > libbpf has to maintain its own version which has to be remembered to be
> > > > kept in sync manually, which is very error-prone approach.
> > > >
> > > > This patch makes libbpf.map a source of truth for libbpf version and
> > > > uses shell invocation to parse out correct full and major libbpf version
> > > > to use during build. Now we need to make sure that once new release
> > > > cycle starts, we need to add (initially) empty section to libbpf.map
> > > > with correct latest version.
> > > >
> > > > This also will make it possible to keep Github projection consistent
> > > > with kernel sources version of libbpf by adopting similar parsing of
> > > > version from libbpf.map.
> > >
> > > Thanks for taking care of this!
> > >
> > >
> > > > Cc: Andrey Ignatov <rdna@fb.com>
> > > > Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> > > > ---
> > > > tools/lib/bpf/Makefile | 12 +++++-------
> > > > tools/lib/bpf/libbpf.map | 3 +++
> > > > 2 files changed, 8 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
> > > > index 9312066a1ae3..d9afc8509725 100644
> > > > --- a/tools/lib/bpf/Makefile
> > > > +++ b/tools/lib/bpf/Makefile
> > > > @@ -1,9 +1,10 @@
> > > > # SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> > > > # Most of this file is copied from tools/lib/traceevent/Makefile
> > > >
> > > > -BPF_VERSION = 0
> > > > -BPF_PATCHLEVEL = 0
> > > > -BPF_EXTRAVERSION = 4
> > > > +BPF_FULL_VERSION = $(shell \
> > >
> > > Nit: Should it be LIBBPF_VERSION? IMO it's more descriptive name.
> >
> > LIBBPF_VERSION is used below, but combining your suggestion with
> > Jakub's eager evaluation, I can use just LIBBPF_VERSION and drop
> > BPF_FULL_VERSION altogether.
> >
> > >
> > > > + grep -E 'LIBBPF_([0-9]+)\.([0-9]+)\.([0-9]+) \{' libbpf.map | \
> > > > + tail -n1 | cut -d'_' -f2 | cut -d' ' -f1)
> > >
> > > It can be done simpler and IMO versions should be sorted before taking
> > > the last one (just in case), something like:
> > >
> > > grep -oE '^LIBBPF_[0-9.]+' libbpf.map | cut -d_ -f 2 | sort -nr | head -n 1
> >
> > Ah, you mean making regex simpler? Yeah, I originally intended to
> > extract major, patch, and extra version, but ralized patch and extra
> > are not used for anything. I'll simplify regex. But second `cut -d' '
> > -f1` is still needed to drop " {".
>
> Yeah, regex, but not only. Note `-o' in the `grep' arguments, it returns
> only matched piece of a string and the second `cut' is not needed.
Oh, TIL, will do -o as well, didn't notice it first time.
>
>
> > Regarding sorting. I don't think it's necessary, as I can't imagine
> > having non-ordered libbpf.map. Even more so, sort -nr doesn't sort
> > versions like these correctly anyway:
> >
> > 0.1.2
> > 0.1.12
> >
> > So this will just give us false sense of correctness, while being a "time bomb".
>
> Right, `-n' is not a good one, `-V' is much better since it's intended
> to sort specifically versions:
>
> % printf "0.1.2\n0.1.12\n0.1.11\n"
> 0.1.2
> 0.1.12
> 0.1.11
> % printf "0.1.2\n0.1.12\n0.1.11\n" | sort -cV
> sort: -:3: disorder: 0.1.11
> % printf "0.1.2\n0.1.12\n0.1.11\n" | sort -V
> 0.1.2
> 0.1.11
> 0.1.12
>
>
> The reason I brought this up is the version string can be an arbitrary string
> and for example glibc does this:
>
> % grep -Eo '^\s+GLIBC_\S+' sysdeps/unix/sysv/linux/Versions | tail -n 3
> GLIBC_2.29
> GLIBC_2.30
> GLIBC_PRIVATE
>
> I agree though that it's not a problem with the current version script
> structure and it should be fine to postpone adding some kind of sorting till
> the time this structure is changed (if at all).
I like sort -V, will use that in v3, thanks!
>
> > > > +BPF_VERSION = $(firstword $(subst ., ,$(BPF_FULL_VERSION)))
> > > >
> > > > MAKEFLAGS += --no-print-directory
> > > >
> > > > @@ -79,15 +80,12 @@ export prefix libdir src obj
> > > > libdir_SQ = $(subst ','\'',$(libdir))
> > > > libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
> > > >
> > > > +LIBBPF_VERSION = $(BPF_FULL_VERSION)
> > > > VERSION = $(BPF_VERSION)
> > > > -PATCHLEVEL = $(BPF_PATCHLEVEL)
> > > > -EXTRAVERSION = $(BPF_EXTRAVERSION)
> > > >
> > > > OBJ = $@
> > > > N =
> > > >
> > > > -LIBBPF_VERSION = $(BPF_VERSION).$(BPF_PATCHLEVEL).$(BPF_EXTRAVERSION)
> > > > -
> > > > LIB_TARGET = libbpf.a libbpf.so.$(LIBBPF_VERSION)
> > > > LIB_FILE = libbpf.a libbpf.so*
> > > > PC_FILE = libbpf.pc
> > > > diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> > > > index f9d316e873d8..4e72df8e98ba 100644
> > > > --- a/tools/lib/bpf/libbpf.map
> > > > +++ b/tools/lib/bpf/libbpf.map
> > > > @@ -184,3 +184,6 @@ LIBBPF_0.0.4 {
> > > > perf_buffer__new_raw;
> > > > perf_buffer__poll;
> > > > } LIBBPF_0.0.3;
> > > > +
> > > > +LIBBPF_0.0.5 {
> > > > +} LIBBPF_0.0.4;
> > >
> > > I'm not sure version should be bumped in this patch since this patch is
> > > about keeping the version in one place, not about bumping it, right?
> >
> > This is actually fixing a version. Current libbpf version in bpf-next
> > is 0.0.5, it just was never updated in Makefile.
> >
> > >
> > >
> > > > --
> > > > 2.17.1
> > > >
> > >
> > > --
> > > Andrey Ignatov
>
> --
> Andrey Ignatov
^ permalink raw reply
* Re: fallout from net-next netfilter changes
From: Pablo Neira Ayuso @ 2019-08-14 17:27 UTC (permalink / raw)
To: David Miller; +Cc: netfilter-devel, netdev
In-Reply-To: <20190814.125330.1934256694306164517.davem@davemloft.net>
On Wed, Aug 14, 2019 at 12:53:30PM -0400, David Miller wrote:
>
> This started happening after Jakub's pull of your net-next changes
> yesterday:
>
> ./include/uapi/linux/netfilter_ipv6/ip6t_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> #warning "Please update iptables, this file will be removed soon!"
> ^~~~~~~
> In file included from <command-line>:
> ./include/uapi/linux/netfilter_ipv4/ipt_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> #warning "Please update iptables, this file will be removed soon!"
> ^~~~~~~
>
> It's probaly from the standard kernel build UAPI header checks.
>
> Please fix this.
Would you apply this patch that Jeremy posted via net-next instead of
nf-next?
http://patchwork.ozlabs.org/patch/1146821/
Thanks.
^ permalink raw reply
* Re: [PATCH net-next] net: phy: realtek: add NBase-T PHY auto-detection
From: David Miller @ 2019-08-14 17:26 UTC (permalink / raw)
To: hkallweit1; +Cc: andrew, f.fainelli, netdev
In-Reply-To: <e69e636d-9109-aec9-4d8a-e36af37a706b@gmail.com>
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Tue, 13 Aug 2019 08:09:32 +0200
> Realtek provided information on how the new NIC-integrated PHY's
> expose whether they support 2.5G/5G/10G. This allows to automatically
> differentiate 1Gbps and 2.5Gbps PHY's, and therefore allows to
> remove the fake PHY ID mechanism for RTL8125.
> So far RTL8125 supports 2.5Gbps only, but register layout for faster
> modes has been defined already, so let's use this information to be
> future-proof.
>
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Applied.
^ permalink raw reply
* [PATCH net-next v2 1/4] net: bridge: mdb: move vlan comments
From: Nikolay Aleksandrov @ 2019-08-14 17:04 UTC (permalink / raw)
To: netdev; +Cc: davem, roopa, bridge, Nikolay Aleksandrov
In-Reply-To: <20190814170501.1808-1-nikolay@cumulusnetworks.com>
Trivial patch to move the vlan comments in their proper places above the
vid 0 checks.
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
net/bridge/br_mdb.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 428af1abf8cc..ee6208c6d946 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -653,9 +653,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
- /* If vlan filtering is enabled and VLAN is not specified
- * install mdb entry on all vlans configured on the port.
- */
pdev = __dev_get_by_index(net, entry->ifindex);
if (!pdev)
return -ENODEV;
@@ -665,6 +662,9 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
vg = nbp_vlan_group(p);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * install mdb entry on all vlans configured on the port.
+ */
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
@@ -745,9 +745,6 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
- /* If vlan filtering is enabled and VLAN is not specified
- * delete mdb entry on all vlans configured on the port.
- */
pdev = __dev_get_by_index(net, entry->ifindex);
if (!pdev)
return -ENODEV;
@@ -757,6 +754,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
vg = nbp_vlan_group(p);
+ /* If vlan filtering is enabled and VLAN is not specified
+ * delete mdb entry on all vlans configured on the port.
+ */
if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
--
2.21.0
^ permalink raw reply related
* [PATCH net-next v2 2/4] net: bridge: mdb: factor out mdb filling
From: Nikolay Aleksandrov @ 2019-08-14 17:04 UTC (permalink / raw)
To: netdev; +Cc: davem, roopa, bridge, Nikolay Aleksandrov
In-Reply-To: <20190814170501.1808-1-nikolay@cumulusnetworks.com>
We have to factor out the mdb fill portion in order to re-use it later for
the bridge mdb entries. No functional changes intended.
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
net/bridge/br_mdb.c | 68 ++++++++++++++++++++++++---------------------
1 file changed, 37 insertions(+), 31 deletions(-)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index ee6208c6d946..77730983097e 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -77,6 +77,40 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
#endif
}
+static int __mdb_fill_info(struct sk_buff *skb,
+ struct net_bridge_port_group *p)
+{
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+
+ memset(&e, 0, sizeof(e));
+ __mdb_entry_fill_flags(&e, p->flags);
+ e.ifindex = p->port->dev->ifindex;
+ e.vid = p->addr.vid;
+ if (p->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = p->addr.u.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (p->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = p->addr.u.ip6;
+#endif
+ e.addr.proto = p->addr.proto;
+ nest_ent = nla_nest_start_noflag(skb,
+ MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent)
+ return -EMSGSIZE;
+
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(&p->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, nest_ent);
+
+ return 0;
+}
+
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
@@ -95,7 +129,6 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port *port;
if (idx < s_idx)
goto skip;
@@ -108,41 +141,14 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
pp = &p->next) {
- struct nlattr *nest_ent;
- struct br_mdb_entry e;
-
- port = p->port;
- if (!port)
+ if (!p->port)
continue;
- memset(&e, 0, sizeof(e));
- e.ifindex = port->dev->ifindex;
- e.vid = p->addr.vid;
- __mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
-#endif
- e.addr.proto = p->addr.proto;
- nest_ent = nla_nest_start_noflag(skb,
- MDBA_MDB_ENTRY_INFO);
- if (!nest_ent) {
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- if (nla_put_nohdr(skb, sizeof(e), &e) ||
- nla_put_u32(skb,
- MDBA_MDB_EATTR_TIMER,
- br_timer_value(&p->timer))) {
- nla_nest_cancel(skb, nest_ent);
+ err = __mdb_fill_info(skb, p);
+ if (err) {
nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
goto out;
}
- nla_nest_end(skb, nest_ent);
}
nla_nest_end(skb, nest2);
skip:
--
2.21.0
^ permalink raw reply related
* [PATCH net-next v2 4/4] net: bridge: mdb: allow add/delete for host-joined groups
From: Nikolay Aleksandrov @ 2019-08-14 17:05 UTC (permalink / raw)
To: netdev; +Cc: davem, roopa, bridge, Nikolay Aleksandrov
In-Reply-To: <20190814170501.1808-1-nikolay@cumulusnetworks.com>
Currently this is needed only for user-space compatibility, so similar
object adds/deletes as the dumped ones would succeed. Later it can be
used for L2 mcast MAC add/delete.
v2: don't send a notification when used from user-space, arm the group
timer if no ports are left after host entry del
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
net/bridge/br_mdb.c | 76 +++++++++++++++++++++++++++------------
net/bridge/br_multicast.c | 30 ++++++++++++----
net/bridge/br_private.h | 2 ++
3 files changed, 79 insertions(+), 29 deletions(-)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 985273425117..e0f789296920 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -616,6 +616,19 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
return err;
}
+ /* host join */
+ if (!port) {
+ /* don't allow any flags for host-joined groups */
+ if (state)
+ return -EINVAL;
+ if (mp->host_joined)
+ return -EEXIST;
+
+ br_multicast_host_join(mp, false);
+
+ return 0;
+ }
+
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
@@ -640,19 +653,21 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
{
struct br_ip ip;
struct net_device *dev;
- struct net_bridge_port *p;
+ struct net_bridge_port *p = NULL;
int ret;
if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED))
return -EINVAL;
- dev = __dev_get_by_index(net, entry->ifindex);
- if (!dev)
- return -ENODEV;
+ if (entry->ifindex != br->dev->ifindex) {
+ dev = __dev_get_by_index(net, entry->ifindex);
+ if (!dev)
+ return -ENODEV;
- p = br_port_get_rtnl(dev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
+ p = br_port_get_rtnl(dev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ }
__mdb_entry_to_br_ip(entry, &ip);
@@ -680,15 +695,19 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
+ if (entry->ifindex != br->dev->ifindex) {
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ vg = nbp_vlan_group(p);
+ } else {
+ vg = br_vlan_group(br);
+ }
- vg = nbp_vlan_group(p);
/* If vlan filtering is enabled and VLAN is not specified
* install mdb entry on all vlans configured on the port.
*/
@@ -727,6 +746,15 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (!mp)
goto unlock;
+ /* host leave */
+ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
+ br_multicast_host_leave(mp, false);
+ err = 0;
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ goto unlock;
+ }
+
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
@@ -759,9 +787,9 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
- struct net_bridge_port *p;
struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
@@ -772,15 +800,19 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
br = netdev_priv(dev);
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
+ if (entry->ifindex != br->dev->ifindex) {
+ pdev = __dev_get_by_index(net, entry->ifindex);
+ if (!pdev)
+ return -ENODEV;
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
+ p = br_port_get_rtnl(pdev);
+ if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+ return -EINVAL;
+ vg = nbp_vlan_group(p);
+ } else {
+ vg = br_vlan_group(br);
+ }
- vg = nbp_vlan_group(p);
/* If vlan filtering is enabled and VLAN is not specified
* delete mdb entry on all vlans configured on the port.
*/
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 9b379e110129..ad12fe3fca8c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -148,8 +148,7 @@ static void br_multicast_group_expired(struct timer_list *t)
if (!netif_running(br->dev) || timer_pending(&mp->timer))
goto out;
- mp->host_joined = false;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+ br_multicast_host_leave(mp, true);
if (mp->ports)
goto out;
@@ -512,6 +511,27 @@ static bool br_port_group_equal(struct net_bridge_port_group *p,
return ether_addr_equal(src, p->eth_addr);
}
+void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined) {
+ mp->host_joined = true;
+ if (notify)
+ br_mdb_notify(mp->br->dev, NULL, &mp->addr,
+ RTM_NEWMDB, 0);
+ }
+ mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval);
+}
+
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined)
+ return;
+
+ mp->host_joined = false;
+ if (notify)
+ br_mdb_notify(mp->br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+}
+
static int br_multicast_add_group(struct net_bridge *br,
struct net_bridge_port *port,
struct br_ip *group,
@@ -534,11 +554,7 @@ static int br_multicast_add_group(struct net_bridge *br,
goto err;
if (!port) {
- if (!mp->host_joined) {
- mp->host_joined = true;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0);
- }
- mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ br_multicast_host_join(mp, true);
goto out;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b7a4942ff1b3..ce2ab14ee605 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -702,6 +702,8 @@ void br_multicast_get_stats(const struct net_bridge *br,
struct br_mcast_stats *dest);
void br_mdb_init(void);
void br_mdb_uninit(void);
+void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
--
2.21.0
^ permalink raw reply related
* [PATCH net-next v2 3/4] net: bridge: mdb: dump host-joined entries as well
From: Nikolay Aleksandrov @ 2019-08-14 17:05 UTC (permalink / raw)
To: netdev; +Cc: davem, roopa, bridge, Nikolay Aleksandrov
In-Reply-To: <20190814170501.1808-1-nikolay@cumulusnetworks.com>
Currently we dump only the port mdb entries but we can have host-joined
entries on the bridge itself and they should be treated as normal temp
mdbs, they're already notified:
$ bridge monitor all
[MDB]dev br0 port br0 grp ff02::8 temp
The group will not be shown in the bridge mdb output, but it takes 1 slot
and it's timing out. If it's only host-joined then the mdb show output
can even be empty.
After this patch we show the host-joined groups:
$ bridge mdb show
dev br0 port br0 grp ff02::8 temp
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
net/bridge/br_mdb.c | 41 +++++++++++++++++++++++++++++++----------
1 file changed, 31 insertions(+), 10 deletions(-)
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 77730983097e..985273425117 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -78,22 +78,35 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
}
static int __mdb_fill_info(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp,
struct net_bridge_port_group *p)
{
+ struct timer_list *mtimer;
struct nlattr *nest_ent;
struct br_mdb_entry e;
+ u8 flags = 0;
+ int ifindex;
memset(&e, 0, sizeof(e));
- __mdb_entry_fill_flags(&e, p->flags);
- e.ifindex = p->port->dev->ifindex;
- e.vid = p->addr.vid;
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
+ if (p) {
+ ifindex = p->port->dev->ifindex;
+ mtimer = &p->timer;
+ flags = p->flags;
+ } else {
+ ifindex = mp->br->dev->ifindex;
+ mtimer = &mp->timer;
+ }
+
+ __mdb_entry_fill_flags(&e, flags);
+ e.ifindex = ifindex;
+ e.vid = mp->addr.vid;
+ if (mp->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = mp->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
+ if (mp->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = mp->addr.u.ip6;
#endif
- e.addr.proto = p->addr.proto;
+ e.addr.proto = mp->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
MDBA_MDB_ENTRY_INFO);
if (!nest_ent)
@@ -102,7 +115,7 @@ static int __mdb_fill_info(struct sk_buff *skb,
if (nla_put_nohdr(skb, sizeof(e), &e) ||
nla_put_u32(skb,
MDBA_MDB_EATTR_TIMER,
- br_timer_value(&p->timer))) {
+ br_timer_value(mtimer))) {
nla_nest_cancel(skb, nest_ent);
return -EMSGSIZE;
}
@@ -139,12 +152,20 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
break;
}
+ if (mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err) {
+ nla_nest_cancel(skb, nest2);
+ break;
+ }
+ }
+
for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
pp = &p->next) {
if (!p->port)
continue;
- err = __mdb_fill_info(skb, p);
+ err = __mdb_fill_info(skb, mp, p);
if (err) {
nla_nest_cancel(skb, nest2);
goto out;
--
2.21.0
^ permalink raw reply related
* Re: [PATCH] MAINTAINERS: net_failover: Fix typo in a filepath
From: David Miller @ 2019-08-14 17:24 UTC (permalink / raw)
To: efremov; +Cc: linux-kernel, joe, sridhar.samudrala, netdev
In-Reply-To: <20190813060530.13138-1-efremov@linux.com>
From: Denis Efremov <efremov@linux.com>
Date: Tue, 13 Aug 2019 09:05:30 +0300
> Replace "driver" with "drivers" in the filepath to net_failover.c
>
> Cc: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: netdev@vger.kernel.org
> Fixes: cfc80d9a1163 ("net: Introduce net_failover driver")
> Signed-off-by: Denis Efremov <efremov@linux.com>
Applied.
^ permalink raw reply
* Re: [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Stanislav Fomichev @ 2019-08-14 17:07 UTC (permalink / raw)
To: Toshiaki Makita
Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, David S. Miller, Jakub Kicinski,
Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
Cong Wang, Jiri Pirko, netdev, bpf, William Tu
In-Reply-To: <20190813120558.6151-1-toshiaki.makita1@gmail.com>
On 08/13, Toshiaki Makita wrote:
> * Implementation
>
> xdp_flow makes use of UMH to load an eBPF program for XDP, similar to
> bpfilter. The difference is that xdp_flow does not generate the eBPF
> program dynamically but a prebuilt program is embedded in UMH. This is
> mainly because flow insertion is considerably frequent. If we generate
> and load an eBPF program on each insertion of a flow, the latency of the
> first packet of ping in above test will incease, which I want to avoid.
Can this be instead implemented with a new hook that will be called
for TC events? This hook can write to perf event buffer and control
plane will insert/remove/modify flow tables in the BPF maps (contol
plane will also install xdp program).
Why do we need UMH? What am I missing?
^ permalink raw reply
* Re: [net PATCH] net: tls, fix sk_write_space NULL write when tx disabled
From: Jakub Kicinski @ 2019-08-14 17:08 UTC (permalink / raw)
To: John Fastabend; +Cc: davem, ying.xue, netdev, andreyknvl
In-Reply-To: <156576071416.1402.5907777786031481705.stgit@ubuntu3-kvm1>
On Wed, 14 Aug 2019 05:31:54 +0000, John Fastabend wrote:
> The ctx->sk_write_space pointer is only set when TLS tx mode is enabled.
> When running without TX mode its a null pointer but we still set the
> sk sk_write_space pointer on close().
>
> Fix the close path to only overwrite sk->sk_write_space when the current
> pointer is to the tls_write_space function indicating the tls module should
> clean it up properly as well.
>
> Reported-by: Hillf Danton <hdanton@sina.com>
> Cc: Ying Xue <ying.xue@windriver.com>
> Cc: Andrey Konovalov <andreyknvl@google.com>
> Fixes: 57c722e932cfb ("net/tls: swap sk_write_space on close")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Thanks!
^ permalink raw reply
* [PATCH net-next] selftests: Fix get_ifidx and callers in nettest.c
From: David Ahern @ 2019-08-14 17:11 UTC (permalink / raw)
To: davem; +Cc: netdev, dan.carpenter, David Ahern
From: David Ahern <dsahern@gmail.com>
Dan reported:
The patch acda655fefae: "selftests: Add nettest" from Aug 1, 2019,
leads to the following static checker warning:
./tools/testing/selftests/net/nettest.c:1690 main()
warn: unsigned 'tmp' is never less than zero.
./tools/testing/selftests/net/nettest.c
1680 case '1':
1681 args.has_expected_raddr = 1;
1682 if (convert_addr(&args, optarg,
1683 ADDR_TYPE_EXPECTED_REMOTE))
1684 return 1;
1685
1686 break;
1687 case '2':
1688 if (str_to_uint(optarg, 0, 0x7ffffff, &tmp) != 0) {
1689 tmp = get_ifidx(optarg);
1690 if (tmp < 0) {
"tmp" is unsigned so it can't be negative. Also all the callers assume
that get_ifidx() returns negatives on error but it looks like it really
returns zero on error so it's a bit unclear to me.
Update get_ifidx to return -1 on errors and cleanup callers of it.
Fixes: acda655fefae ("selftests: Add nettest")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
---
tools/testing/selftests/net/nettest.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c
index 83515e5ea4dc..c08f4db8330d 100644
--- a/tools/testing/selftests/net/nettest.c
+++ b/tools/testing/selftests/net/nettest.c
@@ -266,7 +266,7 @@ static int get_ifidx(const char *ifname)
int sd, rc;
if (!ifname || *ifname == '\0')
- return 0;
+ return -1;
memset(&ifdata, 0, sizeof(ifdata));
@@ -275,14 +275,14 @@ static int get_ifidx(const char *ifname)
sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
if (sd < 0) {
log_err_errno("socket failed");
- return 0;
+ return -1;
}
rc = ioctl(sd, SIOCGIFINDEX, (char *)&ifdata);
close(sd);
if (rc != 0) {
log_err_errno("ioctl(SIOCGIFINDEX) failed");
- return 0;
+ return -1;
}
return ifdata.ifr_ifindex;
@@ -419,20 +419,20 @@ static int set_multicast_if(int sd, int ifindex)
return rc;
}
-static int set_membership(int sd, uint32_t grp, uint32_t addr, const char *dev)
+static int set_membership(int sd, uint32_t grp, uint32_t addr, int ifindex)
{
uint32_t if_addr = addr;
struct ip_mreqn mreq;
int rc;
- if (addr == htonl(INADDR_ANY) && !dev) {
+ if (addr == htonl(INADDR_ANY) && !ifindex) {
log_error("Either local address or device needs to be given for multicast membership\n");
return -1;
}
mreq.imr_multiaddr.s_addr = grp;
mreq.imr_address.s_addr = if_addr;
- mreq.imr_ifindex = dev ? get_ifidx(dev) : 0;
+ mreq.imr_ifindex = ifindex;
rc = setsockopt(sd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
if (rc < 0) {
@@ -1048,7 +1048,7 @@ static int msock_init(struct sock_args *args, int server)
if (server &&
set_membership(sd, args->grp.s_addr,
- args->local_addr.in.s_addr, args->dev))
+ args->local_addr.in.s_addr, args->ifindex))
goto out_err;
return sd;
@@ -1685,15 +1685,16 @@ int main(int argc, char *argv[])
break;
case '2':
- if (str_to_uint(optarg, 0, 0x7ffffff, &tmp) != 0) {
- tmp = get_ifidx(optarg);
- if (tmp < 0) {
+ if (str_to_uint(optarg, 0, INT_MAX, &tmp) == 0) {
+ args.expected_ifindex = (int)tmp;
+ } else {
+ args.expected_ifindex = get_ifidx(optarg);
+ if (args.expected_ifindex < 0) {
fprintf(stderr,
- "Invalid device index\n");
+ "Invalid expected device\n");
return 1;
}
}
- args.expected_ifindex = (int)tmp;
break;
case 'q':
quiet = 1;
--
2.11.0
^ permalink raw reply related
* Re: fallout from net-next netfilter changes
From: Florian Westphal @ 2019-08-14 17:18 UTC (permalink / raw)
To: David Miller; +Cc: pablo, netfilter-devel, netdev
In-Reply-To: <20190814.125330.1934256694306164517.davem@davemloft.net>
David Miller <davem@davemloft.net> wrote:
> This started happening after Jakub's pull of your net-next changes
> yesterday:
>
> ./include/uapi/linux/netfilter_ipv6/ip6t_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> #warning "Please update iptables, this file will be removed soon!"
> ^~~~~~~
> In file included from <command-line>:
> ./include/uapi/linux/netfilter_ipv4/ipt_LOG.h:5:2: warning: #warning "Please update iptables, this file will be removed soon!" [-Wcpp]
> #warning "Please update iptables, this file will be removed soon!"
> ^~~~~~~
>
> It's probaly from the standard kernel build UAPI header checks.
A patch that removes those #warning from the kernel is sitting in
the netfilter patchwork queue already.
^ permalink raw reply
* Re: [RFC bpf-next 0/3] tools: bpftool: add subcommand to count map entries
From: Edward Cree @ 2019-08-14 17:14 UTC (permalink / raw)
To: Quentin Monnet, Alexei Starovoitov
Cc: Alexei Starovoitov, Daniel Borkmann, bpf, netdev, oss-drivers
In-Reply-To: <18f887ec-99fd-20ae-f5d6-a1f4117b2d77@netronome.com>
On 14/08/2019 17:58, Quentin Monnet wrote:
> 2019-08-14 17:45 UTC+0100 ~ Edward Cree <ecree@solarflare.com>
>> This might be a really dumb suggestion, but: you're wanting to collect a
>> summary statistic over an in-kernel data structure in a single syscall,
>> because making a series of syscalls to examine every entry is slow and
>> racy. Isn't that exactly a job for an in-kernel virtual machine, and
>> could you not supply an eBPF program which the kernel runs on each entry
>> in the map, thus supporting people who want to calculate something else
>> (mean, min and max, whatever) instead of count?
>>
> Hi Edward, I like the approach, thanks for the suggestion.
>
> But I did not mention that we were using offloaded maps: Tracing the
> kernel would probably work for programs running on the host, but this is
> not a solution we could extend to hardware offload.
I don't see where "tracing" comes into it; this is a new program type and
a new map op under the bpf() syscall.
Could the user-supplied BPF program not then be passed down to the device
for it to run against its offloaded maps?
^ permalink raw reply
* Re: [RFC bpf-next 0/3] tools: bpftool: add subcommand to count map entries
From: Quentin Monnet @ 2019-08-14 17:12 UTC (permalink / raw)
To: Alexei Starovoitov, Edward Cree
Cc: Alexei Starovoitov, Daniel Borkmann, bpf, Network Development,
oss-drivers
In-Reply-To: <CAADnVQJE2DCU0J2_d4Z-1cmXZsb_q2FODcbC1S24C0f=_b2ffg@mail.gmail.com>
2019-08-14 09:58 UTC-0700 ~ Alexei Starovoitov
<alexei.starovoitov@gmail.com>
> On Wed, Aug 14, 2019 at 9:45 AM Edward Cree <ecree@solarflare.com> wrote:
>>
>> On 14/08/2019 10:42, Quentin Monnet wrote:
>>> 2019-08-13 18:51 UTC-0700 ~ Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com>
>>>> The same can be achieved by 'bpftool map dump|grep key|wc -l', no?
>>> To some extent (with subtleties for some other map types); and we use a
>>> similar command line as a workaround for now. But because of the rate of
>>> inserts/deletes in the map, the process often reports a number higher
>>> than the max number of entries (we observed up to ~750k when max_entries
>>> is 500k), even is the map is only half-full on average during the count.
>>> On the worst case (though not frequent), an entry is deleted just before
>>> we get the next key from it, and iteration starts all over again. This
>>> is not reliable to determine how much space is left in the map.
>>>
>>> I cannot see a solution that would provide a more accurate count from
>>> user space, when the map is under pressure?
>> This might be a really dumb suggestion, but: you're wanting to collect a
>> summary statistic over an in-kernel data structure in a single syscall,
>> because making a series of syscalls to examine every entry is slow and
>> racy. Isn't that exactly a job for an in-kernel virtual machine, and
>> could you not supply an eBPF program which the kernel runs on each entry
>> in the map, thus supporting people who want to calculate something else
>> (mean, min and max, whatever) instead of count?
>
> Pretty much my suggestion as well :)
>
> It seems the better fix for your nat threshold is to keep count of
> elements in the map in a separate global variable that
> bpf program manually increments and decrements.
> bpftool will dump it just as regular map of single element.
> (I believe it doesn't recognize global variables properly yet)
> and BTF will be there to pick exactly that 'count' variable.
>
It would be with an offloaded map, but yes, I suppose we could keep
track of the numbers in a separate map. We'll have a look into this.
Thanks to both of you for the suggestions.
Quentin
^ permalink raw reply
* Re: [PATCH bpf-next 1/4] selftests/bpf: test_progs: change formatting of the condenced output
From: Stanislav Fomichev @ 2019-08-14 17:07 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Stanislav Fomichev, Network Development, bpf, David S. Miller,
Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko
In-Reply-To: <CAADnVQJk=qSLR1A=1poPY85wNqiye3dMvXZOZ+1OFZSA78VARg@mail.gmail.com>
On 08/14, Alexei Starovoitov wrote:
> On Wed, Aug 14, 2019 at 9:47 AM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > This makes it visually simpler to follow the output.
> > Also, highlight with red color failures when outputting to tty.
> >
> > Before:
> > #1 attach_probe:FAIL
> > #2 bpf_obj_id:OK
> > #3/1 bpf_verif_scale:loop3.o:OK
> > #3/2 bpf_verif_scale:test_verif_scale1.o:OK
> > #3/3 bpf_verif_scale:test_verif_scale2.o:OK
> > #3/4 bpf_verif_scale:test_verif_scale3.o:OK
> > #3/5 bpf_verif_scale:pyperf50.o:OK
> > #3/6 bpf_verif_scale:pyperf100.o:OK
> > #3/7 bpf_verif_scale:pyperf180.o:OK
> > #3/8 bpf_verif_scale:pyperf600.o:OK
> > #3/9 bpf_verif_scale:pyperf600_nounroll.o:OK
> > #3/10 bpf_verif_scale:loop1.o:OK
> > #3/11 bpf_verif_scale:loop2.o:OK
> > #3/12 bpf_verif_scale:loop4.o:OK
> > #3/13 bpf_verif_scale:loop5.o:OK
> > #3/14 bpf_verif_scale:strobemeta.o:OK
> > #3/15 bpf_verif_scale:strobemeta_nounroll1.o:OK
> > #3/16 bpf_verif_scale:strobemeta_nounroll2.o:OK
> > #3/17 bpf_verif_scale:test_sysctl_loop1.o:OK
> > #3/18 bpf_verif_scale:test_sysctl_loop2.o:OK
> > #3/19 bpf_verif_scale:test_xdp_loop.o:OK
> > #3/20 bpf_verif_scale:test_seg6_loop.o:OK
> > #3 bpf_verif_scale:OK
> > #4 flow_dissector:OK
> >
> > After:
> > # 1 FAIL attach_probe
> > # 2 OK bpf_obj_id
> > # 3/1 OK bpf_verif_scale:loop3.o
> > # 3/2 OK bpf_verif_scale:test_verif_scale1.o
> > # 3/3 OK bpf_verif_scale:test_verif_scale2.o
> > # 3/4 OK bpf_verif_scale:test_verif_scale3.o
> > # 3/5 OK bpf_verif_scale:pyperf50.o
> > # 3/6 OK bpf_verif_scale:pyperf100.o
> > # 3/7 OK bpf_verif_scale:pyperf180.o
> > # 3/8 OK bpf_verif_scale:pyperf600.o
> > # 3/9 OK bpf_verif_scale:pyperf600_nounroll.o
> > # 3/10 OK bpf_verif_scale:loop1.o
> > # 3/11 OK bpf_verif_scale:loop2.o
> > # 3/12 OK bpf_verif_scale:loop4.o
> > # 3/13 OK bpf_verif_scale:loop5.o
> > # 3/14 OK bpf_verif_scale:strobemeta.o
> > # 3/15 OK bpf_verif_scale:strobemeta_nounroll1.o
> > # 3/16 OK bpf_verif_scale:strobemeta_nounroll2.o
> > # 3/17 OK bpf_verif_scale:test_sysctl_loop1.o
> > # 3/18 OK bpf_verif_scale:test_sysctl_loop2.o
> > # 3/19 OK bpf_verif_scale:test_xdp_loop.o
> > # 3/20 OK bpf_verif_scale:test_seg6_loop.o
> > # 3 OK bpf_verif_scale
> > # 4 OK flow_dissector
>
> sorry this is nack.
> I prefer consistency with test_verifier output.
No problem, let me know how you feel about the other patches
in the series, can drop this one.
^ permalink raw reply
* [PATCH net-next v2 0/4] net: bridge: mdb: allow dump/add/del of host-joined entries
From: Nikolay Aleksandrov @ 2019-08-14 17:04 UTC (permalink / raw)
To: netdev; +Cc: davem, roopa, bridge, Nikolay Aleksandrov
In-Reply-To: <81258876-5f03-002c-5aa8-2d6d00e6d99e@cumulusnetworks.com>
Hi,
This set makes the bridge dump host-joined mdb entries, they should be
treated as normal entries since they take a slot and are aging out.
We already have notifications for them but we couldn't dump them until
now so they remained hidden. We dump them similar to how they're
notified, in order to keep user-space compatibility with the dumped
objects (e.g. iproute2 dumps mdbs in a format which can be fed into
add/del commands) we allow host-joined groups also to be added/deleted via
mdb commands. That can later be used for L2 mcast MAC manipulation as
was recently discussed. Note that iproute2 changes are not necessary,
this set will work with the current user-space mdb code.
Patch 01 - a trivial comment move
Patch 02 - factors out the mdb filling code so it can be
re-used for the host-joined entries
Patch 03 - dumps host-joined entries
Patch 04 - allows manipulation of host-joined entries via standard mdb
calls
v2: change patch 04 to avoid double notification and improve host group
manual removal if no ports are present in the group
Thanks,
Nik
Nikolay Aleksandrov (4):
net: bridge: mdb: move vlan comments
net: bridge: mdb: factor out mdb filling
net: bridge: mdb: dump host-joined entries as well
net: bridge: mdb: allow add/delete for host-joined groups
net/bridge/br_mdb.c | 173 +++++++++++++++++++++++++-------------
net/bridge/br_multicast.c | 30 +++++--
net/bridge/br_private.h | 2 +
3 files changed, 141 insertions(+), 64 deletions(-)
--
2.21.0
^ permalink raw reply
* Re: [PATCH AUTOSEL 4.19 04/42] netfilter: conntrack: always store window size un-scaled
From: Sasha Levin @ 2019-08-14 17:01 UTC (permalink / raw)
To: Jakub Jankowski
Cc: Reindl Harald, Thomas Jarosch, linux-kernel, stable,
Florian Westphal, Jozsef Kadlecsik, Pablo Neira Ayuso,
netfilter-devel, coreteam, netdev
In-Reply-To: <alpine.LNX.2.21.1908141316420.1803@kich.toxcorp.com>
On Wed, Aug 14, 2019 at 01:17:30PM +0200, Jakub Jankowski wrote:
>On 2019-08-14, Reindl Harald wrote:
>
>>that's still not in 5.2.8
>
>It will make its way into next 5.2.x release, as it is now in the
>pending queue: https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git/tree/queue-5.2
In general, AUTOSEL stuff soak for much longer before they make it to
the queue.
If there's an urgent need for a fix to go in, please make it explicit.
--
Thanks,
Sasha
^ permalink raw reply
* Re: [PATCH bpf-next 1/4] selftests/bpf: test_progs: change formatting of the condenced output
From: Alexei Starovoitov @ 2019-08-14 17:00 UTC (permalink / raw)
To: Stanislav Fomichev
Cc: Network Development, bpf, David S. Miller, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko
In-Reply-To: <20190814164742.208909-2-sdf@google.com>
On Wed, Aug 14, 2019 at 9:47 AM Stanislav Fomichev <sdf@google.com> wrote:
>
> This makes it visually simpler to follow the output.
> Also, highlight with red color failures when outputting to tty.
>
> Before:
> #1 attach_probe:FAIL
> #2 bpf_obj_id:OK
> #3/1 bpf_verif_scale:loop3.o:OK
> #3/2 bpf_verif_scale:test_verif_scale1.o:OK
> #3/3 bpf_verif_scale:test_verif_scale2.o:OK
> #3/4 bpf_verif_scale:test_verif_scale3.o:OK
> #3/5 bpf_verif_scale:pyperf50.o:OK
> #3/6 bpf_verif_scale:pyperf100.o:OK
> #3/7 bpf_verif_scale:pyperf180.o:OK
> #3/8 bpf_verif_scale:pyperf600.o:OK
> #3/9 bpf_verif_scale:pyperf600_nounroll.o:OK
> #3/10 bpf_verif_scale:loop1.o:OK
> #3/11 bpf_verif_scale:loop2.o:OK
> #3/12 bpf_verif_scale:loop4.o:OK
> #3/13 bpf_verif_scale:loop5.o:OK
> #3/14 bpf_verif_scale:strobemeta.o:OK
> #3/15 bpf_verif_scale:strobemeta_nounroll1.o:OK
> #3/16 bpf_verif_scale:strobemeta_nounroll2.o:OK
> #3/17 bpf_verif_scale:test_sysctl_loop1.o:OK
> #3/18 bpf_verif_scale:test_sysctl_loop2.o:OK
> #3/19 bpf_verif_scale:test_xdp_loop.o:OK
> #3/20 bpf_verif_scale:test_seg6_loop.o:OK
> #3 bpf_verif_scale:OK
> #4 flow_dissector:OK
>
> After:
> # 1 FAIL attach_probe
> # 2 OK bpf_obj_id
> # 3/1 OK bpf_verif_scale:loop3.o
> # 3/2 OK bpf_verif_scale:test_verif_scale1.o
> # 3/3 OK bpf_verif_scale:test_verif_scale2.o
> # 3/4 OK bpf_verif_scale:test_verif_scale3.o
> # 3/5 OK bpf_verif_scale:pyperf50.o
> # 3/6 OK bpf_verif_scale:pyperf100.o
> # 3/7 OK bpf_verif_scale:pyperf180.o
> # 3/8 OK bpf_verif_scale:pyperf600.o
> # 3/9 OK bpf_verif_scale:pyperf600_nounroll.o
> # 3/10 OK bpf_verif_scale:loop1.o
> # 3/11 OK bpf_verif_scale:loop2.o
> # 3/12 OK bpf_verif_scale:loop4.o
> # 3/13 OK bpf_verif_scale:loop5.o
> # 3/14 OK bpf_verif_scale:strobemeta.o
> # 3/15 OK bpf_verif_scale:strobemeta_nounroll1.o
> # 3/16 OK bpf_verif_scale:strobemeta_nounroll2.o
> # 3/17 OK bpf_verif_scale:test_sysctl_loop1.o
> # 3/18 OK bpf_verif_scale:test_sysctl_loop2.o
> # 3/19 OK bpf_verif_scale:test_xdp_loop.o
> # 3/20 OK bpf_verif_scale:test_seg6_loop.o
> # 3 OK bpf_verif_scale
> # 4 OK flow_dissector
sorry this is nack.
I prefer consistency with test_verifier output.
^ permalink raw reply
* Re: [PATCH net-next] mcast: ensure L-L IPv6 packets are accepted by bridge
From: Nikolay Aleksandrov @ 2019-08-14 16:58 UTC (permalink / raw)
To: pruddy, Ido Schimmel; +Cc: netdev, roopa, linus.luessing
In-Reply-To: <620d3cfbe58e3ae87ef1d5e7f2aa1588cac3e64a.camel@vyatta.att-mail.com>
On 8/14/19 7:40 PM, Patrick Ruddy wrote:
> Thanks both for the quick replies, answers inline...
>
> On Wed, 2019-08-14 at 02:55 +0300, Nikolay Aleksandrov wrote:
>> On 8/13/19 10:53 PM, Ido Schimmel wrote:
>>> + Bridge maintainers, Linus
>>>
>>
>> Good catch Ido, thanks!
>> First I'd say the subject needs to reflect that this is a bridge change
>> better, please rearrange it like so - bridge: mcast: ...
>> More below,
>>
>>> On Tue, Aug 13, 2019 at 03:18:04PM +0100, Patrick Ruddy wrote:
>>>> At present only all-nodes IPv6 multicast packets are accepted by
>>>> a bridge interface that is not in multicast router mode. Since
>>>> other protocols can be running in the absense of multicast
>>>> forwarding e.g. OSPFv3 IPv6 ND. Change the test to allow
>>>> all of the FFx2::/16 range to be accepted when not in multicast
>>>> router mode. This aligns the code with IPv4 link-local reception
>>>> and RFC4291
>>>
>>> Can you please quote the relevant part from RFC 4291?
>>>
>>>> Signed-off-by: Patrick Ruddy <pruddy@vyatta.att-mail.com>
>>>> ---
>>>> include/net/addrconf.h | 15 +++++++++++++++
>>>> net/bridge/br_multicast.c | 2 +-
>>>> 2 files changed, 16 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
>>>> index becdad576859..05b42867e969 100644
>>>> --- a/include/net/addrconf.h
>>>> +++ b/include/net/addrconf.h
>>>> @@ -434,6 +434,21 @@ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
>>>> htonl(0xFF000000) | addr->s6_addr32[3]);
>>>> }
>>>>
>>>> +/*
>>>> + * link local multicast address range ffx2::/16 rfc4291
>>>> + */
>>>> +static inline bool ipv6_addr_is_ll_mcast(const struct in6_addr *addr)
>>>> +{
>>>> +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
>>>> + __be64 *p = (__be64 *)addr;
>>>> + return ((p[0] & cpu_to_be64(0xff0f000000000000UL))
>>>> + ^ cpu_to_be64(0xff02000000000000UL)) == 0UL;
>>>> +#else
>>>> + return ((addr->s6_addr32[0] & htonl(0xff0f0000)) ^
>>>> + htonl(0xff020000)) == 0;
>>>> +#endif
>>>> +}
>>>> +
>>>> static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
>>>> {
>>>> #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
>>>> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
>>>> index 9b379e110129..ed3957381fa2 100644
>>>> --- a/net/bridge/br_multicast.c
>>>> +++ b/net/bridge/br_multicast.c
>>>> @@ -1664,7 +1664,7 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
>>>> err = ipv6_mc_check_mld(skb);
>>>>
>>>> if (err == -ENOMSG) {
>>>> - if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
>>>> + if (!ipv6_addr_is_ll_mcast(&ipv6_hdr(skb)->daddr))
>>>> BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
>>>
>>> IIUC, you want IPv6 link-local packets to be locally received, but this
>>> also changes how these packets are flooded. RFC 4541 says that packets
>>
>> Indeed, we'll start flooding them all, not just the all hosts address.
>> If that is at all required it'll definitely have to be optional.
>>
>>> addressed to the all hosts address are a special case and should be
>>> forwarded to all ports:
>>>
>>> "In IPv6, the data forwarding rules are more straight forward because MLD is
>>> mandated for addresses with scope 2 (link-scope) or greater. The only exception
>>> is the address FF02::1 which is the all hosts link-scope address for which MLD
>>> messages are never sent. Packets with the all hosts link-scope address should
>>> be forwarded on all ports."
>>>
>>
>> I wonder what is the problem for the host to join such group on behalf of the bridge ?
>> Then you'll receive the traffic at least locally and the RFC says it itself - MLD is mandated
>> for the other link-local addresses.
>> It's very late here and maybe I'm missing something.. :)
>>
> The group is being joined by MLD at the L3 level but the packets are
> not being passed up to the l3 interface becasue there is a MLD querier
> on the network
>
That shouldn't matter if the host has joined the group, there is a specific
check for that. If the host has joined the group and we have an mdst then
we'll hit this code:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
if ((mdst && mdst->host_joined) ||
br_multicast_is_router(br)) {
local_rcv = true;
br->dev->stats.multicast++;
}
mcast_hit = true;
} else {
local_rcv become true and the packet is passed up, so what is the problem ?
Have you missed to refresh the group and it has expired in the bridge perhaps ?
> snippet from /proc/net/igmp6
> ...
> 40 sw1 ff0200000000000000000001ff008700 1 00000004 0
> 40 sw1 ff020000000000000000000000000002 1 00000004 0
> 40 sw1 ff020000000000000000000000000001 1 0000000C 0
> 40 sw1 ff010000000000000000000000000001 1 00000008 0
> 41 lo1 ff020000000000000000000000000001 1 0000000C 0
> 41 lo1 ff010000000000000000000000000001 1 00000008 0
> 42 sw1.1 ff020000000000000000000000000006 1 00000004 0
> 42 sw1.1 ff020000000000000000000000000005 1 00000004 0
> 42 sw1.1 ff0200000000000000000001ff000000 2 00000004 0
> 42 sw1.1 ff0200000000000000000001ff008700 1 00000004 0
> 42 sw1.1 ff0200000000000000000001ff000099 1 00000004 0
> 42 sw1.1 ff020000000000000000000000000002 1 00000004 0
> 42 sw1.1 ff020000000000000000000000000001 1 0000000C 0
> 42 sw1.1 ff010000000000000000000000000001 1 00000008 0
> ...
>
> the bridge is sw1 and the l3 intervace is sw1.1
>
> Ido is correct about the flooding - I will update the patch with the
> comments and reissue.
>
> Thanks again
>
> -pr
>>
>>> Maybe you want something like:
>>>
>>
>> I think we can do without the new field, either pass local_rcv into br_multicast_rcv() or
>> set it based on return value. The extra test will have to remain unfortunately, but we
>> can reduce the tests by one if carefully done.
>>
>>> diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
>>> index 09b1dd8cd853..9f312a73f61c 100644
>>> --- a/net/bridge/br_input.c
>>> +++ b/net/bridge/br_input.c
>>> @@ -132,7 +132,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
>>> if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
>>> br_multicast_querier_exists(br, eth_hdr(skb))) {
>>> if ((mdst && mdst->host_joined) ||
>>> - br_multicast_is_router(br)) {
>>> + br_multicast_is_router(br) ||
>>> + BR_INPUT_SKB_CB_LOCAL_RECEIVE(skb)) {
>>> local_rcv = true;
>>> br->dev->stats.multicast++;
>>> }
>>> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
>>> index 9b379e110129..f03cecf6174e 100644
>>> --- a/net/bridge/br_multicast.c
>>> +++ b/net/bridge/br_multicast.c
>>> @@ -1667,6 +1667,9 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
>>> if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
>>> BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
>>>
>>> + if (ipv6_addr_is_ll_mcast(&ipv6_hdr(skb)->daddr))
>>> + BR_INPUT_SKB_CB(skb)->local_receive = 1;
>>> +
>>> if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) {
>>> err = br_ip6_multicast_mrd_rcv(br, port, skb);
>>>
>>> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
>>> index b7a4942ff1b3..d76394ca4059 100644
>>> --- a/net/bridge/br_private.h
>>> +++ b/net/bridge/br_private.h
>>> @@ -426,6 +426,7 @@ struct br_input_skb_cb {
>>> #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
>>> u8 igmp;
>>> u8 mrouters_only:1;
>>> + u8 local_receive:1;
>>> #endif
>>> u8 proxyarp_replied:1;
>>> u8 src_port_isolated:1;
>>> @@ -445,8 +446,10 @@ struct br_input_skb_cb {
>>>
>>> #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
>>> # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (BR_INPUT_SKB_CB(__skb)->mrouters_only)
>>> +# define BR_INPUT_SKB_CB_LOCAL_RECEIVE(__skb) (BR_INPUT_SKB_CB(__skb)->local_receive)
>>> #else
>>> # define BR_INPUT_SKB_CB_MROUTERS_ONLY(__skb) (0)
>>> +# define BR_INPUT_SKB_CB_LOCAL_RECEIVE(__skb) (0)
>>> #endif
>>>
>>> #define br_printk(level, br, format, args...) \
>>>
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox