* [PATCH v2 04/11] VSOCK: extract connect/accept functions from vsock_diag_test.c
From: Stefano Garzarella @ 2019-08-01 15:25 UTC (permalink / raw)
To: netdev
Cc: kvm, Stefan Hajnoczi, Dexuan Cui, virtualization, David S. Miller,
Jorgen Hansen, linux-kernel
In-Reply-To: <20190801152541.245833-1-sgarzare@redhat.com>
From: Stefan Hajnoczi <stefanha@redhat.com>
Many test cases will need to connect to the server or accept incoming
connections. This patch extracts these operations into utility
functions that can be reused.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
tools/testing/vsock/util.c | 108 ++++++++++++++++++++++++++
tools/testing/vsock/util.h | 6 ++
tools/testing/vsock/vsock_diag_test.c | 81 ++-----------------
3 files changed, 119 insertions(+), 76 deletions(-)
diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index f40f45b36d2f..f838bcee3589 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -11,8 +11,10 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
+#include <unistd.h>
#include "timeout.h"
+#include "control.h"
#include "util.h"
/* Install signal handlers */
@@ -41,6 +43,112 @@ unsigned int parse_cid(const char *str)
return n;
}
+/* Connect to <cid, port> and return the file descriptor. */
+int vsock_stream_connect(unsigned int cid, unsigned int port)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_vm svm;
+ } addr = {
+ .svm = {
+ .svm_family = AF_VSOCK,
+ .svm_port = port,
+ .svm_cid = cid,
+ },
+ };
+ int ret;
+ int fd;
+
+ control_expectln("LISTENING");
+
+ fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+
+ timeout_begin(TIMEOUT);
+ do {
+ ret = connect(fd, &addr.sa, sizeof(addr.svm));
+ timeout_check("connect");
+ } while (ret < 0 && errno == EINTR);
+ timeout_end();
+
+ if (ret < 0) {
+ int old_errno = errno;
+
+ close(fd);
+ fd = -1;
+ errno = old_errno;
+ }
+ return fd;
+}
+
+/* Listen on <cid, port> and return the first incoming connection. The remote
+ * address is stored to clientaddrp. clientaddrp may be NULL.
+ */
+int vsock_stream_accept(unsigned int cid, unsigned int port,
+ struct sockaddr_vm *clientaddrp)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_vm svm;
+ } addr = {
+ .svm = {
+ .svm_family = AF_VSOCK,
+ .svm_port = port,
+ .svm_cid = cid,
+ },
+ };
+ union {
+ struct sockaddr sa;
+ struct sockaddr_vm svm;
+ } clientaddr;
+ socklen_t clientaddr_len = sizeof(clientaddr.svm);
+ int fd;
+ int client_fd;
+ int old_errno;
+
+ fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+
+ if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) {
+ perror("bind");
+ exit(EXIT_FAILURE);
+ }
+
+ if (listen(fd, 1) < 0) {
+ perror("listen");
+ exit(EXIT_FAILURE);
+ }
+
+ control_writeln("LISTENING");
+
+ timeout_begin(TIMEOUT);
+ do {
+ client_fd = accept(fd, &clientaddr.sa, &clientaddr_len);
+ timeout_check("accept");
+ } while (client_fd < 0 && errno == EINTR);
+ timeout_end();
+
+ old_errno = errno;
+ close(fd);
+ errno = old_errno;
+
+ if (client_fd < 0)
+ return client_fd;
+
+ if (clientaddr_len != sizeof(clientaddr.svm)) {
+ fprintf(stderr, "unexpected addrlen from accept(2), %zu\n",
+ (size_t)clientaddr_len);
+ exit(EXIT_FAILURE);
+ }
+ if (clientaddr.sa.sa_family != AF_VSOCK) {
+ fprintf(stderr, "expected AF_VSOCK from accept(2), got %d\n",
+ clientaddr.sa.sa_family);
+ exit(EXIT_FAILURE);
+ }
+
+ if (clientaddrp)
+ *clientaddrp = clientaddr.svm;
+ return client_fd;
+}
+
/* Run test cases. The program terminates if a failure occurs. */
void run_tests(const struct test_case *test_cases,
const struct test_opts *opts)
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index 033e7d59a42a..1786305cfddd 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -2,6 +2,9 @@
#ifndef UTIL_H
#define UTIL_H
+#include <sys/socket.h>
+#include <linux/vm_sockets.h>
+
/* Tests can either run as the client or the server */
enum test_mode {
TEST_MODE_UNSET,
@@ -30,6 +33,9 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
+int vsock_stream_connect(unsigned int cid, unsigned int port);
+int vsock_stream_accept(unsigned int cid, unsigned int port,
+ struct sockaddr_vm *clientaddrp);
void run_tests(const struct test_case *test_cases,
const struct test_opts *opts);
diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c
index 944c8a72eed7..abd7dc2a9631 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -13,13 +13,11 @@
#include <string.h>
#include <errno.h>
#include <unistd.h>
-#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/list.h>
#include <linux/net.h>
#include <linux/netlink.h>
-#include <linux/vm_sockets.h>
#include <linux/sock_diag.h>
#include <linux/vm_sockets_diag.h>
#include <netinet/tcp.h>
@@ -378,33 +376,12 @@ static void test_listen_socket_server(const struct test_opts *opts)
static void test_connect_client(const struct test_opts *opts)
{
- union {
- struct sockaddr sa;
- struct sockaddr_vm svm;
- } addr = {
- .svm = {
- .svm_family = AF_VSOCK,
- .svm_port = 1234,
- .svm_cid = opts->peer_cid,
- },
- };
int fd;
- int ret;
LIST_HEAD(sockets);
struct vsock_stat *st;
- control_expectln("LISTENING");
-
- fd = socket(AF_VSOCK, SOCK_STREAM, 0);
-
- timeout_begin(TIMEOUT);
- do {
- ret = connect(fd, &addr.sa, sizeof(addr.svm));
- timeout_check("connect");
- } while (ret < 0 && errno == EINTR);
- timeout_end();
-
- if (ret < 0) {
+ fd = vsock_stream_connect(opts->peer_cid, 1234);
+ if (fd < 0) {
perror("connect");
exit(EXIT_FAILURE);
}
@@ -424,66 +401,19 @@ static void test_connect_client(const struct test_opts *opts)
static void test_connect_server(const struct test_opts *opts)
{
- union {
- struct sockaddr sa;
- struct sockaddr_vm svm;
- } addr = {
- .svm = {
- .svm_family = AF_VSOCK,
- .svm_port = 1234,
- .svm_cid = VMADDR_CID_ANY,
- },
- };
- union {
- struct sockaddr sa;
- struct sockaddr_vm svm;
- } clientaddr;
- socklen_t clientaddr_len = sizeof(clientaddr.svm);
- LIST_HEAD(sockets);
struct vsock_stat *st;
- int fd;
+ LIST_HEAD(sockets);
int client_fd;
- fd = socket(AF_VSOCK, SOCK_STREAM, 0);
-
- if (bind(fd, &addr.sa, sizeof(addr.svm)) < 0) {
- perror("bind");
- exit(EXIT_FAILURE);
- }
-
- if (listen(fd, 1) < 0) {
- perror("listen");
- exit(EXIT_FAILURE);
- }
-
- control_writeln("LISTENING");
-
- timeout_begin(TIMEOUT);
- do {
- client_fd = accept(fd, &clientaddr.sa, &clientaddr_len);
- timeout_check("accept");
- } while (client_fd < 0 && errno == EINTR);
- timeout_end();
-
+ client_fd = vsock_stream_accept(VMADDR_CID_ANY, 1234, NULL);
if (client_fd < 0) {
perror("accept");
exit(EXIT_FAILURE);
}
- if (clientaddr.sa.sa_family != AF_VSOCK) {
- fprintf(stderr, "expected AF_VSOCK from accept(2), got %d\n",
- clientaddr.sa.sa_family);
- exit(EXIT_FAILURE);
- }
- if (clientaddr.svm.svm_cid != opts->peer_cid) {
- fprintf(stderr, "expected peer CID %u from accept(2), got %u\n",
- opts->peer_cid, clientaddr.svm.svm_cid);
- exit(EXIT_FAILURE);
- }
read_vsock_stat(&sockets);
- check_num_sockets(&sockets, 2);
- find_vsock_stat(&sockets, fd);
+ check_num_sockets(&sockets, 1);
st = find_vsock_stat(&sockets, client_fd);
check_socket_state(st, TCP_ESTABLISHED);
@@ -491,7 +421,6 @@ static void test_connect_server(const struct test_opts *opts)
control_expectln("DONE");
close(client_fd);
- close(fd);
free_sock_stat(&sockets);
}
--
2.20.1
^ permalink raw reply related
* [PATCH v2 02/11] VSOCK: add SPDX identifiers to vsock tests
From: Stefano Garzarella @ 2019-08-01 15:25 UTC (permalink / raw)
To: netdev
Cc: kvm, Stefan Hajnoczi, Dexuan Cui, virtualization, David S. Miller,
Jorgen Hansen, linux-kernel
In-Reply-To: <20190801152541.245833-1-sgarzare@redhat.com>
From: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
v2:
* Aligned with the current SPDX [Stefano]
---
tools/testing/vsock/control.h | 1 +
tools/testing/vsock/timeout.h | 1 +
2 files changed, 2 insertions(+)
diff --git a/tools/testing/vsock/control.h b/tools/testing/vsock/control.h
index 54a07efd267c..dac3964a891d 100644
--- a/tools/testing/vsock/control.h
+++ b/tools/testing/vsock/control.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef CONTROL_H
#define CONTROL_H
diff --git a/tools/testing/vsock/timeout.h b/tools/testing/vsock/timeout.h
index 77db9ce9860a..ecb7c840e65a 100644
--- a/tools/testing/vsock/timeout.h
+++ b/tools/testing/vsock/timeout.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef TIMEOUT_H
#define TIMEOUT_H
--
2.20.1
^ permalink raw reply related
* [PATCH v2 01/11] VSOCK: fix header include in vsock_diag_test
From: Stefano Garzarella @ 2019-08-01 15:25 UTC (permalink / raw)
To: netdev
Cc: kvm, Stefan Hajnoczi, Dexuan Cui, virtualization, David S. Miller,
Jorgen Hansen, linux-kernel
In-Reply-To: <20190801152541.245833-1-sgarzare@redhat.com>
From: Stefan Hajnoczi <stefanha@redhat.com>
The vsock_diag_test program directly included ../../../include/uapi/
headers from the source tree. Tests are supposed to use the
usr/include/linux/ headers that have been prepared with make
headers_install instead.
Suggested-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
tools/testing/vsock/Makefile | 2 +-
tools/testing/vsock/README | 2 +-
tools/testing/vsock/vsock_diag_test.c | 5 ++---
3 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index 5be687b1e16c..d41a4e13960a 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -3,7 +3,7 @@ all: test
test: vsock_diag_test
vsock_diag_test: vsock_diag_test.o timeout.o control.o
-CFLAGS += -g -O2 -Werror -Wall -I. -I../../include/uapi -I../../include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE
+CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
clean:
${RM} *.o *.d vsock_diag_test
diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README
index 2cc6d7302db6..cf7dc64273bf 100644
--- a/tools/testing/vsock/README
+++ b/tools/testing/vsock/README
@@ -10,7 +10,7 @@ The following tests are available:
The following prerequisite steps are not automated and must be performed prior
to running tests:
-1. Build the kernel and these tests.
+1. Build the kernel, make headers_install, and build these tests.
2. Install the kernel and tests on the host.
3. Install the kernel and tests inside the guest.
4. Boot the guest and ensure that the AF_VSOCK transport is enabled.
diff --git a/tools/testing/vsock/vsock_diag_test.c b/tools/testing/vsock/vsock_diag_test.c
index c481101364a4..fc391e041954 100644
--- a/tools/testing/vsock/vsock_diag_test.c
+++ b/tools/testing/vsock/vsock_diag_test.c
@@ -21,12 +21,11 @@
#include <linux/list.h>
#include <linux/net.h>
#include <linux/netlink.h>
+#include <linux/vm_sockets.h>
#include <linux/sock_diag.h>
+#include <linux/vm_sockets_diag.h>
#include <netinet/tcp.h>
-#include "../../../include/uapi/linux/vm_sockets.h"
-#include "../../../include/uapi/linux/vm_sockets_diag.h"
-
#include "timeout.h"
#include "control.h"
--
2.20.1
^ permalink raw reply related
* [PATCH v2 00/11] VSOCK: add vsock_test test suite
From: Stefano Garzarella @ 2019-08-01 15:25 UTC (permalink / raw)
To: netdev
Cc: kvm, Stefan Hajnoczi, Dexuan Cui, virtualization, David S. Miller,
Jorgen Hansen, linux-kernel
The vsock_diag.ko module already has a test suite but the core AF_VSOCK
functionality has no tests. This patch series adds several test cases that
exercise AF_VSOCK SOCK_STREAM socket semantics (send/recv, connect/accept,
half-closed connections, simultaneous connections).
Stefan: Do you think we should have a single application or is better to
split it in single tests (e.g. vsock_test_send_recv, vsock_test_half_close,
vsock_test_multiconnection, etc.)?
Jorgen: Please test the VMCI transport and let me know if the fixes work.
I added the '--transport' parameter to skip the read() on the half-closed
connection test.
Dexuan: Do you think can be useful to test HyperV?
The v1 of this series was originally sent by Stefan. I rebased on master
and tried to fix some issues reported by Jorgen.
v2:
- Patch 1 added by Stefan to fix header include in vsock_diag_test.
- Patch 2 added by Stefan to add SPDX identifiers. I modified it to be
aligned with SPDX currently used on master.
- Patch 3:
* fixed SPDX [Stefano].
- Patch 7:
* Drop unnecessary includes [Stefan]
* Fixed SPDX [Stefano]
* Set MULTICONN_NFDS to 100 [Stefano]
* Changed (i % 1) in (i % 2) in the 'multiconn' test [Stefano]
- Patches 8,9,10 added to skip read after close in test_stream*close tests
on a VMCI host.
- Patch 11 added to wait for the remote to close the connection, before
check if a send returns -EPIPE.
v1: https://patchwork.ozlabs.org/cover/847998/
Stefan Hajnoczi (7):
VSOCK: fix header include in vsock_diag_test
VSOCK: add SPDX identifiers to vsock tests
VSOCK: extract utility functions from vsock_diag_test.c
VSOCK: extract connect/accept functions from vsock_diag_test.c
VSOCK: add full barrier between test cases
VSOCK: add send_byte()/recv_byte() test utilities
VSOCK: add AF_VSOCK test cases
Stefano Garzarella (4):
VSOCK: add vsock_get_local_cid() test utility
vsock_test: add --transport parameter
vsock_test: skip read() in test_stream*close tests on a VMCI host
vsock_test: wait for the remote to close the connection
tools/testing/vsock/.gitignore | 1 +
tools/testing/vsock/Makefile | 9 +-
tools/testing/vsock/README | 3 +-
tools/testing/vsock/control.h | 1 +
tools/testing/vsock/timeout.h | 1 +
tools/testing/vsock/util.c | 342 ++++++++++++++++++++++++
tools/testing/vsock/util.h | 54 ++++
tools/testing/vsock/vsock_diag_test.c | 170 ++----------
tools/testing/vsock/vsock_test.c | 362 ++++++++++++++++++++++++++
9 files changed, 793 insertions(+), 150 deletions(-)
create mode 100644 tools/testing/vsock/util.c
create mode 100644 tools/testing/vsock/util.h
create mode 100644 tools/testing/vsock/vsock_test.c
--
2.20.1
^ permalink raw reply
* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 15:25 UTC (permalink / raw)
To: Horatiu Vultur, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <696c9bcc-f7e3-3d22-69c4-cdf4f37280a9@cumulusnetworks.com>
On 01/08/2019 17:15, Nikolay Aleksandrov wrote:
> On 01/08/2019 17:11, Nikolay Aleksandrov wrote:
>> On 01/08/2019 17:07, Nikolay Aleksandrov wrote:
>>> Hi Horatiu,
>>> Overall I think MDB is the right way, we'd like to contain the multicast code.
>>> A few comments below.
>>>
>>> On 01/08/2019 15:50, Horatiu Vultur wrote:
>> [snip]
>>>>
>>>> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
>>>> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>>>> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>>>> ---
>>>> include/linux/if_bridge.h | 1 +
>>>> include/uapi/linux/if_bridge.h | 1 +
>>>> net/bridge/br_device.c | 7 +++++--
>>>> net/bridge/br_forward.c | 3 ++-
>>>> net/bridge/br_input.c | 13 ++++++++++--
>>>> net/bridge/br_mdb.c | 47 +++++++++++++++++++++++++++++++++++-------
>>>> net/bridge/br_multicast.c | 4 +++-
>>>> net/bridge/br_private.h | 3 ++-
>>>> 8 files changed, 64 insertions(+), 15 deletions(-)
>>>>
>>>
>>> Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
>>> easier and without the checks if you use a per-mdb flag that says it's to be treated
>>> as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
>>> attached patch based on this one for example). and continue processing it as it is processed today.
>>> We'll keep the fast-path with minimal number of new conditionals.
>>>
>>> Something like the patch I've attached to this reply, note that it is not complete
>>> just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
>>> and there're most probably other details I've missed. If you find even better/less
>>> complex way to do it then please do.
>>>
>>> Cheers,
>>> Nik
>>
>> Oops, I sent back your original patch. Here's the actually changed version
>> I was talking about.
>>
>> Thanks,
>> Nik
>>
>>
>>
>
> The querier exists change is a hack just to get the point, I'd prefer
> to re-write that portion in a better way which makes more sense, i.e.
> get that check out of there since it doesn't mean that an actual querier
> exists. :)
>
TBH, I'm inclined to just use proto == 0 *internally* as this even though it's reserved,
we're not putting it on the wire or using it to construct packets, it's just internal
use which can change into a flag if some day that value needs to be used. Obviously
to user-space we need it to be a flag, we can't expose or configure it as a proto value
without making it permanent uapi. I haven't looked into detail how feasible this is,
just a thought that might make it simpler.
^ permalink raw reply
* Re: [PATCH iproute2-next] ip tunnel: add json output
From: Stephen Hemminger @ 2019-08-01 15:16 UTC (permalink / raw)
To: Andrea Claudi; +Cc: netdev, dsahern
In-Reply-To: <7090709d3ddace589952a128fb62f6603e2da9e8.1564653511.git.aclaudi@redhat.com>
On Thu, 1 Aug 2019 12:12:58 +0200
Andrea Claudi <aclaudi@redhat.com> wrote:
> Add json support on iptunnel and ip6tunnel.
> The plain text output format should remain the same.
>
> Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
> ---
> ip/ip6tunnel.c | 82 +++++++++++++++++++++++++++++++--------------
> ip/iptunnel.c | 90 +++++++++++++++++++++++++++++++++-----------------
> ip/tunnel.c | 42 +++++++++++++++++------
> 3 files changed, 148 insertions(+), 66 deletions(-)
>
> diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
> index d7684a673fdc4..f2b9710c1320f 100644
> --- a/ip/ip6tunnel.c
> +++ b/ip/ip6tunnel.c
> @@ -71,57 +71,90 @@ static void usage(void)
> static void print_tunnel(const void *t)
> {
> const struct ip6_tnl_parm2 *p = t;
> - char s1[1024];
> - char s2[1024];
> + SPRINT_BUF(b1);
>
> /* Do not use format_host() for local addr,
> * symbolic name will not be useful.
> */
> - printf("%s: %s/ipv6 remote %s local %s",
> - p->name,
> - tnl_strproto(p->proto),
> - format_host_r(AF_INET6, 16, &p->raddr, s1, sizeof(s1)),
> - rt_addr_n2a_r(AF_INET6, 16, &p->laddr, s2, sizeof(s2)));
> + open_json_object(NULL);
> + print_string(PRINT_ANY, "ifname", "%s: ", p->name);
Print this using color for interface name?
> + snprintf(b1, sizeof(b1), "%s/ipv6", tnl_strproto(p->proto));
> + print_string(PRINT_ANY, "mode", "%s ", b1);
> + print_string(PRINT_ANY,
> + "remote",
> + "remote %s ",
> + format_host_r(AF_INET6, 16, &p->raddr, b1, sizeof(b1)));
> + print_string(PRINT_ANY,
> + "local",
> + "local %s",
> + rt_addr_n2a_r(AF_INET6, 16, &p->laddr, b1, sizeof(b1)));
> +
> if (p->link) {
> const char *n = ll_index_to_name(p->link);
>
> if (n)
> - printf(" dev %s", n);
> + print_string(PRINT_ANY, "link", " dev %s", n);
> }
>
> if (p->flags & IP6_TNL_F_IGN_ENCAP_LIMIT)
> - printf(" encaplimit none");
> + print_bool(PRINT_ANY,
> + "ip6_tnl_f_ign_encap_limit",
> + " encaplimit none",
> + true);
For flags like this, print_null is more typical JSON than a boolean
value. Null is better for presence flag. Bool is better if both true and
false are printed.
^ permalink raw reply
* Fw: [Bug 204399] New: error in handling vlan tag by raw ethernet socket
From: Stephen Hemminger @ 2019-08-01 14:51 UTC (permalink / raw)
To: netdev
Begin forwarded message:
Date: Thu, 01 Aug 2019 06:37:39 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 204399] New: error in handling vlan tag by raw ethernet socket
https://bugzilla.kernel.org/show_bug.cgi?id=204399
Bug ID: 204399
Summary: error in handling vlan tag by raw ethernet socket
Product: Networking
Version: 2.5
Kernel Version: 5.0.0-21-generic #22-Ubuntu x86_64
Hardware: All
OS: Linux
Tree: Mainline
Status: NEW
Severity: high
Priority: P1
Component: IPV4
Assignee: stephen@networkplumber.org
Reporter: Lvenkatakumarchakka@gmail.com
Regression: No
Created attachment 284067
--> https://bugzilla.kernel.org/attachment.cgi?id=284067&action=edit
set/unset the variable "stripping_vlan_tag" to reproduce the issue
I am using raw ethernet socket and sending/receiving network traffic.
problem I am seeing is if read socket is opened before starting sending the
vlan traffic, read socket is able to capture packets along with vlan tag. but
if I open the read socket before starting sending the packets, vlan tag is
being stripped and only those four bytes are missing from the packet. Rest of
the packet is intact.
Please refer to the small code snippet which consistently produces the bug.
set/unset the variable "stripping_vlan_tag" to reproduce the issue.
is this a bug in the kernel or any problem with my coding ?
I am working on a project where I will be opening all the required sockets at
once and will be using as and when required. In that case, I am not able to
capture vlan tags which is blocking me. However, tcpdump is capture along with
vlan tag.
please be noted that, both the cards are in same system and are connected just
back to back using a cat5 cable. eth0 is built-in card and eth1 is usb based
external network card.
Is there a way to get rid of this issue ?
Best Regards,
Lokesh.
--
You are receiving this mail because:
You are the assignee for the bug.
^ permalink raw reply
* Re: [PATCH mlx5-next 1/3] IB/mlx5: Query ODP capabilities for DC
From: Leon Romanovsky @ 2019-08-01 14:51 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, RDMA mailing list, Michael Guralnik, Moni Shoua,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190801142511.GE23885@mellanox.com>
On Thu, Aug 01, 2019 at 02:25:16PM +0000, Jason Gunthorpe wrote:
> On Thu, Aug 01, 2019 at 03:21:37PM +0300, Leon Romanovsky wrote:
>
> > diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
> > index ec571fd7fcf8..5eae8d734435 100644
> > +++ b/include/linux/mlx5/mlx5_ifc.h
> > @@ -944,7 +944,9 @@ struct mlx5_ifc_odp_cap_bits {
> >
> > struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps;
> >
> > - u8 reserved_at_100[0x700];
> > + struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps;
> > +
> > + u8 reserved_at_100[0x6E0];
> > };
>
> Not splitting this to mlx5-next?
This whole patch goes to mlx5-next, because it touches
drivers/net/ethernet/mellanox/mlx5/core/main.c too and splitting
to ifc specific patch won't change anything.
Thanks
>
> Jason
^ permalink raw reply
* Re: [PATCH mlx5-next 1/3] IB/mlx5: Query ODP capabilities for DC
From: Jason Gunthorpe @ 2019-08-01 14:25 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list,
Michael Guralnik, Moni Shoua, Saeed Mahameed, linux-netdev
In-Reply-To: <20190801122139.25224-2-leon@kernel.org>
On Thu, Aug 01, 2019 at 03:21:37PM +0300, Leon Romanovsky wrote:
> diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
> index ec571fd7fcf8..5eae8d734435 100644
> +++ b/include/linux/mlx5/mlx5_ifc.h
> @@ -944,7 +944,9 @@ struct mlx5_ifc_odp_cap_bits {
>
> struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps;
>
> - u8 reserved_at_100[0x700];
> + struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps;
> +
> + u8 reserved_at_100[0x6E0];
> };
Not splitting this to mlx5-next?
Jason
^ permalink raw reply
* Re: [PATCH rdma-next 0/3] ODP support for mlx5 DC QPs
From: Jason Gunthorpe @ 2019-08-01 14:24 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Doug Ledford, Leon Romanovsky, RDMA mailing list,
Michael Guralnik, Moni Shoua, Saeed Mahameed, linux-netdev
In-Reply-To: <20190801122139.25224-1-leon@kernel.org>
On Thu, Aug 01, 2019 at 03:21:36PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> From Michael,
>
> The series adds support for on-demand paging for DC transport.
> Adding handling of DC WQE parsing upon page faults and exposing
> capabilities.
>
> As DC is mlx-only transport, the capabilities are exposed to the user
> using the direct-verbs mechanism. Namely through the mlx5dv_query_device.
The cover letter should like to the RDMA core PR that uses the new
API...
Jason
^ permalink raw reply
* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-08-01 14:22 UTC (permalink / raw)
To: Nikolay Aleksandrov
Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <b755f613-e6d8-a2e6-16cd-6f13ec0a6ddc@cumulusnetworks.com>
The 07/26/2019 15:31, Nikolay Aleksandrov wrote:
...
> You know that in order to not run in promisc mode you'll have to disable
> port flooding and port learning, right ? Otherwise they're always put in promisc.
Yes, we have spend some time looking at nbp_update_port_count and trying to
understand the reasoning behind it.
Our understanding is that this is to make it work with a pure SW bridge
implementation, and this is actually an optimization to allow disable promisc
mode if all forwarding is static (no flooding and no learning).
We also noticed that the Ocelot and the Rocker drivers avoids this "issue" by
not implementing promisc mode.
But promisc mode is a really nice feature for debugging, and we would actually
like to have it, and when HW that can do learning/flooding it does not seem to
be necessary.
I tried to understand how this is handled in the Mellanox drivers, but gave up.
Too big, and we lack the insight in their design.
Do you know if there are better ways to prevent switchdev-offloaded-slave
interfaces to go to promisc mode?
/Allan
^ permalink raw reply
* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 14:15 UTC (permalink / raw)
To: Horatiu Vultur, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <1db865a6-9deb-fbd2-dee6-83609fcc2d95@cumulusnetworks.com>
On 01/08/2019 17:11, Nikolay Aleksandrov wrote:
> On 01/08/2019 17:07, Nikolay Aleksandrov wrote:
>> Hi Horatiu,
>> Overall I think MDB is the right way, we'd like to contain the multicast code.
>> A few comments below.
>>
>> On 01/08/2019 15:50, Horatiu Vultur wrote:
> [snip]
>>>
>>> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
>>> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>>> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>>> ---
>>> include/linux/if_bridge.h | 1 +
>>> include/uapi/linux/if_bridge.h | 1 +
>>> net/bridge/br_device.c | 7 +++++--
>>> net/bridge/br_forward.c | 3 ++-
>>> net/bridge/br_input.c | 13 ++++++++++--
>>> net/bridge/br_mdb.c | 47 +++++++++++++++++++++++++++++++++++-------
>>> net/bridge/br_multicast.c | 4 +++-
>>> net/bridge/br_private.h | 3 ++-
>>> 8 files changed, 64 insertions(+), 15 deletions(-)
>>>
>>
>> Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
>> easier and without the checks if you use a per-mdb flag that says it's to be treated
>> as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
>> attached patch based on this one for example). and continue processing it as it is processed today.
>> We'll keep the fast-path with minimal number of new conditionals.
>>
>> Something like the patch I've attached to this reply, note that it is not complete
>> just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
>> and there're most probably other details I've missed. If you find even better/less
>> complex way to do it then please do.
>>
>> Cheers,
>> Nik
>
> Oops, I sent back your original patch. Here's the actually changed version
> I was talking about.
>
> Thanks,
> Nik
>
>
>
The querier exists change is a hack just to get the point, I'd prefer
to re-write that portion in a better way which makes more sense, i.e.
get that check out of there since it doesn't mean that an actual querier
exists. :)
^ permalink raw reply
* Re: [PATCH V2 7/9] vhost: do not use RCU to synchronize MMU notifier with worker
From: Jason Gunthorpe @ 2019-08-01 14:15 UTC (permalink / raw)
To: Jason Wang; +Cc: mst, kvm, virtualization, netdev, linux-kernel, linux-mm
In-Reply-To: <a3bde826-6329-68e4-2826-8a9de4c5bd1e@redhat.com>
On Thu, Aug 01, 2019 at 01:02:18PM +0800, Jason Wang wrote:
>
> On 2019/8/1 上午3:30, Jason Gunthorpe wrote:
> > On Wed, Jul 31, 2019 at 09:28:20PM +0800, Jason Wang wrote:
> > > On 2019/7/31 下午8:39, Jason Gunthorpe wrote:
> > > > On Wed, Jul 31, 2019 at 04:46:53AM -0400, Jason Wang wrote:
> > > > > We used to use RCU to synchronize MMU notifier with worker. This leads
> > > > > calling synchronize_rcu() in invalidate_range_start(). But on a busy
> > > > > system, there would be many factors that may slow down the
> > > > > synchronize_rcu() which makes it unsuitable to be called in MMU
> > > > > notifier.
> > > > >
> > > > > A solution is SRCU but its overhead is obvious with the expensive full
> > > > > memory barrier. Another choice is to use seqlock, but it doesn't
> > > > > provide a synchronization method between readers and writers. The last
> > > > > choice is to use vq mutex, but it need to deal with the worst case
> > > > > that MMU notifier must be blocked and wait for the finish of swap in.
> > > > >
> > > > > So this patch switches use a counter to track whether or not the map
> > > > > was used. The counter was increased when vq try to start or finish
> > > > > uses the map. This means, when it was even, we're sure there's no
> > > > > readers and MMU notifier is synchronized. When it was odd, it means
> > > > > there's a reader we need to wait it to be even again then we are
> > > > > synchronized.
> > > > You just described a seqlock.
> > >
> > > Kind of, see my explanation below.
> > >
> > >
> > > > We've been talking about providing this as some core service from mmu
> > > > notifiers because nearly every use of this API needs it.
> > >
> > > That would be very helpful.
> > >
> > >
> > > > IMHO this gets the whole thing backwards, the common pattern is to
> > > > protect the 'shadow pte' data with a seqlock (usually open coded),
> > > > such that the mmu notififer side has the write side of that lock and
> > > > the read side is consumed by the thread accessing or updating the SPTE.
> > >
> > > Yes, I've considered something like that. But the problem is, mmu notifier
> > > (writer) need to wait for the vhost worker to finish the read before it can
> > > do things like setting dirty pages and unmapping page. It looks to me
> > > seqlock doesn't provide things like this.
> > The seqlock is usually used to prevent a 2nd thread from accessing the
> > VA while it is being changed by the mm. ie you use something seqlocky
> > instead of the ugly mmu_notifier_unregister/register cycle.
>
>
> Yes, so we have two mappings:
>
> [1] vring address to VA
> [2] VA to PA
>
> And have several readers and writers
>
> 1) set_vring_num_addr(): writer of both [1] and [2]
> 2) MMU notifier: reader of [1] writer of [2]
> 3) GUP: reader of [1] writer of [2]
> 4) memory accessors: reader of [1] and [2]
>
> Fortunately, 1) 3) and 4) have already synchronized through vq->mutex. We
> only need to deal with synchronization between 2) and each of the reset:
> Sync between 1) and 2): For mapping [1], I do
> mmu_notifier_unregister/register. This help to avoid holding any lock to do
> overlap check.
I suspect you could have done this with a RCU technique instead of
register/unregister.
> Sync between 2) and 4): For mapping [1], both are readers, no need any
> synchronization. For mapping [2], synchronize through RCU (or something
> simliar to seqlock).
You can't really use a seqlock, seqlocks are collision-retry locks,
and the semantic here is that invalidate_range_start *MUST* not
continue until thread doing #4 above is guarenteed no longer touching
the memory.
This must be a proper barrier, like a spinlock, mutex, or
synchronize_rcu.
And, again, you can't re-invent a spinlock with open coding and get
something better.
Jason
^ permalink raw reply
* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 14:11 UTC (permalink / raw)
To: Horatiu Vultur, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <f758fdbf-4e0a-57b3-f13d-23e893ba7458@cumulusnetworks.com>
[-- Attachment #1: Type: text/plain, Size: 1750 bytes --]
On 01/08/2019 17:07, Nikolay Aleksandrov wrote:
> Hi Horatiu,
> Overall I think MDB is the right way, we'd like to contain the multicast code.
> A few comments below.
>
> On 01/08/2019 15:50, Horatiu Vultur wrote:
[snip]
>>
>> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
>> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
>> ---
>> include/linux/if_bridge.h | 1 +
>> include/uapi/linux/if_bridge.h | 1 +
>> net/bridge/br_device.c | 7 +++++--
>> net/bridge/br_forward.c | 3 ++-
>> net/bridge/br_input.c | 13 ++++++++++--
>> net/bridge/br_mdb.c | 47 +++++++++++++++++++++++++++++++++++-------
>> net/bridge/br_multicast.c | 4 +++-
>> net/bridge/br_private.h | 3 ++-
>> 8 files changed, 64 insertions(+), 15 deletions(-)
>>
>
> Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
> easier and without the checks if you use a per-mdb flag that says it's to be treated
> as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
> attached patch based on this one for example). and continue processing it as it is processed today.
> We'll keep the fast-path with minimal number of new conditionals.
>
> Something like the patch I've attached to this reply, note that it is not complete
> just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
> and there're most probably other details I've missed. If you find even better/less
> complex way to do it then please do.
>
> Cheers,
> Nik
Oops, I sent back your original patch. Here's the actually changed version
I was talking about.
Thanks,
Nik
[-- Attachment #2: 0001-net-bridge-mdb-Extend-with-multicast-LLADDR.patch --]
[-- Type: text/x-patch, Size: 12962 bytes --]
From 01dbe0b22da96efcc6fbf46bd3b22353fca32f5d Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 1 Aug 2019 14:50:40 +0200
Subject: [RFC incomplete] net: bridge: mdb: Extend with multicast LLADDR
Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.
In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.
To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1
This requires changes to iproute2 as well, see the follogin patch for that.
Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.
So far so good. This is an important part of the problem we wanted to solve.
But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.
Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.
We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.
If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).
The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master
It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.
Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.
Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).
Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.
Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.
If this sounds appealing, then we can do a proposal based on the idea.
If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).
[1] https://patchwork.ozlabs.org/patch/1136878/
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
include/linux/if_bridge.h | 1 +
include/uapi/linux/if_bridge.h | 2 ++
net/bridge/br_device.c | 2 +-
net/bridge/br_input.c | 2 +-
net/bridge/br_mdb.c | 58 ++++++++++++++++++++++++++++------
net/bridge/br_multicast.c | 6 ++--
net/bridge/br_private.h | 10 ++++--
7 files changed, 64 insertions(+), 17 deletions(-)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 9e57c4411734..68f2558b1a23 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
struct br_ip {
union {
__be32 ip4;
+ __u8 mac[ETH_ALEN];
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr ip6;
#endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1b3c2b643a02..50b4b481fac5 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -238,12 +238,14 @@ struct br_mdb_entry {
__u8 state;
#define MDB_FLAGS_OFFLOAD (1 << 0)
#define MDB_FLAGS_FAST_LEAVE (1 << 1)
+#define MDB_FLAGS_L2MCAST (1 << 2)
__u8 flags;
__u16 vid;
struct {
union {
__be32 ip4;
struct in6_addr ip6;
+ __u8 mac[ETH_ALEN];
} u;
__be16 proto;
} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 681b72862c16..8ffceaac4c80 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -94,7 +94,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb)))
+ br_multicast_querier_exists(br, eth_hdr(skb), mdst))
br_multicast_flood(mdst, skb, false, true);
else
br_flood(br, skb, BR_PKT_MULTICAST, false, true);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 09b1dd8cd853..331a1ee87c62 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -130,7 +130,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
case BR_PKT_MULTICAST:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb))) {
+ br_multicast_querier_exists(br, eth_hdr(skb), mdst)) {
if ((mdst && mdst->host_joined) ||
br_multicast_is_router(br)) {
local_rcv = true;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 428af1abf8cc..2da17b769760 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -62,6 +62,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
e->flags |= MDB_FLAGS_OFFLOAD;
if (flags & MDB_PG_FLAGS_FAST_LEAVE)
e->flags |= MDB_FLAGS_FAST_LEAVE;
+ if (flags & MDB_PG_FLAGS_L2MCAST)
+ e->flags |= MDB_FLAGS_L2MCAST;
}
static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
@@ -69,12 +71,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
memset(ip, 0, sizeof(struct br_ip));
ip->vid = entry->vid;
ip->proto = entry->addr.proto;
- if (ip->proto == htons(ETH_P_IP))
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
ip->u.ip4 = entry->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ip->u.ip6 = entry->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+ break;
+ }
}
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -110,6 +119,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
pp = &p->next) {
struct nlattr *nest_ent;
struct br_mdb_entry e;
+ u16 flags = p->flags;
port = p->port;
if (!port)
@@ -118,13 +128,22 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
memset(&e, 0, sizeof(e));
e.ifindex = port->dev->ifindex;
e.vid = p->addr.vid;
- __mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
+ if (mp->l2_mcast)
+ flags |= MDB_PG_FLAGS_L2MCAST;
+ __mdb_entry_fill_flags(&e, flags);
+ switch (p->addr.proto) {
+ case htons(ETH_P_IP):
e.addr.u.ip4 = p->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
+ case htons(ETH_P_IPV6):
e.addr.u.ip6 = p->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+ break;
+ }
e.addr.proto = p->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
MDBA_MDB_ENTRY_INFO);
@@ -324,12 +343,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
.vid = entry->vid,
};
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = dev;
switch (type) {
@@ -369,12 +395,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
int err = -ENOBUFS;
port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = port_dev;
if (p && port_dev && type == RTM_NEWMDB) {
@@ -425,6 +458,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ ether_addr_copy(group->u.mac, entry.addr.u.mac);
entry.vid = group->vid;
__mdb_entry_fill_flags(&entry, flags);
__br_mdb_notify(dev, port, &entry, type);
@@ -512,8 +546,12 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
return false;
#endif
- } else
+ } else if ((entry->flags & MDB_FLAGS_L2MCAST) &&
+ !is_multicast_ether_addr(entry->addr.u.mac)) {
return false;
+ } else {
+ return false;
+ }
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
return false;
if (entry->vid >= VLAN_VID_MASK)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3d4b2817687f..43cfc8b18765 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
break;
#endif
default:
- return NULL;
+ ip.proto = 0;
+ ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+ break;
}
return br_mdb_ip_get_rcu(br, &ip);
@@ -2233,7 +2235,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
memset(ð, 0, sizeof(eth));
eth.h_proto = htons(proto);
- ret = br_multicast_querier_exists(br, ð);
+ ret = br_multicast_querier_exists(br, ð, NULL);
unlock:
rcu_read_unlock();
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c4fd307fbfdc..be9e5f327c86 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -200,6 +200,7 @@ struct net_bridge_fdb_entry {
#define MDB_PG_FLAGS_PERMANENT BIT(0)
#define MDB_PG_FLAGS_OFFLOAD BIT(1)
#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define MDB_PG_FLAGS_L2MCAST BIT(3)
struct net_bridge_port_group {
struct net_bridge_port *port;
@@ -220,6 +221,7 @@ struct net_bridge_mdb_entry {
struct timer_list timer;
struct br_ip addr;
bool host_joined;
+ bool l2_mcast;
struct hlist_node mdb_node;
};
@@ -734,7 +736,8 @@ __br_multicast_querier_exists(struct net_bridge *br,
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+ struct ethhdr *eth,
+ struct net_bridge_mdb_entry *dst)
{
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
@@ -746,7 +749,7 @@ static inline bool br_multicast_querier_exists(struct net_bridge *br,
&br->ip6_other_query, true);
#endif
default:
- return false;
+ return !!(dst && dst->l2_mcast);
}
}
@@ -814,7 +817,8 @@ static inline bool br_multicast_is_router(struct net_bridge *br)
}
static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+ struct ethhdr *eth,
+ struct net_bridge_mdb_entry *dst)
{
return false;
}
--
2.21.0
^ permalink raw reply related
* Re: [PATCH net] ibmveth: use net_err_ratelimited when set_multicast_list
From: Hangbin Liu @ 2019-08-01 14:10 UTC (permalink / raw)
To: Joe Perches; +Cc: netdev, Thomas Falcon, David S . Miller
In-Reply-To: <209f7fe62e2a79cd8c02b104b8e3babdd16bff30.camel@perches.com>
On Thu, Aug 01, 2019 at 03:28:43AM -0700, Joe Perches wrote:
> Perhaps add the netdev_<level>_ratelimited variants and use that instead
>
> Somthing like:
Yes, that looks better. Do you mind if I take your code and add your
Signed-off-by info?
Thanks
Hangbin
>
> ---
> include/linux/netdevice.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 54 insertions(+)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 88292953aa6f..37116019e14f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -4737,6 +4737,60 @@ do { \
> #define netdev_info_once(dev, fmt, ...) \
> netdev_level_once(KERN_INFO, dev, fmt, ##__VA_ARGS__)
>
> +#define netdev_level_ratelimited(netdev_level, dev, fmt, ...) \
> +do { \
> + static DEFINE_RATELIMIT_STATE(_rs, \
> + DEFAULT_RATELIMIT_INTERVAL, \
> + DEFAULT_RATELIMIT_BURST); \
> + if (__ratelimit(&_rs)) \
> + netdev_level(dev, fmt, ##__VA_ARGS__); \
> +} while (0)
> +
> +#define netdev_emerg_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_emerg, dev, fmt, ##__VA_ARGS__)
> +#define netdev_alert_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_alert, dev, fmt, ##__VA_ARGS__)
> +#define netdev_crit_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_crit, dev, fmt, ##__VA_ARGS__)
> +#define netdev_err_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_err, dev, fmt, ##__VA_ARGS__)
> +#define netdev_warn_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_warn, dev, fmt, ##__VA_ARGS__)
> +#define netdev_notice_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_notice, dev, fmt, ##__VA_ARGS__)
> +#define netdev_info_ratelimited(dev, fmt, ...) \
> + netdev_level_ratelimited(netdev_info, dev, fmt, ##__VA_ARGS__)
> +
> +#if defined(CONFIG_DYNAMIC_DEBUG)
> +/* descriptor check is first to prevent flooding with "callbacks suppressed" */
> +#define netdev_dbg_ratelimited(dev, fmt, ...) \
> +do { \
> + static DEFINE_RATELIMIT_STATE(_rs, \
> + DEFAULT_RATELIMIT_INTERVAL, \
> + DEFAULT_RATELIMIT_BURST); \
> + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
> + if (DYNAMIC_DEBUG_BRANCH(descriptor) && \
> + __ratelimit(&_rs)) \
> + __dynamic_netdev_dbg(&descriptor, dev, fmt, \
> + ##__VA_ARGS__); \
> +} while (0)
> +#elif defined(DEBUG)
> +#define netdev_dbg_ratelimited(dev, fmt, ...) \
> +do { \
> + static DEFINE_RATELIMIT_STATE(_rs, \
> + DEFAULT_RATELIMIT_INTERVAL, \
> + DEFAULT_RATELIMIT_BURST); \
> + if (__ratelimit(&_rs)) \
> + netdev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
> +} while (0)
> +#else
> +#define netdev_dbg_ratelimited(dev, fmt, ...) \
> +do { \
> + if (0) \
> + netdev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
> +} while (0)
> +#endif
> +
> #define MODULE_ALIAS_NETDEV(device) \
> MODULE_ALIAS("netdev-" device)
>
>
>
^ permalink raw reply
* Re: [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Nikolay Aleksandrov @ 2019-08-01 14:07 UTC (permalink / raw)
To: Horatiu Vultur, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge
In-Reply-To: <1564663840-27721-1-git-send-email-horatiu.vultur@microchip.com>
[-- Attachment #1: Type: text/plain, Size: 13857 bytes --]
Hi Horatiu,
Overall I think MDB is the right way, we'd like to contain the multicast code.
A few comments below.
On 01/08/2019 15:50, Horatiu Vultur wrote:
> Based on the discussion on the topic[1], we extend the functionality of
> the 'bridge mdb' command to accept link layer multicast address. This
> required only few changes and it fits nicely with the current
> implementation and also the old implementation was not changed.
>
> In this patch, we have added a MAC address to the union in 'struct br_ip'.
> If we want to continue like this we should properly rename the structure as
> it is not an IP any more.
>
> To create a group for two of the front ports the following entries can
> be added:
> bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
>
> Now the entries will be display as following:
> dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
> dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1
>
> This requires changes to iproute2 as well, see the follogin patch for that.
>
> Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
> ports. If we have HW offload support, then the frame will be forwarded by
> the switch, and need not to go to the CPU. In a pure SW world, the frame is
> forwarded by the SW bridge, which will flooded it only the ports which are
> part of the group.
>
> So far so good. This is an important part of the problem we wanted to solve.
>
> But, there is one drawback of this approach. If you want to add two of the
> front ports and br0 to receive the frame then I can't see a way of doing it
> with the bridge mdb command. To do that it requireds many more changes to
> the existing code.
>
> Example:
> bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
> bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.
>
> We believe we come a long way by re-using the facilities in MDB (thanks for
> convincing us in doing this), but we are still not completely happy with
> the result.
>
Just add self argument for the bridge mdb command, no need to specify it twice.
> If I only look at the user-interface (iproute2), and completely ignore all
> the implementation details, then I still think that the FDB sub command is
> more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
> We could change this such that MDB is for IP-multicast, and FDB is
> forwarding in general (we should prevent FDB in install IP-multicast rules,
> but we suggest to allow it to install MAC-Multicast rules).
>
> The example from above would now look like this:
> bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
> bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
> bridge fdb add 01:00:00:00:00:04 dev br0 static self master
>
> It would be very similar to the "bridge vlan" command which also allow to
> specify groups with and without br0.
>
> Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
> we only use/need the net_bridge_port_group/ports member (and the MAC
> address, which we hacked into the br_ip struct) when we are a L2-multicast
> entry. This type allow use to re-use the br_multicast_flood function
> which does a lot of the work for us.
>
> Also, the key used to do the lookup in the FDB is already a MAC address
> (no need to hack the br_ip).
>
Look at it as extending br_ip, it's not a hack but a valid mcast use-case.
In fact br_ip is an internal structure which can easily be renamed.
> Regarding the events generated by switchdev: In the current proposal this
> is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
> type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
> the associated data type would be switchdev_notifier_fdb_info - which also
> has the information we need.
>
> Using the FDB database, can still reuse the net_bridge_port_group type (and
> associated functions), and I other parts from the MDB call graph as well.
>
We don't want to mix these.
> If this sounds appealing, then we can do a proposal based on the idea.
>
> If the MDB patch is what we can agree on, then we will continue polish this
> and look for a solution to control the inclusion/exclusion of the br0
> device (hints will be most appreciated).
>
Yes, please. Let's work on this implementation. Some code comments below.
> [1] https://patchwork.ozlabs.org/patch/1136878/
>
> Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
> Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
> Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
> ---
> include/linux/if_bridge.h | 1 +
> include/uapi/linux/if_bridge.h | 1 +
> net/bridge/br_device.c | 7 +++++--
> net/bridge/br_forward.c | 3 ++-
> net/bridge/br_input.c | 13 ++++++++++--
> net/bridge/br_mdb.c | 47 +++++++++++++++++++++++++++++++++++-------
> net/bridge/br_multicast.c | 4 +++-
> net/bridge/br_private.h | 3 ++-
> 8 files changed, 64 insertions(+), 15 deletions(-)
>
Overall I don't think we need this BR_PKT_MULTICAST_L2, we could do the below much
easier and without the checks if you use a per-mdb flag that says it's to be treated
as a MULTICAST_L2 entry. Then you remove all of the BR_PKT_MULTICAST_L2 code (see the
attached patch based on this one for example). and continue processing it as it is processed today.
We'll keep the fast-path with minimal number of new conditionals.
Something like the patch I've attached to this reply, note that it is not complete
just to show the intent, you'll have to re-work br_mdb_notify() to make it proper
and there're most probably other details I've missed. If you find even better/less
complex way to do it then please do.
Cheers,
Nik
> diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
> index f3fab5d..07b092a 100644
> --- a/include/linux/if_bridge.h
> +++ b/include/linux/if_bridge.h
> @@ -16,6 +16,7 @@
> struct br_ip {
> union {
> __be32 ip4;
> + __u8 mac[ETH_ALEN];
> #if IS_ENABLED(CONFIG_IPV6)
> struct in6_addr ip6;
> #endif
> diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
> index 773e476..e535a81 100644
> --- a/include/uapi/linux/if_bridge.h
> +++ b/include/uapi/linux/if_bridge.h
> @@ -243,6 +243,7 @@ struct br_mdb_entry {
> union {
> __be32 ip4;
> struct in6_addr ip6;
> + __u8 mac[ETH_ALEN];
> } u;
> __be16 proto;
> } addr;
> diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
> index c05def8..b2d9041 100644
> --- a/net/bridge/br_device.c
> +++ b/net/bridge/br_device.c
> @@ -83,7 +83,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
> br_flood(br, skb, BR_PKT_BROADCAST, false, true);
> } else if (is_multicast_ether_addr(dest)) {
> if (unlikely(netpoll_tx_running(dev))) {
> - br_flood(br, skb, BR_PKT_MULTICAST, false, true);
> + br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
> goto out;
> }
> if (br_multicast_rcv(br, NULL, skb, vid)) {
> @@ -95,8 +95,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
> if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
> br_multicast_querier_exists(br, eth_hdr(skb)))
> br_multicast_flood(mdst, skb, false, true);
> + else if (mdst && skb->protocol != htons(ETH_P_IP) &&
> + skb->protocol != htons(ETH_P_IPV6))
> + br_multicast_flood(mdst, skb, false, true);
> else
> - br_flood(br, skb, BR_PKT_MULTICAST, false, true);
> + br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
> } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
> br_forward(dst->dst, skb, false, true);
> } else {
> diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
> index 8663700..36b58e8 100644
> --- a/net/bridge/br_forward.c
> +++ b/net/bridge/br_forward.c
> @@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
> if (!(p->flags & BR_FLOOD))
> continue;
> break;
> - case BR_PKT_MULTICAST:
> + case BR_PKT_MULTICAST_IP:
> + case BR_PKT_MULTICAST_L2:
> if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
> continue;
> break;
> diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
> index 21b74e7..a7db0c5 100644
> --- a/net/bridge/br_input.c
> +++ b/net/bridge/br_input.c
> @@ -99,9 +99,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
> pkt_type = BR_PKT_BROADCAST;
> local_rcv = true;
> } else {
> - pkt_type = BR_PKT_MULTICAST;
> + pkt_type = BR_PKT_MULTICAST_IP;
> if (br_multicast_rcv(br, p, skb, vid))
> goto drop;
> +
> + if (skb->protocol != htons(ETH_P_IP) &&
> + skb->protocol != htons(ETH_P_IPV6))
> + pkt_type = BR_PKT_MULTICAST_L2;
> }
> }
>
> @@ -129,7 +133,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
> }
>
> switch (pkt_type) {
> - case BR_PKT_MULTICAST:
> + case BR_PKT_MULTICAST_L2:
> + mdst = br_mdb_get(br, skb, vid);
> + if (mdst)
> + mcast_hit = true;
> + break;
> + case BR_PKT_MULTICAST_IP:
> mdst = br_mdb_get(br, skb, vid);
> if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
> br_multicast_querier_exists(br, eth_hdr(skb))) {
> diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
> index bf6acd3..b337a30 100644
> --- a/net/bridge/br_mdb.c
> +++ b/net/bridge/br_mdb.c
> @@ -67,12 +67,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
> memset(ip, 0, sizeof(struct br_ip));
> ip->vid = entry->vid;
> ip->proto = entry->addr.proto;
> - if (ip->proto == htons(ETH_P_IP))
> + switch (ip->proto) {
> + case htons(ETH_P_IP):
> ip->u.ip4 = entry->addr.u.ip4;
> + break;
> #if IS_ENABLED(CONFIG_IPV6)
> - else
> + case htons(ETH_P_IPV6):
> ip->u.ip6 = entry->addr.u.ip6;
> + break;
> #endif
> + default:
> + ether_addr_copy(ip->u.mac, entry->addr.u.mac);
> + break;
> + }
> }
>
> static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
> @@ -117,12 +124,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
> e.ifindex = port->dev->ifindex;
> e.vid = p->addr.vid;
> __mdb_entry_fill_flags(&e, p->flags);
> - if (p->addr.proto == htons(ETH_P_IP))
> + switch (p->addr.proto) {
> + case htons(ETH_P_IP):
> e.addr.u.ip4 = p->addr.u.ip4;
> + break;
> #if IS_ENABLED(CONFIG_IPV6)
> - if (p->addr.proto == htons(ETH_P_IPV6))
> + case htons(ETH_P_IPV6):
> e.addr.u.ip6 = p->addr.u.ip6;
> + break;
> #endif
> + default:
> + ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
> + break;
> + }
> e.addr.proto = p->addr.proto;
> nest_ent = nla_nest_start_noflag(skb,
> MDBA_MDB_ENTRY_INFO);
> @@ -322,12 +336,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
> .vid = entry->vid,
> };
>
> - if (entry->addr.proto == htons(ETH_P_IP))
> + switch (entry->addr.proto) {
> + case htons(ETH_P_IP):
> ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
> + break;
> #if IS_ENABLED(CONFIG_IPV6)
> - else
> + case htons(ETH_P_IPV6):
> ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
> + break;
> #endif
> + default:
> + ether_addr_copy(mdb.addr, entry->addr.u.mac);
> + break;
> + }
>
> mdb.obj.orig_dev = dev;
> switch (type) {
> @@ -367,12 +388,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
> int err = -ENOBUFS;
>
> port_dev = __dev_get_by_index(net, entry->ifindex);
> - if (entry->addr.proto == htons(ETH_P_IP))
> + switch (entry->addr.proto) {
> + case htons(ETH_P_IP):
> ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
> + break;
> #if IS_ENABLED(CONFIG_IPV6)
> - else
> + case htons(ETH_P_IPV6):
> ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
> + break;
> #endif
> + default:
> + ether_addr_copy(mdb.addr, entry->addr.u.mac);
> + break;
> + }
>
> mdb.obj.orig_dev = port_dev;
> if (p && port_dev && type == RTM_NEWMDB) {
> @@ -423,6 +451,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
> #if IS_ENABLED(CONFIG_IPV6)
> entry.addr.u.ip6 = group->u.ip6;
> #endif
> + ether_addr_copy(group->u.mac, entry.addr.u.mac);
> entry.vid = group->vid;
> __mdb_entry_fill_flags(&entry, flags);
> __br_mdb_notify(dev, port, &entry, type);
> @@ -510,6 +539,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
> if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
> return false;
> #endif
> + } else if (is_multicast_ether_addr(entry->addr.u.mac)) {
> + ;
> } else
> return false;
> if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
> diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
> index de22c8f..01250c1 100644
> --- a/net/bridge/br_multicast.c
> +++ b/net/bridge/br_multicast.c
> @@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
> break;
> #endif
> default:
> - return NULL;
> + ip.proto = 0;
> + ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
> + break;
> }
>
> return br_mdb_ip_get_rcu(br, &ip);
> diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
> index 159a0e2..60e2430d 100644
> --- a/net/bridge/br_private.h
> +++ b/net/bridge/br_private.h
> @@ -590,7 +590,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
> /* br_forward.c */
> enum br_pkt_type {
> BR_PKT_UNICAST,
> - BR_PKT_MULTICAST,
> + BR_PKT_MULTICAST_IP,
> + BR_PKT_MULTICAST_L2,
> BR_PKT_BROADCAST
> };
> int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
>
[-- Attachment #2: 0001-net-bridge-mdb-Extend-with-multicast-LLADDR.patch --]
[-- Type: text/x-patch, Size: 12289 bytes --]
From 611cb2250c06a22d08819bc2d3d67bb7a2867cc4 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Thu, 1 Aug 2019 14:50:40 +0200
Subject: [RFC incomplete] net: bridge: mdb: Extend with multicast LLADDR
Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.
In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.
To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1
This requires changes to iproute2 as well, see the follogin patch for that.
Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.
So far so good. This is an important part of the problem we wanted to solve.
But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.
Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.
We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.
If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).
The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master
It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.
Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.
Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).
Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.
Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.
If this sounds appealing, then we can do a proposal based on the idea.
If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).
[1] https://patchwork.ozlabs.org/patch/1136878/
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
include/linux/if_bridge.h | 1 +
include/uapi/linux/if_bridge.h | 1 +
net/bridge/br_device.c | 7 +++--
net/bridge/br_forward.c | 3 ++-
net/bridge/br_input.c | 13 ++++++++--
net/bridge/br_mdb.c | 47 ++++++++++++++++++++++++++++------
net/bridge/br_multicast.c | 4 ++-
net/bridge/br_private.h | 3 ++-
8 files changed, 64 insertions(+), 15 deletions(-)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 9e57c4411734..68f2558b1a23 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
struct br_ip {
union {
__be32 ip4;
+ __u8 mac[ETH_ALEN];
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr ip6;
#endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 1b3c2b643a02..49a2de0b7420 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -244,6 +244,7 @@ struct br_mdb_entry {
union {
__be32 ip4;
struct in6_addr ip6;
+ __u8 mac[ETH_ALEN];
} u;
__be16 proto;
} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 681b72862c16..581d6875cdb2 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -84,7 +84,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_flood(br, skb, BR_PKT_BROADCAST, false, true);
} else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
goto out;
}
if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -96,8 +96,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb)))
br_multicast_flood(mdst, skb, false, true);
+ else if (mdst && skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ br_multicast_flood(mdst, skb, false, true);
else
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 86637000f275..36b58e86d719 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
if (!(p->flags & BR_FLOOD))
continue;
break;
- case BR_PKT_MULTICAST:
+ case BR_PKT_MULTICAST_IP:
+ case BR_PKT_MULTICAST_L2:
if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
continue;
break;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 09b1dd8cd853..f69e710a7f55 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -97,9 +97,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
- pkt_type = BR_PKT_MULTICAST;
+ pkt_type = BR_PKT_MULTICAST_IP;
if (br_multicast_rcv(br, p, skb, vid))
goto drop;
+
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ pkt_type = BR_PKT_MULTICAST_L2;
}
}
@@ -127,7 +131,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
switch (pkt_type) {
- case BR_PKT_MULTICAST:
+ case BR_PKT_MULTICAST_L2:
+ mdst = br_mdb_get(br, skb, vid);
+ if (mdst)
+ mcast_hit = true;
+ break;
+ case BR_PKT_MULTICAST_IP:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 428af1abf8cc..25d182f45fde 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -69,12 +69,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
memset(ip, 0, sizeof(struct br_ip));
ip->vid = entry->vid;
ip->proto = entry->addr.proto;
- if (ip->proto == htons(ETH_P_IP))
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
ip->u.ip4 = entry->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ip->u.ip6 = entry->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+ break;
+ }
}
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -119,12 +126,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
e.ifindex = port->dev->ifindex;
e.vid = p->addr.vid;
__mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
+ switch (p->addr.proto) {
+ case htons(ETH_P_IP):
e.addr.u.ip4 = p->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
+ case htons(ETH_P_IPV6):
e.addr.u.ip6 = p->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+ break;
+ }
e.addr.proto = p->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
MDBA_MDB_ENTRY_INFO);
@@ -324,12 +338,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
.vid = entry->vid,
};
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = dev;
switch (type) {
@@ -369,12 +390,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
int err = -ENOBUFS;
port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = port_dev;
if (p && port_dev && type == RTM_NEWMDB) {
@@ -425,6 +453,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ ether_addr_copy(group->u.mac, entry.addr.u.mac);
entry.vid = group->vid;
__mdb_entry_fill_flags(&entry, flags);
__br_mdb_notify(dev, port, &entry, type);
@@ -512,6 +541,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
return false;
#endif
+ } else if (is_multicast_ether_addr(entry->addr.u.mac)) {
+ ;
} else
return false;
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 3d4b2817687f..aa28a322ce9d 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
break;
#endif
default:
- return NULL;
+ ip.proto = 0;
+ ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+ break;
}
return br_mdb_ip_get_rcu(br, &ip);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c4fd307fbfdc..1f2a880a9d17 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -592,7 +592,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
/* br_forward.c */
enum br_pkt_type {
BR_PKT_UNICAST,
- BR_PKT_MULTICAST,
+ BR_PKT_MULTICAST_IP,
+ BR_PKT_MULTICAST_L2,
BR_PKT_BROADCAST
};
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
--
2.21.0
^ permalink raw reply related
* Re: [PATCH v2 net-next] be2net: disable bh with spin_lock in be_process_mcc
From: Willem de Bruijn @ 2019-08-01 14:03 UTC (permalink / raw)
To: Denis Kirjanov
Cc: sathya.perla, ajit.khaparde, sriharsha.basavapatna,
Network Development
In-Reply-To: <20190801092420.34502-1-dkirjanov@suse.com>
On Thu, Aug 1, 2019 at 5:24 AM Denis Kirjanov <kda@linux-powerpc.org> wrote:
>
> be_process_mcc() is invoked in 3 different places and
> always with BHs disabled except the be_poll function
> but since it's invoked from softirq with BHs
> disabled it won't hurt.
This describes the current state. What is the benefit of removing the
local_bh_disable/local_bh_enable pair from one caller (be_worker), but
not another (be_mcc_wait_compl) and then convert process_mcc to
disable bh itself with spin_lock_bh?
^ permalink raw reply
* [PATCH v2] net/mlx5e: always initialize frag->last_in_page
From: Qian Cai @ 2019-08-01 13:52 UTC (permalink / raw)
To: davem; +Cc: saeedm, tariqt, netdev, linux-kernel, Qian Cai
The commit 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue
memory scheme") introduced an undefined behaviour below due to
"frag->last_in_page" is only initialized in mlx5e_init_frags_partition()
when,
if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE)
or after bailed out the loop,
for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++)
As the result, there could be some "frag" have uninitialized
value of "last_in_page".
Later, get_frag() obtains those "frag" and check "frag->last_in_page" in
mlx5e_put_rx_frag() and triggers the error during boot. Fix it by always
initializing "frag->last_in_page" to "false" in
mlx5e_init_frags_partition().
UBSAN: Undefined behaviour in
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c:325:12
load of value 170 is not a valid value for type 'bool' (aka '_Bool')
Call trace:
dump_backtrace+0x0/0x264
show_stack+0x20/0x2c
dump_stack+0xb0/0x104
__ubsan_handle_load_invalid_value+0x104/0x128
mlx5e_handle_rx_cqe+0x8e8/0x12cc [mlx5_core]
mlx5e_poll_rx_cq+0xca8/0x1a94 [mlx5_core]
mlx5e_napi_poll+0x17c/0xa30 [mlx5_core]
net_rx_action+0x248/0x940
__do_softirq+0x350/0x7b8
irq_exit+0x200/0x26c
__handle_domain_irq+0xc8/0x128
gic_handle_irq+0x138/0x228
el1_irq+0xb8/0x140
arch_cpu_idle+0x1a4/0x348
do_idle+0x114/0x1b0
cpu_startup_entry+0x24/0x28
rest_init+0x1ac/0x1dc
arch_call_rest_init+0x10/0x18
start_kernel+0x4d4/0x57c
Fixes: 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory scheme")
Signed-off-by: Qian Cai <cai@lca.pw>
---
v2: zero-init the whole struct instead per Tariq.
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 47eea6b3a1c3..e1810c03a510 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -331,12 +331,11 @@ static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
{
- struct mlx5e_wqe_frag_info next_frag, *prev;
+ struct mlx5e_wqe_frag_info next_frag = {};
+ struct mlx5e_wqe_frag_info *prev = NULL;
int i;
next_frag.di = &rq->wqe.di[0];
- next_frag.offset = 0;
- prev = NULL;
for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH] net: sched: use temporary variable for actions indexes
From: Dmytro Linkin @ 2019-08-01 13:39 UTC (permalink / raw)
To: netdev; +Cc: davem, jiri, jhs, xiyou.wangcong, Vlad Buslov
In-Reply-To: <1564664571-31508-1-git-send-email-dmitrolin@mellanox.com>
On Thu, Aug 01, 2019 at 01:02:51PM +0000, dmitrolin@mellanox.com wrote:
> From: Dmytro Linkin <dmitrolin@mellanox.com>
>
> Currently init call of all actions (except ipt) init their 'parm'
> structure as a direct pointer to nla data in skb. This leads to race
> condition when some of the filter actions were initialized successfully
> (and were assigned with idr action index that was written directly
> into nla data), but then were deleted and retried (due to following
> action module missing or classifier-initiated retry), in which case
> action init code tries to insert action to idr with index that was
> assigned on previous iteration. During retry the index can be reused
> by another action that was inserted concurrently, which causes
> unintended action sharing between filters.
> To fix described race condition, save action idr index to temporary
> stack-allocated variable instead on nla data.
>
> Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action")
> Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> ---
Hi,
Forgot to add target branch - net
Sincerely,
Dmytro
^ permalink raw reply
* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Stefano Garzarella @ 2019-08-01 13:36 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
virtualization, Jason Wang, kvm
In-Reply-To: <20190801091106-mutt-send-email-mst@kernel.org>
On Thu, Aug 01, 2019 at 09:21:15AM -0400, Michael S. Tsirkin wrote:
> On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> > On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
> >
> > (...)
> >
> > > >
> > > > The problem here is the compatibility. Before this series virtio-vsock
> > > > and vhost-vsock modules had the RX buffer size hard-coded
> > > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > > of 4K, there might be issues.
> > >
> > > Shouldn't be if they are following the spec. If not let's fix
> > > the broken parts.
> > >
> > > >
> > > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > > >
> > > > Thanks,
> > > > Stefano
> > >
> > > Why would a remote care about buffer sizes?
> > >
> > > Let's first see what the issues are. If they exist
> > > we can either fix the bugs, or code the bug as a feature in spec.
> > >
> >
> > The vhost_transport '.stream_enqueue' callback
> > [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> > passing the user message. This function allocates a new packet, copying
> > the user message, but (before this series) it limits the packet size to
> > the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
> >
> > static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > struct virtio_vsock_pkt_info *info)
> > {
> > ...
> > /* we can send less than pkt_len bytes */
> > if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> > pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
> >
> > /* virtio_transport_get_credit might return less than pkt_len credit */
> > pkt_len = virtio_transport_get_credit(vvs, pkt_len);
> >
> > /* Do not send zero length OP_RW pkt */
> > if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> > return pkt_len;
> > ...
> > }
> >
> > then it queues the packet for the TX worker calling .send_pkt()
> > [vhost_transport_send_pkt() in the vhost_transport case]
> >
> > The main function executed by the TX worker is
> > vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> > and it tries to copy the packet (up to 4K) on it. If the buffer
> > allocated from the guest will be smaller then 4K, I think here it will
> > be discarded with an error:
> >
I'm adding more lines to explain better.
> > static void
> > vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> > struct vhost_virtqueue *vq)
> > {
...
head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
&out, &in, NULL, NULL);
...
len = iov_length(&vq->iov[out], in);
iov_iter_init(&iov_iter, READ, &vq->iov[out], in, len);
nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
if (nbytes != sizeof(pkt->hdr)) {
virtio_transport_free_pkt(pkt);
vq_err(vq, "Faulted on copying pkt hdr\n");
break;
}
> > ...
> > nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>
> isn't pck len the actual length though?
>
It is the length of the packet that we are copying in the guest RX
buffers pointed by the iov_iter. The guest allocates an iovec with 2
buffers, one for the header and one for the payload (4KB).
> > if (nbytes != pkt->len) {
> > virtio_transport_free_pkt(pkt);
> > vq_err(vq, "Faulted on copying pkt buf\n");
> > break;
> > }
> > ...
> > }
> >
> >
> > This series changes this behavior since now we will split the packet in
> > vhost_transport_do_send_pkt() depending on the buffer found in the
> > virtqueue.
> >
> > We didn't change the buffer size in this series, so we still backward
> > compatible, but if we will use buffers smaller than 4K, we should
> > encounter the error described above.
> >
> > How do you suggest we proceed if we want to change the buffer size?
> > Maybe adding a feature to "support any buffer size"?
> >
> > Thanks,
> > Stefano
>
>
--
^ permalink raw reply
* Re: [PATCH] tipc: compat: allow tipc commands without arguments
From: Ying Xue @ 2019-08-01 13:12 UTC (permalink / raw)
To: Taras Kondratiuk, Jon Maloy, David S. Miller
Cc: netdev, tipc-discussion, linux-kernel, xe-linux-external, stable
In-Reply-To: <20190729221507.48893-1-takondra@cisco.com>
On 7/30/19 6:15 AM, Taras Kondratiuk wrote:
> Commit 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
> broke older tipc tools that use compat interface (e.g. tipc-config from
> tipcutils package):
>
> % tipc-config -p
> operation not supported
>
> The commit started to reject TIPC netlink compat messages that do not
> have attributes. It is too restrictive because some of such messages are
> valid (they don't need any arguments):
>
> % grep 'tx none' include/uapi/linux/tipc_config.h
> #define TIPC_CMD_NOOP 0x0000 /* tx none, rx none */
> #define TIPC_CMD_GET_MEDIA_NAMES 0x0002 /* tx none, rx media_name(s) */
> #define TIPC_CMD_GET_BEARER_NAMES 0x0003 /* tx none, rx bearer_name(s) */
> #define TIPC_CMD_SHOW_PORTS 0x0006 /* tx none, rx ultra_string */
> #define TIPC_CMD_GET_REMOTE_MNG 0x4003 /* tx none, rx unsigned */
> #define TIPC_CMD_GET_MAX_PORTS 0x4004 /* tx none, rx unsigned */
> #define TIPC_CMD_GET_NETID 0x400B /* tx none, rx unsigned */
> #define TIPC_CMD_NOT_NET_ADMIN 0xC001 /* tx none, rx none */
>
> This patch relaxes the original fix and rejects messages without
> arguments only if such arguments are expected by a command (reg_type is
> non zero).
>
> Fixes: 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
> Cc: stable@vger.kernel.org
> Signed-off-by: Taras Kondratiuk <takondra@cisco.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
> ---
> The patch is based on v5.3-rc2.
>
> net/tipc/netlink_compat.c | 11 +++++++----
> 1 file changed, 7 insertions(+), 4 deletions(-)
>
> diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
> index d86030ef1232..e135d4e11231 100644
> --- a/net/tipc/netlink_compat.c
> +++ b/net/tipc/netlink_compat.c
> @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg {
> int rep_type;
> int rep_size;
> int req_type;
> + int req_size;
> struct net *net;
> struct sk_buff *rep;
> struct tlv_desc *req;
> @@ -257,7 +258,8 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
> int err;
> struct sk_buff *arg;
>
> - if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
> + if (msg->req_type && (!msg->req_size ||
> + !TLV_CHECK_TYPE(msg->req, msg->req_type)))
> return -EINVAL;
>
> msg->rep = tipc_tlv_alloc(msg->rep_size);
> @@ -354,7 +356,8 @@ static int tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd,
> {
> int err;
>
> - if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
> + if (msg->req_type && (!msg->req_size ||
> + !TLV_CHECK_TYPE(msg->req, msg->req_type)))
> return -EINVAL;
>
> err = __tipc_nl_compat_doit(cmd, msg);
> @@ -1278,8 +1281,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
> goto send;
> }
>
> - len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
> - if (!len || !TLV_OK(msg.req, len)) {
> + msg.req_size = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
> + if (msg.req_size && !TLV_OK(msg.req, msg.req_size)) {
> msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
> err = -EOPNOTSUPP;
> goto send;
>
^ permalink raw reply
* Re: [PATCH v4 1/5] vsock/virtio: limit the memory used per-socket
From: Michael S. Tsirkin @ 2019-08-01 13:21 UTC (permalink / raw)
To: Stefano Garzarella
Cc: netdev, linux-kernel, Stefan Hajnoczi, David S. Miller,
virtualization, Jason Wang, kvm
In-Reply-To: <20190801104754.lb3ju5xjfmnxioii@steredhat>
On Thu, Aug 01, 2019 at 12:47:54PM +0200, Stefano Garzarella wrote:
> On Tue, Jul 30, 2019 at 04:42:25PM -0400, Michael S. Tsirkin wrote:
> > On Tue, Jul 30, 2019 at 11:35:39AM +0200, Stefano Garzarella wrote:
>
> (...)
>
> > >
> > > The problem here is the compatibility. Before this series virtio-vsock
> > > and vhost-vsock modules had the RX buffer size hard-coded
> > > (VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE = 4K). So, if we send a buffer smaller
> > > of 4K, there might be issues.
> >
> > Shouldn't be if they are following the spec. If not let's fix
> > the broken parts.
> >
> > >
> > > Maybe it is the time to add add 'features' to virtio-vsock device.
> > >
> > > Thanks,
> > > Stefano
> >
> > Why would a remote care about buffer sizes?
> >
> > Let's first see what the issues are. If they exist
> > we can either fix the bugs, or code the bug as a feature in spec.
> >
>
> The vhost_transport '.stream_enqueue' callback
> [virtio_transport_stream_enqueue()] calls the virtio_transport_send_pkt_info(),
> passing the user message. This function allocates a new packet, copying
> the user message, but (before this series) it limits the packet size to
> the VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (4K):
>
> static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> struct virtio_vsock_pkt_info *info)
> {
> ...
> /* we can send less than pkt_len bytes */
> if (pkt_len > VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE)
> pkt_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
>
> /* virtio_transport_get_credit might return less than pkt_len credit */
> pkt_len = virtio_transport_get_credit(vvs, pkt_len);
>
> /* Do not send zero length OP_RW pkt */
> if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
> return pkt_len;
> ...
> }
>
> then it queues the packet for the TX worker calling .send_pkt()
> [vhost_transport_send_pkt() in the vhost_transport case]
>
> The main function executed by the TX worker is
> vhost_transport_do_send_pkt() that picks up a buffer from the virtqueue
> and it tries to copy the packet (up to 4K) on it. If the buffer
> allocated from the guest will be smaller then 4K, I think here it will
> be discarded with an error:
>
> static void
> vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
> struct vhost_virtqueue *vq)
> {
> ...
> nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
isn't pck len the actual length though?
> if (nbytes != pkt->len) {
> virtio_transport_free_pkt(pkt);
> vq_err(vq, "Faulted on copying pkt buf\n");
> break;
> }
> ...
> }
>
>
> This series changes this behavior since now we will split the packet in
> vhost_transport_do_send_pkt() depending on the buffer found in the
> virtqueue.
>
> We didn't change the buffer size in this series, so we still backward
> compatible, but if we will use buffers smaller than 4K, we should
> encounter the error described above.
>
> How do you suggest we proceed if we want to change the buffer size?
> Maybe adding a feature to "support any buffer size"?
>
> Thanks,
> Stefano
^ permalink raw reply
* [PATCH] net: sched: use temporary variable for actions indexes
From: dmitrolin @ 2019-08-01 13:02 UTC (permalink / raw)
To: netdev; +Cc: davem, jiri, jhs, xiyou.wangcong, Dmytro Linkin, Vlad Buslov
From: Dmytro Linkin <dmitrolin@mellanox.com>
Currently init call of all actions (except ipt) init their 'parm'
structure as a direct pointer to nla data in skb. This leads to race
condition when some of the filter actions were initialized successfully
(and were assigned with idr action index that was written directly
into nla data), but then were deleted and retried (due to following
action module missing or classifier-initiated retry), in which case
action init code tries to insert action to idr with index that was
assigned on previous iteration. During retry the index can be reused
by another action that was inserted concurrently, which causes
unintended action sharing between filters.
To fix described race condition, save action idr index to temporary
stack-allocated variable instead on nla data.
Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action")
Signed-off-by: Dmytro Linkin <dmitrolin@mellanox.com>
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
---
net/sched/act_bpf.c | 9 +++++----
net/sched/act_connmark.c | 9 +++++----
net/sched/act_csum.c | 9 +++++----
net/sched/act_ct.c | 9 +++++----
net/sched/act_ctinfo.c | 9 +++++----
net/sched/act_gact.c | 8 +++++---
net/sched/act_ife.c | 8 +++++---
net/sched/act_mirred.c | 13 +++++++------
net/sched/act_mpls.c | 8 +++++---
net/sched/act_nat.c | 9 +++++----
net/sched/act_pedit.c | 10 ++++++----
net/sched/act_police.c | 8 +++++---
net/sched/act_sample.c | 10 +++++-----
net/sched/act_simple.c | 10 ++++++----
net/sched/act_skbedit.c | 11 ++++++-----
net/sched/act_skbmod.c | 11 ++++++-----
net/sched/act_tunnel_key.c | 8 +++++---
net/sched/act_vlan.c | 16 +++++++++-------
18 files changed, 100 insertions(+), 75 deletions(-)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 8126b26..fd1f7e7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -285,6 +285,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct tcf_bpf *prog;
bool is_bpf, is_ebpf;
int ret, res = 0;
+ u32 index;
if (!nla)
return -EINVAL;
@@ -298,13 +299,13 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
-
- ret = tcf_idr_check_alloc(tn, &parm->index, act, bind);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, act, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, act,
+ ret = tcf_idr_create(tn, index, est, act,
&act_bpf_ops, bind, true);
if (ret < 0) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index ce36b0f..32ac04d 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -103,6 +103,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct tcf_connmark_info *ci;
struct tc_connmark *parm;
int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
@@ -116,13 +117,13 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
-
- ret = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ ret = tcf_idr_check_alloc(tn, &index, a, bind);
if (!ret) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_connmark_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 621fb22..9b92882 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -52,6 +52,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
struct tc_csum *parm;
struct tcf_csum *p;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
@@ -64,13 +65,13 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
if (tb[TCA_CSUM_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_csum_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index b501ce0..33a1a74 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -666,6 +666,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
struct tc_ct *parm;
struct tcf_ct *c;
int err, res = 0;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
@@ -681,16 +682,16 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
parm = nla_data(tb[TCA_CT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
if (!err) {
- err = tcf_idr_create(tn, parm->index, est, a,
+ err = tcf_idr_create(tn, index, est, a,
&act_ct_ops, bind, true);
if (err) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return err;
}
res = ACT_P_CREATED;
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 10eb2bb..06ef74b 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -157,10 +157,10 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
struct netlink_ext_ack *extack)
{
struct tc_action_net *tn = net_generic(net, ctinfo_net_id);
+ u32 dscpmask = 0, dscpstatemask, index;
struct nlattr *tb[TCA_CTINFO_MAX + 1];
struct tcf_ctinfo_params *cp_new;
struct tcf_chain *goto_ch = NULL;
- u32 dscpmask = 0, dscpstatemask;
struct tc_ctinfo *actparm;
struct tcf_ctinfo *ci;
u8 dscpmaskshift;
@@ -206,12 +206,13 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla,
}
/* done the validation:now to the actual action allocation */
- err = tcf_idr_check_alloc(tn, &actparm->index, a, bind);
+ index = actparm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, actparm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_ctinfo_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, actparm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b2380c5..8f0140c 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -61,6 +61,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct tc_gact *parm;
struct tcf_gact *gact;
int ret = 0;
+ u32 index;
int err;
#ifdef CONFIG_GACT_PROB
struct tc_gact_p *p_parm = NULL;
@@ -77,6 +78,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
if (tb[TCA_GACT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_GACT_PARMS]);
+ index = parm->index;
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
@@ -94,12 +96,12 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_gact_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 41d5398..bd1a541 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -479,6 +479,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
u8 *saddr = NULL;
bool exists = false;
int ret = 0;
+ u32 index;
int err;
err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy,
@@ -502,7 +503,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
if (!p)
return -ENOMEM;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0) {
kfree(p);
return err;
@@ -514,10 +516,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla,
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops,
+ ret = tcf_idr_create(tn, index, est, a, &act_ife_ops,
bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
kfree(p);
return ret;
}
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 055faa2..be3f88d 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -104,6 +104,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct net_device *dev;
bool exists = false;
int ret, err;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
@@ -118,8 +119,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
parm = nla_data(tb[TCA_MIRRED_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -136,21 +137,21 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
return -EINVAL;
}
if (!exists) {
if (!parm->ifindex) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
return -EINVAL;
}
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_mirred_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c
index ca2597c..0f299e3 100644
--- a/net/sched/act_mpls.c
+++ b/net/sched/act_mpls.c
@@ -138,6 +138,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
struct tcf_mpls *m;
int ret = 0, err;
u8 mpls_ttl = 0;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes");
@@ -153,6 +154,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
parm = nla_data(tb[TCA_MPLS_PARMS]);
+ index = parm->index;
/* Verify parameters against action type. */
switch (parm->m_action) {
@@ -209,7 +211,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
return -EINVAL;
}
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -217,10 +219,10 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla,
return 0;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_mpls_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 45923eb..7b858c1 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -44,6 +44,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
+ u32 index;
if (nla == NULL)
return -EINVAL;
@@ -56,13 +57,13 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
if (tb[TCA_NAT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_nat_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 45e9d6b..17360c6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -149,6 +149,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct tcf_pedit *p;
int ret = 0, err;
int ksize;
+ u32 index;
if (!nla) {
NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
@@ -179,18 +180,19 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (IS_ERR(keys_ex))
return PTR_ERR(keys_ex);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (!err) {
if (!parm->nkeys) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
ret = -EINVAL;
goto out_free;
}
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_pedit_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
goto out_free;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index a065f62..49cec3e 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -57,6 +57,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
struct tc_action_net *tn = net_generic(net, police_net_id);
struct tcf_police_params *new;
bool exists = false;
+ u32 index;
if (nla == NULL)
return -EINVAL;
@@ -73,7 +74,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -81,10 +83,10 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
return 0;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, NULL, a,
+ ret = tcf_idr_create(tn, index, NULL, a,
&act_police_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 274d7a0..595308d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -41,8 +41,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
struct tc_action_net *tn = net_generic(net, sample_net_id);
struct nlattr *tb[TCA_SAMPLE_MAX + 1];
struct psample_group *psample_group;
+ u32 psample_group_num, rate, index;
struct tcf_chain *goto_ch = NULL;
- u32 psample_group_num, rate;
struct tc_sample *parm;
struct tcf_sample *s;
bool exists = false;
@@ -59,8 +59,8 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_SAMPLE_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -68,10 +68,10 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla,
return 0;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_sample_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
ret = ACT_P_CREATED;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index f28ddba..33aefa2 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -95,6 +95,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct tcf_defact *d;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
@@ -108,7 +109,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -119,15 +121,15 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_simp_ops, bind, false);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 215a067..b100870 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -99,6 +99,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
u16 *queue_mapping = NULL, *ptype = NULL;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (nla == NULL)
return -EINVAL;
@@ -146,8 +147,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
-
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -158,15 +159,15 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_skbedit_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 4f07706..7da3518 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -87,12 +87,12 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
struct tcf_skbmod_params *p, *p_old;
struct tcf_chain *goto_ch = NULL;
struct tc_skbmod *parm;
+ u32 lflags = 0, index;
struct tcf_skbmod *d;
bool exists = false;
u8 *daddr = NULL;
u8 *saddr = NULL;
u16 eth_type = 0;
- u32 lflags = 0;
int ret = 0, err;
if (!nla)
@@ -122,10 +122,11 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+ index = parm->index;
if (parm->flags & SKBMOD_F_SWAPMAC)
lflags = SKBMOD_F_SWAPMAC;
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -136,15 +137,15 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_skbmod_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 10dffda..6d0debd 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -225,6 +225,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
__be16 flags = 0;
u8 tos, ttl;
int ret = 0;
+ u32 index;
int err;
if (!nla) {
@@ -245,7 +246,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -345,7 +347,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_tunnel_key_ops, bind, true);
if (ret) {
NL_SET_ERR_MSG(extack, "Cannot create TC IDR");
@@ -403,7 +405,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 9269d35..984b05a 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -116,6 +116,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
u8 push_prio = 0;
bool exists = false;
int ret = 0, err;
+ u32 index;
if (!nla)
return -EINVAL;
@@ -128,7 +129,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (!tb[TCA_VLAN_PARMS])
return -EINVAL;
parm = nla_data(tb[TCA_VLAN_PARMS]);
- err = tcf_idr_check_alloc(tn, &parm->index, a, bind);
+ index = parm->index;
+ err = tcf_idr_check_alloc(tn, &index, a, bind);
if (err < 0)
return err;
exists = err;
@@ -144,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]);
@@ -152,7 +154,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -ERANGE;
}
@@ -166,7 +168,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EPROTONOSUPPORT;
}
} else {
@@ -180,16 +182,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
if (exists)
tcf_idr_release(*a, bind);
else
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return -EINVAL;
}
action = parm->v_action;
if (!exists) {
- ret = tcf_idr_create(tn, parm->index, est, a,
+ ret = tcf_idr_create(tn, index, est, a,
&act_vlan_ops, bind, true);
if (ret) {
- tcf_idr_cleanup(tn, parm->index);
+ tcf_idr_cleanup(tn, index);
return ret;
}
--
1.8.3.1
^ permalink raw reply related
* [iproute2,rfc] bridge: mdb: Extend with with LLADDR
From: Horatiu Vultur @ 2019-08-01 12:51 UTC (permalink / raw)
To: nikolay, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge,
Horatiu Vultur
Extend bridge mdb command to accept as group also link layer multicast
addresss. The old behaviour is not change.
To add new mdb entry:
bridge mdb add dev br0 port eth0 grp 11:22:33:44:55:66 permanent vid 1
To display existing entries:
bridge mdb
dev br0 port eth4 grp 01:00:00:00:00:01 permanent offload vid 1
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
---
bridge/mdb.c | 29 ++++++++++++++++++++++++-----
include/uapi/linux/if_bridge.h | 1 +
2 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/bridge/mdb.c b/bridge/mdb.c
index 928ae56..057c0b6 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -138,9 +138,21 @@ static void print_mdb_entry(FILE *f, int ifindex, const struct br_mdb_entry *e,
print_string(PRINT_ANY, "port", " port %s",
ll_index_to_name(e->ifindex));
- print_color_string(PRINT_ANY, ifa_family_color(af),
- "grp", " grp %s",
- inet_ntop(af, src, abuf, sizeof(abuf)));
+ if (e->addr.proto == htons(ETH_P_IP) ||
+ e->addr.proto == htons(ETH_P_IPV6)) {
+ print_color_string(PRINT_ANY, ifa_family_color(af),
+ "grp", " grp %s",
+ inet_ntop(af, src, abuf, sizeof(abuf)));
+ } else {
+ const char *lladdr;
+ SPRINT_BUF(b1);
+
+ lladdr = ll_addr_n2a(e->addr.u.mac, ETH_ALEN, 0, b1,
+ sizeof(b1));
+
+ print_color_string(PRINT_ANY, COLOR_MAC, "grp", " grp %s",
+ lladdr);
+ }
print_string(PRINT_ANY, "state", " %s",
(e->state & MDB_PERMANENT) ? "permanent" : "temp");
@@ -380,6 +392,7 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
};
struct br_mdb_entry entry = {};
char *d = NULL, *p = NULL, *grp = NULL;
+ char abuf[ETH_ALEN];
short vid = 0;
while (argc > 0) {
@@ -422,8 +435,14 @@ static int mdb_modify(int cmd, int flags, int argc, char **argv)
if (!inet_pton(AF_INET, grp, &entry.addr.u.ip4)) {
if (!inet_pton(AF_INET6, grp, &entry.addr.u.ip6)) {
- fprintf(stderr, "Invalid address \"%s\"\n", grp);
- return -1;
+ if (sscanf(grp, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+ abuf, abuf+1, abuf+2, abuf+3, abuf+4,
+ abuf+5) != 6) {
+ fprintf(stderr, "Invalid address \"%s\"\n", grp);
+ return -1;
+ }
+ memcpy(entry.addr.u.mac, abuf, ETH_ALEN);
+ entry.addr.proto = 0;
} else
entry.addr.proto = htons(ETH_P_IPV6);
} else
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 04f763c..88aad9c 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -243,6 +243,7 @@ struct br_mdb_entry {
union {
__be32 ip4;
struct in6_addr ip6;
+ __u8 mac[ETH_ALEN];
} u;
__be16 proto;
} addr;
--
2.7.4
^ permalink raw reply related
* [net-next,rfc] net: bridge: mdb: Extend with multicast LLADDR
From: Horatiu Vultur @ 2019-08-01 12:50 UTC (permalink / raw)
To: nikolay, idosch, andrew, allan.nielsen
Cc: davem, roopa, petrm, tglx, fw, netdev, linux-kernel, bridge,
Horatiu Vultur
Based on the discussion on the topic[1], we extend the functionality of
the 'bridge mdb' command to accept link layer multicast address. This
required only few changes and it fits nicely with the current
implementation and also the old implementation was not changed.
In this patch, we have added a MAC address to the union in 'struct br_ip'.
If we want to continue like this we should properly rename the structure as
it is not an IP any more.
To create a group for two of the front ports the following entries can
be added:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
Now the entries will be display as following:
dev br0 port eth0 grp 01:00:00:00:00:04 permanent offload vid 1
dev br0 port eth1 grp 01:00:00:00:00:04 permanent offload vid 1
This requires changes to iproute2 as well, see the follogin patch for that.
Now if frame with dmac '01:00:00:00:00:04' will arrive at one of the front
ports. If we have HW offload support, then the frame will be forwarded by
the switch, and need not to go to the CPU. In a pure SW world, the frame is
forwarded by the SW bridge, which will flooded it only the ports which are
part of the group.
So far so good. This is an important part of the problem we wanted to solve.
But, there is one drawback of this approach. If you want to add two of the
front ports and br0 to receive the frame then I can't see a way of doing it
with the bridge mdb command. To do that it requireds many more changes to
the existing code.
Example:
bridge mdb add dev br0 port eth0 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port eth1 grp 01:00:00:00:00:04 permanent vid 1
bridge mdb add dev br0 port br0 grp 01:00:00:00:00:04 permanent vid 1 // This looks wrong.
We believe we come a long way by re-using the facilities in MDB (thanks for
convincing us in doing this), but we are still not completely happy with
the result.
If I only look at the user-interface (iproute2), and completely ignore all
the implementation details, then I still think that the FDB sub command is
more suitable for this. Today, FDB is for unicast, and MDB is for multicast.
We could change this such that MDB is for IP-multicast, and FDB is
forwarding in general (we should prevent FDB in install IP-multicast rules,
but we suggest to allow it to install MAC-Multicast rules).
The example from above would now look like this:
bridge fdb add 01:00:00:00:00:04 dev eth0 static self master
bridge fdb add 01:00:00:00:00:04 dev eth1 static self master
bridge fdb add 01:00:00:00:00:04 dev br0 static self master
It would be very similar to the "bridge vlan" command which also allow to
specify groups with and without br0.
Next observation is on the hashing data structure. In 'net_bridge_mdb_entry'
we only use/need the net_bridge_port_group/ports member (and the MAC
address, which we hacked into the br_ip struct) when we are a L2-multicast
entry. This type allow use to re-use the br_multicast_flood function
which does a lot of the work for us.
Also, the key used to do the lookup in the FDB is already a MAC address
(no need to hack the br_ip).
Regarding the events generated by switchdev: In the current proposal this
is a SWITCHDEV_OBJ_ID_PORT_MDB which refer to the switchdev_obj_port_mdb
type. If we changed to use the SWITCHDEV_FDB_ADD_TO_BRIDGE event, then
the associated data type would be switchdev_notifier_fdb_info - which also
has the information we need.
Using the FDB database, can still reuse the net_bridge_port_group type (and
associated functions), and I other parts from the MDB call graph as well.
If this sounds appealing, then we can do a proposal based on the idea.
If the MDB patch is what we can agree on, then we will continue polish this
and look for a solution to control the inclusion/exclusion of the br0
device (hints will be most appreciated).
[1] https://patchwork.ozlabs.org/patch/1136878/
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Co-developed-by: Allan W. Nielsen <allan.nielsen@microchip.com>
Signed-off-by: Allan W. Nielsen <allan.nielsen@microchip.com>
---
include/linux/if_bridge.h | 1 +
include/uapi/linux/if_bridge.h | 1 +
net/bridge/br_device.c | 7 +++++--
net/bridge/br_forward.c | 3 ++-
net/bridge/br_input.c | 13 ++++++++++--
net/bridge/br_mdb.c | 47 +++++++++++++++++++++++++++++++++++-------
net/bridge/br_multicast.c | 4 +++-
net/bridge/br_private.h | 3 ++-
8 files changed, 64 insertions(+), 15 deletions(-)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f3fab5d..07b092a 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -16,6 +16,7 @@
struct br_ip {
union {
__be32 ip4;
+ __u8 mac[ETH_ALEN];
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr ip6;
#endif
diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 773e476..e535a81 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -243,6 +243,7 @@ struct br_mdb_entry {
union {
__be32 ip4;
struct in6_addr ip6;
+ __u8 mac[ETH_ALEN];
} u;
__be16 proto;
} addr;
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index c05def8..b2d9041 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -83,7 +83,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
br_flood(br, skb, BR_PKT_BROADCAST, false, true);
} else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
goto out;
}
if (br_multicast_rcv(br, NULL, skb, vid)) {
@@ -95,8 +95,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb)))
br_multicast_flood(mdst, skb, false, true);
+ else if (mdst && skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ br_multicast_flood(mdst, skb, false, true);
else
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST_IP, false, true);
} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 8663700..36b58e8 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -203,7 +203,8 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
if (!(p->flags & BR_FLOOD))
continue;
break;
- case BR_PKT_MULTICAST:
+ case BR_PKT_MULTICAST_IP:
+ case BR_PKT_MULTICAST_L2:
if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
continue;
break;
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 21b74e7..a7db0c5 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -99,9 +99,13 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
- pkt_type = BR_PKT_MULTICAST;
+ pkt_type = BR_PKT_MULTICAST_IP;
if (br_multicast_rcv(br, p, skb, vid))
goto drop;
+
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ pkt_type = BR_PKT_MULTICAST_L2;
}
}
@@ -129,7 +133,12 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
switch (pkt_type) {
- case BR_PKT_MULTICAST:
+ case BR_PKT_MULTICAST_L2:
+ mdst = br_mdb_get(br, skb, vid);
+ if (mdst)
+ mcast_hit = true;
+ break;
+ case BR_PKT_MULTICAST_IP:
mdst = br_mdb_get(br, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
br_multicast_querier_exists(br, eth_hdr(skb))) {
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index bf6acd3..b337a30 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -67,12 +67,19 @@ static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
memset(ip, 0, sizeof(struct br_ip));
ip->vid = entry->vid;
ip->proto = entry->addr.proto;
- if (ip->proto == htons(ETH_P_IP))
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
ip->u.ip4 = entry->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ip->u.ip6 = entry->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(ip->u.mac, entry->addr.u.mac);
+ break;
+ }
}
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
@@ -117,12 +124,19 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
e.ifindex = port->dev->ifindex;
e.vid = p->addr.vid;
__mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
+ switch (p->addr.proto) {
+ case htons(ETH_P_IP):
e.addr.u.ip4 = p->addr.u.ip4;
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
+ case htons(ETH_P_IPV6):
e.addr.u.ip6 = p->addr.u.ip6;
+ break;
#endif
+ default:
+ ether_addr_copy(e.addr.u.mac, p->addr.u.mac);
+ break;
+ }
e.addr.proto = p->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
MDBA_MDB_ENTRY_INFO);
@@ -322,12 +336,19 @@ static void br_mdb_switchdev_host_port(struct net_device *dev,
.vid = entry->vid,
};
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = dev;
switch (type) {
@@ -367,12 +388,19 @@ static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
int err = -ENOBUFS;
port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
+ switch (entry->addr.proto) {
+ case htons(ETH_P_IP):
ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
+ case htons(ETH_P_IPV6):
ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ break;
#endif
+ default:
+ ether_addr_copy(mdb.addr, entry->addr.u.mac);
+ break;
+ }
mdb.obj.orig_dev = port_dev;
if (p && port_dev && type == RTM_NEWMDB) {
@@ -423,6 +451,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
#if IS_ENABLED(CONFIG_IPV6)
entry.addr.u.ip6 = group->u.ip6;
#endif
+ ether_addr_copy(group->u.mac, entry.addr.u.mac);
entry.vid = group->vid;
__mdb_entry_fill_flags(&entry, flags);
__br_mdb_notify(dev, port, &entry, type);
@@ -510,6 +539,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
return false;
#endif
+ } else if (is_multicast_ether_addr(entry->addr.u.mac)) {
+ ;
} else
return false;
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index de22c8f..01250c1 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -133,7 +133,9 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
break;
#endif
default:
- return NULL;
+ ip.proto = 0;
+ ether_addr_copy(ip.u.mac, eth_hdr(skb)->h_dest);
+ break;
}
return br_mdb_ip_get_rcu(br, &ip);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 159a0e2..60e2430d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -590,7 +590,8 @@ void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
/* br_forward.c */
enum br_pkt_type {
BR_PKT_UNICAST,
- BR_PKT_MULTICAST,
+ BR_PKT_MULTICAST_IP,
+ BR_PKT_MULTICAST_L2,
BR_PKT_BROADCAST
};
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb);
--
2.7.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox