* [bpf-next PATCH 2/2] bpf: test_maps add a test to catch kcm + sockmap
From: John Fastabend @ 2018-10-18 3:37 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev, eric.dumazet
In-Reply-To: <20181018032125.22427.85747.stgit@john-Precision-Tower-5810>
Adding a socket to both sockmap and kcm is not supported due to
collision on sk_user_data usage.
If selftests is run without KCM support we will issue a warning
and continue with the tests.
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
tools/testing/selftests/bpf/Makefile | 2 -
tools/testing/selftests/bpf/sockmap_kcm.c | 14 ++++++
tools/testing/selftests/bpf/test_maps.c | 64 ++++++++++++++++++++++++++++-
3 files changed, 77 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d99dd6f..f290554 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -28,7 +28,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \
- sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
+ sockmap_verdict_prog.o sockmap_kcm.o dev_cgroup.o sample_ret0.o test_tracepoint.o \
test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
diff --git a/tools/testing/selftests/bpf/sockmap_kcm.c b/tools/testing/selftests/bpf/sockmap_kcm.c
new file mode 100644
index 0000000..4377adc
--- /dev/null
+++ b/tools/testing/selftests/bpf/sockmap_kcm.c
@@ -0,0 +1,14 @@
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+#include "bpf_util.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+SEC("socket_kcm")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 9b552c0..be20f1d 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -20,6 +20,7 @@
#include <sys/socket.h>
#include <netinet/in.h>
#include <linux/bpf.h>
+#include <linux/kcm.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
@@ -479,14 +480,16 @@ static void test_devmap(int task, void *data)
#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o"
#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o"
+#define KCM_PROG "./sockmap_kcm.o"
static void test_sockmap(int tasks, void *data)
{
struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
- int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break;
+ int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break, kcm;
int ports[] = {50200, 50201, 50202, 50204};
int err, i, fd, udp, sfd[6] = {0xdeadbeef};
u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0};
- int parse_prog, verdict_prog, msg_prog;
+ int parse_prog, verdict_prog, msg_prog, kcm_prog;
+ struct kcm_attach attach_info;
struct sockaddr_in addr;
int one = 1, s, sc, rc;
struct bpf_object *obj;
@@ -744,6 +747,62 @@ static void test_sockmap(int tasks, void *data)
goto out_sockmap;
}
+ /* Test adding a KCM socket into map */
+#define AF_KCM 41
+ kcm = socket(AF_KCM, SOCK_DGRAM, KCMPROTO_CONNECTED);
+ if (kcm == -1) {
+ printf("Warning, KCM+Sockmap could not be tested.\n");
+ goto skip_kcm;
+ }
+
+ err = bpf_prog_load(KCM_PROG,
+ BPF_PROG_TYPE_SOCKET_FILTER,
+ &obj, &kcm_prog);
+ if (err) {
+ printf("Failed to load SK_SKB parse prog\n");
+ goto out_sockmap;
+ }
+
+ i = 2;
+ memset(&attach_info, 0, sizeof(attach_info));
+ attach_info.fd = sfd[i];
+ attach_info.bpf_fd = kcm_prog;
+ err = ioctl(kcm, SIOCKCMATTACH, &attach_info);
+ if (!err) {
+ perror("Failed KCM attached to sockmap fd: ");
+ goto out_sockmap;
+ }
+
+ err = bpf_map_delete_elem(fd, &i);
+ if (err) {
+ printf("Failed delete sockmap from empty map %i %i\n", err, errno);
+ goto out_sockmap;
+ }
+
+ err = ioctl(kcm, SIOCKCMATTACH, &attach_info);
+ if (err) {
+ perror("Failed KCM attach");
+ goto out_sockmap;
+ }
+
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+ if (!err) {
+ printf("Failed sockmap attached KCM sock!\n");
+ goto out_sockmap;
+ }
+ err = ioctl(kcm, SIOCKCMUNATTACH, &attach_info);
+ if (err) {
+ printf("Failed detach KCM sock!\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+ if (err) {
+ printf("Failed post-kcm update sockmap '%i:%i'\n",
+ i, sfd[i]);
+ goto out_sockmap;
+ }
+
/* Test map update elem afterwards fd lives in fd and map_fd */
for (i = 2; i < 6; i++) {
err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY);
@@ -776,6 +835,7 @@ static void test_sockmap(int tasks, void *data)
}
}
+skip_kcm:
/* Put sfd[2] (sending fd below) into msg map to test sendmsg bpf */
i = 0;
err = bpf_map_update_elem(map_fd_msg, &i, &sfd[2], BPF_ANY);
^ permalink raw reply related
* [bpf-next PATCH 1/2] bpf: skmsg, fix psock create on existing kcm/tls port
From: John Fastabend @ 2018-10-18 3:37 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev, eric.dumazet
In-Reply-To: <20181018032125.22427.85747.stgit@john-Precision-Tower-5810>
Before using the psock returned by sk_psock_get() when adding it to a
sockmap we need to ensure it is actually a sockmap based psock.
Previously we were only checking this after incrementing the reference
counter which was an error. This resulted in a slab-out-of-bounds
error when the psock was not actually a sockmap type.
This moves the check up so the reference counter is only used
if it is a sockmap psock.
Eric reported the following KASAN BUG,
BUG: KASAN: slab-out-of-bounds in atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
BUG: KASAN: slab-out-of-bounds in refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
Read of size 4 at addr ffff88019548be58 by task syz-executor4/22387
CPU: 1 PID: 22387 Comm: syz-executor4 Not tainted 4.19.0-rc7+ #264
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113
print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256
kasan_report_error mm/kasan/report.c:354 [inline]
kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
check_memory_region_inline mm/kasan/kasan.c:260 [inline]
check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
kasan_check_read+0x11/0x20 mm/kasan/kasan.c:272
atomic_read include/asm-generic/atomic-instrumented.h:21 [inline]
refcount_inc_not_zero_checked+0x97/0x2f0 lib/refcount.c:120
sk_psock_get include/linux/skmsg.h:379 [inline]
sock_map_link.isra.6+0x41f/0xe30 net/core/sock_map.c:178
sock_hash_update_common+0x19b/0x11e0 net/core/sock_map.c:669
sock_hash_update_elem+0x306/0x470 net/core/sock_map.c:738
map_update_elem+0x819/0xdf0 kernel/bpf/syscall.c:818
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
---
0 files changed
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 677b673..f44ca6b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -275,11 +275,6 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
return rcu_dereference_sk_user_data(sk);
}
-static inline bool sk_has_psock(struct sock *sk)
-{
- return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
-}
-
static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
@@ -379,6 +374,26 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock,
return test_bit(bit, &psock->state);
}
+static inline struct sk_psock *sk_psock_get_checked(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->recvmsg != tcp_bpf_recvmsg) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = NULL;
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
static inline struct sk_psock *sk_psock_get(struct sock *sk)
{
struct sk_psock *psock;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 3c0e44c..be6092a 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -175,12 +175,13 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
}
}
- psock = sk_psock_get(sk);
+ psock = sk_psock_get_checked(sk);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+
if (psock) {
- if (!sk_has_psock(sk)) {
- ret = -EBUSY;
- goto out_progs;
- }
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
(skb_progs && READ_ONCE(psock->progs.skb_parser))) {
sk_psock_put(sk, psock);
^ permalink raw reply related
* [bpf-next PATCH 0/2] Fix kcm + sockmap by checking psock type
From: John Fastabend @ 2018-10-18 3:37 UTC (permalink / raw)
To: ast, daniel; +Cc: netdev, eric.dumazet
We check if the sk_user_data (the psock in skmsg) is in fact a sockmap
type to late, after we read the refcnt which is an error. This
series moves the check up before reading refcnt and also adds a test
to test_maps to test trying to add a KCM socket into a sockmap.
While reviewig this code I also found an issue with KCM and kTLS
where each uses sk_data_ready hooks and associated stream parser
breaking expectations in kcm, ktls or both. But that fix will need
to go to net.
Thanks to Eric for reporting.
---
John Fastabend (2):
bpf: skmsg, fix psock create on existing kcm/tls port
bpf: test_maps add a test to catch kcm + sockmap
tools/testing/selftests/bpf/Makefile | 2 -
tools/testing/selftests/bpf/sockmap_kcm.c | 14 ++++++
tools/testing/selftests/bpf/test_maps.c | 64 ++++++++++++++++++++++++++++-
3 files changed, 77 insertions(+), 3 deletions(-)
create mode 100644 tools/testing/selftests/bpf/sockmap_kcm.c
^ permalink raw reply
* Re: [PATCH bpf-next v3 00/13] bpf: add btf func info support
From: Alexei Starovoitov @ 2018-10-18 3:24 UTC (permalink / raw)
To: Yonghong Song; +Cc: ast, kafai, daniel, netdev, kernel-team
In-Reply-To: <20181017233026.1071519-1-yhs@fb.com>
On Wed, Oct 17, 2018 at 04:30:22PM -0700, Yonghong Song wrote:
>
> This patch added func info support to the kernel so we can
> get better ksym's for bpf function calls. Basically,
> pairs of bpf function calls and their corresponding types
> are passed to the kernel. Extracting function names from
> the types, the kernel is able to construct a ksym for
> each function call with embedded function name.
...
> Below is a demonstration from Patch #13.
> $ bpftool prog dump jited id 1
> int _dummy_tracepoint(struct dummy_tracepoint_args * ):
> bpf_prog_b07ccb89267cf242__dummy_tracepoint:
> 0: push %rbp
> 1: mov %rsp,%rbp
> ......
> 3c: add $0x28,%rbp
> 40: leaveq
> 41: retq
>
> int test_long_fname_1(struct dummy_tracepoint_args * ):
> bpf_prog_2dcecc18072623fc_test_long_fname_1:
> 0: push %rbp
> 1: mov %rsp,%rbp
Considering we only had 16-byte names for main prog and
no names at all for subprogs, this is huge improvement
in BPF introspection.
Cannot wait for the followup patches with line info support.
For the set:
Acked-by: Alexei Starovoitov <ast@kernel.org>
^ permalink raw reply
* Re: [RFC] VSOCK: The performance problem of vhost_vsock.
From: Jason Wang @ 2018-10-18 2:45 UTC (permalink / raw)
To: jiangyiwen, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <5BC7E04E.9000002@huawei.com>
On 2018/10/18 上午9:22, jiangyiwen wrote:
> On 2018/10/17 20:31, Jason Wang wrote:
>> On 2018/10/17 下午7:41, jiangyiwen wrote:
>>> On 2018/10/17 17:51, Jason Wang wrote:
>>>> On 2018/10/17 下午5:39, Jason Wang wrote:
>>>>>> Hi Jason and Stefan,
>>>>>>
>>>>>> Maybe I find the reason of bad performance.
>>>>>>
>>>>>> I found pkt_len is limited to VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE(4K),
>>>>>> it will cause the bandwidth is limited to 500~600MB/s. And once I
>>>>>> increase to 64k, it can improve about 3 times(~1500MB/s).
>>>>> Looks like the value was chosen for a balance between rx buffer size and performance. Allocating 64K always even for small packet is kind of waste and stress for guest memory. Virito-net try to avoid this by inventing the merge able rx buffer which allows big packet to be scattered in into different buffers. We can reuse this idea or revisit the idea of using virtio-net/vhost-net as a transport of vsock.
>>>>>
>>>>> What interesting is the performance is still behind vhost-net.
>>>>>
>>>>> Thanks
>>>>>
>>>>>> By the way, I send to 64K in application once, and I don't use
>>>>>> sg_init_one and rewrite function to packet sg list because pkt_len
>>>>>> include multiple pages.
>>>>>>
>>>>>> Thanks,
>>>>>> Yiwen.
>>>> Btw, if you're using vsock for transferring large files, maybe it's more efficient to implement sendpage() for vsock to allow sendfile()/splice() work.
>>>>
>>>> Thanks
>>>>
>>> I can't agree more.
>>>
>>> why vhost_vsock is still behind vhost_net?
>>> Because I use sendfile() to test performance at first, and then
>>> I found vsock don't implement sendpage() and cause the bandwidth
>>> can't be increased. So I use read() and send() to replace sendfile(),
>>> it will increase some switch between kernel and user mode, and sendfile()
>>> can support zero copy. I think this is main reason.
>>>
>>> Thanks.
>>
>> Want to post patches for this then :) ?
>>
>> Thanks
>>
> I may not post patches at the moment because there are other tasks.
>
> After a period of time, I will consider implement the feature.
>
> Thanks.
That's fine.
Thanks
^ permalink raw reply
* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18 9:53 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018093035.i56byogaslsnzcnl@linutronix.de>
On 2018/10/18 18:30, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:19:21 [+0900], Toshiaki Makita wrote:
>> On 2018/10/18 18:08, Sebastian Andrzej Siewior wrote:
>>> Again: lockdep saw the lock in softirq context once and in process
>>> context once and this is what triggers the warning. It does not matter
>>> if NAPI is enabled or not during the access in process context. If you
>>> want to allow this you need further lockdep annotation…
>>>
>>> … but: refill_work() disables NAPI for &vi->rq[1] and refills + updates
>>> stats while NAPI is enabled for &vi->rq[0].
>>
>> Do you mean this is false positive? rq[0] and rq[1] never race with each
>> other...
>
> Why? So you can't refill rq[1] and then be interrupted and process NAPI
> for rq[0]?
Probably I don't understand what you are trying to say, but rq[0] and
rq[1] are different objects so they can be processed concurrently but
should not affect each other.
>
> But as I said. If lockdep saw the lock in acquired in softirq (what it
> did) _and_ in process context (what it did as well) _once_ then this is
> enough evidence for the warning.
> If you claim that this can not happen due to NAPI guard [0] then this is
> something lockdep does not know about.
The point is that it is not the problem of stats. try_fill_recv() should
not be called for the same rq concurrently in the first place. This
requirement should be satisfied by NAPI guard, so if the NAPI guard
logic is buggy then we need to fix it.
> [0] which I currently don't understand and therefore sent the patch [1]
> as Jason pointed out that in the ->ndo_open case the work is
> scheduled and then NAPI is enabled (which means the worker could
> disable NAPI and refill but before it finishes, ->ndo_open would
> continue and enable NAPI)).
It may be related but I have not investigated it deeply. I'll check if
it can cause this problem later.
> [1] 20181018084753.wefvsypdevbzoadg@linutronix.de
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18 9:26 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018091114.jqiedzds2bqezxtb@linutronix.de>
On 2018/10/18 18:11, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:06:57 [+0900], Toshiaki Makita wrote:
>> NACK. Again, this race should not happen because of NAPI guard.
>> We need to investigate why this warning happened.
>
> I tried to explain this. Please see
> 20181018090812.rry5qgnqxxrjxaii@linutronix.de
Anyway, if this really happens, then it means try_fill_recv() can be
called concurrently for the same rq, which looks like a far more severe
problem to me. If so, we need to fix it instead.
--
Toshiaki Makita
^ permalink raw reply
* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18 9:19 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018090812.rry5qgnqxxrjxaii@linutronix.de>
On 2018/10/18 18:08, Sebastian Andrzej Siewior wrote:
> On 2018-10-18 18:00:05 [+0900], Toshiaki Makita wrote:
>> On 2018/10/18 17:47, Sebastian Andrzej Siewior wrote:
>>> On 2018-10-17 14:48:02 [+0800], Jason Wang wrote:
>>>>
>>>> On 2018/10/17 上午9:13, Toshiaki Makita wrote:
>>>>> I'm not sure what condition triggered this warning.
>>>
>>> If the seqlock is acquired once in softirq and then in process context
>>> again it is enough evidence for lockdep to trigger this warning.
>>
>> No. As I said that should not happen because of NAPI guard.
> Again: lockdep saw the lock in softirq context once and in process
> context once and this is what triggers the warning. It does not matter
> if NAPI is enabled or not during the access in process context. If you
> want to allow this you need further lockdep annotation…
>
> … but: refill_work() disables NAPI for &vi->rq[1] and refills + updates
> stats while NAPI is enabled for &vi->rq[0].
Do you mean this is false positive? rq[0] and rq[1] never race with each
other...
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18 9:06 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Michael S. Tsirkin, netdev, virtualization, tglx, David S. Miller
In-Reply-To: <20181018084313.oopu34iwfwgkcwwc@linutronix.de>
On 2018/10/18 17:43, Sebastian Andrzej Siewior wrote:
> on 32bit, lockdep notices that virtnet_open() and refill_work() invoke
> try_fill_recv() from process context while virtnet_receive() invokes the
> same function from BH context. The problem that the seqcounter within
> u64_stats_update_begin() may deadlock if it is interrupted by BH and
> then acquired again.
>
> Introduce u64_stats_update_begin_bh() which disables BH on 32bit
> architectures. Since the BH might interrupt the worker, this new
> function should not limited to SMP like the others which are expected
> to be used in softirq.
>
> With this change we might lose increments but this is okay. The
> important part that the two 32bit parts of the 64bit counter are not
> corrupted.
>
> Fixes: 461f03dc99cf6 ("virtio_net: Add kick stats").
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
NACK. Again, this race should not happen because of NAPI guard.
We need to investigate why this warning happened.
--
Toshiaki Makita
^ permalink raw reply
* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Toshiaki Makita @ 2018-10-18 9:00 UTC (permalink / raw)
To: Sebastian Andrzej Siewior, Jason Wang
Cc: netdev, tglx, Michael S. Tsirkin, David S. Miller, virtualization
In-Reply-To: <20181018084753.wefvsypdevbzoadg@linutronix.de>
On 2018/10/18 17:47, Sebastian Andrzej Siewior wrote:
> On 2018-10-17 14:48:02 [+0800], Jason Wang wrote:
>>
>> On 2018/10/17 上午9:13, Toshiaki Makita wrote:
>>> I'm not sure what condition triggered this warning.
>
> If the seqlock is acquired once in softirq and then in process context
> again it is enough evidence for lockdep to trigger this warning.
No. As I said that should not happen because of NAPI guard.
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [RFC PATCH] cgroup, netclassid: add a preemption point to write_classid
From: Michal Hocko @ 2018-10-18 8:56 UTC (permalink / raw)
To: Tejun Heo, David S. Miller
Cc: Johannes Weiner, cgroups, netdev, LKML, Michal Hocko
From: Michal Hocko <mhocko@suse.com>
We have seen a customer complaining about soft lockups on !PREEMPT
kernel config with 4.4 based kernel
[1072141.435366] NMI watchdog: BUG: soft lockup - CPU#21 stuck for 22s! [systemd:1]
[1072141.444090] Modules linked in: mpt3sas raid_class binfmt_misc af_packet 8021q garp mrp stp llc xfs libcrc32c bonding iscsi_ibft iscsi_boot_sysfs msr ext4 crc16 jbd2 mbcache cdc_ether usbnet mii joydev hid_generic usbhid intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel ipmi_ssif mgag200 i2c_algo_bit ttm ipmi_devintf drbg ixgbe drm_kms_helper vxlan ansi_cprng ip6_udp_tunnel drm aesni_intel udp_tunnel aes_x86_64 iTCO_wdt syscopyarea ptp xhci_pci lrw iTCO_vendor_support pps_core gf128mul ehci_pci glue_helper sysfillrect mdio pcspkr sb_edac ablk_helper cryptd ehci_hcd sysimgblt xhci_hcd fb_sys_fops edac_core mei_me lpc_ich ses usbcore enclosure dca mfd_core ipmi_si mei i2c_i801 scsi_transport_sas usb_common ipmi_msghandler shpchp f
jes wmi processor button acpi_pad btrfs xor raid6_pq sd_mod crc32c_intel megaraid_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod md_mod autofs4
[1072141.444146] Supported: Yes
[1072141.444149] CPU: 21 PID: 1 Comm: systemd Not tainted 4.4.121-92.80-default #1
[1072141.444150] Hardware name: LENOVO Lenovo System x3650 M5 -[5462P4U]- -[5462P4U]-/01GR451, BIOS -[TCE136H-2.70]- 06/13/2018
[1072141.444151] task: ffff880191bd0040 ti: ffff880191bd4000 task.ti: ffff880191bd4000
[1072141.444153] RIP: 0010:[<ffffffff815229f9>] [<ffffffff815229f9>] update_classid_sock+0x29/0x40
[1072141.444157] RSP: 0018:ffff880191bd7d58 EFLAGS: 00000286
[1072141.444158] RAX: ffff883b177cb7c0 RBX: 0000000000000000 RCX: 0000000000000000
[1072141.444159] RDX: 00000000000009c7 RSI: ffff880191bd7d5c RDI: ffff8822e29bb200
[1072141.444160] RBP: ffff883a72230980 R08: 0000000000000101 R09: 0000000000000000
[1072141.444161] R10: 0000000000000008 R11: f000000000000000 R12: ffffffff815229d0
[1072141.444162] R13: 0000000000000000 R14: ffff881fd0a47ac0 R15: ffff880191bd7f28
[1072141.444163] FS: 00007f3e2f1eb8c0(0000) GS:ffff882000340000(0000) knlGS:0000000000000000
[1072141.444164] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1072141.444165] CR2: 00007f3e2f200000 CR3: 0000001ffea4e000 CR4: 00000000001606f0
[1072141.444166] Stack:
[1072141.444166] ffffffa800000246 00000000000009c7 ffffffff8121d583 ffff8818312a05c0
[1072141.444168] ffff8818312a1100 ffff880197c3b280 ffff881861422858 ffffffffffffffea
[1072141.444170] ffffffff81522b1c ffffffff81d0ca20 ffff8817fa17b950 ffff883fdd8121e0
[1072141.444171] Call Trace:
[1072141.444179] [<ffffffff8121d583>] iterate_fd+0x53/0x80
[1072141.444182] [<ffffffff81522b1c>] write_classid+0x4c/0x80
[1072141.444187] [<ffffffff8111328b>] cgroup_file_write+0x9b/0x100
[1072141.444193] [<ffffffff81278bcb>] kernfs_fop_write+0x11b/0x150
[1072141.444198] [<ffffffff81201566>] __vfs_write+0x26/0x100
[1072141.444201] [<ffffffff81201bed>] vfs_write+0x9d/0x190
[1072141.444203] [<ffffffff812028c2>] SyS_write+0x42/0xa0
[1072141.444207] [<ffffffff815f58c3>] entry_SYSCALL_64_fastpath+0x1e/0xca
[1072141.445490] DWARF2 unwinder stuck at entry_SYSCALL_64_fastpath+0x1e/0xca
If a cgroup has many tasks with many open file descriptors then we would
end up in a large loop without any rescheduling point throught the
operation. Add cond_resched once per task.
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
Hi,
I am sending this as an RFC because I am not sure this is the best
approach to be honest. The customer who reported this issue was not
entirely thrilled about testing this so it might be possible that the
patch is not sufficient. Details about the workload are not very
detailed either.
Any comment would be highly appreciated.
net/core/netclassid_cgroup.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 5e4f04004a49..7bf833598615 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -106,6 +106,7 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
iterate_fd(p->files, 0, update_classid_sock,
(void *)(unsigned long)cs->classid);
task_unlock(p);
+ cond_resched();
}
css_task_iter_end(&it);
--
2.19.0
^ permalink raw reply related
* [net-next 13/14] net/mlx5e: Use a slow path rule instead if vxlan neighbour isn't available
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
When adding a vxlan tc rule, and a neighbour isn't available, we
don't insert any rule to hardware. Once we enable offloading flows
with multiple priorities, a packet that should have matched this rule
will continue in hardware pipeline and might match a wrong one.
This is unlike in tc software path where it will be matched and
forwarded to the vxlan device (which will cause a ARP lookup
eventually) and stop processing further tc filters.
To address that, when when a neighbour isn't available (EAGAIN from
attach_encap), or gets deleted, change the original action to be a
forward to slow path instead. Neighbour update will restore the original
action once the neighbour becomes available. This will be done atomically
so at any given time we will have a the correct match.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 104 ++++++++++++++----
1 file changed, 82 insertions(+), 22 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 1786e25644ac..cb66964aa1ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -74,6 +74,7 @@ enum {
MLX5E_TC_FLOW_OFFLOADED = BIT(MLX5E_TC_FLOW_BASE + 2),
MLX5E_TC_FLOW_HAIRPIN = BIT(MLX5E_TC_FLOW_BASE + 3),
MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
+ MLX5E_TC_FLOW_SLOW = BIT(MLX5E_TC_FLOW_BASE + 5),
};
#define MLX5E_TC_MAX_SPLITS 1
@@ -82,7 +83,7 @@ struct mlx5e_tc_flow {
struct rhash_head node;
struct mlx5e_priv *priv;
u64 cookie;
- u8 flags;
+ u16 flags;
struct mlx5_flow_handle *rule[MLX5E_TC_MAX_SPLITS + 1];
struct list_head encap; /* flows sharing the same encap ID */
struct list_head mod_hdr; /* flows sharing the same mod hdr ID */
@@ -860,6 +861,36 @@ mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
}
+static struct mlx5_flow_handle *
+mlx5e_tc_offload_to_slow_path(struct mlx5_eswitch *esw,
+ struct mlx5e_tc_flow *flow,
+ struct mlx5_flow_spec *spec,
+ struct mlx5_esw_flow_attr *slow_attr)
+{
+ struct mlx5_flow_handle *rule;
+
+ memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr));
+ slow_attr->action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+ slow_attr->mirror_count = 0,
+ slow_attr->dest_chain = FDB_SLOW_PATH_CHAIN,
+
+ rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, slow_attr);
+ if (!IS_ERR(rule))
+ flow->flags |= MLX5E_TC_FLOW_SLOW;
+
+ return rule;
+}
+
+static void
+mlx5e_tc_unoffload_from_slow_path(struct mlx5_eswitch *esw,
+ struct mlx5e_tc_flow *flow,
+ struct mlx5_esw_flow_attr *slow_attr)
+{
+ memcpy(slow_attr, flow->esw_attr, sizeof(*slow_attr));
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, slow_attr);
+ flow->flags &= ~MLX5E_TC_FLOW_SLOW;
+}
+
static int
mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow_parse_attr *parse_attr,
@@ -917,15 +948,21 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
/* we get here if (1) there's no error or when
* (2) there's an encap action and we're on -EAGAIN (no valid neigh)
*/
- if (encap_err != -EAGAIN) {
+ if (encap_err == -EAGAIN) {
+ /* continue with goto slow path rule instead */
+ struct mlx5_esw_flow_attr slow_attr;
+
+ flow->rule[0] = mlx5e_tc_offload_to_slow_path(esw, flow, &parse_attr->spec, &slow_attr);
+ } else {
flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
- if (IS_ERR(flow->rule[0])) {
- err = PTR_ERR(flow->rule[0]);
- goto err_add_rule;
- }
}
- return encap_err;
+ if (IS_ERR(flow->rule[0])) {
+ err = PTR_ERR(flow->rule[0]);
+ goto err_add_rule;
+ }
+
+ return 0;
err_add_rule:
mlx5_fc_destroy(esw->dev, counter);
@@ -946,9 +983,14 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+ struct mlx5_esw_flow_attr slow_attr;
- if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
- mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+ if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+ if (flow->flags & MLX5E_TC_FLOW_SLOW)
+ mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr);
+ else
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, attr);
+ }
mlx5_eswitch_del_vlan_action(esw, attr);
@@ -968,7 +1010,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
struct mlx5e_encap_entry *e)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct mlx5_esw_flow_attr *esw_attr;
+ struct mlx5_esw_flow_attr slow_attr, *esw_attr;
struct mlx5_flow_handle *rule;
struct mlx5_flow_spec *spec;
struct mlx5e_tc_flow *flow;
@@ -991,6 +1033,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
esw_attr->encap_id = e->encap_id;
spec = &esw_attr->parse_attr->spec;
+ /* update from slow path rule to encap rule */
rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, esw_attr);
if (IS_ERR(rule)) {
err = PTR_ERR(rule);
@@ -998,6 +1041,9 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
err);
continue;
}
+
+ mlx5e_tc_unoffload_from_slow_path(esw, flow, &slow_attr);
+ flow->flags |= MLX5E_TC_FLOW_OFFLOADED; /* was unset when slow path rule removed */
flow->rule[0] = rule;
}
}
@@ -1006,11 +1052,28 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
struct mlx5e_encap_entry *e)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_esw_flow_attr slow_attr;
+ struct mlx5_flow_handle *rule;
+ struct mlx5_flow_spec *spec;
struct mlx5e_tc_flow *flow;
+ int err;
list_for_each_entry(flow, &e->flows, encap) {
- if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
- mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+ spec = &flow->esw_attr->parse_attr->spec;
+
+ /* update from encap rule to slow path rule */
+ rule = mlx5e_tc_offload_to_slow_path(esw, flow, spec, &slow_attr);
+
+ if (IS_ERR(rule)) {
+ err = PTR_ERR(rule);
+ mlx5_core_warn(priv->mdev, "Failed to update slow path (encap) flow, %d\n",
+ err);
+ continue;
+ }
+
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
+ flow->flags |= MLX5E_TC_FLOW_OFFLOADED; /* was unset when fast path rule removed */
+ flow->rule[0] = rule;
}
if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
@@ -2888,9 +2951,9 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
return 0;
}
-static void get_flags(int flags, u8 *flow_flags)
+static void get_flags(int flags, u16 *flow_flags)
{
- u8 __flow_flags = 0;
+ u16 __flow_flags = 0;
if (flags & MLX5E_TC_INGRESS)
__flow_flags |= MLX5E_TC_FLOW_INGRESS;
@@ -2921,7 +2984,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
static int
mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
- struct tc_cls_flower_offload *f, u8 flow_flags,
+ struct tc_cls_flower_offload *f, u16 flow_flags,
struct mlx5e_tc_flow_parse_attr **__parse_attr,
struct mlx5e_tc_flow **__flow)
{
@@ -2958,7 +3021,7 @@ mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
static int
mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
struct tc_cls_flower_offload *f,
- u8 flow_flags,
+ u16 flow_flags,
struct mlx5e_tc_flow **__flow)
{
struct netlink_ext_ack *extack = f->common.extack;
@@ -2978,12 +3041,9 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
goto err_free;
err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
- if (err && err != -EAGAIN)
+ if (err)
goto err_free;
- if (!err)
- flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
-
if (!(flow->esw_attr->action &
MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT))
kvfree(parse_attr);
@@ -3002,7 +3062,7 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
static int
mlx5e_add_nic_flow(struct mlx5e_priv *priv,
struct tc_cls_flower_offload *f,
- u8 flow_flags,
+ u16 flow_flags,
struct mlx5e_tc_flow **__flow)
{
struct netlink_ext_ack *extack = f->common.extack;
@@ -3045,7 +3105,7 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow **flow)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- u8 flow_flags;
+ u16 flow_flags;
int err;
get_flags(flags, &flow_flags);
--
2.17.2
^ permalink raw reply related
* [net-next 14/14] net/mlx5e: Support offloading tc priorities and chains for eswitch flows
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
Currently we fail when user specify a non-zero chain, this patch adds the
support for it and tc priorities. To get to a new chain, use the tc
goto action.
Currently we support a fixed prio range 1-16, and chain range 0-3.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_main.c | 3 --
.../net/ethernet/mellanox/mlx5/core/en_rep.c | 6 ---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 51 ++++++++++++++++++-
.../mellanox/mlx5/core/eswitch_offloads.c | 2 +-
4 files changed, 50 insertions(+), 12 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index c9848e333450..1243edbedc9e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3392,9 +3392,6 @@ static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
{
struct mlx5e_priv *priv = cb_priv;
- if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
- return -EOPNOTSUPP;
-
switch (type) {
case TC_SETUP_CLSFLOWER:
return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 64c2b9ea8b1e..c3c657548824 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -853,9 +853,6 @@ static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data,
{
struct mlx5e_priv *priv = cb_priv;
- if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
- return -EOPNOTSUPP;
-
switch (type) {
case TC_SETUP_CLSFLOWER:
return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS);
@@ -869,9 +866,6 @@ static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data,
{
struct mlx5e_priv *priv = cb_priv;
- if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
- return -EOPNOTSUPP;
-
switch (type) {
case TC_SETUP_CLSFLOWER:
return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index cb66964aa1ff..608025ca5c04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -898,15 +898,32 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct netlink_ext_ack *extack)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ u32 max_chain = mlx5_eswitch_get_chain_range(esw);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
+ u16 max_prio = mlx5_eswitch_get_prio_range(esw);
struct net_device *out_dev, *encap_dev = NULL;
struct mlx5_fc *counter = NULL;
struct mlx5e_rep_priv *rpriv;
struct mlx5e_priv *out_priv;
int err = 0, encap_err = 0;
- /* keep the old behaviour, use same prio for all offloaded rules */
- attr->prio = 1;
+ /* if prios are not supported, keep the old behaviour of using same prio
+ * for all offloaded rules.
+ */
+ if (!mlx5_eswitch_prios_supported(esw))
+ attr->prio = 1;
+
+ if (attr->chain > max_chain) {
+ NL_SET_ERR_MSG(extack, "Requested chain is out of supported range");
+ err = -EOPNOTSUPP;
+ goto err_max_prio_chain;
+ }
+
+ if (attr->prio > max_prio) {
+ NL_SET_ERR_MSG(extack, "Requested priority is out of supported range");
+ err = -EOPNOTSUPP;
+ goto err_max_prio_chain;
+ }
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
out_dev = __dev_get_by_index(dev_net(priv->netdev),
@@ -975,6 +992,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
mlx5e_detach_encap(priv, flow);
err_attach_encap:
+err_max_prio_chain:
return err;
}
@@ -2826,6 +2844,7 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
struct mlx5e_tc_flow *flow,
struct netlink_ext_ack *extack)
{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct mlx5e_rep_priv *rpriv = priv->ppriv;
struct ip_tunnel_info *info = NULL;
@@ -2934,6 +2953,25 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
continue;
}
+ if (is_tcf_gact_goto_chain(a)) {
+ u32 dest_chain = tcf_gact_goto_chain_index(a);
+ u32 max_chain = mlx5_eswitch_get_chain_range(esw);
+
+ if (dest_chain <= attr->chain) {
+ NL_SET_ERR_MSG(extack, "Goto earlier chain isn't supported");
+ return -EOPNOTSUPP;
+ }
+ if (dest_chain > max_chain) {
+ NL_SET_ERR_MSG(extack, "Requested destination chain is out of supported range");
+ return -EOPNOTSUPP;
+ }
+ action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+ MLX5_FLOW_CONTEXT_ACTION_COUNT;
+ attr->dest_chain = dest_chain;
+
+ continue;
+ }
+
return -EINVAL;
}
@@ -3036,6 +3074,8 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
if (err)
goto out;
+ flow->esw_attr->chain = f->common.chain_index;
+ flow->esw_attr->prio = TC_H_MAJ(f->common.prio) >> 16;
err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow, extack);
if (err)
goto err_free;
@@ -3070,6 +3110,10 @@ mlx5e_add_nic_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow;
int attr_size, err;
+ /* multi-chain not supported for NIC rules */
+ if (!tc_cls_can_offload_and_chain0(priv->netdev, &f->common))
+ return -EOPNOTSUPP;
+
flow_flags |= MLX5E_TC_FLOW_NIC;
attr_size = sizeof(struct mlx5_nic_flow_attr);
err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags,
@@ -3110,6 +3154,9 @@ mlx5e_tc_add_flow(struct mlx5e_priv *priv,
get_flags(flags, &flow_flags);
+ if (!tc_can_offload_extack(priv->netdev, f->common.extack))
+ return -EOPNOTSUPP;
+
if (esw && esw->mode == SRIOV_OFFLOADS)
err = mlx5e_add_fdb_flow(priv, f, flow_flags, flow);
else
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 42a130455ef8..9eac137790f5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -71,7 +71,7 @@ u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw)
if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
return FDB_MAX_PRIO;
- return U16_MAX;
+ return 1;
}
struct mlx5_flow_handle *
--
2.17.2
^ permalink raw reply related
* [net-next 11/14] net/mlx5e: Avoid duplicated code for tc offloads add/del fdb rule
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Or Gerlitz, Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Or Gerlitz <ogerlitz@mellanox.com>
The code for adding/deleting fdb flow is repeated when
user-space does flow add/del and when we add/del from
the neigh update path - unify them to avoid the duplication.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 91 ++++++++++---------
1 file changed, 50 insertions(+), 41 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8b25850cbf6a..1786e25644ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -823,6 +823,43 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow,
struct netlink_ext_ack *extack);
+static struct mlx5_flow_handle *
+mlx5e_tc_offload_fdb_rules(struct mlx5_eswitch *esw,
+ struct mlx5e_tc_flow *flow,
+ struct mlx5_flow_spec *spec,
+ struct mlx5_esw_flow_attr *attr)
+{
+ struct mlx5_flow_handle *rule;
+
+ rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
+ if (IS_ERR(rule))
+ return rule;
+
+ if (attr->mirror_count) {
+ flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
+ if (IS_ERR(flow->rule[1])) {
+ mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+ return flow->rule[1];
+ }
+ }
+
+ flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+ return rule;
+}
+
+static void
+mlx5e_tc_unoffload_fdb_rules(struct mlx5_eswitch *esw,
+ struct mlx5e_tc_flow *flow,
+ struct mlx5_esw_flow_attr *attr)
+{
+ flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
+
+ if (attr->mirror_count)
+ mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
+
+ mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
+}
+
static int
mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow_parse_attr *parse_attr,
@@ -881,25 +918,15 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
* (2) there's an encap action and we're on -EAGAIN (no valid neigh)
*/
if (encap_err != -EAGAIN) {
- flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
+ flow->rule[0] = mlx5e_tc_offload_fdb_rules(esw, flow, &parse_attr->spec, attr);
if (IS_ERR(flow->rule[0])) {
err = PTR_ERR(flow->rule[0]);
goto err_add_rule;
}
-
- if (attr->mirror_count) {
- flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &parse_attr->spec, attr);
- if (IS_ERR(flow->rule[1])) {
- err = PTR_ERR(flow->rule[1]);
- goto err_fwd_rule;
- }
- }
}
return encap_err;
-err_fwd_rule:
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
err_add_rule:
mlx5_fc_destroy(esw->dev, counter);
err_create_counter:
@@ -920,12 +947,8 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
- if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
- flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
- if (attr->mirror_count)
- mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
- }
+ if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
mlx5_eswitch_del_vlan_action(esw, attr);
@@ -946,6 +969,8 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *esw_attr;
+ struct mlx5_flow_handle *rule;
+ struct mlx5_flow_spec *spec;
struct mlx5e_tc_flow *flow;
int err;
@@ -964,26 +989,16 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
list_for_each_entry(flow, &e->flows, encap) {
esw_attr = flow->esw_attr;
esw_attr->encap_id = e->encap_id;
- flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
- if (IS_ERR(flow->rule[0])) {
- err = PTR_ERR(flow->rule[0]);
+ spec = &esw_attr->parse_attr->spec;
+
+ rule = mlx5e_tc_offload_fdb_rules(esw, flow, spec, esw_attr);
+ if (IS_ERR(rule)) {
+ err = PTR_ERR(rule);
mlx5_core_warn(priv->mdev, "Failed to update cached encapsulation flow, %d\n",
err);
continue;
}
-
- if (esw_attr->mirror_count) {
- flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &esw_attr->parse_attr->spec, esw_attr);
- if (IS_ERR(flow->rule[1])) {
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], esw_attr);
- err = PTR_ERR(flow->rule[1]);
- mlx5_core_warn(priv->mdev, "Failed to update cached mirror flow, %d\n",
- err);
- continue;
- }
- }
-
- flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+ flow->rule[0] = rule;
}
}
@@ -994,14 +1009,8 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow;
list_for_each_entry(flow, &e->flows, encap) {
- if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
- struct mlx5_esw_flow_attr *attr = flow->esw_attr;
-
- flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
- if (attr->mirror_count)
- mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
- }
+ if (flow->flags & MLX5E_TC_FLOW_OFFLOADED)
+ mlx5e_tc_unoffload_fdb_rules(esw, flow, flow->esw_attr);
}
if (e->flags & MLX5_ENCAP_ENTRY_VALID) {
--
2.17.2
^ permalink raw reply related
* [net-next 12/14] net/mlx5: E-Switch, Enable setting goto slow path chain action
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
A pre-step for the tc offloads code to use this when a neigh is
not available for encap rules.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 ++
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 6 ++++++
2 files changed, 8 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 54215f4312fa..aaafc9f17115 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -60,6 +60,7 @@
MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
#define FDB_MAX_CHAIN 3
+#define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
#define FDB_MAX_PRIO 16
struct vport_ingress {
@@ -356,6 +357,7 @@ static inline int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs,
static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
#define FDB_MAX_CHAIN 1
+#define FDB_SLOW_PATH_CHAIN (FDB_MAX_CHAIN + 1)
#define FDB_MAX_PRIO 1
#endif /* CONFIG_MLX5_ESWITCH */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 289f1992f624..42a130455ef8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -671,6 +671,9 @@ esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
int table_prio, l = 0;
u32 flags = 0;
+ if (chain == FDB_SLOW_PATH_CHAIN)
+ return esw->fdb_table.offloads.slow_fdb;
+
mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
fdb = fdb_prio_table(esw, chain, prio, level).fdb;
@@ -730,6 +733,9 @@ esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
{
int l;
+ if (chain == FDB_SLOW_PATH_CHAIN)
+ return;
+
mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
for (l = level; l >= 0; l--) {
--
2.17.2
^ permalink raw reply related
* [net-next 09/14] net/mlx5: Add a no-append flow insertion mode
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
If no-append flag is set, we will add a new FTE, instead of appending
the actions of the inserted rule when the same match already exists.
While here, move the has_flow_tag boolean indicator to be a flag too.
This patch doesn't change any functionality.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanmox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/infiniband/hw/mlx5/main.c | 6 +++---
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
.../net/ethernet/mellanox/mlx5/core/fpga/ipsec.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 9 ++++++++-
include/linux/mlx5/fs.h | 14 +++++++++++---
5 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5ced0cc46ba1..af32899bb72a 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2793,7 +2793,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
return -EINVAL;
action->flow_tag = ib_spec->flow_tag.tag_id;
- action->has_flow_tag = true;
+ action->flags |= FLOW_ACT_HAS_TAG;
break;
case IB_FLOW_SPEC_ACTION_DROP:
if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
@@ -2886,7 +2886,7 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA;
return is_crypto && is_ipsec &&
- (!egress || (!is_drop && !flow_act->has_flow_tag)) ?
+ (!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ?
VALID_SPEC_VALID : VALID_SPEC_INVALID;
}
@@ -3349,7 +3349,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
}
- if (flow_act.has_flow_tag &&
+ if ((flow_act.flags & FLOW_ACT_HAS_TAG) &&
(flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 6c04e11f9a05..a9c68b7859b4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -684,9 +684,9 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
struct mlx5_flow_destination dest[2] = {};
struct mlx5_flow_act flow_act = {
.action = attr->action,
- .has_flow_tag = true,
.flow_tag = attr->flow_tag,
.reformat_id = 0,
+ .flags = FLOW_ACT_HAS_TAG,
};
struct mlx5_fc *counter = NULL;
bool table_created = false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
index 5645a4facad2..28aa8c968a80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/ipsec.c
@@ -650,7 +650,7 @@ static bool mlx5_is_fpga_egress_ipsec_rule(struct mlx5_core_dev *dev,
(match_criteria_enable &
~(MLX5_MATCH_OUTER_HEADERS | MLX5_MATCH_MISC_PARAMETERS)) ||
(flow_act->action & ~(MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | MLX5_FLOW_CONTEXT_ACTION_ALLOW)) ||
- flow_act->has_flow_tag)
+ (flow_act->flags & FLOW_ACT_HAS_TAG))
return false;
return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 7eb6d58733ac..67ba4c975d81 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1428,7 +1428,7 @@ static int check_conflicting_ftes(struct fs_fte *fte, const struct mlx5_flow_act
return -EEXIST;
}
- if (flow_act->has_flow_tag &&
+ if ((flow_act->flags & FLOW_ACT_HAS_TAG) &&
fte->action.flow_tag != flow_act->flow_tag) {
mlx5_core_warn(get_dev(&fte->node),
"FTE flow tag %u already exists with different flow tag %u\n",
@@ -1628,6 +1628,8 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
search_again_locked:
version = matched_fgs_get_version(match_head);
+ if (flow_act->flags & FLOW_ACT_NO_APPEND)
+ goto skip_search;
/* Try to find a fg that already contains a matching fte */
list_for_each_entry(iter, match_head, list) {
struct fs_fte *fte_tmp;
@@ -1644,6 +1646,11 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft,
return rule;
}
+skip_search:
+ /* No group with matching fte found, or we skipped the search.
+ * Try to add a new fte to any matching fg.
+ */
+
/* Check the ft version, for case that new flow group
* was added while the fgs weren't locked
*/
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index f8d00872c7d3..5660f07d3be0 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -158,20 +158,28 @@ struct mlx5_fs_vlan {
#define MLX5_FS_VLAN_DEPTH 2
+enum {
+ FLOW_ACT_HAS_TAG = BIT(0),
+ FLOW_ACT_NO_APPEND = BIT(1),
+};
+
struct mlx5_flow_act {
u32 action;
- bool has_flow_tag;
u32 flow_tag;
u32 reformat_id;
u32 modify_id;
uintptr_t esp_id;
+ u32 flags;
struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH];
struct ib_counters *counters;
};
#define MLX5_DECLARE_FLOW_ACT(name) \
- struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
- MLX5_FS_DEFAULT_FLOW_TAG, 0, 0}
+ struct mlx5_flow_act name = { .action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
+ .flow_tag = MLX5_FS_DEFAULT_FLOW_TAG, \
+ .reformat_id = 0, \
+ .modify_id = 0, \
+ .flags = 0, }
/* Single destination per rule.
* Group ID is implied by the match criteria.
--
2.17.2
^ permalink raw reply related
* [net-next 10/14] net/mlx5e: For TC offloads, always add new flow instead of appending the actions
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
When replacing a tc flower rule, flower first requests to add the
new rule (new action), then deletes the old one.
But currently when asked to add a new tc flower flow, we append the
actions (and counters to it).
This can result in a fte with two flow counters or conflicting
actions (drop and encap action) which firmware complains/errs
about and isn't achieving what the user aimed for.
Instead, insert the flow using the new no-append flag which will add a
new HW rule, the old flow and rule will be deleted later by flower
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanmox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a9c68b7859b4..8b25850cbf6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -686,7 +686,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
.action = attr->action,
.flow_tag = attr->flow_tag,
.reformat_id = 0,
- .flags = FLOW_ACT_HAS_TAG,
+ .flags = FLOW_ACT_HAS_TAG | FLOW_ACT_NO_APPEND,
};
struct mlx5_fc *counter = NULL;
bool table_created = false;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8501b6c31c02..289f1992f624 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -80,8 +80,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_esw_flow_attr *attr)
{
struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+ struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, };
bool mirror = !!(attr->mirror_count);
- struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_handle *rule;
struct mlx5_flow_table *fdb;
int j, i = 0;
@@ -195,7 +195,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
struct mlx5_esw_flow_attr *attr)
{
struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
- struct mlx5_flow_act flow_act = {0};
+ struct mlx5_flow_act flow_act = { .flags = FLOW_ACT_NO_APPEND, };
struct mlx5_flow_table *fast_fdb;
struct mlx5_flow_table *fwd_fdb;
struct mlx5_flow_handle *rule;
--
2.17.2
^ permalink raw reply related
* [net-next 08/14] net/mlx5: E-Switch, Add chains and priorities
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
A chain is a group of priorities, so use the fdb parallel
sub namespaces to implement chains, and a flow table for each
priority in them.
Because these namespaces are parallel and in series to the slow path
fdb, the chains aren't connected to one another (but to the slow path),
and one must use a explicit goto action to reach a different chain.
Flow tables for the priorities will be created on demand and destroyed
once not used.
The Firmware has four pools of tables for sizes S/XS/M/L (4k, 64k, 1m, 4m).
We maintain ghost copies of the pools occupancy.
When a new table is to be created, we scan the pools from large to small
and find the 1st table size which can be now created. When a table is
destroyed, we update the relevant pool.
Multi chain/prio isn't enabled yet by this patch, for now all flows
will use the default chain 0, and prio 1.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 3 +
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 32 +-
.../mellanox/mlx5/core/eswitch_offloads.c | 390 ++++++++++++++----
3 files changed, 339 insertions(+), 86 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 7487bdd55f23..6c04e11f9a05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -837,6 +837,9 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_priv *out_priv;
int err = 0, encap_err = 0;
+ /* keep the old behaviour, use same prio for all offloaded rules */
+ attr->prio = 1;
+
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
out_dev = __dev_get_by_index(dev_net(priv->netdev),
attr->parse_attr->mirred_ifindex);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 584e735bbad1..54215f4312fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -123,6 +123,13 @@ struct mlx5_vport {
u16 enabled_events;
};
+enum offloads_fdb_flags {
+ ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED = BIT(0),
+};
+
+extern const unsigned int ESW_POOLS[4];
+
+#define PRIO_LEVELS 2
struct mlx5_eswitch_fdb {
union {
struct legacy_fdb {
@@ -133,16 +140,24 @@ struct mlx5_eswitch_fdb {
} legacy;
struct offloads_fdb {
- struct mlx5_flow_table *fast_fdb;
- struct mlx5_flow_table *fwd_fdb;
struct mlx5_flow_table *slow_fdb;
struct mlx5_flow_group *send_to_vport_grp;
struct mlx5_flow_group *miss_grp;
struct mlx5_flow_handle *miss_rule_uni;
struct mlx5_flow_handle *miss_rule_multi;
int vlan_push_pop_refcount;
+
+ struct {
+ struct mlx5_flow_table *fdb;
+ u32 num_rules;
+ } fdb_prio[FDB_MAX_CHAIN + 1][FDB_MAX_PRIO + 1][PRIO_LEVELS];
+ /* Protects fdb_prio table */
+ struct mutex fdb_prio_lock;
+
+ int fdb_left[ARRAY_SIZE(ESW_POOLS)];
} offloads;
};
+ u32 flags;
};
struct mlx5_esw_offload {
@@ -184,6 +199,7 @@ struct mlx5_eswitch {
struct mlx5_esw_offload offloads;
int mode;
+ int nvports;
};
void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
@@ -236,6 +252,15 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *rule,
struct mlx5_esw_flow_attr *attr);
+bool
+mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw);
+
+u16
+mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw);
+
+u32
+mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw);
+
struct mlx5_flow_handle *
mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport,
struct mlx5_flow_destination *dest);
@@ -274,6 +299,9 @@ struct mlx5_esw_flow_attr {
u32 mod_hdr_id;
u8 match_level;
struct mlx5_fc *counter;
+ u32 chain;
+ u16 prio;
+ u32 dest_chain;
struct mlx5e_tc_flow_parse_attr *parse_attr;
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 983bb8a80f75..8501b6c31c02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -37,32 +37,59 @@
#include <linux/mlx5/fs.h>
#include "mlx5_core.h"
#include "eswitch.h"
+#include "en.h"
+#include "fs_core.h"
enum {
FDB_FAST_PATH = 0,
FDB_SLOW_PATH
};
+#define fdb_prio_table(esw, chain, prio, level) \
+ (esw)->fdb_table.offloads.fdb_prio[(chain)][(prio)][(level)]
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
+
+bool mlx5_eswitch_prios_supported(struct mlx5_eswitch *esw)
+{
+ return (!!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED));
+}
+
+u32 mlx5_eswitch_get_chain_range(struct mlx5_eswitch *esw)
+{
+ if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+ return FDB_MAX_CHAIN;
+
+ return 0;
+}
+
+u16 mlx5_eswitch_get_prio_range(struct mlx5_eswitch *esw)
+{
+ if (esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)
+ return FDB_MAX_PRIO;
+
+ return U16_MAX;
+}
+
struct mlx5_flow_handle *
mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_spec *spec,
struct mlx5_esw_flow_attr *attr)
{
struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
+ bool mirror = !!(attr->mirror_count);
struct mlx5_flow_act flow_act = {0};
- struct mlx5_flow_table *ft = NULL;
struct mlx5_flow_handle *rule;
+ struct mlx5_flow_table *fdb;
int j, i = 0;
void *misc;
if (esw->mode != SRIOV_OFFLOADS)
return ERR_PTR(-EOPNOTSUPP);
- if (attr->mirror_count)
- ft = esw->fdb_table.offloads.fwd_fdb;
- else
- ft = esw->fdb_table.offloads.fast_fdb;
-
flow_act.action = attr->action;
/* if per flow vlan pop/push is emulated, don't set that into the firmware */
if (!mlx5_eswitch_vlan_actions_supported(esw->dev, 1))
@@ -80,13 +107,28 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
- for (j = attr->mirror_count; j < attr->out_count; j++) {
- dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
- dest[i].vport.num = attr->out_rep[j]->vport;
- dest[i].vport.vhca_id =
- MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
- dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+ if (attr->dest_chain) {
+ struct mlx5_flow_table *ft;
+
+ ft = esw_get_prio_table(esw, attr->dest_chain, 1, 0);
+ if (IS_ERR(ft)) {
+ rule = ERR_CAST(ft);
+ goto err_create_goto_table;
+ }
+
+ dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+ dest[i].ft = ft;
i++;
+ } else {
+ for (j = attr->mirror_count; j < attr->out_count; j++) {
+ dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+ dest[i].vport.num = attr->out_rep[j]->vport;
+ dest[i].vport.vhca_id =
+ MLX5_CAP_GEN(attr->out_mdev[j], vhca_id);
+ dest[i].vport.vhca_id_valid =
+ !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
+ i++;
+ }
}
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
@@ -124,13 +166,26 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
flow_act.reformat_id = attr->encap_id;
- rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
+ fdb = esw_get_prio_table(esw, attr->chain, attr->prio, !!mirror);
+ if (IS_ERR(fdb)) {
+ rule = ERR_CAST(fdb);
+ goto err_esw_get;
+ }
+
+ rule = mlx5_add_flow_rules(fdb, spec, &flow_act, dest, i);
if (IS_ERR(rule))
- goto out;
+ goto err_add_rule;
else
esw->offloads.num_flows++;
-out:
+ return rule;
+
+err_add_rule:
+ esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+err_esw_get:
+ if (attr->dest_chain)
+ esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+err_create_goto_table:
return rule;
}
@@ -141,10 +196,24 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
{
struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
struct mlx5_flow_act flow_act = {0};
+ struct mlx5_flow_table *fast_fdb;
+ struct mlx5_flow_table *fwd_fdb;
struct mlx5_flow_handle *rule;
void *misc;
int i;
+ fast_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 0);
+ if (IS_ERR(fast_fdb)) {
+ rule = ERR_CAST(fast_fdb);
+ goto err_get_fast;
+ }
+
+ fwd_fdb = esw_get_prio_table(esw, attr->chain, attr->prio, 1);
+ if (IS_ERR(fwd_fdb)) {
+ rule = ERR_CAST(fwd_fdb);
+ goto err_get_fwd;
+ }
+
flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
for (i = 0; i < attr->mirror_count; i++) {
dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
@@ -154,7 +223,7 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
dest[i].vport.vhca_id_valid = !!MLX5_CAP_ESW(esw->dev, merged_eswitch);
}
dest[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
- dest[i].ft = esw->fdb_table.offloads.fwd_fdb,
+ dest[i].ft = fwd_fdb,
i++;
misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
@@ -177,21 +246,49 @@ mlx5_eswitch_add_fwd_rule(struct mlx5_eswitch *esw,
spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
MLX5_MATCH_MISC_PARAMETERS;
- rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fast_fdb, spec, &flow_act, dest, i);
+ rule = mlx5_add_flow_rules(fast_fdb, spec, &flow_act, dest, i);
- if (!IS_ERR(rule))
- esw->offloads.num_flows++;
+ if (IS_ERR(rule))
+ goto add_err;
+ esw->offloads.num_flows++;
+
+ return rule;
+add_err:
+ esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+err_get_fwd:
+ esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+err_get_fast:
return rule;
}
+static void
+__mlx5_eswitch_del_rule(struct mlx5_eswitch *esw,
+ struct mlx5_flow_handle *rule,
+ struct mlx5_esw_flow_attr *attr,
+ bool fwd_rule)
+{
+ bool mirror = (attr->mirror_count > 0);
+
+ mlx5_del_flow_rules(rule);
+ esw->offloads.num_flows--;
+
+ if (fwd_rule) {
+ esw_put_prio_table(esw, attr->chain, attr->prio, 1);
+ esw_put_prio_table(esw, attr->chain, attr->prio, 0);
+ } else {
+ esw_put_prio_table(esw, attr->chain, attr->prio, !!mirror);
+ if (attr->dest_chain)
+ esw_put_prio_table(esw, attr->dest_chain, 1, 0);
+ }
+}
+
void
mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *rule,
struct mlx5_esw_flow_attr *attr)
{
- mlx5_del_flow_rules(rule);
- esw->offloads.num_flows--;
+ __mlx5_eswitch_del_rule(esw, rule, attr, false);
}
void
@@ -199,7 +296,7 @@ mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *rule,
struct mlx5_esw_flow_attr *attr)
{
- mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+ __mlx5_eswitch_del_rule(esw, rule, attr, true);
}
static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
@@ -288,7 +385,8 @@ int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
push = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH);
pop = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_VLAN_POP);
- fwd = !!(attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST);
+ fwd = !!((attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
+ !attr->dest_chain);
err = esw_add_vlan_action_check(attr, push, pop, fwd);
if (err)
@@ -495,75 +593,164 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
#define ESW_OFFLOADS_NUM_GROUPS 4
-static int esw_create_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+/* Firmware currently has 4 pool of 4 sizes that it supports (ESW_POOLS),
+ * and a virtual memory region of 16M (ESW_SIZE), this region is duplicated
+ * for each flow table pool. We can allocate up to 16M of each pool,
+ * and we keep track of how much we used via put/get_sz_to_pool.
+ * Firmware doesn't report any of this for now.
+ * ESW_POOL is expected to be sorted from large to small
+ */
+#define ESW_SIZE (16 * 1024 * 1024)
+const unsigned int ESW_POOLS[4] = { 4 * 1024 * 1024, 1 * 1024 * 1024,
+ 64 * 1024, 4 * 1024 };
+
+static int
+get_sz_from_pool(struct mlx5_eswitch *esw)
+{
+ int sz = 0, i;
+
+ for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+ if (esw->fdb_table.offloads.fdb_left[i]) {
+ --esw->fdb_table.offloads.fdb_left[i];
+ sz = ESW_POOLS[i];
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static void
+put_sz_to_pool(struct mlx5_eswitch *esw, int sz)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++) {
+ if (sz >= ESW_POOLS[i]) {
+ ++esw->fdb_table.offloads.fdb_left[i];
+ break;
+ }
+ }
+}
+
+static struct mlx5_flow_table *
+create_next_size_table(struct mlx5_eswitch *esw,
+ struct mlx5_flow_namespace *ns,
+ u16 table_prio,
+ int level,
+ u32 flags)
+{
+ struct mlx5_flow_table *fdb;
+ int sz;
+
+ sz = get_sz_from_pool(esw);
+ if (!sz)
+ return ERR_PTR(-ENOSPC);
+
+ fdb = mlx5_create_auto_grouped_flow_table(ns,
+ table_prio,
+ sz,
+ ESW_OFFLOADS_NUM_GROUPS,
+ level,
+ flags);
+ if (IS_ERR(fdb)) {
+ esw_warn(esw->dev, "Failed to create FDB Table err %d (table prio: %d, level: %d, size: %d)\n",
+ (int)PTR_ERR(fdb), table_prio, level, sz);
+ put_sz_to_pool(esw, sz);
+ }
+
+ return fdb;
+}
+
+static struct mlx5_flow_table *
+esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
{
struct mlx5_core_dev *dev = esw->dev;
- struct mlx5_flow_namespace *root_ns;
struct mlx5_flow_table *fdb = NULL;
- int esw_size, err = 0;
+ struct mlx5_flow_namespace *ns;
+ int table_prio, l = 0;
u32 flags = 0;
- u32 max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
- MLX5_CAP_GEN(dev, max_flow_counter_15_0);
- root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
- if (!root_ns) {
- esw_warn(dev, "Failed to get FDB flow namespace\n");
- err = -EOPNOTSUPP;
- goto out_namespace;
- }
-
- esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
- MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
- max_flow_counter, ESW_OFFLOADS_NUM_GROUPS);
+ mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
- esw_size = min_t(int, max_flow_counter * ESW_OFFLOADS_NUM_GROUPS,
- 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+ fdb = fdb_prio_table(esw, chain, prio, level).fdb;
+ if (fdb) {
+ /* take ref on earlier levels as well */
+ while (level >= 0)
+ fdb_prio_table(esw, chain, prio, level--).num_rules++;
+ mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+ return fdb;
+ }
- if (mlx5_esw_has_fwd_fdb(dev))
- esw_size >>= 1;
+ ns = mlx5_get_fdb_sub_ns(dev, chain);
+ if (!ns) {
+ esw_warn(dev, "Failed to get FDB sub namespace\n");
+ mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
- fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
- esw_size,
- ESW_OFFLOADS_NUM_GROUPS, 0,
- flags);
- if (IS_ERR(fdb)) {
- err = PTR_ERR(fdb);
- esw_warn(dev, "Failed to create Fast path FDB Table err %d\n", err);
- goto out_namespace;
- }
- esw->fdb_table.offloads.fast_fdb = fdb;
+ table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
- if (!mlx5_esw_has_fwd_fdb(dev))
- goto out_namespace;
+ /* create earlier levels for correct fs_core lookup when
+ * connecting tables
+ */
+ for (l = 0; l <= level; l++) {
+ if (fdb_prio_table(esw, chain, prio, l).fdb) {
+ fdb_prio_table(esw, chain, prio, l).num_rules++;
+ continue;
+ }
- fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
- esw_size,
- ESW_OFFLOADS_NUM_GROUPS, 1,
- flags);
- if (IS_ERR(fdb)) {
- err = PTR_ERR(fdb);
- esw_warn(dev, "Failed to create fwd table err %d\n", err);
- goto out_ft;
+ fdb = create_next_size_table(esw, ns, table_prio, l, flags);
+ if (IS_ERR(fdb)) {
+ l--;
+ goto err_create_fdb;
+ }
+
+ fdb_prio_table(esw, chain, prio, l).fdb = fdb;
+ fdb_prio_table(esw, chain, prio, l).num_rules = 1;
}
- esw->fdb_table.offloads.fwd_fdb = fdb;
- return err;
+ mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+ return fdb;
-out_ft:
- mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
-out_namespace:
- return err;
+err_create_fdb:
+ mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+ if (l >= 0)
+ esw_put_prio_table(esw, chain, prio, l);
+
+ return fdb;
}
-static void esw_destroy_offloads_fast_fdb_table(struct mlx5_eswitch *esw)
+static void
+esw_put_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level)
{
- if (mlx5_esw_has_fwd_fdb(esw->dev))
- mlx5_destroy_flow_table(esw->fdb_table.offloads.fwd_fdb);
- mlx5_destroy_flow_table(esw->fdb_table.offloads.fast_fdb);
+ int l;
+
+ mutex_lock(&esw->fdb_table.offloads.fdb_prio_lock);
+
+ for (l = level; l >= 0; l--) {
+ if (--(fdb_prio_table(esw, chain, prio, l).num_rules) > 0)
+ continue;
+
+ put_sz_to_pool(esw, fdb_prio_table(esw, chain, prio, l).fdb->max_fte);
+ mlx5_destroy_flow_table(fdb_prio_table(esw, chain, prio, l).fdb);
+ fdb_prio_table(esw, chain, prio, l).fdb = NULL;
+ }
+
+ mutex_unlock(&esw->fdb_table.offloads.fdb_prio_lock);
+}
+
+static void esw_destroy_offloads_fast_fdb_tables(struct mlx5_eswitch *esw)
+{
+ /* If lazy creation isn't supported, deref the fast path tables */
+ if (!(esw->fdb_table.flags & ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED)) {
+ esw_put_prio_table(esw, 0, 1, 1);
+ esw_put_prio_table(esw, 0, 1, 0);
+ }
}
#define MAX_PF_SQ 256
@@ -574,12 +761,13 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
struct mlx5_flow_table_attr ft_attr = {};
struct mlx5_core_dev *dev = esw->dev;
+ u32 *flow_group_in, max_flow_counter;
struct mlx5_flow_namespace *root_ns;
struct mlx5_flow_table *fdb = NULL;
- int table_size, ix, err = 0;
+ int table_size, ix, err = 0, i;
struct mlx5_flow_group *g;
+ u32 flags = 0, fdb_max;
void *match_criteria;
- u32 *flow_group_in;
u8 *dmac;
esw_debug(esw->dev, "Create offloads FDB Tables\n");
@@ -594,12 +782,29 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
goto ns_err;
}
- err = esw_create_offloads_fast_fdb_table(esw);
- if (err)
- goto fast_fdb_err;
+ max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
+ MLX5_CAP_GEN(dev, max_flow_counter_15_0);
+ fdb_max = 1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size);
+
+ esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d), groups(%d), max flow table size(2^%d))\n",
+ MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
+ max_flow_counter, ESW_OFFLOADS_NUM_GROUPS,
+ fdb_max);
+
+ for (i = 0; i < ARRAY_SIZE(ESW_POOLS); i++)
+ esw->fdb_table.offloads.fdb_left[i] =
+ ESW_POOLS[i] <= fdb_max ? ESW_SIZE / ESW_POOLS[i] : 0;
table_size = nvports * MAX_SQ_NVPORTS + MAX_PF_SQ + 2;
+ /* create the slow path fdb with encap set, so further table instances
+ * can be created at run time while VFs are probed if the FW allows that.
+ */
+ if (esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE)
+ flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
+ MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
+
+ ft_attr.flags = flags;
ft_attr.max_fte = table_size;
ft_attr.prio = FDB_SLOW_PATH;
@@ -611,6 +816,18 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
}
esw->fdb_table.offloads.slow_fdb = fdb;
+ /* If lazy creation isn't supported, open the fast path tables now */
+ if (!MLX5_CAP_ESW_FLOWTABLE(esw->dev, multi_fdb_encap) &&
+ esw->offloads.encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE) {
+ esw->fdb_table.flags &= ~ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+ esw_warn(dev, "Lazy creation of flow tables isn't supported, ignoring priorities\n");
+ esw_get_prio_table(esw, 0, 1, 0);
+ esw_get_prio_table(esw, 0, 1, 1);
+ } else {
+ esw_debug(dev, "Lazy creation of flow tables supported, deferring table opening\n");
+ esw->fdb_table.flags |= ESW_FDB_CHAINS_AND_PRIOS_SUPPORTED;
+ }
+
/* create send-to-vport group */
memset(flow_group_in, 0, inlen);
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -658,6 +875,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
if (err)
goto miss_rule_err;
+ esw->nvports = nvports;
kvfree(flow_group_in);
return 0;
@@ -666,10 +884,9 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
miss_err:
mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
send_vport_err:
+ esw_destroy_offloads_fast_fdb_tables(esw);
mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
slow_fdb_err:
- esw_destroy_offloads_fast_fdb_table(esw);
-fast_fdb_err:
ns_err:
kvfree(flow_group_in);
return err;
@@ -677,7 +894,7 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
{
- if (!esw->fdb_table.offloads.fast_fdb)
+ if (!esw->fdb_table.offloads.slow_fdb)
return;
esw_debug(esw->dev, "Destroy offloads FDB Tables\n");
@@ -687,7 +904,7 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
- esw_destroy_offloads_fast_fdb_table(esw);
+ esw_destroy_offloads_fast_fdb_tables(esw);
}
static int esw_create_offloads_table(struct mlx5_eswitch *esw)
@@ -944,6 +1161,8 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
{
int err;
+ mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
+
err = esw_create_offloads_fdb_tables(esw, nvports);
if (err)
return err;
@@ -1272,16 +1491,19 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap,
return -EOPNOTSUPP;
}
- esw_destroy_offloads_fast_fdb_table(esw);
+ esw_destroy_offloads_fdb_tables(esw);
esw->offloads.encap = encap;
- err = esw_create_offloads_fast_fdb_table(esw);
+
+ err = esw_create_offloads_fdb_tables(esw, esw->nvports);
+
if (err) {
NL_SET_ERR_MSG_MOD(extack,
"Failed re-creating fast FDB table");
esw->offloads.encap = !encap;
- (void)esw_create_offloads_fast_fdb_table(esw);
+ (void)esw_create_offloads_fdb_tables(esw, esw->nvports);
}
+
return err;
}
--
2.17.2
^ permalink raw reply related
* [net-next 07/14] net/mlx5: E-Switch, Have explicit API to delete fwd rules
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Or Gerlitz, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Or Gerlitz <ogerlitz@mellanox.com>
Be symmetric with the e-switch API to add rules which has a
specific function to add fwd rules which are used as part of
vport mirroring.
This patch doesn't change any functionality.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Paul Blakey <paulb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++--
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 4 ++++
.../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 8 ++++++++
3 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a4a432f02930..7487bdd55f23 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -920,7 +920,7 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
if (attr->mirror_count)
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+ mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
}
@@ -996,7 +996,7 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
flow->flags &= ~MLX5E_TC_FLOW_OFFLOADED;
if (attr->mirror_count)
- mlx5_eswitch_del_offloaded_rule(esw, flow->rule[1], attr);
+ mlx5_eswitch_del_fwd_rule(esw, flow->rule[1], attr);
mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 1698a322a7c4..584e735bbad1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -231,6 +231,10 @@ void
mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *rule,
struct mlx5_esw_flow_attr *attr);
+void
+mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
+ struct mlx5_flow_handle *rule,
+ struct mlx5_esw_flow_attr *attr);
struct mlx5_flow_handle *
mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 39932dce15cb..983bb8a80f75 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -194,6 +194,14 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
esw->offloads.num_flows--;
}
+void
+mlx5_eswitch_del_fwd_rule(struct mlx5_eswitch *esw,
+ struct mlx5_flow_handle *rule,
+ struct mlx5_esw_flow_attr *attr)
+{
+ mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+}
+
static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
{
struct mlx5_eswitch_rep *rep;
--
2.17.2
^ permalink raw reply related
* [net-next 06/14] net/mlx5: Split FDB fast path prio to multiple namespaces
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
Towards supporting multi-chains and priorities, split the FDB fast path
to multiple namespaces (sub namespaces), each with multiple priorities.
This patch adds a new flow steering type, FS_TYPE_PRIO_CHAINS, which is
like current FS_TYPE_PRIO, but may contain only namespaces, and those
will be in parallel to one another in terms of managing of the flow
tables connections inside them. Meaning, while searching for the next
or previous flow table to connect for a new table inside such namespace
we skip the parallel namespaces in the same level under the
FS_TYPE_PRIO_CHAINS prio we originated from.
We use this new type for splitting the fast path prio into multiple
parallel namespaces, each containing normal prios.
The prios inside them (and their tables) will be connected to one
another, but not from one parallel namespace to another, instead the
last prio in each namespace will be connected to the next prio in
the containing FDB namespace, which is the slow path prio.
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Acked-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +-
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 7 ++
.../net/ethernet/mellanox/mlx5/core/fs_core.c | 88 ++++++++++++++++---
.../net/ethernet/mellanox/mlx5/core/fs_core.h | 13 +++
include/linux/mlx5/fs.h | 2 +
5 files changed, 101 insertions(+), 11 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9c893d7d273e..d004957328f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -263,7 +263,7 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw)
esw_debug(dev, "Create FDB log_max_size(%d)\n",
MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
- root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+ root_ns = mlx5_get_fdb_sub_ns(dev, 0);
if (!root_ns) {
esw_warn(dev, "Failed to get FDB flow namespace\n");
return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index c1b627577003..1698a322a7c4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -59,6 +59,9 @@
#define mlx5_esw_has_fwd_fdb(dev) \
MLX5_CAP_ESW_FLOWTABLE(dev, fdb_multi_path_to_table)
+#define FDB_MAX_CHAIN 3
+#define FDB_MAX_PRIO 16
+
struct vport_ingress {
struct mlx5_flow_table *acl;
struct mlx5_flow_group *allow_untagged_spoofchk_grp;
@@ -319,6 +322,10 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
static inline void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe) {}
static inline int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode) { return 0; }
static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw) {}
+
+#define FDB_MAX_CHAIN 1
+#define FDB_MAX_PRIO 1
+
#endif /* CONFIG_MLX5_ESWITCH */
#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cdcbf9d0ae6c..7eb6d58733ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -40,6 +40,7 @@
#include "diag/fs_tracepoint.h"
#include "accel/ipsec.h"
#include "fpga/ipsec.h"
+#include "eswitch.h"
#define INIT_TREE_NODE_ARRAY_SIZE(...) (sizeof((struct init_tree_node[]){__VA_ARGS__}) /\
sizeof(struct init_tree_node))
@@ -713,7 +714,7 @@ static struct mlx5_flow_table *find_closest_ft_recursive(struct fs_node *root,
struct fs_node *iter = list_entry(start, struct fs_node, list);
struct mlx5_flow_table *ft = NULL;
- if (!root)
+ if (!root || root->type == FS_TYPE_PRIO_CHAINS)
return NULL;
list_for_each_advance_continue(iter, &root->children, reverse) {
@@ -1973,6 +1974,18 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
fg->id);
}
+struct mlx5_flow_namespace *mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev,
+ int n)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+
+ if (!steering || !steering->fdb_sub_ns)
+ return NULL;
+
+ return steering->fdb_sub_ns[n];
+}
+EXPORT_SYMBOL(mlx5_get_fdb_sub_ns);
+
struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
enum mlx5_flow_namespace_type type)
{
@@ -2051,8 +2064,10 @@ struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_d
}
}
-static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
- unsigned int prio, int num_levels)
+static struct fs_prio *_fs_create_prio(struct mlx5_flow_namespace *ns,
+ unsigned int prio,
+ int num_levels,
+ enum fs_node_type type)
{
struct fs_prio *fs_prio;
@@ -2060,7 +2075,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
if (!fs_prio)
return ERR_PTR(-ENOMEM);
- fs_prio->node.type = FS_TYPE_PRIO;
+ fs_prio->node.type = type;
tree_init_node(&fs_prio->node, NULL, del_sw_prio);
tree_add_node(&fs_prio->node, &ns->node);
fs_prio->num_levels = num_levels;
@@ -2070,6 +2085,19 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
return fs_prio;
}
+static struct fs_prio *fs_create_prio_chained(struct mlx5_flow_namespace *ns,
+ unsigned int prio,
+ int num_levels)
+{
+ return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO_CHAINS);
+}
+
+static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
+ unsigned int prio, int num_levels)
+{
+ return _fs_create_prio(ns, prio, num_levels, FS_TYPE_PRIO);
+}
+
static struct mlx5_flow_namespace *fs_init_namespace(struct mlx5_flow_namespace
*ns)
{
@@ -2374,6 +2402,9 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
cleanup_egress_acls_root_ns(dev);
cleanup_ingress_acls_root_ns(dev);
cleanup_root_ns(steering->fdb_root_ns);
+ steering->fdb_root_ns = NULL;
+ kfree(steering->fdb_sub_ns);
+ steering->fdb_sub_ns = NULL;
cleanup_root_ns(steering->sniffer_rx_root_ns);
cleanup_root_ns(steering->sniffer_tx_root_ns);
cleanup_root_ns(steering->egress_root_ns);
@@ -2419,27 +2450,64 @@ static int init_sniffer_rx_root_ns(struct mlx5_flow_steering *steering)
static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
{
- struct fs_prio *prio;
+ struct mlx5_flow_namespace *ns;
+ struct fs_prio *maj_prio;
+ struct fs_prio *min_prio;
+ int levels;
+ int chain;
+ int prio;
+ int err;
steering->fdb_root_ns = create_root_ns(steering, FS_FT_FDB);
if (!steering->fdb_root_ns)
return -ENOMEM;
- prio = fs_create_prio(&steering->fdb_root_ns->ns, 0, 2);
- if (IS_ERR(prio))
+ steering->fdb_sub_ns = kzalloc(sizeof(steering->fdb_sub_ns) *
+ FDB_MAX_CHAIN + 1, GFP_KERNEL);
+ if (!steering->fdb_sub_ns)
+ return -ENOMEM;
+
+ levels = 2 * FDB_MAX_PRIO * (FDB_MAX_CHAIN + 1);
+ maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns, 0,
+ levels);
+ if (IS_ERR(maj_prio)) {
+ err = PTR_ERR(maj_prio);
goto out_err;
+ }
- prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
- if (IS_ERR(prio))
+ for (chain = 0; chain <= FDB_MAX_CHAIN; chain++) {
+ ns = fs_create_namespace(maj_prio);
+ if (IS_ERR(ns)) {
+ err = PTR_ERR(ns);
+ goto out_err;
+ }
+
+ for (prio = 0; prio < FDB_MAX_PRIO * (chain + 1); prio++) {
+ min_prio = fs_create_prio(ns, prio, 2);
+ if (IS_ERR(min_prio)) {
+ err = PTR_ERR(min_prio);
+ goto out_err;
+ }
+ }
+
+ steering->fdb_sub_ns[chain] = ns;
+ }
+
+ maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
+ if (IS_ERR(maj_prio)) {
+ err = PTR_ERR(maj_prio);
goto out_err;
+ }
set_prio_attrs(steering->fdb_root_ns);
return 0;
out_err:
cleanup_root_ns(steering->fdb_root_ns);
+ kfree(steering->fdb_sub_ns);
+ steering->fdb_sub_ns = NULL;
steering->fdb_root_ns = NULL;
- return PTR_ERR(prio);
+ return err;
}
static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a06f83c0c2b6..b51ad217da32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -38,9 +38,21 @@
#include <linux/rhashtable.h>
#include <linux/llist.h>
+/* FS_TYPE_PRIO_CHAINS is a PRIO that will have namespaces only,
+ * and those are in parallel to one another when going over them to connect
+ * a new flow table. Meaning the last flow table in a TYPE_PRIO prio in one
+ * parallel namespace will not automatically connect to the first flow table
+ * found in any prio in any next namespace, but skip the entire containing
+ * TYPE_PRIO_CHAINS prio.
+ *
+ * This is used to implement tc chains, each chain of prios is a different
+ * namespace inside a containing TYPE_PRIO_CHAINS prio.
+ */
+
enum fs_node_type {
FS_TYPE_NAMESPACE,
FS_TYPE_PRIO,
+ FS_TYPE_PRIO_CHAINS,
FS_TYPE_FLOW_TABLE,
FS_TYPE_FLOW_GROUP,
FS_TYPE_FLOW_ENTRY,
@@ -73,6 +85,7 @@ struct mlx5_flow_steering {
struct kmem_cache *ftes_cache;
struct mlx5_flow_root_namespace *root_ns;
struct mlx5_flow_root_namespace *fdb_root_ns;
+ struct mlx5_flow_namespace **fdb_sub_ns;
struct mlx5_flow_root_namespace **esw_egress_root_ns;
struct mlx5_flow_root_namespace **esw_ingress_root_ns;
struct mlx5_flow_root_namespace *sniffer_tx_root_ns;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index a5fc62184195..f8d00872c7d3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -101,6 +101,8 @@ struct mlx5_flow_destination {
};
};
+struct mlx5_flow_namespace *
+mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev, int n);
struct mlx5_flow_namespace *
mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
enum mlx5_flow_namespace_type type);
--
2.17.2
^ permalink raw reply related
* [net-next 05/14] net/mlx5: Add cap bits for multi fdb encap
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Paul Blakey, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Paul Blakey <paulb@mellanox.com>
If set, the firmware supports creating of flow tables with encap
enabled while VFs are configured, if we already created one
(restriction still applies on the first creation).
Signed-off-by: Paul Blakey <paulb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
include/linux/mlx5/mlx5_ifc.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 15e36198f85f..963611820006 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -584,7 +584,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
struct mlx5_ifc_flow_table_eswitch_cap_bits {
u8 reserved_at_0[0x1c];
u8 fdb_multi_path_to_table[0x1];
- u8 reserved_at_1d[0x1e3];
+ u8 reserved_at_1d[0x1];
+ u8 multi_fdb_encap[0x1];
+ u8 reserved_at_1e[0x1e1];
struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
--
2.17.2
^ permalink raw reply related
* [net-next 03/14] net/mlx5e: Change return type of tc add flow functions
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Rabie Loulou, Shahar Klein, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Rabie Loulou <rabiel@mellanox.com>
Refactor the flow add utility functions to return err code instead of rule
pointers. This will allow for simpler logic when one tc rule is
duplicated to two HW rules in downstream patches.
Signed-off-by: Rabie Loulou <rabiel@mellanox.com>
Signed-off-by: Shahar Klein <shahark@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 86 +++++++++----------
1 file changed, 39 insertions(+), 47 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 5ce87f54852d..861986f82844 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -673,7 +673,7 @@ static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv,
}
}
-static struct mlx5_flow_handle *
+static int
mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow_parse_attr *parse_attr,
struct mlx5e_tc_flow *flow,
@@ -689,14 +689,12 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
.reformat_id = 0,
};
struct mlx5_fc *counter = NULL;
- struct mlx5_flow_handle *rule;
bool table_created = false;
int err, dest_ix = 0;
if (flow->flags & MLX5E_TC_FLOW_HAIRPIN) {
err = mlx5e_hairpin_flow_add(priv, flow, parse_attr, extack);
if (err) {
- rule = ERR_PTR(err);
goto err_add_hairpin_flow;
}
if (flow->flags & MLX5E_TC_FLOW_HAIRPIN_RSS) {
@@ -716,7 +714,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
counter = mlx5_fc_create(dev, true);
if (IS_ERR(counter)) {
- rule = ERR_CAST(counter);
+ err = PTR_ERR(counter);
goto err_fc_create;
}
dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
@@ -729,10 +727,8 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
flow_act.modify_id = attr->mod_hdr_id;
kfree(parse_attr->mod_hdr_actions);
- if (err) {
- rule = ERR_PTR(err);
+ if (err)
goto err_create_mod_hdr_id;
- }
}
if (IS_ERR_OR_NULL(priv->fs.tc.t)) {
@@ -758,7 +754,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
"Failed to create tc offload table\n");
netdev_err(priv->netdev,
"Failed to create tc offload table\n");
- rule = ERR_CAST(priv->fs.tc.t);
+ err = PTR_ERR(priv->fs.tc.t);
goto err_create_ft;
}
@@ -768,13 +764,15 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
if (attr->match_level != MLX5_MATCH_NONE)
parse_attr->spec.match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
- rule = mlx5_add_flow_rules(priv->fs.tc.t, &parse_attr->spec,
- &flow_act, dest, dest_ix);
+ flow->rule[0] = mlx5_add_flow_rules(priv->fs.tc.t, &parse_attr->spec,
+ &flow_act, dest, dest_ix);
- if (IS_ERR(rule))
+ if (IS_ERR(flow->rule[0])) {
+ err = PTR_ERR(flow->rule[0]);
goto err_add_rule;
+ }
- return rule;
+ return 0;
err_add_rule:
if (table_created) {
@@ -790,7 +788,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
if (flow->flags & MLX5E_TC_FLOW_HAIRPIN)
mlx5e_hairpin_flow_del(priv, flow);
err_add_hairpin_flow:
- return rule;
+ return err;
}
static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
@@ -825,7 +823,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow,
struct netlink_ext_ack *extack);
-static struct mlx5_flow_handle *
+static int
mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow_parse_attr *parse_attr,
struct mlx5e_tc_flow *flow,
@@ -834,21 +832,20 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct net_device *out_dev, *encap_dev = NULL;
- struct mlx5_flow_handle *rule = NULL;
struct mlx5_fc *counter = NULL;
struct mlx5e_rep_priv *rpriv;
struct mlx5e_priv *out_priv;
- int err;
+ int err = 0, encap_err = 0;
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) {
out_dev = __dev_get_by_index(dev_net(priv->netdev),
attr->parse_attr->mirred_ifindex);
- err = mlx5e_attach_encap(priv, &parse_attr->tun_info,
- out_dev, &encap_dev, flow, extack);
- if (err) {
- rule = ERR_PTR(err);
- if (err != -EAGAIN)
- goto err_attach_encap;
+ encap_err = mlx5e_attach_encap(priv, &parse_attr->tun_info,
+ out_dev, &encap_dev, flow,
+ extack);
+ if (encap_err && encap_err != -EAGAIN) {
+ err = encap_err;
+ goto err_attach_encap;
}
out_priv = netdev_priv(encap_dev);
rpriv = out_priv->ppriv;
@@ -857,49 +854,49 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
}
err = mlx5_eswitch_add_vlan_action(esw, attr);
- if (err) {
- rule = ERR_PTR(err);
+ if (err)
goto err_add_vlan;
- }
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
err = mlx5e_attach_mod_hdr(priv, flow, parse_attr);
kfree(parse_attr->mod_hdr_actions);
- if (err) {
- rule = ERR_PTR(err);
+ if (err)
goto err_mod_hdr;
- }
}
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
counter = mlx5_fc_create(esw->dev, true);
if (IS_ERR(counter)) {
- rule = ERR_CAST(counter);
+ err = PTR_ERR(counter);
goto err_create_counter;
}
attr->counter = counter;
}
- /* we get here if (1) there's no error (rule being null) or when
+ /* we get here if (1) there's no error or when
* (2) there's an encap action and we're on -EAGAIN (no valid neigh)
*/
- if (rule != ERR_PTR(-EAGAIN)) {
- rule = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
- if (IS_ERR(rule))
+ if (encap_err != -EAGAIN) {
+ flow->rule[0] = mlx5_eswitch_add_offloaded_rule(esw, &parse_attr->spec, attr);
+ if (IS_ERR(flow->rule[0])) {
+ err = PTR_ERR(flow->rule[0]);
goto err_add_rule;
+ }
if (attr->mirror_count) {
flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, &parse_attr->spec, attr);
- if (IS_ERR(flow->rule[1]))
+ if (IS_ERR(flow->rule[1])) {
+ err = PTR_ERR(flow->rule[1]);
goto err_fwd_rule;
+ }
}
}
- return rule;
+
+ return encap_err;
err_fwd_rule:
- mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
- rule = flow->rule[1];
+ mlx5_eswitch_del_offloaded_rule(esw, flow->rule[0], attr);
err_add_rule:
mlx5_fc_destroy(esw->dev, counter);
err_create_counter:
@@ -911,7 +908,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
mlx5e_detach_encap(priv, flow);
err_attach_encap:
- return rule;
+ return err;
}
static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
@@ -2959,22 +2956,17 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
extack);
if (err < 0)
goto err_free;
- flow->rule[0] = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow,
- extack);
+ err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
} else {
err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow,
extack);
if (err < 0)
goto err_free;
- flow->rule[0] = mlx5e_tc_add_nic_flow(priv, parse_attr, flow,
- extack);
+ err = mlx5e_tc_add_nic_flow(priv, parse_attr, flow, extack);
}
- if (IS_ERR(flow->rule[0])) {
- err = PTR_ERR(flow->rule[0]);
- if (err != -EAGAIN)
- goto err_free;
- }
+ if (err && err != -EAGAIN)
+ goto err_free;
if (err != -EAGAIN)
flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
--
2.17.2
^ permalink raw reply related
* [net-next 04/14] net/mlx5e: Split TC add rule path for nic vs e-switch
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Roi Dayan, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Roi Dayan <roid@mellanox.com>
Move to have clear separation on the code path to add nic vs e-switch
flows. While here we break the code that deals with adding offloaded
TC tool to few smaller stages, each on helper function.
Besides getting us simpler and readable code, these are pre-steps
for being able to have two HW flows serving one SW TC flow for some
e-switch use cases.
Signed-off-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 185 +++++++++++++-----
1 file changed, 138 insertions(+), 47 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 861986f82844..a4a432f02930 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -2907,34 +2907,15 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv)
return &priv->fs.tc.ht;
}
-int mlx5e_configure_flower(struct mlx5e_priv *priv,
- struct tc_cls_flower_offload *f, int flags)
+static int
+mlx5e_alloc_flow(struct mlx5e_priv *priv, int attr_size,
+ struct tc_cls_flower_offload *f, u8 flow_flags,
+ struct mlx5e_tc_flow_parse_attr **__parse_attr,
+ struct mlx5e_tc_flow **__flow)
{
- struct netlink_ext_ack *extack = f->common.extack;
- struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5e_tc_flow_parse_attr *parse_attr;
- struct rhashtable *tc_ht = get_tc_ht(priv);
struct mlx5e_tc_flow *flow;
- int attr_size, err = 0;
- u8 flow_flags = 0;
-
- get_flags(flags, &flow_flags);
-
- flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
- if (flow) {
- NL_SET_ERR_MSG_MOD(extack,
- "flow cookie already exists, ignoring");
- netdev_warn_once(priv->netdev, "flow cookie %lx already exists, ignoring\n", f->cookie);
- return 0;
- }
-
- if (esw && esw->mode == SRIOV_OFFLOADS) {
- flow_flags |= MLX5E_TC_FLOW_ESWITCH;
- attr_size = sizeof(struct mlx5_esw_flow_attr);
- } else {
- flow_flags |= MLX5E_TC_FLOW_NIC;
- attr_size = sizeof(struct mlx5_nic_flow_attr);
- }
+ int err;
flow = kzalloc(sizeof(*flow) + attr_size, GFP_KERNEL);
parse_attr = kvzalloc(sizeof(*parse_attr), GFP_KERNEL);
@@ -2948,45 +2929,155 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv,
flow->priv = priv;
err = parse_cls_flower(priv, flow, &parse_attr->spec, f);
- if (err < 0)
+ if (err)
goto err_free;
- if (flow->flags & MLX5E_TC_FLOW_ESWITCH) {
- err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow,
- extack);
- if (err < 0)
- goto err_free;
- err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
- } else {
- err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow,
- extack);
- if (err < 0)
- goto err_free;
- err = mlx5e_tc_add_nic_flow(priv, parse_attr, flow, extack);
- }
+ *__flow = flow;
+ *__parse_attr = parse_attr;
+
+ return 0;
+err_free:
+ kfree(flow);
+ kvfree(parse_attr);
+ return err;
+}
+
+static int
+mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
+ struct tc_cls_flower_offload *f,
+ u8 flow_flags,
+ struct mlx5e_tc_flow **__flow)
+{
+ struct netlink_ext_ack *extack = f->common.extack;
+ struct mlx5e_tc_flow_parse_attr *parse_attr;
+ struct mlx5e_tc_flow *flow;
+ int attr_size, err;
+
+ flow_flags |= MLX5E_TC_FLOW_ESWITCH;
+ attr_size = sizeof(struct mlx5_esw_flow_attr);
+ err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags,
+ &parse_attr, &flow);
+ if (err)
+ goto out;
+
+ err = parse_tc_fdb_actions(priv, f->exts, parse_attr, flow, extack);
+ if (err)
+ goto err_free;
+
+ err = mlx5e_tc_add_fdb_flow(priv, parse_attr, flow, extack);
if (err && err != -EAGAIN)
goto err_free;
- if (err != -EAGAIN)
+ if (!err)
flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
- if (!(flow->flags & MLX5E_TC_FLOW_ESWITCH) ||
- !(flow->esw_attr->action &
+ if (!(flow->esw_attr->action &
MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT))
kvfree(parse_attr);
- err = rhashtable_insert_fast(tc_ht, &flow->node, tc_ht_params);
- if (err) {
- mlx5e_tc_del_flow(priv, flow);
- kfree(flow);
- }
+ *__flow = flow;
+ return 0;
+
+err_free:
+ kfree(flow);
+ kvfree(parse_attr);
+out:
return err;
+}
+
+static int
+mlx5e_add_nic_flow(struct mlx5e_priv *priv,
+ struct tc_cls_flower_offload *f,
+ u8 flow_flags,
+ struct mlx5e_tc_flow **__flow)
+{
+ struct netlink_ext_ack *extack = f->common.extack;
+ struct mlx5e_tc_flow_parse_attr *parse_attr;
+ struct mlx5e_tc_flow *flow;
+ int attr_size, err;
+
+ flow_flags |= MLX5E_TC_FLOW_NIC;
+ attr_size = sizeof(struct mlx5_nic_flow_attr);
+ err = mlx5e_alloc_flow(priv, attr_size, f, flow_flags,
+ &parse_attr, &flow);
+ if (err)
+ goto out;
+
+ err = parse_tc_nic_actions(priv, f->exts, parse_attr, flow, extack);
+ if (err)
+ goto err_free;
+
+ err = mlx5e_tc_add_nic_flow(priv, parse_attr, flow, extack);
+ if (err)
+ goto err_free;
+
+ flow->flags |= MLX5E_TC_FLOW_OFFLOADED;
+ kvfree(parse_attr);
+ *__flow = flow;
+
+ return 0;
err_free:
+ kfree(flow);
kvfree(parse_attr);
+out:
+ return err;
+}
+
+static int
+mlx5e_tc_add_flow(struct mlx5e_priv *priv,
+ struct tc_cls_flower_offload *f,
+ int flags,
+ struct mlx5e_tc_flow **flow)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ u8 flow_flags;
+ int err;
+
+ get_flags(flags, &flow_flags);
+
+ if (esw && esw->mode == SRIOV_OFFLOADS)
+ err = mlx5e_add_fdb_flow(priv, f, flow_flags, flow);
+ else
+ err = mlx5e_add_nic_flow(priv, f, flow_flags, flow);
+
+ return err;
+}
+
+int mlx5e_configure_flower(struct mlx5e_priv *priv,
+ struct tc_cls_flower_offload *f, int flags)
+{
+ struct netlink_ext_ack *extack = f->common.extack;
+ struct rhashtable *tc_ht = get_tc_ht(priv);
+ struct mlx5e_tc_flow *flow;
+ int err = 0;
+
+ flow = rhashtable_lookup_fast(tc_ht, &f->cookie, tc_ht_params);
+ if (flow) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "flow cookie already exists, ignoring");
+ netdev_warn_once(priv->netdev,
+ "flow cookie %lx already exists, ignoring\n",
+ f->cookie);
+ goto out;
+ }
+
+ err = mlx5e_tc_add_flow(priv, f, flags, &flow);
+ if (err)
+ goto out;
+
+ err = rhashtable_insert_fast(tc_ht, &flow->node, tc_ht_params);
+ if (err)
+ goto err_free;
+
+ return 0;
+
+err_free:
+ mlx5e_tc_del_flow(priv, flow);
kfree(flow);
+out:
return err;
}
--
2.17.2
^ permalink raw reply related
* [net-next 02/14] net/mlx5: Use flow counter IDs and not the wrapping cache object
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Mark Bloch, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
Currently, when a flow rule is created using the FS core layer, the caller
has to pass the entire flow counter object and not just the counter HW
handle (ID). This requires both the FS core and the caller to have
knowledge about the inner implementation of the FS layer flow counters
cache and limits the possible users.
Move to use the counter ID across the place when dealing with flows.
Doing this decoupling, now can we privatize the inner implementation
of the flow counters.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/infiniband/hw/mlx5/main.c | 7 +++++--
.../ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h | 6 +++---
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 4 ++--
.../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 ++--------
drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 6 ++++++
include/linux/mlx5/fs.h | 3 ++-
9 files changed, 23 insertions(+), 19 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5d9b7f62a0ba..5ced0cc46ba1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3320,15 +3320,18 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+ struct mlx5_ib_mcounters *mcounters;
+
err = flow_counters_set_data(flow_act.counters, ucmd);
if (err)
goto free;
+ mcounters = to_mcounters(flow_act.counters);
handler->ibcounters = flow_act.counters;
dest_arr[dest_num].type =
MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest_arr[dest_num].counter =
- to_mcounters(flow_act.counters)->hw_cntrs_hndl;
+ dest_arr[dest_num].counter_id =
+ mlx5_fc_id(mcounters->hw_cntrs_hndl);
dest_num++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
index e83dda441a81..d027ce00c8ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.h
@@ -252,10 +252,10 @@ TRACE_EVENT(mlx5_fs_add_rule,
memcpy(__entry->destination,
&rule->dest_attr,
sizeof(__entry->destination));
- if (rule->dest_attr.type & MLX5_FLOW_DESTINATION_TYPE_COUNTER &&
- rule->dest_attr.counter)
+ if (rule->dest_attr.type &
+ MLX5_FLOW_DESTINATION_TYPE_COUNTER)
__entry->counter_id =
- rule->dest_attr.counter->id;
+ rule->dest_attr.counter_id;
),
TP_printk("rule=%p fte=%p index=%u sw_action=<%s> [dst] %s\n",
__entry->rule, __entry->fte, __entry->index,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 8a27c0813a18..5ce87f54852d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -720,7 +720,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
goto err_fc_create;
}
dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest[dest_ix].counter = counter;
+ dest[dest_ix].counter_id = mlx5_fc_id(counter);
dest_ix++;
attr->counter = counter;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index e1d47fa5ab83..9c893d7d273e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1198,7 +1198,7 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
if (counter) {
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- drop_ctr_dst.counter = counter;
+ drop_ctr_dst.counter_id = mlx5_fc_id(counter);
dst = &drop_ctr_dst;
dest_num++;
}
@@ -1285,7 +1285,7 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
if (counter) {
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
drop_ctr_dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- drop_ctr_dst.counter = counter;
+ drop_ctr_dst.counter_id = mlx5_fc_id(counter);
dst = &drop_ctr_dst;
dest_num++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a2f2d726c99b..39932dce15cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -91,7 +91,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest[i].counter = attr->counter;
+ dest[i].counter_id = mlx5_fc_id(attr->counter);
i++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index dc8d7f6b52c2..08a891f9aade 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -419,7 +419,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
continue;
MLX5_SET(flow_counter_list, in_dests, flow_counter_id,
- dst->dest_attr.counter->id);
+ dst->dest_attr.counter_id);
in_dests += MLX5_ST_SZ_BYTES(dest_format_struct);
list_size++;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9e18e6c0a8b3..cdcbf9d0ae6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1474,14 +1474,8 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
return handle;
}
-static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
+static bool counter_is_valid(u32 action)
{
- if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
- return !counter;
-
- if (!counter)
- return false;
-
return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
MLX5_FLOW_CONTEXT_ACTION_FWD_DEST));
}
@@ -1491,7 +1485,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
struct mlx5_flow_table *ft)
{
if (dest && (dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER))
- return counter_is_valid(dest->counter, action);
+ return counter_is_valid(action);
if (!(action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 09206c4acd9a..1329bc5b7969 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -258,6 +258,12 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
}
EXPORT_SYMBOL(mlx5_fc_create);
+u32 mlx5_fc_id(struct mlx5_fc *counter)
+{
+ return counter->id;
+}
+EXPORT_SYMBOL(mlx5_fc_id);
+
void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter)
{
struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 74d0ea146c9a..a5fc62184195 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -92,7 +92,7 @@ struct mlx5_flow_destination {
u32 tir_num;
u32 ft_num;
struct mlx5_flow_table *ft;
- struct mlx5_fc *counter;
+ u32 counter_id;
struct {
u16 num;
u16 vhca_id;
@@ -192,6 +192,7 @@ void mlx5_fc_query_cached(struct mlx5_fc *counter,
u64 *bytes, u64 *packets, u64 *lastuse);
int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter,
u64 *packets, u64 *bytes);
+u32 mlx5_fc_id(struct mlx5_fc *counter);
int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
--
2.17.2
^ permalink raw reply related
* [net-next 01/14] net/mlx5: E-Switch, Get counters for offloaded flows from callers
From: Saeed Mahameed @ 2018-10-18 0:08 UTC (permalink / raw)
To: David S. Miller, netdev; +Cc: Mark Bloch, Saeed Mahameed
In-Reply-To: <20181018000859.16212-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
There's no real reason for the e-switch logic to manage the creation of
counters for offloaded flows. The API already has the directive for the
caller to denote they want to attach a counter to the created flow.
As such, we go and move the management of flow counters to the mlx5e
tc offload logic. This also lets us remove an inelegant interface where
the FS layer had to provide a way to retrieve a counter from a flow rule.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 32 +++++++++++++++++--
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 1 +
.../mellanox/mlx5/core/eswitch_offloads.c | 20 ++----------
.../net/ethernet/mellanox/mlx5/core/fs_core.c | 15 ---------
include/linux/mlx5/fs.h | 1 -
5 files changed, 33 insertions(+), 36 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index acf7a847f561..8a27c0813a18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -61,6 +61,7 @@ struct mlx5_nic_flow_attr {
u32 hairpin_tirn;
u8 match_level;
struct mlx5_flow_table *hairpin_ft;
+ struct mlx5_fc *counter;
};
#define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1)
@@ -721,6 +722,7 @@ mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
dest[dest_ix].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
dest[dest_ix].counter = counter;
dest_ix++;
+ attr->counter = counter;
}
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) {
@@ -797,7 +799,7 @@ static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,
struct mlx5_nic_flow_attr *attr = flow->nic_attr;
struct mlx5_fc *counter = NULL;
- counter = mlx5_flow_rule_counter(flow->rule[0]);
+ counter = attr->counter;
mlx5_del_flow_rules(flow->rule[0]);
mlx5_fc_destroy(priv->mdev, counter);
@@ -833,6 +835,7 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
struct net_device *out_dev, *encap_dev = NULL;
struct mlx5_flow_handle *rule = NULL;
+ struct mlx5_fc *counter = NULL;
struct mlx5e_rep_priv *rpriv;
struct mlx5e_priv *out_priv;
int err;
@@ -868,6 +871,16 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
}
}
+ if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+ counter = mlx5_fc_create(esw->dev, true);
+ if (IS_ERR(counter)) {
+ rule = ERR_CAST(counter);
+ goto err_create_counter;
+ }
+
+ attr->counter = counter;
+ }
+
/* we get here if (1) there's no error (rule being null) or when
* (2) there's an encap action and we're on -EAGAIN (no valid neigh)
*/
@@ -888,6 +901,8 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
rule = flow->rule[1];
err_add_rule:
+ mlx5_fc_destroy(esw->dev, counter);
+err_create_counter:
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
mlx5e_detach_mod_hdr(priv, flow);
err_mod_hdr:
@@ -921,6 +936,9 @@ static void mlx5e_tc_del_fdb_flow(struct mlx5e_priv *priv,
if (attr->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
mlx5e_detach_mod_hdr(priv, flow);
+
+ if (attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+ mlx5_fc_destroy(esw->dev, attr->counter);
}
void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
@@ -992,6 +1010,14 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
}
}
+static struct mlx5_fc *mlx5e_tc_get_counter(struct mlx5e_tc_flow *flow)
+{
+ if (flow->flags & MLX5E_TC_FLOW_ESWITCH)
+ return flow->esw_attr->counter;
+ else
+ return flow->nic_attr->counter;
+}
+
void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
{
struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
@@ -1017,7 +1043,7 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
continue;
list_for_each_entry(flow, &e->flows, encap) {
if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
- counter = mlx5_flow_rule_counter(flow->rule[0]);
+ counter = mlx5e_tc_get_counter(flow);
mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
neigh_used = true;
@@ -3019,7 +3045,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
if (!(flow->flags & MLX5E_TC_FLOW_OFFLOADED))
return 0;
- counter = mlx5_flow_rule_counter(flow->rule[0]);
+ counter = mlx5e_tc_get_counter(flow);
if (!counter)
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index dfc642de4e6d..c1b627577003 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -266,6 +266,7 @@ struct mlx5_esw_flow_attr {
u32 encap_id;
u32 mod_hdr_id;
u8 match_level;
+ struct mlx5_fc *counter;
struct mlx5e_tc_flow_parse_attr *parse_attr;
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 0741683f7d70..a2f2d726c99b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -51,7 +51,6 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_destination dest[MLX5_MAX_FLOW_FWD_VPORTS + 1] = {};
struct mlx5_flow_act flow_act = {0};
struct mlx5_flow_table *ft = NULL;
- struct mlx5_fc *counter = NULL;
struct mlx5_flow_handle *rule;
int j, i = 0;
void *misc;
@@ -91,13 +90,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
}
}
if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
- counter = mlx5_fc_create(esw->dev, true);
- if (IS_ERR(counter)) {
- rule = ERR_CAST(counter);
- goto err_counter_alloc;
- }
dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest[i].counter = counter;
+ dest[i].counter = attr->counter;
i++;
}
@@ -132,15 +126,11 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, i);
if (IS_ERR(rule))
- goto err_add_rule;
+ goto out;
else
esw->offloads.num_flows++;
- return rule;
-
-err_add_rule:
- mlx5_fc_destroy(esw->dev, counter);
-err_counter_alloc:
+out:
return rule;
}
@@ -200,11 +190,7 @@ mlx5_eswitch_del_offloaded_rule(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *rule,
struct mlx5_esw_flow_attr *attr)
{
- struct mlx5_fc *counter = NULL;
-
- counter = mlx5_flow_rule_counter(rule);
mlx5_del_flow_rules(rule);
- mlx5_fc_destroy(esw->dev, counter);
esw->offloads.num_flows--;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 8d340e5181f8..9e18e6c0a8b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1474,21 +1474,6 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
return handle;
}
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
-{
- struct mlx5_flow_rule *dst;
- struct fs_fte *fte;
-
- fs_get_obj(fte, handle->rule[0]->node.parent);
-
- fs_for_each_dst(dst, fte) {
- if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
- return dst->dest_attr.counter;
- }
-
- return NULL;
-}
-
static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
{
if (!(action & MLX5_FLOW_CONTEXT_ACTION_COUNT))
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index b1c026f1c8ba..74d0ea146c9a 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -186,7 +186,6 @@ int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
struct mlx5_flow_destination *new_dest,
struct mlx5_flow_destination *old_dest);
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
void mlx5_fc_query_cached(struct mlx5_fc *counter,
--
2.17.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox