Netdev List

Netdev List
 help / color / mirror / Atom feed

* [bpf-next PATCH 2/3] bpf: sockmap, support for msg_peek in sk_msg with redirect ingress
From: John Fastabend @ 2018-10-16 18:08 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20181016180424.13607.7932.stgit@john-Precision-Tower-5810>

This adds support for the MSG_PEEK flag when doing redirect to ingress
and receiving on the sk_msg psock queue. Previously the flag was
being ignored which could confuse applications if they expected the
flag to work as normal.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tcp.h  |    2 +-
 net/ipv4/tcp_bpf.c |   42 +++++++++++++++++++++++++++---------------
 net/tls/tls_sw.c   |    3 ++-
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3600ae0..14fdd7c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2089,7 +2089,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
 int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		    int nonblock, int flags, int *addr_len);
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len);
+		      struct msghdr *msg, int len, int flags);
 
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f9d3cf1..b7918d4 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
 }
 
 int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
-		      struct msghdr *msg, int len)
+		      struct msghdr *msg, int len, int flags)
 {
 	struct iov_iter *iter = &msg->msg_iter;
+	int peek = flags & MSG_PEEK;
 	int i, ret, copied = 0;
+	struct sk_msg *msg_rx;
+
+	msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+					  struct sk_msg, list);
 
 	while (copied != len) {
 		struct scatterlist *sge;
-		struct sk_msg *msg_rx;
 
-		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
-						  struct sk_msg, list);
 		if (unlikely(!msg_rx))
 			break;
 
@@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 			}
 
 			copied += copy;
-			sge->offset += copy;
-			sge->length -= copy;
-			sk_mem_uncharge(sk, copy);
-			msg_rx->sg.size -= copy;
-			if (!sge->length) {
-				i++;
-				if (i == MAX_SKB_FRAGS)
-					i = 0;
-				if (!msg_rx->skb)
-					put_page(page);
+			if (likely(!peek)) {
+				sge->offset += copy;
+				sge->length -= copy;
+				sk_mem_uncharge(sk, copy);
+				msg_rx->sg.size -= copy;
+
+				if (!sge->length) {
+					sk_msg_iter_var_next(i);
+					if (!msg_rx->skb)
+						put_page(page);
+				}
+			} else {
+				sk_msg_iter_var_next(i);
 			}
 
 			if (copied == len)
 				break;
 		} while (i != msg_rx->sg.end);
 
+		if (unlikely(peek)) {
+			msg_rx = list_next_entry(msg_rx, list);
+			continue;
+		}
+
 		msg_rx->sg.start = i;
 		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
 			list_del(&msg_rx->list);
@@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
 				consume_skb(msg_rx->skb);
 			kfree(msg_rx);
 		}
+		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+						  struct sk_msg, list);
 	}
 
 	return copied;
@@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 	lock_sock(sk);
 msg_bytes_ready:
-	copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+	copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
 	if (!copied) {
 		int data, err = 0;
 		long timeo;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index a525fc4..5cd88ba 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk,
 		skb = tls_wait_data(sk, psock, flags, timeo, &err);
 		if (!skb) {
 			if (psock) {
-				int ret = __tcp_bpf_recvmsg(sk, psock, msg, len);
+				int ret = __tcp_bpf_recvmsg(sk, psock,
+							    msg, len, flags);
 
 				if (ret > 0) {
 					copied += ret;

^ permalink raw reply related

* [bpf-next PATCH 3/3] bpf: sockmap, add msg_peek tests to test_sockmap
From: John Fastabend @ 2018-10-16 18:08 UTC (permalink / raw)
  To: ast, daniel; +Cc: netdev
In-Reply-To: <20181016180424.13607.7932.stgit@john-Precision-Tower-5810>

Add tests that do a MSG_PEEK recv followed by a regular receive to
test flag support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 tools/testing/selftests/bpf/test_sockmap.c |  167 +++++++++++++++++++---------
 1 file changed, 115 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 7cb69ce..cbd1c0b 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -80,6 +80,7 @@
 int txmsg_ingress;
 int txmsg_skb;
 int ktls;
+int peek_flag;
 
 static const struct option long_options[] = {
 	{"help",	no_argument,		NULL, 'h' },
@@ -102,6 +103,7 @@
 	{"txmsg_ingress", no_argument,		&txmsg_ingress, 1 },
 	{"txmsg_skb", no_argument,		&txmsg_skb, 1 },
 	{"ktls", no_argument,			&ktls, 1 },
+	{"peek", no_argument,			&peek_flag, 1 },
 	{0, 0, NULL, 0 }
 };
 
@@ -352,33 +354,40 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt,
 	return 0;
 }
 
-static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
-		    struct msg_stats *s, bool tx,
-		    struct sockmap_options *opt)
+static void msg_free_iov(struct msghdr *msg)
 {
-	struct msghdr msg = {0};
-	int err, i, flags = MSG_NOSIGNAL;
+	int i;
+
+	for (i = 0; i < msg->msg_iovlen; i++)
+		free(msg->msg_iov[i].iov_base);
+	free(msg->msg_iov);
+	msg->msg_iov = NULL;
+	msg->msg_iovlen = 0;
+}
+
+static int msg_alloc_iov(struct msghdr *msg,
+			 int iov_count, int iov_length,
+			 bool data, bool xmit)
+{
+	unsigned char k = 0;
 	struct iovec *iov;
-	unsigned char k;
-	bool data_test = opt->data_test;
-	bool drop = opt->drop_expected;
+	int i;
 
 	iov = calloc(iov_count, sizeof(struct iovec));
 	if (!iov)
 		return errno;
 
-	k = 0;
 	for (i = 0; i < iov_count; i++) {
 		unsigned char *d = calloc(iov_length, sizeof(char));
 
 		if (!d) {
 			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
-			goto out_errno;
+			goto unwind_iov;
 		}
 		iov[i].iov_base = d;
 		iov[i].iov_len = iov_length;
 
-		if (data_test && tx) {
+		if (data && xmit) {
 			int j;
 
 			for (j = 0; j < iov_length; j++)
@@ -386,9 +395,60 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 	}
 
-	msg.msg_iov = iov;
-	msg.msg_iovlen = iov_count;
-	k = 0;
+	msg->msg_iov = iov;
+	msg->msg_iovlen = iov_count;
+
+	return 0;
+unwind_iov:
+	for (i--; i >= 0 ; i--)
+		free(msg->msg_iov[i].iov_base);
+	return -ENOMEM;
+}
+
+static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
+{
+	int i, j, bytes_cnt = 0;
+	unsigned char k = 0;
+
+	for (i = 0; i < msg->msg_iovlen; i++) {
+		unsigned char *d = msg->msg_iov[i].iov_base;
+
+		for (j = 0;
+		     j < msg->msg_iov[i].iov_len && size; j++) {
+			if (d[j] != k++) {
+				fprintf(stderr,
+					"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
+					i, j, d[j], k - 1, d[j+1], k);
+				return -EIO;
+			}
+			bytes_cnt++;
+			if (bytes_cnt == chunk_sz) {
+				k = 0;
+				bytes_cnt = 0;
+			}
+			size--;
+		}
+	}
+	return 0;
+}
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx,
+		    struct sockmap_options *opt)
+{
+	struct msghdr msg = {0}, msg_peek = {0};
+	int err, i, flags = MSG_NOSIGNAL;
+	bool drop = opt->drop_expected;
+	bool data = opt->data_test;
+
+	err = msg_alloc_iov(&msg, iov_count, iov_length, data, tx);
+	if (err)
+		goto out_errno;
+	if (peek_flag) {
+		err = msg_alloc_iov(&msg_peek, iov_count, iov_length, data, tx);
+		if (err)
+			goto out_errno;
+	}
 
 	if (tx) {
 		clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -408,19 +468,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		}
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
-		int slct, recv, max_fd = fd;
+		int slct, recvp = 0, recv, max_fd = fd;
 		int fd_flags = O_NONBLOCK;
 		struct timeval timeout;
 		float total_bytes;
-		int bytes_cnt = 0;
-		int chunk_sz;
 		fd_set w;
 
-		if (opt->sendpage)
-			chunk_sz = iov_length * cnt;
-		else
-			chunk_sz = iov_length * iov_count;
-
 		fcntl(fd, fd_flags);
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
 		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
@@ -452,6 +505,19 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				goto out_errno;
 			}
 
+			errno = 0;
+			if (peek_flag) {
+				flags |= MSG_PEEK;
+				recvp = recvmsg(fd, &msg_peek, flags);
+				if (recvp < 0) {
+					if (errno != EWOULDBLOCK) {
+						clock_gettime(CLOCK_MONOTONIC, &s->end);
+						goto out_errno;
+					}
+				}
+				flags = 0;
+			}
+
 			recv = recvmsg(fd, &msg, flags);
 			if (recv < 0) {
 				if (errno != EWOULDBLOCK) {
@@ -463,27 +529,23 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 
 			s->bytes_recvd += recv;
 
-			if (data_test) {
-				int j;
-
-				for (i = 0; i < msg.msg_iovlen; i++) {
-					unsigned char *d = iov[i].iov_base;
-
-					for (j = 0;
-					     j < iov[i].iov_len && recv; j++) {
-						if (d[j] != k++) {
-							errno = -EIO;
-							fprintf(stderr,
-								"detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
-								i, j, d[j], k - 1, d[j+1], k);
-							goto out_errno;
-						}
-						bytes_cnt++;
-						if (bytes_cnt == chunk_sz) {
-							k = 0;
-							bytes_cnt = 0;
-						}
-						recv--;
+			if (data) {
+				int chunk_sz = opt->sendpage ?
+						iov_length * cnt :
+						iov_length * iov_count;
+
+				errno = msg_verify_data(&msg, recv, chunk_sz);
+				if (errno) {
+					perror("data verify msg failed\n");
+					goto out_errno;
+				}
+				if (recvp) {
+					errno = msg_verify_data(&msg_peek,
+								recvp,
+								chunk_sz);
+					if (errno) {
+						perror("data verify msg_peek failed\n");
+						goto out_errno;
 					}
 				}
 			}
@@ -491,14 +553,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	}
 
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
-	return 0;
+	msg_free_iov(&msg);
+	msg_free_iov(&msg_peek);
+	return err;
 out_errno:
-	for (i = 0; i < iov_count; i++)
-		free(iov[i].iov_base);
-	free(iov);
+	msg_free_iov(&msg);
+	msg_free_iov(&msg_peek);
 	return errno;
 }
 
@@ -565,9 +625,10 @@ static int sendmsg_test(struct sockmap_options *opt)
 		}
 		if (opt->verbose)
 			fprintf(stdout,
-				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+				"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
 				s.bytes_sent, sent_Bps, sent_Bps/giga,
-				s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+				s.bytes_recvd, recvd_Bps, recvd_Bps/giga,
+				peek_flag ? "(peek_msg)" : "");
 		if (err && txmsg_cork)
 			err = 0;
 		exit(err ? 1 : 0);
@@ -999,6 +1060,8 @@ static void test_options(char *options)
 		strncat(options, "skb,", OPTSTRING);
 	if (ktls)
 		strncat(options, "ktls,", OPTSTRING);
+	if (peek_flag)
+		strncat(options, "peek,", OPTSTRING);
 }
 
 static int __test_exec(int cgrp, int test, struct sockmap_options *opt)

^ permalink raw reply related

* [PATCH net] sctp: fix race on sctp_id2asoc
From: Marcelo Ricardo Leitner @ 2018-10-16 18:18 UTC (permalink / raw)
  To: netdev; +Cc: linux-sctp, Vlad Yasevich, Neil Horman, Dmitry Vyukov

syzbot reported an use-after-free involving sctp_id2asoc.  Dmitry Vyukov
helped to root cause it and it is because of reading the asoc after it
was freed:

        CPU 1                       CPU 2
(working on socket 1)            (working on socket 2)
	                         sctp_association_destroy
sctp_id2asoc
   spin lock
     grab the asoc from idr
   spin unlock
                                   spin lock
				     remove asoc from idr
				   spin unlock
				   free(asoc)
   if asoc->base.sk != sk ... [*]

This can only be hit if trying to fetch asocs from different sockets. As
we have a single IDR for all asocs, in all SCTP sockets, their id is
unique on the system. An application can try to send stuff on an id
that matches on another socket, and the if in [*] will protect from such
usage. But it didn't consider that as that asoc may belong to another
socket, it may be freed in parallel (read: under another socket lock).

We fix it by moving the checks in [*] into the protected region. This
fixes it because the asoc cannot be freed while the lock is held.

Reported-by: syzbot+c7dd55d7aec49d48e49a@syzkaller.appspotmail.com
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
 net/sctp/socket.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d38d5ba734d7ee3347e4015fd30d355bbfa..a7722f43aa69801c31409d4914c99946ee5533f5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -271,11 +271,10 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id)

 	spin_lock_bh(&sctp_assocs_id_lock);
 	asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id);
+	if (asoc && (asoc->base.sk != sk || asoc->base.dead))
+		asoc = NULL;
 	spin_unlock_bh(&sctp_assocs_id_lock);

-	if (!asoc || (asoc->base.sk != sk) || asoc->base.dead)
-		return NULL;
-
 	return asoc;
 }

-- 
2.17.1

^ permalink raw reply related

* [RFC PATCH net-next 0/4] net: ethernet: ti: cpsw: fix vlan mcast
From: Ivan Khoronzhuk @ 2018-10-16 18:20 UTC (permalink / raw)
  To: grygorii.strashko, davem
  Cc: linux-omap, netdev, linux-kernel, alexander.h.duyck,
	Ivan Khoronzhuk

The cpsw holds separate mcast entires for vlan entries. At this moment
driver adds only not vlan mcast addresses, omitting vlan/mcast entries.
As result mcast for vlans doesn't work. It can be fixed by adding same
mcast entries for every created vlan, but this patchseries uses more
sophisticated way and allows to create mcast entries only for vlans
that really require it. Generic functions from this series can be
reused for fixing vlan and macvlan unicast.

Simple example of ALE table before and after this series, having same
mcast entries as for vlan 100 as for real device (reserved vlan 2),
and one mcast address only for vlan 100 - 01:1b:19:00:00:00.

<---- Before this patchset ---->
vlan , vid = 2, untag_force = 0x5, reg_mcast = 0x5, mem_list = 0x5
mcast, vid = 2, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
ucast, vid = 2, addr = 74:da:ea:47:7d:9d, persistant, port_num = 0x0
vlan , vid = 0, untag_force = 0x7, reg_mcast = 0x0, mem_list = 0x7
mcast, vid = 2, addr = 33:33:00:00:00:01, port_mask = 0x1
mcast, vid = 2, addr = 01:00:5e:00:00:01, port_mask = 0x1
vlan , vid = 1, untag_force = 0x3, reg_mcast = 0x3, mem_list = 0x3
mcast, vid = 1, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
ucast, vid = 1, addr = 74:da:ea:47:7d:9c, persistant, port_num = 0x0
mcast, vid = 1, addr = 33:33:00:00:00:01, port_mask = 0x1
mcast, vid = 1, addr = 01:00:5e:00:00:01, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:00, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:03, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:0e, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:00, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:03, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:0e, port_mask = 0x1
mcast, vid = 2, addr = 33:33:ff:47:7d:9d, port_mask = 0x1
mcast, vid = 2, addr = 33:33:00:00:00:fb, port_mask = 0x1
mcast, vid = 2, addr = 33:33:00:01:00:03, port_mask = 0x1
mcast, vid = 1, addr = 33:33:ff:47:7d:9c, port_mask = 0x1
mcast, vid = 1, addr = 33:33:00:00:00:fb, port_mask = 0x1
mcast, vid = 1, addr = 33:33:00:01:00:03, port_mask = 0x1
mcast, vid = 1, addr = 01:00:5e:00:00:fb, port_mask = 0x1
mcast, vid = 1, addr = 01:00:5e:00:00:fc, port_mask = 0x1
vlan , vid = 100, untag_force = 0x0, reg_mcast = 0x5, mem_list = 0x5
ucast, vid = 100, addr = 74:da:ea:47:7d:9d, persistant, port_num = 0x0
mcast, vid = 100, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
mcast, vid = 2, addr = 01:1b:19:00:00:00, port_mask = 0x1
			 ^^^
 Here mcast entry (ptpl2), has to be added only for vlan 100
 but added for reserved vlan 2...that's not enough.

<---- After this patchset ---->
vlan , vid = 2, untag_force = 0x5, reg_mcast = 0x5, mem_list = 0x5
mcast, vid = 2, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
ucast, vid = 2, addr = 74:da:ea:47:7d:9d, persistant, port_num = 0x0
vlan , vid = 0, untag_force = 0x7, reg_mcast = 0x0, mem_list = 0x7
mcast, vid = 2, addr = 33:33:00:00:00:01, port_mask = 0x1
mcast, vid = 2, addr = 01:00:5e:00:00:01, port_mask = 0x1
vlan , vid = 1, untag_force = 0x3, reg_mcast = 0x3, mem_list = 0x3
mcast, vid = 1, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
ucast, vid = 1, addr = 74:da:ea:47:7d:9c, persistant, port_num = 0x0
mcast, vid = 1, addr = 33:33:00:00:00:01, port_mask = 0x1
mcast, vid = 1, addr = 01:00:5e:00:00:01, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:00, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:03, port_mask = 0x1
mcast, vid = 2, addr = 01:80:c2:00:00:0e, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:00, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:03, port_mask = 0x1
mcast, vid = 1, addr = 01:80:c2:00:00:0e, port_mask = 0x1
mcast, vid = 2, addr = 33:33:ff:47:7d:9d, port_mask = 0x1
mcast, vid = 1, addr = 33:33:ff:47:7d:9c, port_mask = 0x1
mcast, vid = 2, addr = 33:33:00:00:00:fb, port_mask = 0x1
mcast, vid = 2, addr = 33:33:00:01:00:03, port_mask = 0x1
mcast, vid = 1, addr = 33:33:00:00:00:fb, port_mask = 0x1
mcast, vid = 1, addr = 33:33:00:01:00:03, port_mask = 0x1
vlan , vid = 100, untag_force = 0x0, reg_mcast = 0x5, mem_list = 0x5
ucast, vid = 100, addr = 74:da:ea:47:7d:9d, persistant, port_num = 0x0
mcast, vid = 100, addr = ff:ff:ff:ff:ff:ff, port_mask = 0x1
mcast, vid = 100, addr = 33:33:00:00:00:01, port_mask = 0x1
mcast, vid = 100, addr = 01:00:5e:00:00:01, port_mask = 0x1
mcast, vid = 100, addr = 33:33:ff:47:7d:9d, port_mask = 0x1
mcast, vid = 100, addr = 01:80:c2:00:00:00, port_mask = 0x1
mcast, vid = 100, addr = 01:80:c2:00:00:03, port_mask = 0x1
mcast, vid = 100, addr = 01:80:c2:00:00:0e, port_mask = 0x1
mcast, vid = 100, addr = 33:33:00:00:00:fb, port_mask = 0x1
mcast, vid = 100, addr = 33:33:00:01:00:03, port_mask = 0x1
mcast, vid = 100, addr = 01:1b:19:00:00:00, port_mask = 0x1
			 ^^^
    Here mcast entry (ptpl2), is added only for vlan 100
    as it should be.

Based on net-next/master

Ivan Khoronzhuk (4):
  net: core: dev_addr_lists: add auxiliary func to handle reference
    address updates
  net: 8021q: vlan_core: allow use list of vlans for real device
  net: ethernet: ti: cpsw: fix vlan mcast
  net: ethernet: ti: cpsw: fix vlan configuration while down/up

 drivers/net/ethernet/ti/cpsw.c | 189 +++++++++++++++++++++++++++------
 include/linux/if_vlan.h        |  10 ++
 include/linux/netdevice.h      |  10 ++
 net/8021q/vlan_core.c          |  27 +++++
 net/core/dev_addr_lists.c      |  97 +++++++++++++++++
 5 files changed, 303 insertions(+), 30 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [RFC PATCH net-next 3/4] net: ethernet: ti: cpsw: fix vlan mcast
From: Ivan Khoronzhuk @ 2018-10-16 18:20 UTC (permalink / raw)
  To: grygorii.strashko, davem
  Cc: linux-omap, netdev, linux-kernel, alexander.h.duyck,
	Ivan Khoronzhuk
In-Reply-To: <20181016182035.18234-1-ivan.khoronzhuk@linaro.org>

At this moment, mcast addresses are added only for real device only
(reserved vlans for dual-emac mode), even if a mcast address was added
for some vlan only, thus ALE doesn't have corresponding vlan mcast
entries after vlan socket joined multicast group. So ALE drops vlan
frames with mcast addresses intended for vlans and potentially can
receive mcast frames for base ndev. That's not correct. So, fix it by
creating only vlan/mcast entries as requested. Patch doesn't use any
additional lists and is based on device mc address list and cpsw ALE
table entries.

Also, move device to allmulti state if no space for new mcast entries.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c | 172 +++++++++++++++++++++++++++------
 1 file changed, 142 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 226be2a56c1f..5967484619d4 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -570,21 +570,6 @@ static inline int cpsw_get_slave_port(u32 slave_num)
 	return slave_num + 1;
 }
 
-static void cpsw_add_mcast(struct cpsw_priv *priv, const u8 *addr)
-{
-	struct cpsw_common *cpsw = priv->cpsw;
-
-	if (cpsw->data.dual_emac) {
-		struct cpsw_slave *slave = cpsw->slaves + priv->emac_port;
-
-		cpsw_ale_add_mcast(cpsw->ale, addr, ALE_PORT_HOST,
-				   ALE_VLAN, slave->port_vlan, 0);
-		return;
-	}
-
-	cpsw_ale_add_mcast(cpsw->ale, addr, ALE_ALL_PORTS, 0, 0, 0);
-}
-
 static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 {
 	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
@@ -660,29 +645,153 @@ static void cpsw_set_promiscious(struct net_device *ndev, bool enable)
 	}
 }
 
-static int cpsw_add_mc_addr(struct net_device *ndev, const u8 *addr)
+struct addr_sync_ctx {
+	struct net_device *ndev;
+	const u8 *addr;		/* address to be sync or unsync */
+	int keep_num;		/* number of address instances to be kept */
+	int flush;		/* flush flag */
+};
+
+/**
+ * cpsw_set_mc - adds multicast entry to the table if it's not added or deletes
+ * if it's not deleted
+ * @ndev: device to sync
+ * @addr: address to be added or deleted
+ * @vid: vlan id, if vid < 0 set/unset address for real device
+ * @add: add address if the flag is set or remove otherwise
+ */
+static int cpsw_set_mc(struct net_device *ndev, const u8 *addr,
+		       int vid, int add)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
+	struct cpsw_common *cpsw = priv->cpsw;
+	int mask, flags, ret;
+
+	if (vid < 0) {
+		if (cpsw->data.dual_emac)
+			vid = cpsw->slaves[priv->emac_port].port_vlan;
+		else
+			vid = 0;
+	}
+
+	mask = cpsw->data.dual_emac ? ALE_PORT_HOST : ALE_ALL_PORTS;
+	flags = vid ? ALE_VLAN : 0;
+
+	if (add)
+		ret = cpsw_ale_add_mcast(cpsw->ale, addr, mask, flags, vid, 0);
+	else
+		ret = cpsw_ale_del_mcast(cpsw->ale, addr, 0, flags, vid);
+
+	return ret;
+}
+
+static int cpsw_update_vlan_mc(struct net_device *vdev, int vid, void *ctx)
+{
+	struct addr_sync_ctx *sync_ctx = ctx;
+	struct netdev_hw_addr *ha;
+	int found = 0, ret = 0;
+
+	if (!vdev || !(vdev->flags & IFF_UP))
+		return ret;
+
+	/* vlan address is relevant if it's sync_cnt != 0 */
+	netdev_for_each_mc_addr(ha, vdev) {
+		if (ether_addr_equal(ha->addr, sync_ctx->addr)) {
+			found = ha->sync_cnt;
+			break;
+		}
+	}
+
+	if (found)
+		sync_ctx->keep_num--;
+
+	if (sync_ctx->flush) {
+		if (!found)
+			cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 0);
+		return ret;
+	}
+
+	if (found)
+		ret = cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 1);
+
+	return ret;
+}
+
+static int cpsw_add_mc_addr(struct net_device *ndev, const u8 *addr, int num)
+{
+	struct cpsw_common *cpsw = ndev_to_cpsw(ndev);
+	struct addr_sync_ctx sync_ctx;
+	int ret;
+
+	sync_ctx.keep_num = num;
+	sync_ctx.addr = addr;
+	sync_ctx.ndev = ndev;
+	sync_ctx.flush = 0;
+
+	ret = vlan_for_each(ndev, cpsw_update_vlan_mc, &sync_ctx);
+	if (sync_ctx.keep_num && !ret)
+		ret = cpsw_set_mc(ndev, addr, -1, 1);
+
+	/* table is overflowed, set ALLMULTI */
+	if (ret == -ENOMEM)
+		cpsw_ale_set_allmulti(cpsw->ale, IFF_ALLMULTI);
+
+	return ret;
+}
+
+static int cpsw_del_mc_addr(struct net_device *ndev, const u8 *addr, int num)
+{
+	struct addr_sync_ctx sync_ctx;
+
+	sync_ctx.keep_num = num;
+	sync_ctx.addr = addr;
+	sync_ctx.ndev = ndev;
+	sync_ctx.flush = 1;
+
+	vlan_for_each(ndev, cpsw_update_vlan_mc, &sync_ctx);
+	if (!sync_ctx.keep_num)
+		cpsw_set_mc(ndev, addr, -1, 0);
 
-	cpsw_add_mcast(priv, addr);
 	return 0;
 }
 
-static int cpsw_del_mc_addr(struct net_device *ndev, const u8 *addr)
+static int cpsw_purge_vlan_mc(struct net_device *vdev, int vid, void *ctx)
 {
-	struct cpsw_priv *priv = netdev_priv(ndev);
-	struct cpsw_common *cpsw = priv->cpsw;
-	int vid, flags;
+	struct addr_sync_ctx *sync_ctx = ctx;
+	struct netdev_hw_addr *ha;
+	int found = 0;
 
-	if (cpsw->data.dual_emac) {
-		vid = cpsw->slaves[priv->emac_port].port_vlan;
-		flags = ALE_VLAN;
-	} else {
-		vid = 0;
-		flags = 0;
+	if (!vdev || !(vdev->flags & IFF_UP))
+		return 0;
+
+	/* vlan address is relevant if it's sync_cnt != 0 */
+	netdev_for_each_mc_addr(ha, vdev) {
+		if (ether_addr_equal(ha->addr, sync_ctx->addr)) {
+			found = ha->sync_cnt;
+			break;
+		}
 	}
 
-	cpsw_ale_del_mcast(cpsw->ale, addr, 0, flags, vid);
+	if (!found)
+		return 0;
+
+	sync_ctx->keep_num--;
+	cpsw_set_mc(sync_ctx->ndev, sync_ctx->addr, vid, 0);
+	return 0;
+}
+
+static int cpsw_purge_all_mc(struct net_device *ndev, const u8 *addr, int num)
+{
+	struct addr_sync_ctx sync_ctx;
+
+	sync_ctx.addr = addr;
+	sync_ctx.ndev = ndev;
+	sync_ctx.keep_num = num;
+
+	vlan_for_each(ndev, cpsw_purge_vlan_mc, &sync_ctx);
+	if (sync_ctx.keep_num)
+		cpsw_set_mc(ndev, addr, -1, 0);
+
 	return 0;
 }
 
@@ -703,7 +812,9 @@ static void cpsw_ndo_set_rx_mode(struct net_device *ndev)
 	/* Restore allmulti on vlans if necessary */
 	cpsw_ale_set_allmulti(cpsw->ale, ndev->flags & IFF_ALLMULTI);
 
-	__dev_mc_sync(ndev, cpsw_add_mc_addr, cpsw_del_mc_addr);
+	/* add/remove mcast address either for real netdev or for vlan */
+	__hw_addr_ref_sync_dev(&ndev->mc, ndev, cpsw_add_mc_addr,
+			       cpsw_del_mc_addr);
 }
 
 static void cpsw_intr_enable(struct cpsw_common *cpsw)
@@ -1963,7 +2074,7 @@ static int cpsw_ndo_stop(struct net_device *ndev)
 	struct cpsw_common *cpsw = priv->cpsw;
 
 	cpsw_info(priv, ifdown, "shutting down cpsw device\n");
-	__dev_mc_unsync(priv->ndev, cpsw_del_mc_addr);
+	__hw_addr_ref_unsync_dev(&ndev->mc, ndev, cpsw_purge_all_mc);
 	netif_tx_stop_all_queues(priv->ndev);
 	netif_carrier_off(priv->ndev);
 
@@ -2414,6 +2525,7 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
 				  HOST_PORT_NUM, ALE_VLAN, vid);
 	ret |= cpsw_ale_del_mcast(cpsw->ale, priv->ndev->broadcast,
 				  0, ALE_VLAN, vid);
+	ret |= cpsw_ale_flush_multicast(cpsw->ale, 0, vid);
 err:
 	pm_runtime_put(cpsw->dev);
 	return ret;
-- 
2.17.1

^ permalink raw reply related

* [RFC PATCH net-next 4/4] net: ethernet: ti: cpsw: fix vlan configuration while down/up
From: Ivan Khoronzhuk @ 2018-10-16 18:20 UTC (permalink / raw)
  To: grygorii.strashko, davem
  Cc: linux-omap, netdev, linux-kernel, alexander.h.duyck,
	Ivan Khoronzhuk
In-Reply-To: <20181016182035.18234-1-ivan.khoronzhuk@linaro.org>

The vlan configuration is not restored after interface donw/up sequence
(if dual-emac - both interfaces). Tested on am572x EVM.

Steps to check:
~# ip link add link eth1 name eth1.100 type vlan id 100
~# ifconfig eth0 down
~# ifconfig eth1 down

This is TI sdk tool, dumping all ALE entries, see they are present
~# switch-config -d
~# ifconfig eth1 up
See entry for vid 100 is absent
~# switch-config -d

Try to remove vid and observe warning:
~# ip link del eth1.100
[  739.526757] net eth1: removing vlanid 100 from vlan filter
[  739.533322] failed to kill vid 0081/100 for device eth1

This patch fixes it, restoring only vlan ALE entries and all other
unicast/multicast entries are restored by system calling rx_mode ndo.

Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
---
 drivers/net/ethernet/ti/cpsw.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 5967484619d4..0b28a90b62bb 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -565,6 +565,9 @@ static const struct cpsw_stats cpsw_gstrings_ch_stats[] = {
 				(func)(slave++, ##arg);			\
 	} while (0)
 
+static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
+				    __be16 proto, u16 vid);
+
 static inline int cpsw_get_slave_port(u32 slave_num)
 {
 	return slave_num + 1;
@@ -1955,9 +1958,23 @@ static void cpsw_mqprio_resume(struct cpsw_slave *slave, struct cpsw_priv *priv)
 	slave_write(slave, tx_prio_map, tx_prio_rg);
 }
 
+static int cpsw_restore_vlans(struct net_device *vdev, int vid, void *arg)
+{
+	struct cpsw_priv *priv = arg;
+
+	if (!vdev)
+		return 0;
+
+	cpsw_ndo_vlan_rx_add_vid(priv->ndev, 0, vid);
+	return 0;
+}
+
 /* restore resources after port reset */
 static void cpsw_restore(struct cpsw_priv *priv)
 {
+	/* restore vlan configurations */
+	vlan_for_each(priv->ndev, cpsw_restore_vlans, priv);
+
 	/* restore MQPRIO offload */
 	for_each_slave(priv, cpsw_mqprio_resume, priv);
 
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH bpf-next 00/13] bpf: add btf func info support
From: Alexei Starovoitov @ 2018-10-16 18:27 UTC (permalink / raw)
  To: Yonghong Song; +Cc: ast, kafai, daniel, netdev, kernel-team
In-Reply-To: <20181012185424.2378502-1-yhs@fb.com>

On Fri, Oct 12, 2018 at 11:54:20AM -0700, Yonghong Song wrote:
> The BTF support was added to kernel by Commit 69b693f0aefa
> ("bpf: btf: Introduce BPF Type Format (BTF)"), which introduced
> .BTF section into ELF file and is primarily
> used for map pretty print.
> pahole is used to convert dwarf to BTF for ELF files.
> 
> The next step would be add func type info and debug line info
> into BTF. For debug line info, it is desirable to encode
> source code directly in the BTF to ease deployment and
> introspection.

it's kinda confusing that cover letter talks about line info next step,
but these kernel side patches are only for full prog name from btf.
It certainly makes sense for llvm to do both at the same time.
Please make the cover letter more clear.

^ permalink raw reply

* Re: [PATCH rdma-next v1 0/4] Scatter to CQE
From: Doug Ledford @ 2018-10-16 18:39 UTC (permalink / raw)
  To: Leon Romanovsky, Jason Gunthorpe
  Cc: Leon Romanovsky, RDMA mailing list, Guy Levi, Yonatan Cohen,
	Saeed Mahameed, linux-netdev
In-Reply-To: <20181009090515.25223-1-leon@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 2049 bytes --]

On Tue, 2018-10-09 at 12:05 +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
> 
> Changelog v0->v1:
>  * Changed patch #3 to use check_mask function from rdma-core instead define.
> 
> --------------------------------------------------------------------------
> From Yonatan,
> 
> Scatter to CQE is a HW offload feature that saves PCI writes by
> scattering the payload to the CQE.
> 
> The feature depends on the CQE size and if the CQE size is 64B, it will
> work for payload smaller than 32. If the CQE size is 128B, it will work for
> payload smaller than 64.
> 
> The feature works for responder and requestor:
> 1. For responder, if the payload is small as required above, the data
> will be part of the CQE, and thus we save another PCI transaction the recv buffers.
> 2. For requestor, this can be used to get the RDMA_READ response and
> RDMA_ATOMIC response in the CQE. This feature is already supported in upstream.
> 
> As part of this series, we are adding support for DC transport type and
> ability to enable the feature (force enable) in the requestor when SQ
> is not configured to signal all WRs.
> 
> Thanks
> 
> Yonatan Cohen (4):
>   net/mlx5: Expose DC scatter to CQE capability bit
>   IB/mlx5: Support scatter to CQE for DC transport type
>   IB/mlx5: Verify that driver supports user flags
>   IB/mlx5: Allow scatter to CQE without global signaled WRs
> 
>  drivers/infiniband/hw/mlx5/cq.c      |  2 +-
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +-
>  drivers/infiniband/hw/mlx5/qp.c      | 93 ++++++++++++++++++++++++++++--------
>  include/linux/mlx5/mlx5_ifc.h        |  3 +-
>  include/uapi/rdma/mlx5-abi.h         |  1 +
>  5 files changed, 79 insertions(+), 22 deletions(-)
> 
> --
> 2.14.4
> 


Hi Leon,

This series looks fine.  Let me know when the net/mlx5 portion has been
committed.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [bpf-next PATCH 0/3] sockmap support for msg_peek flag
From: Alexei Starovoitov @ 2018-10-16 18:41 UTC (permalink / raw)
  To: John Fastabend; +Cc: ast, daniel, netdev
In-Reply-To: <20181016180424.13607.7932.stgit@john-Precision-Tower-5810>

On Tue, Oct 16, 2018 at 11:07:54AM -0700, John Fastabend wrote:
> This adds support for the MSG_PEEK flag when redirecting into an
> ingress psock sk_msg queue.
> 
> The first patch adds some base support to the helpers, then the
> feature, and finally we add an option for the test suite to do
> a duplicate MSG_PEEK call on every recv to test the feature.
> 
> With duplicate MSG_PEEK call all tests continue to PASS.

for the set
Acked-by: Alexei Starovoitov <ast@kernel.org>

^ permalink raw reply

* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Sebastian Andrzej Siewior @ 2018-10-16 18:42 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, virtualization, tglx, Toshiaki Makita, Michael S. Tsirkin,
	Jason Wang, David S. Miller
In-Reply-To: <20181016110114.73e2b248@xeon-e3>

On 2018-10-16 11:01:14 [-0700], Stephen Hemminger wrote:
> On Tue, 16 Oct 2018 18:55:45 +0200
> Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> 
> > Also, ptr->var++ is not an atomic operation even on 64bit CPUs. Which
> > means if try_fill_recv() runs on CPU0 (via virtnet_receive()) then the
> > worker might run on CPU1.
> 
> On modern CPU's increment of native types is atomic but not locked.
> u64_stats_update_begin is a no-op on UP and also if BIT_PER_LONG != 32

On ARM64 you have load, inc, store. So if two CPUs increment the counter
simultaneously we might lose one increment. That is why I asked if we
care or not.

Sebastian

^ permalink raw reply

* Re: [RFC] virtio_net: add local_bh_disable() around u64_stats_update_begin
From: Stephen Hemminger @ 2018-10-16 18:44 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, virtualization, tglx, Toshiaki Makita, Michael S. Tsirkin,
	Jason Wang, David S. Miller
In-Reply-To: <20181016184206.coukhtgmlr32hyl7@linutronix.de>

On Tue, 16 Oct 2018 20:42:07 +0200
Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:

> On 2018-10-16 11:01:14 [-0700], Stephen Hemminger wrote:
> > On Tue, 16 Oct 2018 18:55:45 +0200
> > Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
> >   
> > > Also, ptr->var++ is not an atomic operation even on 64bit CPUs. Which
> > > means if try_fill_recv() runs on CPU0 (via virtnet_receive()) then the
> > > worker might run on CPU1.  
> > 
> > On modern CPU's increment of native types is atomic but not locked.
> > u64_stats_update_begin is a no-op on UP and also if BIT_PER_LONG != 32  
> 
> On ARM64 you have load, inc, store. So if two CPUs increment the counter
> simultaneously we might lose one increment. That is why I asked if we
> care or not.
> 
> Sebastian

The point is that kicks is just a counter, not important as part of the
device operation. The point of the u64_stats is to avoid problems with
high/low 32 bit wrap on increment. So this is ok on ARM64.

^ permalink raw reply

* Re: [PATCH rdma-next v1 0/4] Scatter to CQE
From: Leon Romanovsky @ 2018-10-16 19:00 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Jason Gunthorpe, RDMA mailing list, Guy Levi, Yonatan Cohen,
	Saeed Mahameed, linux-netdev
In-Reply-To: <f59c18d5d4d02d93d3e918f93b8a4a81bdf6e5a6.camel@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2268 bytes --]

On Tue, Oct 16, 2018 at 02:39:01PM -0400, Doug Ledford wrote:
> On Tue, 2018-10-09 at 12:05 +0300, Leon Romanovsky wrote:
> > From: Leon Romanovsky <leonro@mellanox.com>
> >
> > Changelog v0->v1:
> >  * Changed patch #3 to use check_mask function from rdma-core instead define.
> >
> > --------------------------------------------------------------------------
> > From Yonatan,
> >
> > Scatter to CQE is a HW offload feature that saves PCI writes by
> > scattering the payload to the CQE.
> >
> > The feature depends on the CQE size and if the CQE size is 64B, it will
> > work for payload smaller than 32. If the CQE size is 128B, it will work for
> > payload smaller than 64.
> >
> > The feature works for responder and requestor:
> > 1. For responder, if the payload is small as required above, the data
> > will be part of the CQE, and thus we save another PCI transaction the recv buffers.
> > 2. For requestor, this can be used to get the RDMA_READ response and
> > RDMA_ATOMIC response in the CQE. This feature is already supported in upstream.
> >
> > As part of this series, we are adding support for DC transport type and
> > ability to enable the feature (force enable) in the requestor when SQ
> > is not configured to signal all WRs.
> >
> > Thanks
> >
> > Yonatan Cohen (4):
> >   net/mlx5: Expose DC scatter to CQE capability bit
> >   IB/mlx5: Support scatter to CQE for DC transport type
> >   IB/mlx5: Verify that driver supports user flags
> >   IB/mlx5: Allow scatter to CQE without global signaled WRs
> >
> >  drivers/infiniband/hw/mlx5/cq.c      |  2 +-
> >  drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +-
> >  drivers/infiniband/hw/mlx5/qp.c      | 93 ++++++++++++++++++++++++++++--------
> >  include/linux/mlx5/mlx5_ifc.h        |  3 +-
> >  include/uapi/rdma/mlx5-abi.h         |  1 +
> >  5 files changed, 79 insertions(+), 22 deletions(-)
> >
> > --
> > 2.14.4
> >
>
>
> Hi Leon,
>
> This series looks fine.  Let me know when the net/mlx5 portion has been
> committed.

Thanks Doug,
I pushed first patch to mlx5-next
94a04d1d3d36 ("net/mlx5: Expose DC scatter to CQE capability bit")

>
> --
> Doug Ledford <dledford@redhat.com>
>     GPG KeyID: B826A3330E572FDD
>     Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* [PATCH net] sctp: not free the new asoc when sctp_wait_for_connect returns err
From: Xin Long @ 2018-10-16 19:06 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: davem, Marcelo Ricardo Leitner, Neil Horman

When sctp_wait_for_connect is called to wait for connect ready
for sp->strm_interleave in sctp_sendmsg_to_asoc, a panic could
be triggered if cpu is scheduled out and the new asoc is freed
elsewhere, as it will return err and later the asoc gets freed
again in sctp_sendmsg.

[  285.840764] list_del corruption, ffff9f0f7b284078->next is LIST_POISON1 (dead000000000100)
[  285.843590] WARNING: CPU: 1 PID: 8861 at lib/list_debug.c:47 __list_del_entry_valid+0x50/0xa0
[  285.846193] Kernel panic - not syncing: panic_on_warn set ...
[  285.846193]
[  285.848206] CPU: 1 PID: 8861 Comm: sctp_ndata Kdump: loaded Not tainted 4.19.0-rc7.label #584
[  285.850559] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
[  285.852164] Call Trace:
...
[  285.872210]  ? __list_del_entry_valid+0x50/0xa0
[  285.872894]  sctp_association_free+0x42/0x2d0 [sctp]
[  285.873612]  sctp_sendmsg+0x5a4/0x6b0 [sctp]
[  285.874236]  sock_sendmsg+0x30/0x40
[  285.874741]  ___sys_sendmsg+0x27a/0x290
[  285.875304]  ? __switch_to_asm+0x34/0x70
[  285.875872]  ? __switch_to_asm+0x40/0x70
[  285.876438]  ? ptep_set_access_flags+0x2a/0x30
[  285.877083]  ? do_wp_page+0x151/0x540
[  285.877614]  __sys_sendmsg+0x58/0xa0
[  285.878138]  do_syscall_64+0x55/0x180
[  285.878669]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

This is a similar issue with the one fixed in Commit ca3af4dd28cf
("sctp: do not free asoc when it is already dead in sctp_sendmsg").
But this one can't be fixed by returning -ESRCH for the dead asoc
in sctp_wait_for_connect, as it will break sctp_connect's return
value to users.

This patch is to simply set err to -ESRCH before it returns to
sctp_sendmsg when any err is returned by sctp_wait_for_connect
for sp->strm_interleave, so that no asoc would be freed due to
this.

When users see this error, they will know the packet hasn't been
sent. And it also makes sense to not free asoc because waiting
connect fails, like the second call for sctp_wait_for_connect in
sctp_sendmsg_to_asoc.

Fixes: 668c9beb9020 ("sctp: implement assign_number for sctp_stream_interleave")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e25a20f..1baa9d9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1946,8 +1946,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		if (sp->strm_interleave) {
 			timeo = sock_sndtimeo(sk, 0);
 			err = sctp_wait_for_connect(asoc, &timeo);
-			if (err)
+			if (err) {
+				err = -ESRCH;
 				goto err;
+			}
 		} else {
 			wait_connect = true;
 		}
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 0/2] sctp: fix sk_wmem_queued and use it to check for writable space
From: Xin Long @ 2018-10-16 19:07 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem

sctp doesn't count and use asoc sndbuf_used, sk sk_wmem_alloc and
sk_wmem_queued properly, which also causes some problem.

This patchset is to improve it.

Xin Long (2):
  sctp: count both sk and asoc sndbuf with skb truesize and sctp_chunk
    size
  sctp: use sk_wmem_queued to check for writable space

 include/net/sctp/constants.h |  5 ----
 net/sctp/outqueue.c          |  8 ++----
 net/sctp/socket.c            | 59 +++++++++++---------------------------------
 3 files changed, 17 insertions(+), 55 deletions(-)

-- 
2.1.0

^ permalink raw reply

* [PATCH net-next 1/2] sctp: count both sk and asoc sndbuf with skb truesize and sctp_chunk size
From: Xin Long @ 2018-10-16 19:07 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1539716812.git.lucien.xin@gmail.com>

Now it's confusing that asoc sndbuf_used is doing memory accounting with
SCTP_DATA_SNDSIZE(chunk) + sizeof(sk_buff) + sizeof(sctp_chunk) while sk
sk_wmem_alloc is doing that with skb->truesize + sizeof(sctp_chunk).

It also causes sctp_prsctp_prune to count with a wrong freed memory when
sndbuf_policy is not set.

To make this right and also keep consistent between asoc sndbuf_used, sk
sk_wmem_alloc and sk_wmem_queued, use skb->truesize + sizeof(sctp_chunk)
for them.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/sctp/constants.h |  5 -----
 net/sctp/outqueue.c          |  8 ++------
 net/sctp/socket.c            | 21 ++++++---------------
 3 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 86f034b..8dadc74 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -148,11 +148,6 @@ SCTP_SUBTYPE_CONSTRUCTOR(PRIMITIVE,	enum sctp_event_primitive, primitive)
 #define sctp_chunk_is_data(a) (a->chunk_hdr->type == SCTP_CID_DATA || \
 			       a->chunk_hdr->type == SCTP_CID_I_DATA)
 
-/* Calculate the actual data size in a data chunk */
-#define SCTP_DATA_SNDSIZE(c) ((int)((unsigned long)(c->chunk_end) - \
-				    (unsigned long)(c->chunk_hdr) - \
-				    sctp_datachk_len(&c->asoc->stream)))
-
 /* Internal error codes */
 enum sctp_ierror {
 	SCTP_IERROR_NO_ERROR	        = 0,
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 42191ed..9cb854b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -385,9 +385,7 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc,
 			asoc->outqueue.outstanding_bytes -= sctp_data_size(chk);
 		}
 
-		msg_len -= SCTP_DATA_SNDSIZE(chk) +
-			   sizeof(struct sk_buff) +
-			   sizeof(struct sctp_chunk);
+		msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
 		if (msg_len <= 0)
 			break;
 	}
@@ -421,9 +419,7 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc,
 			streamout->ext->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++;
 		}
 
-		msg_len -= SCTP_DATA_SNDSIZE(chk) +
-			   sizeof(struct sk_buff) +
-			   sizeof(struct sctp_chunk);
+		msg_len -= chk->skb->truesize + sizeof(struct sctp_chunk);
 		sctp_chunk_free(chk);
 		if (msg_len <= 0)
 			break;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d3..c6f2950 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -166,12 +166,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
 	/* Save the chunk pointer in skb for sctp_wfree to use later.  */
 	skb_shinfo(chunk->skb)->destructor_arg = chunk;
 
-	asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk) +
-				sizeof(struct sk_buff) +
-				sizeof(struct sctp_chunk);
-
 	refcount_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
-	sk->sk_wmem_queued += chunk->skb->truesize;
+	asoc->sndbuf_used += chunk->skb->truesize + sizeof(struct sctp_chunk);
+	sk->sk_wmem_queued += chunk->skb->truesize + sizeof(struct sctp_chunk);
 	sk_mem_charge(sk, chunk->skb->truesize);
 }
 
@@ -8460,17 +8457,11 @@ static void sctp_wfree(struct sk_buff *skb)
 	struct sctp_association *asoc = chunk->asoc;
 	struct sock *sk = asoc->base.sk;
 
-	asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk) +
-				sizeof(struct sk_buff) +
-				sizeof(struct sctp_chunk);
-
-	WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc));
-
-	/*
-	 * This undoes what is done via sctp_set_owner_w and sk_mem_charge
-	 */
-	sk->sk_wmem_queued   -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
+	sk->sk_wmem_queued -= skb->truesize + sizeof(struct sctp_chunk);
+	asoc->sndbuf_used -= skb->truesize + sizeof(struct sctp_chunk);
+	WARN_ON(refcount_sub_and_test(sizeof(struct sctp_chunk),
+				      &sk->sk_wmem_alloc));
 
 	if (chunk->shkey) {
 		struct sctp_shared_key *shkey = chunk->shkey;
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 2/2] sctp: use sk_wmem_queued to check for writable space
From: Xin Long @ 2018-10-16 19:07 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1539716812.git.lucien.xin@gmail.com>

sk->sk_wmem_queued is used to count the size of chunks in out queue
while sk->sk_wmem_alloc is for counting the size of chunks has been
sent. sctp is increasing both of them before enqueuing the chunks,
and using sk->sk_wmem_alloc to check for writable space.

However, sk_wmem_alloc is also increased by 1 for the skb allocked
for sending in sctp_packet_transmit() but it will not wake up the
waiters when sk_wmem_alloc is decreased in this skb's destructor.

If msg size is equal to sk_sndbuf and sendmsg is waiting for sndbuf,
the check 'msg_len <= sctp_wspace(asoc)' in sctp_wait_for_sndbuf()
will keep waiting if there's a skb allocked in sctp_packet_transmit,
and later even if this skb got freed, the waiting thread will never
get waked up.

This issue has been there since very beginning, so we change to use
sk->sk_wmem_queued to check for writable space as sk_wmem_queued is
not increased for the skb allocked for sending, also as TCP does.

SOCK_SNDBUF_LOCK check is also removed here as it's for tx buf auto
tuning which I will add in another patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/socket.c | 38 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c6f2950..111ebd8 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -83,7 +83,7 @@
 #include <net/sctp/stream_sched.h>
 
 /* Forward declarations for internal helper functions. */
-static int sctp_writeable(struct sock *sk);
+static bool sctp_writeable(struct sock *sk);
 static void sctp_wfree(struct sk_buff *skb);
 static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 				size_t msg_len);
@@ -119,25 +119,10 @@ static void sctp_enter_memory_pressure(struct sock *sk)
 /* Get the sndbuf space available at the time on the association.  */
 static inline int sctp_wspace(struct sctp_association *asoc)
 {
-	int amt;
+	struct sock *sk = asoc->base.sk;
 
-	if (asoc->ep->sndbuf_policy)
-		amt = asoc->sndbuf_used;
-	else
-		amt = sk_wmem_alloc_get(asoc->base.sk);
-
-	if (amt >= asoc->base.sk->sk_sndbuf) {
-		if (asoc->base.sk->sk_userlocks & SOCK_SNDBUF_LOCK)
-			amt = 0;
-		else {
-			amt = sk_stream_wspace(asoc->base.sk);
-			if (amt < 0)
-				amt = 0;
-		}
-	} else {
-		amt = asoc->base.sk->sk_sndbuf - amt;
-	}
-	return amt;
+	return asoc->ep->sndbuf_policy ? sk->sk_sndbuf - asoc->sndbuf_used
+				       : sk_stream_wspace(sk);
 }
 
 /* Increment the used sndbuf space count of the corresponding association by
@@ -1925,10 +1910,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		asoc->pmtu_pending = 0;
 	}
 
-	if (sctp_wspace(asoc) < msg_len)
+	if (sctp_wspace(asoc) < (int)msg_len)
 		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
 
-	if (!sctp_wspace(asoc)) {
+	if (sctp_wspace(asoc) <= 0) {
 		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
 		if (err)
@@ -8535,7 +8520,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 			goto do_error;
 		if (signal_pending(current))
 			goto do_interrupted;
-		if (msg_len <= sctp_wspace(asoc))
+		if ((int)msg_len <= sctp_wspace(asoc))
 			break;
 
 		/* Let another process have a go.  Since we are going
@@ -8610,14 +8595,9 @@ void sctp_write_space(struct sock *sk)
  * UDP-style sockets or TCP-style sockets, this code should work.
  *  - Daisy
  */
-static int sctp_writeable(struct sock *sk)
+static bool sctp_writeable(struct sock *sk)
 {
-	int amt = 0;
-
-	amt = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
-	if (amt < 0)
-		amt = 0;
-	return amt;
+	return sk->sk_sndbuf > sk->sk_wmem_queued;
 }
 
 /* Wait for an association to go into ESTABLISHED state. If timeout is 0,
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next v5] net/ncsi: Add NCSI Broadcom OEM command
From: Vijay Khemka @ 2018-10-16 19:13 UTC (permalink / raw)
  To: Samuel Mendoza-Jonas, David S. Miller, netdev, linux-kernel
  Cc: vijaykhemka, openbmc @ lists . ozlabs . org, Justin.Lee1, joel,
	linux-aspeed

This patch adds OEM Broadcom commands and response handling. It also
defines OEM Get MAC Address handler to get and configure the device.

ncsi_oem_gma_handler_bcm: This handler send NCSI broadcom command for
getting mac address.
ncsi_rsp_handler_oem_bcm: This handles response received for all
broadcom OEM commands.
ncsi_rsp_handler_oem_bcm_gma: This handles get mac address response and
set it to device.

Signed-off-by: Vijay Khemka <vijaykhemka@fb.com>
---
 net/ncsi/Kconfig       |  6 ++++
 net/ncsi/internal.h    |  9 +++++
 net/ncsi/ncsi-manage.c | 82 ++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-pkt.h    |  8 +++++
 net/ncsi/ncsi-rsp.c    | 44 +++++++++++++++++++++--
 5 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/net/ncsi/Kconfig b/net/ncsi/Kconfig
index 08a8a6031fd7..7f2b46108a24 100644
--- a/net/ncsi/Kconfig
+++ b/net/ncsi/Kconfig
@@ -10,3 +10,9 @@ config NET_NCSI
 	  support. Enable this only if your system connects to a network
 	  device via NCSI and the ethernet driver you're using supports
 	  the protocol explicitly.
+config NCSI_OEM_CMD_GET_MAC
+	bool "Get NCSI OEM MAC Address"
+	depends on NET_NCSI
+	---help---
+	  This allows to get MAC address from NCSI firmware and set them back to
+		controller.
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index 13c9b5eeb3b7..1dae77c54009 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -71,6 +71,13 @@ enum {
 /* OEM Vendor Manufacture ID */
 #define NCSI_OEM_MFR_MLX_ID             0x8119
 #define NCSI_OEM_MFR_BCM_ID             0x113d
+/* Broadcom specific OEM Command */
+#define NCSI_OEM_BCM_CMD_GMA            0x01   /* CMD ID for Get MAC */
+/* OEM Command payload lengths*/
+#define NCSI_OEM_BCM_CMD_GMA_LEN        12
+/* Mac address offset in OEM response */
+#define BCM_MAC_ADDR_OFFSET             28
+
 
 struct ncsi_channel_version {
 	u32 version;		/* Supported BCD encoded NCSI version */
@@ -246,6 +253,7 @@ enum {
 	ncsi_dev_state_probe_dp,
 	ncsi_dev_state_config_sp	= 0x0301,
 	ncsi_dev_state_config_cis,
+	ncsi_dev_state_config_oem_gma,
 	ncsi_dev_state_config_clear_vids,
 	ncsi_dev_state_config_svf,
 	ncsi_dev_state_config_ev,
@@ -279,6 +287,7 @@ struct ncsi_dev_priv {
 #define NCSI_DEV_PROBED		1            /* Finalized NCSI topology    */
 #define NCSI_DEV_HWA		2            /* Enabled HW arbitration     */
 #define NCSI_DEV_RESHUFFLE	4
+	unsigned int        gma_flag;        /* OEM GMA flag               */
 	spinlock_t          lock;            /* Protect the NCSI device    */
 #if IS_ENABLED(CONFIG_IPV6)
 	unsigned int        inet6_addr_num;  /* Number of IPv6 addresses   */
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 6aa0614d2d28..bfc43b28c7a6 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -651,6 +651,72 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc,
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+
+/* NCSI OEM Command APIs */
+static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca)
+{
+	unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN];
+	int ret = 0;
+
+	nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN;
+
+	memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN);
+	*(unsigned int *)data = ntohl(NCSI_OEM_MFR_BCM_ID);
+	data[5] = NCSI_OEM_BCM_CMD_GMA;
+
+	nca->data = data;
+
+	ret = ncsi_xmit_cmd(nca);
+	if (ret)
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: Failed to transmit cmd 0x%x during configure\n",
+			   nca->type);
+	return ret;
+}
+
+/* OEM Command handlers initialization */
+static struct ncsi_oem_gma_handler {
+	unsigned int	mfr_id;
+	int		(*handler)(struct ncsi_cmd_arg *nca);
+} ncsi_oem_gma_handlers[] = {
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }
+};
+
+static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id)
+{
+	struct ncsi_oem_gma_handler *nch = NULL;
+	int i;
+
+	/* This function should only be called once, return if flag set */
+	if (nca->ndp->gma_flag == 1)
+		return -1;
+
+	/* Find gma handler for given manufacturer id */
+	for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) {
+		if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) {
+			if (ncsi_oem_gma_handlers[i].handler)
+				nch = &ncsi_oem_gma_handlers[i];
+			break;
+			}
+	}
+
+	if (!nch) {
+		netdev_err(nca->ndp->ndev.dev,
+			   "NCSI: No GMA handler available for MFR-ID (0x%x)\n",
+			   mf_id);
+		return -1;
+	}
+
+	/* Set the flag for GMA command which should only be called once */
+	nca->ndp->gma_flag = 1;
+
+	/* Get Mac address from NCSI device */
+	return nch->handler(nca);
+}
+
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
 static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -701,7 +767,23 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 			goto error;
 		}
 
+		nd->state = ncsi_dev_state_config_oem_gma;
+		break;
+	case ncsi_dev_state_config_oem_gma:
 		nd->state = ncsi_dev_state_config_clear_vids;
+		ret = -1;
+
+#if IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC)
+		nca.type = NCSI_PKT_CMD_OEM;
+		nca.package = np->id;
+		nca.channel = nc->id;
+		ndp->pending_req_num = 1;
+		ret = ncsi_gma_handler(&nca, nc->version.mf_id);
+#endif /* CONFIG_NCSI_OEM_CMD_GET_MAC */
+
+		if (ret < 0)
+			schedule_work(&ndp->work);
+
 		break;
 	case ncsi_dev_state_config_clear_vids:
 	case ncsi_dev_state_config_svf:
diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h
index 0f2087c8d42a..4d3f06be38bd 100644
--- a/net/ncsi/ncsi-pkt.h
+++ b/net/ncsi/ncsi-pkt.h
@@ -165,6 +165,14 @@ struct ncsi_rsp_oem_pkt {
 	unsigned char           data[];      /* Payload data      */
 };
 
+/* Broadcom Response Data */
+struct ncsi_rsp_oem_bcm_pkt {
+	unsigned char           ver;         /* Payload Version   */
+	unsigned char           type;        /* OEM Command type  */
+	__be16                  len;         /* Payload Length    */
+	unsigned char           data[];      /* Cmd specific Data */
+};
+
 /* Get Link Status */
 struct ncsi_rsp_gls_pkt {
 	struct ncsi_rsp_pkt_hdr rsp;        /* Response header   */
diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c
index 85fa59afae34..77e07ba3f493 100644
--- a/net/ncsi/ncsi-rsp.c
+++ b/net/ncsi/ncsi-rsp.c
@@ -611,19 +611,59 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr)
 	return 0;
 }
 
+/* Response handler for Broadcom command Get Mac Address */
+static int ncsi_rsp_handler_oem_bcm_gma(struct ncsi_request *nr)
+{
+	struct ncsi_dev_priv *ndp = nr->ndp;
+	struct net_device *ndev = ndp->ndev.dev;
+	const struct net_device_ops *ops = ndev->netdev_ops;
+	struct ncsi_rsp_oem_pkt *rsp;
+	struct sockaddr saddr;
+	int ret = 0;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+
+	saddr.sa_family = ndev->type;
+	ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+	memcpy(saddr.sa_data, &rsp->data[BCM_MAC_ADDR_OFFSET], ETH_ALEN);
+	/* Increase mac address by 1 for BMC's address */
+	saddr.sa_data[ETH_ALEN - 1]++;
+	ret = ops->ndo_set_mac_address(ndev, &saddr);
+	if (ret < 0)
+		netdev_warn(ndev, "NCSI: 'Writing mac address to device failed\n");
+
+	return ret;
+}
+
+/* Response handler for Broadcom card */
+static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr)
+{
+	struct ncsi_rsp_oem_bcm_pkt *bcm;
+	struct ncsi_rsp_oem_pkt *rsp;
+
+	/* Get the response header */
+	rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp);
+	bcm = (struct ncsi_rsp_oem_bcm_pkt *)(rsp->data);
+
+	if (bcm->type == NCSI_OEM_BCM_CMD_GMA)
+		return ncsi_rsp_handler_oem_bcm_gma(nr);
+	return 0;
+}
+
 static struct ncsi_rsp_oem_handler {
 	unsigned int	mfr_id;
 	int		(*handler)(struct ncsi_request *nr);
 } ncsi_rsp_oem_handlers[] = {
 	{ NCSI_OEM_MFR_MLX_ID, NULL },
-	{ NCSI_OEM_MFR_BCM_ID, NULL }
+	{ NCSI_OEM_MFR_BCM_ID, ncsi_rsp_handler_oem_bcm }
 };
 
 /* Response handler for OEM command */
 static int ncsi_rsp_handler_oem(struct ncsi_request *nr)
 {
-	struct ncsi_rsp_oem_pkt *rsp;
 	struct ncsi_rsp_oem_handler *nrh = NULL;
+	struct ncsi_rsp_oem_pkt *rsp;
 	unsigned int mfr_id, i;
 
 	/* Get the response header */
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next] tcp, ulp: remove socket lock assertion on ULP cleanup
From: Daniel Borkmann @ 2018-10-16 19:31 UTC (permalink / raw)
  To: davem
  Cc: eric.dumazet, john.fastabend, alexei.starovoitov, davejwatson,
	netdev, Daniel Borkmann

Eric reported that syzkaller triggered a splat in tcp_cleanup_ulp()
where assertion sock_owned_by_me() failed. This happened through
inet_csk_prepare_forced_close() first releasing the socket lock,
then calling into tcp_done(newsk) which is called after the
inet_csk_prepare_forced_close() and therefore without the socket
lock held. The sock_owned_by_me() assertion can generally be
removed as the only place where tcp_cleanup_ulp() is called from
now is out of inet_csk_destroy_sock() -> sk->sk_prot->destroy()
where socket is in dead state and unreachable. Therefore, add a
comment why the check is not needed instead.

Fixes: 8b9088f806e1 ("tcp, ulp: enforce sock_owned_by_me upon ulp init and cleanup")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv4/tcp_ulp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index a9162aa..95df7f7 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -99,8 +99,10 @@ void tcp_cleanup_ulp(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);

-	sock_owned_by_me(sk);
-
+	/* No sock_owned_by_me() check here as at the time the
+	 * stack calls this function, the socket is dead and
+	 * about to be destroyed.
+	 */
 	if (!icsk->icsk_ulp_ops)
 		return;

-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next] tcp, ulp: remove socket lock assertion on ULP cleanup
From: David Miller @ 2018-10-16 19:39 UTC (permalink / raw)
  To: daniel
  Cc: eric.dumazet, john.fastabend, alexei.starovoitov, davejwatson,
	netdev
In-Reply-To: <20181016193135.13571-1-daniel@iogearbox.net>

From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 16 Oct 2018 21:31:35 +0200

> Eric reported that syzkaller triggered a splat in tcp_cleanup_ulp()
> where assertion sock_owned_by_me() failed. This happened through
> inet_csk_prepare_forced_close() first releasing the socket lock,
> then calling into tcp_done(newsk) which is called after the
> inet_csk_prepare_forced_close() and therefore without the socket
> lock held. The sock_owned_by_me() assertion can generally be
> removed as the only place where tcp_cleanup_ulp() is called from
> now is out of inet_csk_destroy_sock() -> sk->sk_prot->destroy()
> where socket is in dead state and unreachable. Therefore, add a
> comment why the check is not needed instead.
> 
> Fixes: 8b9088f806e1 ("tcp, ulp: enforce sock_owned_by_me upon ulp init and cleanup")
> Reported-by: Eric Dumazet <edumazet@google.com>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

Applied.

^ permalink raw reply

* Re: netconsole warning in 4.19.0-rc7
From: Dave Jones @ 2018-10-17  3:40 UTC (permalink / raw)
  To: Cong Wang; +Cc: Meelis Roos, LKML, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpU0+LVwgMea32KQWf8z6Bp1VTmWOGnWNwNSmV0PZzf_ew@mail.gmail.com>

On Wed, Oct 10, 2018 at 10:34:49PM -0700, Cong Wang wrote:
 > (Cc'ing Dave)
 > 
 > On Wed, Oct 10, 2018 at 5:14 AM Meelis Roos <mroos@linux.ee> wrote:
 > >
 > > Thies 4.19-rc7 on a bunch of test machines and got this warning from one.
 > > It is reproducible and I have not noticed it before.
 > >
 > [...]
 > > [    9.914805] WARNING: CPU: 0 PID: 0 at kernel/softirq.c:168 __local_bh_enable_ip+0x2e/0x44
 > > [    9.914806] Modules linked in:
 > > [    9.914808] CPU: 0 PID: 0 Comm: swapper Not tainted 4.19.0-rc7 #210
 > > [    9.914810] Hardware name: MicroLink                       /D850MV                         , BIOS MV85010A.86A.0067.P24.0304081124 04/08/2003
 > > [    9.914811] EIP: __local_bh_enable_ip+0x2e/0x44
 > > [    9.914813] Code: cc 02 5f c8 a9 00 00 0f 00 75 1f 83 ea 01 f7 da 01 15 cc 02 5f c8 a1 cc 02 5f c8 a9 00 ff 1f 00 74 0c ff 0d cc 02 5f c8 5d c3 <0f> 0b eb dd 66 a1 80 cd 5e c8 66 85 c0 74 e9 e8 87 ff ff ff eb e2
 > > [    9.914814] EAX: 80010200 EBX: f602b000 ECX: 36346270 EDX: 00000200
 > > [    9.914815] ESI: f620ecc0 EDI: f620ebac EBP: f600de40 ESP: f600de40
 > > [    9.914816] DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 EFLAGS: 00010006
 > > [    9.914817] CR0: 80050033 CR2: b7f5f000 CR3: 36389000 CR4: 000006d0
 > > [    9.914818] Call Trace:
 > > [    9.914819]  <IRQ>
 > > [    9.914820]  netpoll_send_skb_on_dev+0xa5/0x1b0
 > 
 > This is exactly what I mentioned in my review here:
 > https://marc.info/?l=linux-netdev&m=153816136624679&w=2
 > 
 > "But irq is disabled here, so not sure if rcu_read_lock_bh()
 > could cause trouble... "

Not sure why this didn't show up for me when I was developing that
patch, but I can now reproduce this.  The patch below fixes it for
me, but I'm not sure if there are still any side-effects.
There's also a missed unlock in the error path.

If this looks legit, I'll submit it formally..

Tested on both bonded and unbonded interfaces.

diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index de1d1ba92f2d..fafb9e2c4e7b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -312,12 +312,13 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 	/* It is up to the caller to keep npinfo alive. */
 	struct netpoll_info *npinfo;
 
-	rcu_read_lock_bh();
+	rcu_read_lock();
 	lockdep_assert_irqs_disabled();
 
 	npinfo = rcu_dereference_bh(np->dev->npinfo);
 	if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
 		dev_kfree_skb_irq(skb);
+		rcu_read_unlock();
 		return;
 	}
 
@@ -357,7 +358,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 		skb_queue_tail(&npinfo->txq, skb);
 		schedule_delayed_work(&npinfo->tx_work,0);
 	}
-	rcu_read_unlock_bh();
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL(netpoll_send_skb_on_dev);
 

^ permalink raw reply related

* [PATCH net] r8169: fix NAPI handling under high load
From: Heiner Kallweit @ 2018-10-16 20:37 UTC (permalink / raw)
  To: David Miller, Realtek linux nic maintainers; +Cc: netdev@vger.kernel.org

rtl_rx() and rtl_tx() are called only if the respective bits are set
in the interrupt status register. Under high load NAPI may not be
able to process all data (work_done == budget) and it will schedule
subsequent calls to the poll callback.
rtl_ack_events() however resets the bits in the interrupt status
register, therefore subsequent calls to rtl8169_poll() won't call
rtl_rx() and rtl_tx() - chip interrupts are still disabled.

Fix this by calling rtl_rx() and rtl_tx() independent of the bits
set in the interrupt status register. Both functions will detect
if there's nothing to do for them.

This issue has been there more or less forever (at least it exists in
3.16 already), so I can't provide a "Fixes" tag. 

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 8c4f49adc..7caf3b7e9 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -6528,17 +6528,15 @@ static int rtl8169_poll(struct napi_struct *napi, int budget)
 	struct rtl8169_private *tp = container_of(napi, struct rtl8169_private, napi);
 	struct net_device *dev = tp->dev;
 	u16 enable_mask = RTL_EVENT_NAPI | tp->event_slow;
-	int work_done= 0;
+	int work_done;
 	u16 status;

 	status = rtl_get_events(tp);
 	rtl_ack_events(tp, status & ~tp->event_slow);

-	if (status & RTL_EVENT_NAPI_RX)
-		work_done = rtl_rx(dev, tp, (u32) budget);
+	work_done = rtl_rx(dev, tp, (u32) budget);

-	if (status & RTL_EVENT_NAPI_TX)
-		rtl_tx(dev, tp);
+	rtl_tx(dev, tp);

 	if (status & tp->event_slow) {
 		enable_mask &= ~tp->event_slow;
-- 
2.19.1

^ permalink raw reply related

* Re: [PATCH bpf-next v2 3/7] bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall
From: Mauricio Vasquez @ 2018-10-16 21:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Song Liu, Alexei Starovoitov, Daniel Borkmann, Networking
In-Reply-To: <20181011235136.mfryrjhur6yoslp5@ast-mbp.dhcp.thefacebook.com>

On 10/11/2018 06:51 PM, Alexei Starovoitov wrote:
> On Wed, Oct 10, 2018 at 05:50:01PM -0500, Mauricio Vasquez wrote:
>>>> Does it make sense to you?
>>> I reread the other patch, and found it does NOT use the following logic for
>>> queue and stack:
>>>
>>>                  rcu_read_lock();
>>>                  ptr = map->ops->map_lookup_and_delete_elem(map, key);
>>>                  if (ptr)
>>>                          memcpy(value, ptr, value_size);
>>>
>>> I guess this part is not used at all? Can we just remove it?
>>>
>>> Thanks,
>>> Song
>> This is the base code for map_lookup_and_delete support, it is not used in
>> queue/stack maps.
>>
>> I think we can leave it there, so when somebody implements lookup_and_delete
>> for other maps doesn't have to care about implementing also this.
> The code looks useful to me, but I also agree with Song. And in the kernel
> we don't typically add 'almost dead code'.
> May be provide an implementation of the lookup_and_delete for hash map
> so it's actually used ?

I haven't written any code but I think there is a potential problem here.
Current lookup_and_delete returns a pointer to the element, hence 
deletion of the element should be done using call_rcu to guarantee this 
is valid after returning.
In the hashtab, the deletion only uses call_rcu when there is not 
prealloc, otherwise the element is pushed on the list of free elements 
immediately.
If we move the logic to push elements into the free list under a 
call_rcu invocation, it could happen that the free list is empty because 
the call_rcu is still pending (a similar issue we had with the 
queue/stack maps when they used a pass by reference API).

There is another way to implement it without this issue, in syscall.c:
l = ops->lookup(key);
memcpy(l, some_buffer)
ops->delete(key)

The point here is that the lookup_and_delete operation is not being used 
at all, so, is lookup + delete = lookup_and_delete?, can it be generalized?
If this is true, then what is the sense of having lookup_and_delete 
syscall?,

> imo it would be a useful feature to have for hash map and will clarify
> the intent of it for all other maps and for stack/queue in particular.
>
>

^ permalink raw reply

* Re: [PATCH net] r8169: fix NAPI handling under high load
From: Holger Hoffstätte @ 2018-10-16 21:17 UTC (permalink / raw)
  To: Heiner Kallweit, David Miller, Realtek linux nic maintainers
  Cc: netdev@vger.kernel.org
In-Reply-To: <8f84fe39-3d8d-396d-3b97-027e0a83f8cb@gmail.com>

On 10/16/18 22:37, Heiner Kallweit wrote:
> rtl_rx() and rtl_tx() are called only if the respective bits are set
> in the interrupt status register. Under high load NAPI may not be
> able to process all data (work_done == budget) and it will schedule
> subsequent calls to the poll callback.
> rtl_ack_events() however resets the bits in the interrupt status
> register, therefore subsequent calls to rtl8169_poll() won't call
> rtl_rx() and rtl_tx() - chip interrupts are still disabled.

Very interesting! Could this be the reason for the mysterious
hangs & resets we experienced when enabling BQL for r8169?
They happened more often with TSO/GSO enabled and several people
attempted to fix those hangs unsuccessfully; it was later reverted
and has been since then (#87cda7cb43).
If this bug has been there "forever" it might be tempting to
re-apply BQL and see what happens. Any chance you could give that
a try? I'll gladly test patches, just like I'll run this one.

cheers
Holger

^ permalink raw reply

* Re: gianfar: Implement MAC reset and reconfig procedure
From: Daniel Walker @ 2018-10-16 21:36 UTC (permalink / raw)
  To: Claudiu Manoil; +Cc: Hemant Ramdasi, netdev

Hi,

I would like to report an issue in the gianfar driver. The issue is as follows. 

We have a P2020 board that uses the gianfar driver, and we have a m88e1101
PHY connect. When the interface is initially brought up traffic flows as
normal. If you take the interface down then bring it back up traffic stops
flowing. If you do this sequence over and over up/down/up we find that the
interface will allow traffic to flow at a low percentage.

In v4.9 interface allows traffic about %10 of the time.

In v4.19-rc8 the allows traffic %30 of the time.

After bisecting I found that in v3.14 the interface was rock solid and never did
we see this issue. However, v3.15 we started to see this issue. After bisecting I
found the following change is the first one which causes the issue,

a328ac9 gianfar: Implement MAC reset and reconfig procedure

I was able to revert this in v3.15 , however with later development a revert
doesn't appear to be possible. We have no fix for this currently.

I can do testing if you have an idea what might cause the issue.

Daniel

^ permalink raw reply

* Re: gianfar: Implement MAC reset and reconfig procedure
From: Florian Fainelli @ 2018-10-16 22:03 UTC (permalink / raw)
  To: Daniel Walker, Claudiu Manoil; +Cc: Hemant Ramdasi, netdev
In-Reply-To: <20181016213629.mjgrkifpt265hvvm@zorba>

On 10/16/2018 02:36 PM, Daniel Walker wrote:
> Hi,
> 
> I would like to report an issue in the gianfar driver. The issue is as follows. 
> 
> We have a P2020 board that uses the gianfar driver, and we have a m88e1101
> PHY connect. When the interface is initially brought up traffic flows as
> normal. If you take the interface down then bring it back up traffic stops
> flowing. If you do this sequence over and over up/down/up we find that the
> interface will allow traffic to flow at a low percentage.
> 
> In v4.9 interface allows traffic about %10 of the time.
> 
> In v4.19-rc8 the allows traffic %30 of the time.
> 
> After bisecting I found that in v3.14 the interface was rock solid and never did
> we see this issue. However, v3.15 we started to see this issue. After bisecting I
> found the following change is the first one which causes the issue,
> 
> a328ac9 gianfar: Implement MAC reset and reconfig procedure
> 
> I was able to revert this in v3.15 , however with later development a revert
> doesn't appear to be possible. We have no fix for this currently.
> 
> I can do testing if you have an idea what might cause the issue.

What we have seen being typically the problem is that when you have a
PHY connection whereby the PHY provides the RX clock to the MAC (e.g:
RGMII), it is very easy to get in a situation where the PHY clock is
stopped, and the MAC is asked to be reset, but the HW design does not
like that at all since it e.g: stops on packet boundaries and need some
clock cycles to do that, and that results in all sorts of issues (in our
case it was some FIFO corruption). We solved that in bcmgenet.c with
looping internally the TX clock to the RX clock to make sure the
Ethernet MAC (UniMAC in our designs) was successfully reset:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=28c2d1a7a0bfdf3617800d2beae1c67983c03d15

Could that somehow be the problem here?
-- 
Florian

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox