Netdev List
 help / color / mirror / Atom feed
* [PATCH 10/12] netfilter: x_tables: avoid stack-out-of-bounds read in xt_copy_counters_from_user
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Eric Dumazet <edumazet@google.com>

syzkaller reports an out of bound read in strlcpy(), triggered
by xt_copy_counters_from_user()

Fix this by using memcpy(), then forcing a zero byte at the last position
of the destination, as Florian did for the non COMPAT code.

Fixes: d7591f0c41ce ("netfilter: x_tables: introduce and use xt_copy_counters_from_user")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c83a3b5e1c6c..d8571f414208 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 		if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0)
 			return ERR_PTR(-EFAULT);
 
-		strlcpy(info->name, compat_tmp.name, sizeof(info->name));
+		memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1);
 		info->num_counters = compat_tmp.num_counters;
 		user += sizeof(compat_tmp);
 	} else
@@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 		if (copy_from_user(info, user, sizeof(*info)) != 0)
 			return ERR_PTR(-EFAULT);
 
-		info->name[sizeof(info->name) - 1] = '\0';
 		user += sizeof(*info);
 	}
+	info->name[sizeof(info->name) - 1] = '\0';
 
 	size = sizeof(struct xt_counters);
 	size *= info->num_counters;
-- 
2.1.4


^ permalink raw reply related

* [PATCH 03/12] netfilter: ipset: Fix adding an IPv4 range containing more than 2^31 addresses
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Wrong comparison prevented the hash types to add a range with more than
2^31 addresses but reported as a success.

Fixes Netfilter's bugzilla id #1005, reported by Oleg Serditov and
Oliver Ford.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_ip.c         | 22 ++++++++++++----------
 net/netfilter/ipset/ip_set_hash_ipmark.c     |  2 +-
 net/netfilter/ipset/ip_set_hash_ipport.c     |  2 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  2 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  4 ++--
 net/netfilter/ipset/ip_set_hash_net.c        |  2 +-
 net/netfilter/ipset/ip_set_hash_netiface.c   |  2 +-
 net/netfilter/ipset/ip_set_hash_netnet.c     |  4 ++--
 net/netfilter/ipset/ip_set_hash_netport.c    |  2 +-
 net/netfilter/ipset/ip_set_hash_netportnet.c |  4 ++--
 10 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index 20bfbd315f61..613eb212cb48 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 		return ret;
 
 	ip &= ip_set_hostmask(h->netmask);
+	e.ip = htonl(ip);
+	if (e.ip == 0)
+		return -IPSET_ERR_HASH_ELEM;
 
-	if (adt == IPSET_TEST) {
-		e.ip = htonl(ip);
-		if (e.ip == 0)
-			return -IPSET_ERR_HASH_ELEM;
+	if (adt == IPSET_TEST)
 		return adtfn(set, &e, &ext, &ext, flags);
-	}
 
 	ip_to = ip;
 	if (tb[IPSET_ATTR_IP_TO]) {
@@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
 
-	if (retried)
+	if (retried) {
 		ip = ntohl(h->next.ip);
-	for (; !before(ip_to, ip); ip += hosts) {
 		e.ip = htonl(ip);
-		if (e.ip == 0)
-			return -IPSET_ERR_HASH_ELEM;
+	}
+	for (; ip <= ip_to;) {
 		ret = adtfn(set, &e, &ext, &ext, flags);
-
 		if (ret && !ip_set_eexist(ret, flags))
 			return ret;
 
+		ip += hosts;
+		e.ip = htonl(ip);
+		if (e.ip == 0)
+			return 0;
+
 		ret = 0;
 	}
 	return ret;
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index b64cf14e8352..f3ba8348cf9d 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	for (; !before(ip_to, ip); ip++) {
+	for (; ip <= ip_to; ip++) {
 		e.ip = htonl(ip);
 		ret = adtfn(set, &e, &ext, &ext, flags);
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index f438740e6c6a..ddb8039ec1d2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	for (; !before(ip_to, ip); ip++) {
+	for (; ip <= ip_to; ip++) {
 		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
 						       : port;
 		for (; p <= port_to; p++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 6215fb898c50..a7f4d7a85420 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	for (; !before(ip_to, ip); ip++) {
+	for (; ip <= ip_to; ip++) {
 		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
 						       : port;
 		for (; p <= port_to; p++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 5ab1b99a53c2..a2f19b9906e9 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	for (; !before(ip_to, ip); ip++) {
+	for (; ip <= ip_to; ip++) {
 		e.ip = htonl(ip);
 		p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
 						       : port;
@@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			      ip == ntohl(h->next.ip) &&
 			      p == ntohs(h->next.port)
 				? ntohl(h->next.ip2) : ip2_from;
-			while (!after(ip2, ip2_to)) {
+			while (ip2 <= ip2_to) {
 				e.ip2 = htonl(ip2);
 				ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
 								&cidr);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 5d9e895452e7..1c67a1761e45 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 	if (retried)
 		ip = ntohl(h->next.ip);
-	while (!after(ip, ip_to)) {
+	while (ip <= ip_to) {
 		e.ip = htonl(ip);
 		last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
 		ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 44cf11939c91..d417074f1c1a 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	while (!after(ip, ip_to)) {
+	while (ip <= ip_to) {
 		e.ip = htonl(ip);
 		last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
 		ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index db614e13b193..7f9ae2e9645b 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (retried)
 		ip = ntohl(h->next.ip[0]);
 
-	while (!after(ip, ip_to)) {
+	while (ip <= ip_to) {
 		e.ip[0] = htonl(ip);
 		last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
 		ip2 = (retried &&
 		       ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1])
 						   : ip2_from;
-		while (!after(ip2, ip2_to)) {
+		while (ip2 <= ip2_to) {
 			e.ip[1] = htonl(ip2);
 			last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]);
 			ret = adtfn(set, &e, &ext, &ext, flags);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 54b64b6cd0cd..e6ef382febe4 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	if (retried)
 		ip = ntohl(h->next.ip);
-	while (!after(ip, ip_to)) {
+	while (ip <= ip_to) {
 		e.ip = htonl(ip);
 		last = ip_set_range_to_cidr(ip, ip_to, &cidr);
 		e.cidr = cidr - 1;
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index aff846960ac4..8602f2595a1a 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (retried)
 		ip = ntohl(h->next.ip[0]);
 
-	while (!after(ip, ip_to)) {
+	while (ip <= ip_to) {
 		e.ip[0] = htonl(ip);
 		ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]);
 		p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port)
@@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			ip2 = (retried && ip == ntohl(h->next.ip[0]) &&
 			       p == ntohs(h->next.port)) ? ntohl(h->next.ip[1])
 							 : ip2_from;
-			while (!after(ip2, ip2_to)) {
+			while (ip2 <= ip2_to) {
 				e.ip[1] = htonl(ip2);
 				ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
 								&e.cidr[1]);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 02/12] netfilter: xt_socket: Restore mark from full sockets only
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>

An out of bounds error was detected on an ARM64 target with
Android based kernel 4.9. This occurs while trying to
restore mark on a skb from an inet request socket.

BUG: KASAN: slab-out-of-bounds in socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248
Read of size 4 at addr ffffffc06a8d824c by task syz-fuzzer/1532
CPU: 7 PID: 1532 Comm: syz-fuzzer Tainted: G        W  O    4.9.41+ #1
Call trace:
[<ffffff900808d2f8>] dump_backtrace+0x0/0x440 arch/arm64/kernel/traps.c:76
[<ffffff900808d760>] show_stack+0x28/0x38 arch/arm64/kernel/traps.c:226
[<ffffff90085f7dc8>] __dump_stack lib/dump_stack.c:15 [inline]
[<ffffff90085f7dc8>] dump_stack+0xe4/0x134 lib/dump_stack.c:51
[<ffffff900830f358>] print_address_description+0x68/0x258 mm/kasan/report.c:248
[<ffffff900830f770>] kasan_report_error mm/kasan/report.c:347 [inline]
[<ffffff900830f770>] kasan_report.part.2+0x228/0x2f0 mm/kasan/report.c:371
[<ffffff900830fdec>] kasan_report+0x5c/0x70 mm/kasan/report.c:372
[<ffffff900830de98>] check_memory_region_inline mm/kasan/kasan.c:308 [inline]
[<ffffff900830de98>] __asan_load4+0x88/0xa0 mm/kasan/kasan.c:740
[<ffffff90097498f8>] socket_match.isra.2+0xc8/0x1f0 net/netfilter/xt_socket.c:248
[<ffffff9009749a5c>] socket_mt4_v1_v2_v3+0x3c/0x48 net/netfilter/xt_socket.c:272
[<ffffff90097f7e4c>] ipt_do_table+0x54c/0xad8 net/ipv4/netfilter/ip_tables.c:311
[<ffffff90097fcf14>] iptable_mangle_hook+0x6c/0x220 net/ipv4/netfilter/iptable_mangle.c:90
...
Allocated by task 1532:
 save_stack_trace_tsk+0x0/0x2a0 arch/arm64/kernel/stacktrace.c:131
 save_stack_trace+0x28/0x38 arch/arm64/kernel/stacktrace.c:215
 save_stack mm/kasan/kasan.c:495 [inline]
 set_track mm/kasan/kasan.c:507 [inline]
 kasan_kmalloc+0xd8/0x188 mm/kasan/kasan.c:599
 kasan_slab_alloc+0x14/0x20 mm/kasan/kasan.c:537
 slab_post_alloc_hook mm/slab.h:417 [inline]
 slab_alloc_node mm/slub.c:2728 [inline]
 slab_alloc mm/slub.c:2736 [inline]
 kmem_cache_alloc+0x14c/0x2e8 mm/slub.c:2741
 reqsk_alloc include/net/request_sock.h:87 [inline]
 inet_reqsk_alloc+0x4c/0x238 net/ipv4/tcp_input.c:6236
 tcp_conn_request+0x2b0/0xea8 net/ipv4/tcp_input.c:6341
 tcp_v4_conn_request+0xe0/0x100 net/ipv4/tcp_ipv4.c:1256
 tcp_rcv_state_process+0x384/0x18a8 net/ipv4/tcp_input.c:5926
 tcp_v4_do_rcv+0x2f0/0x3e0 net/ipv4/tcp_ipv4.c:1430
 tcp_v4_rcv+0x1278/0x1350 net/ipv4/tcp_ipv4.c:1709
 ip_local_deliver_finish+0x174/0x3e0 net/ipv4/ip_input.c:216

v1->v2: Change socket_mt6_v1_v2_v3() as well as mentioned by Eric
v2->v3: Put the correct fixes tag

Fixes: 01555e74bde5 ("netfilter: xt_socket: add XT_SOCKET_RESTORESKMARK flag")
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index e75ef39669c5..575d2153e3b8 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 			transparent = nf_sk_is_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-		    transparent)
+		    transparent && sk_fullsock(sk))
 			pskb->mark = sk->sk_mark;
 
 		if (sk != skb->sk)
@@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par)
 			transparent = nf_sk_is_transparent(sk);
 
 		if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard &&
-		    transparent)
+		    transparent && sk_fullsock(sk))
 			pskb->mark = sk->sk_mark;
 
 		if (sk != skb->sk)
-- 
2.1.4

^ permalink raw reply related

* [PATCH 05/12] netfilter: ipset: Fix race between dump and swap
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Ross Lagerwall <ross.lagerwall@citrix.com>

Fix a race between ip_set_dump_start() and ip_set_swap().
The race is as follows:
* Without holding the ref lock, ip_set_swap() checks ref_netlink of the
  set and it is 0.
* ip_set_dump_start() takes a reference on the set.
* ip_set_swap() does the swap (even though it now has a non-zero
  reference count).
* ip_set_dump_start() gets the set from ip_set_list again which is now a
  different set since it has been swapped.
* ip_set_dump_start() calls __ip_set_put_netlink() and hits a BUG_ON due
  to the reference count being 0.

Fix this race by extending the critical region in which the ref lock is
held to include checking the ref counts.

The race can be reproduced with the following script:
  while :; do
    ipset destroy hash_ip1
    ipset destroy hash_ip2
    ipset create hash_ip1 hash:ip family inet hashsize 1024 \
        maxelem 500000
    ipset create hash_ip2 hash:ip family inet hashsize 300000 \
        maxelem 500000
    ipset create hash_ip3 hash:ip family inet hashsize 1024 \
        maxelem 500000
    ipset save &
    ipset swap hash_ip3 hash_ip2
    ipset destroy hash_ip3
    wait
  done

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a7f049ff3049..cf84f7b37cd9 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
 	      from->family == to->family))
 		return -IPSET_ERR_TYPE_MISMATCH;
 
-	if (from->ref_netlink || to->ref_netlink)
+	write_lock_bh(&ip_set_ref_lock);
+
+	if (from->ref_netlink || to->ref_netlink) {
+		write_unlock_bh(&ip_set_ref_lock);
 		return -EBUSY;
+	}
 
 	strncpy(from_name, from->name, IPSET_MAXNAMELEN);
 	strncpy(from->name, to->name, IPSET_MAXNAMELEN);
 	strncpy(to->name, from_name, IPSET_MAXNAMELEN);
 
-	write_lock_bh(&ip_set_ref_lock);
 	swap(from->ref, to->ref);
 	ip_set(inst, from_id) = to;
 	ip_set(inst, to_id) = from;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 01/12] netfilter: ipvs: full-functionality option for ECN encapsulation in tunnel
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Vadim Fedorenko <vfedorenko@yandex-team.ru>

IPVS tunnel mode works as simple tunnel (see RFC 3168) copying ECN field
to outer header. That's result in packet drops on egress tunnels in case
the egress tunnel operates as ECN-capable with Full-functionality option
(like ip_tunnel and ip6_tunnel kernel modules), according to RFC 3168
section 9.1.1 recommendation.

This patch implements ECN full-functionality option into ipvs xmit code.

Cc: netdev@vger.kernel.org
Cc: lvs-devel@vger.kernel.org
Signed-off-by: Vadim Fedorenko <vfedorenko@yandex-team.ru>
Reviewed-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_xmit.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 90d396814798..4527921b1c3a 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 {
 	struct sk_buff *new_skb = NULL;
 	struct iphdr *old_iph = NULL;
+	__u8 old_dsfield;
 #ifdef CONFIG_IP_VS_IPV6
 	struct ipv6hdr *old_ipv6h = NULL;
 #endif
@@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 			*payload_len =
 				ntohs(old_ipv6h->payload_len) +
 				sizeof(*old_ipv6h);
-		*dsfield = ipv6_get_dsfield(old_ipv6h);
+		old_dsfield = ipv6_get_dsfield(old_ipv6h);
 		*ttl = old_ipv6h->hop_limit;
 		if (df)
 			*df = 0;
@@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
 
 		/* fix old IP header checksum */
 		ip_send_check(old_iph);
-		*dsfield = ipv4_get_dsfield(old_iph);
+		old_dsfield = ipv4_get_dsfield(old_iph);
 		*ttl = old_iph->ttl;
 		if (payload_len)
 			*payload_len = ntohs(old_iph->tot_len);
 	}
 
+	/* Implement full-functionality option for ECN encapsulation */
+	*dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
+
 	return skb;
 error:
 	kfree_skb(skb);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 08/12] netfilter: nf_tables: Release memory obtained by kasprintf
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Arvind Yadav <arvind.yadav.cs@gmail.com>

Free memory region, if nf_tables_set_alloc_name is not successful.

Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars")
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f98ca8c6aa59..34adedcb239e 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2741,8 +2741,10 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 	list_for_each_entry(i, &ctx->table->sets, list) {
 		if (!nft_is_active_next(ctx->net, i))
 			continue;
-		if (!strcmp(set->name, i->name))
+		if (!strcmp(set->name, i->name)) {
+			kfree(set->name);
 			return -ENFILE;
+		}
 	}
 	return 0;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH 07/12] netfilter: ebtables: fix race condition in frame_filter_net_init()
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Artem Savkov <asavkov@redhat.com>

It is possible for ebt_in_hook to be triggered before ebt_table is assigned
resulting in a NULL-pointer dereference. Make sure hooks are
registered as the last step.

Fixes: aee12a0a3727 ("ebtables: remove nf_hook_register usage")
Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |  7 ++++---
 net/bridge/netfilter/ebtable_broute.c     |  4 ++--
 net/bridge/netfilter/ebtable_filter.c     |  4 ++--
 net/bridge/netfilter/ebtable_nat.c        |  4 ++--
 net/bridge/netfilter/ebtables.c           | 17 +++++++++--------
 5 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 2c2a5514b0df..528b24c78308 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -108,9 +108,10 @@ struct ebt_table {
 
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \
 		     ~(__alignof__(struct _xt_align)-1))
-extern struct ebt_table *ebt_register_table(struct net *net,
-					    const struct ebt_table *table,
-					    const struct nf_hook_ops *);
+extern int ebt_register_table(struct net *net,
+			      const struct ebt_table *table,
+			      const struct nf_hook_ops *ops,
+			      struct ebt_table **res);
 extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
 				 const struct nf_hook_ops *);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 2585b100ebbb..276b60262981 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb)
 
 static int __net_init broute_net_init(struct net *net)
 {
-	net->xt.broute_table = ebt_register_table(net, &broute_table, NULL);
-	return PTR_ERR_OR_ZERO(net->xt.broute_table);
+	return ebt_register_table(net, &broute_table, NULL,
+				  &net->xt.broute_table);
 }
 
 static void __net_exit broute_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 45a00dbdbcad..c41da5fac84f 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = {
 
 static int __net_init frame_filter_net_init(struct net *net)
 {
-	net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter);
-	return PTR_ERR_OR_ZERO(net->xt.frame_filter);
+	return ebt_register_table(net, &frame_filter, ebt_ops_filter,
+				  &net->xt.frame_filter);
 }
 
 static void __net_exit frame_filter_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 57cd5bb154e7..08df7406ecb3 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = {
 
 static int __net_init frame_nat_net_init(struct net *net)
 {
-	net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat);
-	return PTR_ERR_OR_ZERO(net->xt.frame_nat);
+	return ebt_register_table(net, &frame_nat, ebt_ops_nat,
+				  &net->xt.frame_nat);
 }
 
 static void __net_exit frame_nat_net_exit(struct net *net)
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 83951f978445..3b3dcf719e07 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
 	kfree(table);
 }
 
-struct ebt_table *
-ebt_register_table(struct net *net, const struct ebt_table *input_table,
-		   const struct nf_hook_ops *ops)
+int ebt_register_table(struct net *net, const struct ebt_table *input_table,
+		       const struct nf_hook_ops *ops, struct ebt_table **res)
 {
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t, *table;
@@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	    repl->entries == NULL || repl->entries_size == 0 ||
 	    repl->counters != NULL || input_table->private != NULL) {
 		BUGPRINT("Bad table data for ebt_register_table!!!\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	/* Don't add one table to multiple lists. */
@@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
 	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
 	mutex_unlock(&ebt_mutex);
 
+	WRITE_ONCE(*res, table);
+
 	if (!ops)
-		return table;
+		return 0;
 
 	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
 	if (ret) {
 		__ebt_unregister_table(net, table);
-		return ERR_PTR(ret);
+		*res = NULL;
 	}
 
-	return table;
+	return ret;
 free_unlock:
 	mutex_unlock(&ebt_mutex);
 free_chainstack:
@@ -1276,7 +1277,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table,
 free_table:
 	kfree(table);
 out:
-	return ERR_PTR(ret);
+	return ret;
 }
 
 void ebt_unregister_table(struct net *net, struct ebt_table *table,
-- 
2.1.4

^ permalink raw reply related

* [PATCH 11/12] netfilter: SYNPROXY: skip non-tcp packet in {ipv4, ipv6}_synproxy_hook
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Lin Zhang <xiaolou4617@gmail.com>

In function {ipv4,ipv6}_synproxy_hook we expect a normal tcp packet, but
the real server maybe reply an icmp error packet related to the exist
tcp conntrack, so we will access wrong tcp data.

Fix it by checking for the protocol field and only process tcp traffic.

Signed-off-by: Lin Zhang <xiaolou4617@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_SYNPROXY.c  | 3 ++-
 net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 811689e523c3..f75fc6b53115 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
 	if (synproxy == NULL)
 		return NF_ACCEPT;
 
-	if (nf_is_loopback_packet(skb))
+	if (nf_is_loopback_packet(skb) ||
+	    ip_hdr(skb)->protocol != IPPROTO_TCP)
 		return NF_ACCEPT;
 
 	thoff = ip_hdrlen(skb);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index a5cd43d75393..437af8c95277 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 	nexthdr = ipv6_hdr(skb)->nexthdr;
 	thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
 				 &frag_off);
-	if (thoff < 0)
+	if (thoff < 0 || nexthdr != IPPROTO_TCP)
 		return NF_ACCEPT;
 
 	th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 12/12] netfilter: xt_bpf: Fix XT_BPF_MODE_FD_PINNED mode of 'xt_bpf_info_v1'
From: Pablo Neira Ayuso @ 2017-10-09 16:25 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1507566346-32553-1-git-send-email-pablo@netfilter.org>

From: Shmulik Ladkani <shmulik.ladkani@gmail.com>

Commit 2c16d6033264 ("netfilter: xt_bpf: support ebpf") introduced
support for attaching an eBPF object by an fd, with the
'bpf_mt_check_v1' ABI expecting the '.fd' to be specified upon each
IPT_SO_SET_REPLACE call.

However this breaks subsequent iptables calls:

 # iptables -A INPUT -m bpf --object-pinned /sys/fs/bpf/xxx -j ACCEPT
 # iptables -A INPUT -s 5.6.7.8 -j ACCEPT
 iptables: Invalid argument. Run `dmesg' for more information.

That's because iptables works by loading existing rules using
IPT_SO_GET_ENTRIES to userspace, then issuing IPT_SO_SET_REPLACE with
the replacement set.

However, the loaded 'xt_bpf_info_v1' has an arbitrary '.fd' number
(from the initial "iptables -m bpf" invocation) - so when 2nd invocation
occurs, userspace passes a bogus fd number, which leads to
'bpf_mt_check_v1' to fail.

One suggested solution [1] was to hack iptables userspace, to perform a
"entries fixup" immediatley after IPT_SO_GET_ENTRIES, by opening a new,
process-local fd per every 'xt_bpf_info_v1' entry seen.

However, in [2] both Pablo Neira Ayuso and Willem de Bruijn suggested to
depricate the xt_bpf_info_v1 ABI dealing with pinned ebpf objects.

This fix changes the XT_BPF_MODE_FD_PINNED behavior to ignore the given
'.fd' and instead perform an in-kernel lookup for the bpf object given
the provided '.path'.

It also defines an alias for the XT_BPF_MODE_FD_PINNED mode, named
XT_BPF_MODE_PATH_PINNED, to better reflect the fact that the user is
expected to provide the path of the pinned object.

Existing XT_BPF_MODE_FD_ELF behavior (non-pinned fd mode) is preserved.

References: [1] https://marc.info/?l=netfilter-devel&m=150564724607440&w=2
            [2] https://marc.info/?l=netfilter-devel&m=150575727129880&w=2

Reported-by: Rafael Buchbinder <rafi@rbk.ms>
Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/bpf.h                   |  5 +++++
 include/uapi/linux/netfilter/xt_bpf.h |  1 +
 kernel/bpf/inode.c                    |  1 +
 net/netfilter/xt_bpf.c                | 22 ++++++++++++++++++++--
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..f1af7d63d678 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -368,6 +368,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline int bpf_obj_get_user(const char __user *pathname)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 						       u32 key)
 {
diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h
index b97725af2ac0..da161b56c79e 100644
--- a/include/uapi/linux/netfilter/xt_bpf.h
+++ b/include/uapi/linux/netfilter/xt_bpf.h
@@ -23,6 +23,7 @@ enum xt_bpf_modes {
 	XT_BPF_MODE_FD_PINNED,
 	XT_BPF_MODE_FD_ELF,
 };
+#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED
 
 struct xt_bpf_info_v1 {
 	__u16 mode;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index e833ed914358..be1dde967208 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -363,6 +363,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	putname(pname);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(bpf_obj_get_user);
 
 static void bpf_evict_inode(struct inode *inode)
 {
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 38986a95216c..29123934887b 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/syscalls.h>
 #include <linux/skbuff.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>
@@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret)
 	return 0;
 }
 
+static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
+{
+	mm_segment_t oldfs = get_fs();
+	int retval, fd;
+
+	set_fs(KERNEL_DS);
+	fd = bpf_obj_get_user(path);
+	set_fs(oldfs);
+	if (fd < 0)
+		return fd;
+
+	retval = __bpf_mt_check_fd(fd, ret);
+	sys_close(fd);
+	return retval;
+}
+
 static int bpf_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_bpf_info *info = par->matchinfo;
@@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par)
 		return __bpf_mt_check_bytecode(info->bpf_program,
 					       info->bpf_program_num_elem,
 					       &info->filter);
-	else if (info->mode == XT_BPF_MODE_FD_PINNED ||
-		 info->mode == XT_BPF_MODE_FD_ELF)
+	else if (info->mode == XT_BPF_MODE_FD_ELF)
 		return __bpf_mt_check_fd(info->fd, &info->filter);
+	else if (info->mode == XT_BPF_MODE_PATH_PINNED)
+		return __bpf_mt_check_path(info->path, &info->filter);
 	else
 		return -EINVAL;
 }
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH] net/mlx4_core: Convert timers to use timer_setup()
From: Doug Ledford @ 2017-10-09 16:26 UTC (permalink / raw)
  To: Kees Cook, linux-kernel; +Cc: Tariq Toukan, netdev, linux-rdma, Thomas Gleixner
In-Reply-To: <20171005005154.GA23500@beast>

On Wed, 2017-10-04 at 17:51 -0700, Kees Cook wrote:
> In preparation for unconditionally passing the struct timer_list
> pointer to
> all timer callbacks, switch to using the new timer_setup() and
> from_timer()
> to pass the timer pointer explicitly.
> 
> Cc: Tariq Toukan <tariqt@mellanox.com>
> Cc: netdev@vger.kernel.org
> Cc: linux-rdma@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
> This requires commit 686fef928bba ("timer: Prepare to change timer
> callback argument type") in v4.14-rc3, but should be otherwise
> stand-alone.

Thanks, applied.

-- 
Doug Ledford <dledford@redhat.com>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

^ permalink raw reply

* RE: [PATCH v3 2/3] net: phy: DP83822 initial driver submission
From: Woojung.Huh @ 2017-10-09 16:27 UTC (permalink / raw)
  To: dmurphy, andrew, f.fainelli; +Cc: netdev, afd
In-Reply-To: <20171009120302.23611-2-dmurphy@ti.com>

> Subject: [PATCH v3 2/3] net: phy: DP83822 initial driver submission
> 
> Add support for the TI  DP83822 10/100Mbit ethernet phy.
> 
> The DP83822 provides flexibility to connect to a MAC through a
> standard MII, RMII or RGMII interface.
> 
> Datasheet:
> http://www.ti.com/product/DP83822I/datasheet
> 
> Signed-off-by: Dan Murphy <dmurphy@ti.com>
> ---

Reviewed-by: Woojung Huh <woojung.huh@microchip.com>

^ permalink raw reply

* Re: [net-next 14/15] i40e: ignore skb->xmit_more when deciding to set RS bit
From: Alexander Duyck @ 2017-10-09 16:28 UTC (permalink / raw)
  To: David Laight
  Cc: Jeff Kirsher, davem@davemloft.net, Jacob Keller,
	netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com
In-Reply-To: <063D6719AE5E284EB5DD2968C1650D6DD008D373@AcuExch.aculab.com>

On Mon, Oct 9, 2017 at 2:07 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Jeff Kirsher
>> Sent: 06 October 2017 18:57
>> From: Jacob Keller <jacob.e.keller@intel.com>
>>
>> Since commit 6a7fded776a7 ("i40e: Fix RS bit update in Tx path and
>> disable force WB workaround") we've tried to "optimize" setting the
>> RS bit based around skb->xmit_more. This same logic was refactored
>> in commit 1dc8b538795f ("i40e: Reorder logic for coalescing RS bits"),
>> but ultimately was not functionally changed.
>
> I've tried to understand the above, but without definitions of WB
> and RS and the '4-ness' of something it is quite difficult.
>
> I THINK this is all about telling the hardware there is a packet
> in the ring to transmit?
>
> I don't understand the 4-ness. Linux requires that the hardware
> be notified of a single packet transmit, and that the 'transmit
> complete' also be notified in 'a timely manner' for a single packet.

So to clarify some of this. The RS is short for Report Status. It
tells the hardware that when it has completed the descriptor it should
trigger a write back, aka WB.

The 4-ness is because each descriptor is 16 bytes, so if we write back
once every 4 descriptors that is one 64B cache line which is much
better for most platforms performance wise.

> skb->xmit_more ought to be usable to optimise both of these
> (assuming it isn't a lie).

Actually it leads to issues if we hold off for too long. If we are
busy dequeueing a long list, and the Tx queue has gone empty then we
are stalling Tx without need to do so. We need to be regularly
notifying the device that it has buffers available to transmit which
is what xmit_more is good for. However in addition the hardware needs
to notify us regularly that there are buffers ready to clean which it
isn't necessarily so useful for, especially if  are filling the entire
ring and only providing one notification to the driver that there are
buffers ready since we can defer the Tx interrupt for too long.

What Jake is trying to fix here is to prevent us from generating what
looks like a square wave in terms of throughput. Essentially the
current behavior that is being seen is that all the buffers in the
ring either belong to software, or belong to hardware. It is best for
us to have this split half and half so that the hardware has buffers
to transmit and software has buffers to clean and enqueue new buffers
onto.

> The driver would need to ensure a ring full of messages isn't
> generated without either wakeup - but that might be 128 frames.

That is what is happening here. Basically we are guaranteeing
writebacks are occurring on a regular basis instead of in large
bursts.

> FWIW on the systems I have PCIe writes are relatively cheap
> (reads are slow). So different counts would be appropriate
> for delaying doorbell writes and requesting completion interrupts.
>
>         David

The doorbell writes are still being delayed the same amount with this
patch. The only thing that was split out was the device write backs
are now occurring on a more regular basis instead of being held off
until a much larger set is handled.

- Alex

^ permalink raw reply

* Re: [PATCH net] ipv6: Fix traffic triggered IPsec connections.
From: David Miller @ 2017-10-09 16:41 UTC (permalink / raw)
  To: steffen.klassert; +Cc: tobias, weiwan, netdev
In-Reply-To: <20171009063943.GG3149@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 9 Oct 2017 08:39:43 +0200

> A recent patch removed the dst_free() on the allocated
> dst_entry in ipv6_blackhole_route(). The dst_free() marked
> the dst_entry as dead and added it to the gc list. I.e. it
> was setup for a one time usage. As a result we may now have
> a blackhole route cached at a socket on some IPsec scenarios.
> This makes the connection unusable.
> 
> Fix this by marking the dst_entry directly at allocation time
> as 'dead', so it is used only once.
> 
> Fixes: 587fea741134 ("ipv6: mark DST_NOGC and remove the operation of dst_free()")
> Reported-by: Tobias Brunner <tobias@strongswan.org>
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

Applied and queued up for -stable, thanks.

^ permalink raw reply

* Re: [PATCH net] ipv4: Fix traffic triggered IPsec connections.
From: David Miller @ 2017-10-09 16:41 UTC (permalink / raw)
  To: steffen.klassert; +Cc: tobias, weiwan, netdev
In-Reply-To: <20171009064355.GI3149@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 9 Oct 2017 08:43:55 +0200

> A recent patch removed the dst_free() on the allocated
> dst_entry in ipv4_blackhole_route(). The dst_free() marked the
> dst_entry as dead and added it to the gc list. I.e. it was setup
> for a one time usage. As a result we may now have a blackhole
> route cached at a socket on some IPsec scenarios. This makes the
> connection unusable.
> 
> Fix this by marking the dst_entry directly at allocation time
> as 'dead', so it is used only once.
> 
> Fixes: b838d5e1c5b6 ("ipv4: mark DST_NOGC and remove the operation of dst_free()")
> Reported-by: Tobias Brunner <tobias@strongswan.org>
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH net-next] ipv6: avoid zeroing per cpu data again
From: Martin KaFai Lau @ 2017-10-09 16:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Tejun Heo
In-Reply-To: <1507554097.31614.11.camel@edumazet-glaptop3.roam.corp.google.com>

On Mon, Oct 09, 2017 at 01:01:37PM +0000, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> per cpu allocations are already zeroed, no need to clear them again.
> 
> Fixes: d52d3997f843f ("ipv6: Create percpu rt6_info")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Martin KaFai Lau <kafai@fb.com>
> Cc: Tejun Heo <tj@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply

* Re: [PATCH v3 1/3] net: phy: Remove TI DP83822 from DP83848 driver
From: Florian Fainelli @ 2017-10-09 16:44 UTC (permalink / raw)
  To: Dan Murphy, andrew; +Cc: netdev, Woojung.Huh, afd
In-Reply-To: <20171009120302.23611-1-dmurphy@ti.com>

On 10/09/2017 05:03 AM, Dan Murphy wrote:
> Removing the DP83822 device from the DP83848 to
> support the TI DP83822 dedicated driver that will
> initially support WoL settings.
Hi Dan,

The ordering of patch 1 and 2 may have to be reversed, otherwise you are
leaving people with the Generic PHY driver matching the DP83822 PHY
after applying patch 1, and without proper interrupt management again
unless they apply patch 2.

It may even be better to combine patch 1 and 2 actually to have a
cleaner transition.

Sorry for noticing this so late in the series...

> 
> Signed-off-by: Dan Murphy <dmurphy@ti.com>
> ---
> 
> v3 - No changes made
> v2 - There was no v1 on this patch this is new.
> 
>  drivers/net/phy/dp83848.c | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
> index 3de4fe4dda77..3966d43c5146 100644
> --- a/drivers/net/phy/dp83848.c
> +++ b/drivers/net/phy/dp83848.c
> @@ -20,7 +20,6 @@
>  #define TI_DP83620_PHY_ID		0x20005ce0
>  #define NS_DP83848C_PHY_ID		0x20005c90
>  #define TLK10X_PHY_ID			0x2000a210
> -#define TI_DP83822_PHY_ID		0x2000a240
>  
>  /* Registers */
>  #define DP83848_MICR			0x11 /* MII Interrupt Control Register */
> @@ -80,7 +79,6 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = {
>  	{ NS_DP83848C_PHY_ID, 0xfffffff0 },
>  	{ TI_DP83620_PHY_ID, 0xfffffff0 },
>  	{ TLK10X_PHY_ID, 0xfffffff0 },
> -	{ TI_DP83822_PHY_ID, 0xfffffff0 },
>  	{ }
>  };
>  MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
> @@ -110,7 +108,6 @@ static struct phy_driver dp83848_driver[] = {
>  	DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"),
>  	DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"),
>  	DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"),
> -	DP83848_PHY_DRIVER(TI_DP83822_PHY_ID, "TI DP83822 10/100 Mbps PHY"),
>  };
>  module_phy_driver(dp83848_driver);
>  
> 


-- 
Florian

^ permalink raw reply

* Re: pull request (net): ipsec 2017-10-09
From: David Miller @ 2017-10-09 16:44 UTC (permalink / raw)
  To: steffen.klassert; +Cc: herbert, netdev
In-Reply-To: <1507533399-29399-1-git-send-email-steffen.klassert@secunet.com>

From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Mon, 9 Oct 2017 09:16:35 +0200

> 1) Fix some error paths of the IPsec offloading API.
> 
> 2) Fix a NULL pointer dereference when IPsec is used
>    with vti. From Alexey Kodanev.
> 
> 3) Don't call xfrm_policy_cache_flush under xfrm_state_lock,
>    it triggers several locking warnings. From Artem Savkov.
> 
> Please pull or let me know if there are problems.

Pulled, thank you.

^ permalink raw reply

* [PATCH] net: thunderx: mark expected switch fall-throughs in nicvf_main()
From: Gustavo A. R. Silva @ 2017-10-09 16:44 UTC (permalink / raw)
  To: Sunil Goutham, Robert Richter
  Cc: linux-arm-kernel, netdev, linux-kernel, Gustavo A. R. Silva

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Robert Richter <rric@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: netdev@vger.kernel.org
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index d68478a..71989e1 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -566,8 +566,10 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 		return true;
 	default:
 		bpf_warn_invalid_xdp_action(action);
+		/* fall through */
 	case XDP_ABORTED:
 		trace_xdp_exception(nic->netdev, prog, action);
+		/* fall through */
 	case XDP_DROP:
 		/* Check if it's a recycled page, if not
 		 * unmap the DMA mapping.
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3 1/3] net: phy: Remove TI DP83822 from DP83848 driver
From: Dan Murphy @ 2017-10-09 16:45 UTC (permalink / raw)
  To: Florian Fainelli, andrew; +Cc: netdev, Woojung.Huh, afd
In-Reply-To: <b1dc0e7e-7024-7111-1a15-59a114207555@gmail.com>

Florian

On 10/09/2017 11:44 AM, Florian Fainelli wrote:
> On 10/09/2017 05:03 AM, Dan Murphy wrote:
>> Removing the DP83822 device from the DP83848 to
>> support the TI DP83822 dedicated driver that will
>> initially support WoL settings.
> Hi Dan,
> 
> The ordering of patch 1 and 2 may have to be reversed, otherwise you are
> leaving people with the Generic PHY driver matching the DP83822 PHY
> after applying patch 1, and without proper interrupt management again
> unless they apply patch 2.
> 
> It may even be better to combine patch 1 and 2 actually to have a
> cleaner transition.
> 

No worries I will squash patch 1 and 2.

Dan

> Sorry for noticing this so late in the series...
> 
>>
>> Signed-off-by: Dan Murphy <dmurphy@ti.com>
>> ---
>>
>> v3 - No changes made
>> v2 - There was no v1 on this patch this is new.
>>
>>  drivers/net/phy/dp83848.c | 3 ---
>>  1 file changed, 3 deletions(-)
>>
>> diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c
>> index 3de4fe4dda77..3966d43c5146 100644
>> --- a/drivers/net/phy/dp83848.c
>> +++ b/drivers/net/phy/dp83848.c
>> @@ -20,7 +20,6 @@
>>  #define TI_DP83620_PHY_ID		0x20005ce0
>>  #define NS_DP83848C_PHY_ID		0x20005c90
>>  #define TLK10X_PHY_ID			0x2000a210
>> -#define TI_DP83822_PHY_ID		0x2000a240
>>  
>>  /* Registers */
>>  #define DP83848_MICR			0x11 /* MII Interrupt Control Register */
>> @@ -80,7 +79,6 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = {
>>  	{ NS_DP83848C_PHY_ID, 0xfffffff0 },
>>  	{ TI_DP83620_PHY_ID, 0xfffffff0 },
>>  	{ TLK10X_PHY_ID, 0xfffffff0 },
>> -	{ TI_DP83822_PHY_ID, 0xfffffff0 },
>>  	{ }
>>  };
>>  MODULE_DEVICE_TABLE(mdio, dp83848_tbl);
>> @@ -110,7 +108,6 @@ static struct phy_driver dp83848_driver[] = {
>>  	DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"),
>>  	DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"),
>>  	DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"),
>> -	DP83848_PHY_DRIVER(TI_DP83822_PHY_ID, "TI DP83822 10/100 Mbps PHY"),
>>  };
>>  module_phy_driver(dp83848_driver);
>>  
>>
> 
> 


-- 
------------------
Dan Murphy

^ permalink raw reply

* Re: [PATCH v3 3/3] net: phy: Change error to EINVAL for invalid MAC
From: Florian Fainelli @ 2017-10-09 16:47 UTC (permalink / raw)
  To: Dan Murphy, andrew; +Cc: netdev, Woojung.Huh, afd
In-Reply-To: <20171009120302.23611-3-dmurphy@ti.com>

On 10/09/2017 05:03 AM, Dan Murphy wrote:
> Change the return error code to EINVAL if the MAC
> address is not valid in the set_wol function.

Looks fine to me, since you are respining, do you mind using "net: phy:
at803x:  Change error to EINVAL for invalid MAC" as a subject to further
specify what this is about?

Thanks!

> 
> Signed-off-by: Dan Murphy <dmurphy@ti.com>
> ---
> 
> v3 - No changes made
> v2 - There was no v1 on this patch this is new.
> 
>  drivers/net/phy/at803x.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
> index c1e52b9dc58d..5f93e6add563 100644
> --- a/drivers/net/phy/at803x.c
> +++ b/drivers/net/phy/at803x.c
> @@ -167,7 +167,7 @@ static int at803x_set_wol(struct phy_device *phydev,
>  		mac = (const u8 *) ndev->dev_addr;
>  
>  		if (!is_valid_ether_addr(mac))
> -			return -EFAULT;
> +			return -EINVAL;
>  
>  		for (i = 0; i < 3; i++) {
>  			phy_write(phydev, AT803X_MMD_ACCESS_CONTROL,
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 0/7] A few cleanup for hns3 ethernet driver
From: David Miller @ 2017-10-09 16:50 UTC (permalink / raw)
  To: linyunsheng
  Cc: huangdaode, xuwei5, liguozhu, Yisen.Zhuang, gabriele.paoloni,
	john.garry, linuxarm, salil.mehta, lipeng321, netdev,
	linux-kernel
In-Reply-To: <1507535041-204956-1-git-send-email-linyunsheng@huawei.com>

From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 9 Oct 2017 15:43:54 +0800

> This patchset contains a few cleanup for hns3 ethernet driver.
> No functional change intended.

Series applied, thank you.

^ permalink raw reply

* Re: pull request: linux-firmware: update cxgb4 firmware
From: Ben Hutchings @ 2017-10-09 16:50 UTC (permalink / raw)
  To: Ganesh Goudar, linux-firmware
  Cc: linux-kernel, nirranjan, indranil, venkatesh, netdev
In-Reply-To: <1506525877-6675-1-git-send-email-ganeshgr@chelsio.com>

[-- Attachment #1: Type: text/plain, Size: 819 bytes --]

On Wed, 2017-09-27 at 20:54 +0530, Ganesh Goudar wrote:
> Hi,
> 
> Kindly pull the new firmware from the following URL.
> git://git.chelsio.net/pub/git/linux-firmware.git for-upstream

Pulled, thanks.

Ben.

> Thanks
> Ganesh
> 
> The following changes since commit 59bf7e2cad88269ea704eb270d7ac6e69e8a544c:
> 
>   cxgb4: update firmware to revision 1.16.63.0 (2017-09-27 08:14:29 -0700)
> 
> are available in the git repository at:
> 
>   git://git.chelsio.net/pub/git/linux-firmware.git for-upstream
> 
> for you to fetch changes up to 59bf7e2cad88269ea704eb270d7ac6e69e8a544c:
> 
>   cxgb4: update firmware to revision 1.16.63.0 (2017-09-27 08:14:29 -0700)
> 
> ----------------------------------------------------------------
-- 
Ben Hutchings
Humour is the best antidote to reality.


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 00/15] nfp: bpf ABIv2 and multi port
From: David Miller @ 2017-10-09 16:52 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: netdev, oss-drivers
In-Reply-To: <20171009040417.22172-1-jakub.kicinski@netronome.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun,  8 Oct 2017 21:04:02 -0700

> This series migrates our eBPF offload from old PoC firmware to 
> a redesigned, faster and more feature rich FW.  Marking support
> is dropped for now.  We have to teach the JIT about encoding
> local memory accesses (one of NFP memory types).  There is also
> code to populate the ECC of instructions (PoC had ECC protection
> on instruction store disabled).  There is also a minor ld_field
> fix and all 64 bit shifts can now be encoded.

Thank you for structuring this series the way you did, as this made it
very easy to review.

Series applied.

^ permalink raw reply

* [PATCH net-next 0/2] ipv6: addrlabel: avoid dirtying ip6addrlbl_entry
From: Eric Dumazet @ 2017-10-09 16:52 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Hideaki YOSHIFUJI

The refcount on ip6addrlbl_entry is only used to make sure ip6addrlbl_entry
does not disappear while ip6addrlbl_get() is allocating an skb.

We can instead allocate skb first, then use RCU, so that we no longer need
to refcount these structures.

Eric Dumazet (2):
  ipv6: addrlabel: rework ip6addrlbl_get()
  ipv6: addrlabel: remove refcounting

 net/ipv6/addrlabel.c | 69 +++++++++++++---------------------------------------
 1 file changed, 17 insertions(+), 52 deletions(-)

-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply

* [PATCH net-next 1/2] ipv6: addrlabel: rework ip6addrlbl_get()
From: Eric Dumazet @ 2017-10-09 16:52 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Hideaki YOSHIFUJI
In-Reply-To: <20171009165225.7008-1-edumazet@google.com>

If we allocate skb before the lookup, we can use RCU
without the need of ip6addrlbl_hold()

This means that the following patch can get rid of refcounting.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv6/addrlabel.c | 36 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index c6311d7108f651c7385cd6316752ba4a86667dcc..f31489652c4ae1fc0c93d1756e13717f0689b0ab 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -546,38 +546,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		return -EINVAL;
 	addr = nla_data(tb[IFAL_ADDRESS]);
 
-	rcu_read_lock();
-	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
-	if (p && !ip6addrlbl_hold(p))
-		p = NULL;
-	lseq = net->ipv6.ip6addrlbl_table.seq;
-	rcu_read_unlock();
-
-	if (!p) {
-		err = -ESRCH;
-		goto out;
-	}
-
 	skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL);
-	if (!skb) {
-		ip6addrlbl_put(p);
+	if (!skb)
 		return -ENOBUFS;
-	}
 
-	err = ip6addrlbl_fill(skb, p, lseq,
-			      NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-			      RTM_NEWADDRLABEL, 0);
+	err = -ESRCH;
 
-	ip6addrlbl_put(p);
+	rcu_read_lock();
+	p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
+	lseq = net->ipv6.ip6addrlbl_table.seq;
+	if (p)
+		err = ip6addrlbl_fill(skb, p, lseq,
+				      NETLINK_CB(in_skb).portid,
+				      nlh->nlmsg_seq,
+				      RTM_NEWADDRLABEL, 0);
+	rcu_read_unlock();
 
 	if (err < 0) {
 		WARN_ON(err == -EMSGSIZE);
 		kfree_skb(skb);
-		goto out;
+	} else {
+		err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 	}
-
-	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-out:
 	return err;
 }
 
-- 
2.14.2.920.gcf0c67979c-goog

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox