Netdev List
 help / color / mirror / Atom feed
* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Dave Jones @ 2014-10-20 20:43 UTC (permalink / raw)
  To: Kevin Fenzi; +Cc: netdev, linux-kernel
In-Reply-To: <20141020141515.0688bf33@voldemort.scrye.com>

On Mon, Oct 20, 2014 at 02:15:15PM -0600, Kevin Fenzi wrote:
 
 > I'm seeing suspend/resume failures with recent 3.18 git kernels. 
 > 
 > Full dmesg at: http://paste.fedoraproject.org/143615/83287914/
 > 
 > The possibly interesting parts: 
 > 
 > [   78.373144] PM: Syncing filesystems ... done.
 > [   78.411180] PM: Preparing system for mem sleep
 > [   78.411995] Freezing user space processes ... 
 > [   98.429955] Freezing of tasks failed after 20.001 seconds (1 tasks refusing to freeze, wq_busy=0):
 > [   98.429971] (-localed)      D ffff88025f214c80     0  1866      1 0x00000084
 > [   98.429975]  ffff88024e777df8 0000000000000086 ffff88009b4444b0 0000000000014c80
 > [   98.429978]  ffff88024e777fd8 0000000000014c80 ffff880250ffb110 ffff88009b4444b0
 > [   98.429981]  0000000000000000 ffffffff81cec1a0 ffffffff81cec1a4 ffff88009b4444b0
 > [   98.429983] Call Trace:
 > [   98.429991]  [<ffffffff8175d619>] schedule_preempt_disabled+0x29/0x70
 > [   98.429994]  [<ffffffff8175f433>] __mutex_lock_slowpath+0xb3/0x120
 > [   98.429997]  [<ffffffff8175f4c3>] mutex_lock+0x23/0x40
 > [   98.430001]  [<ffffffff8163e325>] copy_net_ns+0x75/0x140
 > [   98.430005]  [<ffffffff810b8c2d>] create_new_namespaces+0xfd/0x1a0
 > [   98.430008]  [<ffffffff810b8e5a>] unshare_nsproxy_namespaces+0x5a/0xc0
 > [   98.430012]  [<ffffffff81098813>] SyS_unshare+0x193/0x340
 > [   98.430015]  [<ffffffff817617a9>] system_call_fastpath+0x12/0x17

I've seen similar soft lockup traces from the sys_unshare path when running my
fuzz tester.  It seems that if you create enough network namespaces,
it can take a huge amount of time for them to be iterated.
(Running trinity with '-c unshare' you can see the slow down happen. In
 some cases, it takes so long that the watchdog process kills it --
 though the SIGKILL won't get delivered until the unshare() completes)

Any idea what this machine had been doing prior to this that may have
involved creating lots of namespaces ?

	Dave

^ permalink raw reply

* [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev

Hi,

This patch set is trying to reduce the number of fib6_lookup()
calls from ip6_pol_route().

I have adapted davem's udpflooda and kbench_mod test
(https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
support IPv6 and here is the result:


Before:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m34.190s
user    0m3.047s
sys     0m31.108s

real    0m34.635s
user    0m3.125s
sys     0m31.475s

real    0m34.517s
user    0m3.034s
sys     0m31.449s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  660.160976] ip6_route_kbench: ip6_route_output tdiff: 933
[  660.207261] ip6_route_kbench: ip6_route_output tdiff: 988
[  660.253492] ip6_route_kbench: ip6_route_output tdiff: 896
[  660.298862] ip6_route_kbench: ip6_route_output tdiff: 898

After:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m32.695s
user    0m2.925s
sys     0m29.737s

real    0m32.636s
user    0m3.007s
sys     0m29.596s

real    0m32.797s
user    0m2.866s
sys     0m29.898s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  881.220793] ip6_route_kbench: ip6_route_output tdiff: 684
[  881.253477] ip6_route_kbench: ip6_route_output tdiff: 640
[  881.286867] ip6_route_kbench: ip6_route_output tdiff: 630
[  881.320749] ip6_route_kbench: ip6_route_output tdiff: 653


/****************************** udpflood.c ******************************/
/* It is an adaptation of the Eric Dumazet's and David Miller's
 * udpflood tool, by adding IPv6 support.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <malloc.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#define _GNU_SOURCE
#include <getopt.h>

typedef uint32_t u32;

static int debug =3D 0;

/* Allow -fstrict-aliasing */
typedef union sa_u {
	struct sockaddr_storage a46;
	struct sockaddr_in a4;
	struct sockaddr_in6 a6;
} sa_u;

static int usage(void)
{
	printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs=
 ] IP_ADDRESS\n");
	return -1;
}

static u32 get_last32h(const sa_u *sa)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		return ntohl(sa->a4.sin_addr.s_addr);
	else
		return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
}

static void set_last32h(sa_u *sa, u32 last32h)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		sa->a4.sin_addr.s_addr =3D htonl(last32h);
	else
		sa->a6.sin6_addr.s6_addr32[3] =3D htonl(last32h);
}

static void print_saddr(const sa_u *sa, const char *msg)
{
	char buf[64];

	if (!debug)
		return;

	switch (sa->a46.ss_family) {
	case PF_INET:
		inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
			  sizeof(buf));
		break;
	case PF_INET6:
		inet_ntop(PF_INET6, &(sa->a6.sin6_addr), buf, sizeof(buf));
		break;
	}

	printf("%s: %s\n", msg, buf);
}

static int send_packets(const sa_u *sa, size_t num_addrs, int count, int ms=
g_sz)
{
	char *msg =3D malloc(msg_sz);
	sa_u saddr;
	u32 start_addr32h, end_addr32h, cur_addr32h;
	int fd, i, err;

	if (!msg)
		return -ENOMEM;

	memset(msg, 0, msg_sz);

	memcpy(&saddr, sa, sizeof(saddr));
	cur_addr32h =3D start_addr32h =3D get_last32h(&saddr);
	end_addr32h =3D start_addr32h + num_addrs;

	fd =3D socket(saddr.a46.ss_family, SOCK_DGRAM, 0);
	if (fd < 0) {
		perror("socket");
		err =3D fd;
		goto out_nofd;
	}

	/* connect to avoid the kernel spending time in figuring
	 * out the source address (i.e pin the src address)
	 */
	err =3D connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
	if (err < 0) {
		perror("connect");
		goto out;
	}

	print_saddr(&saddr, "start_addr");
	for (i =3D 0; i < count; i++) {
		print_saddr(&saddr, "sendto");
		err =3D sendto(fd, msg, msg_sz, 0, (struct sockaddr *)&saddr,
			     sizeof(saddr));
		if (err < 0) {
			perror("sendto");
			goto out;
		}

		if (++cur_addr32h >=3D end_addr32h)
			cur_addr32h =3D start_addr32h;
		set_last32h(&saddr, cur_addr32h);
	}

	err =3D 0;
out:
	close(fd);
out_nofd:
	free(msg);
	return err;
}

int main(int argc, char **argv, char **envp)
{
	int port, msg_sz, count, num_addrs, ret;

	sa_u start_addr;

	port =3D 6000;
	msg_sz =3D 32;
	count =3D 10000000;
	num_addrs =3D 1;

	while ((ret =3D getopt(argc, argv, "dl:s:p:c:")) >=3D 0) {
		switch (ret) {
		case 'l':
			sscanf(optarg, "%d", &count);
			break;
		case 's':
			sscanf(optarg, "%d", &msg_sz);
			break;
		case 'p':
			sscanf(optarg, "%d", &port);
			break;
		case 'c':
			sscanf(optarg, "%d", &num_addrs);
			break;
		case 'd':
			debug =3D 1;
			break;
		case '?':
			return usage();
		}
	}

	if (num_addrs < 1)
		return usage();

	if (!argv[optind])
		return usage();

	start_addr.a4.sin_port =3D htons(port);
	if (inet_pton(PF_INET, argv[optind], &start_addr.a4.sin_addr))
		start_addr.a46.ss_family =3D PF_INET;
	else if (inet_pton(PF_INET6, argv[optind], &start_addr.a6.sin6_addr.s6_add=
r))
		start_addr.a46.ss_family =3D PF_INET6;
	else
		return usage();

	return send_packets(&start_addr, num_addrs, count, msg_sz);
}

/****************** ip6_route_kbench_mod.c ******************/
#define pr_fmt(fmt) "ip6_route_kbench: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/inet.h>
#include <linux/in6.h>

#include <net/route.h>
#include <net/ip6_route.h>

#include <linux/timex.h>
#include <uapi/linux/icmpv6.h>

/* We can't just use "get_cycles()" as on some platforms, such
 * as sparc64, that gives system cycles rather than cpu clock
 * cycles.
 */

#ifdef CONFIG_SPARC64
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	__asm__ __volatile__("rd %%tick, %0" : "=r" (t));
	return t;
}
#elif defined(CONFIG_X86)
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	rdtscll(t);

	return t;
}
#elif defined(CONFIG_POWERPC)
static inline unsigned long long get_tick(void)
{
	return get_cycles();
}
#else
#error Unsupported architecture, please implement get_tick()
#endif

#define DEFAULT_WARMUP_COUNT 100000

#define DEFAULT_DST_IP_ADDR	0x4a800001
#define DEFAULT_SRC_IP_ADDR	0x00000000
#define DEFAULT_OIF		0
#define DEFAULT_IIF		0
#define DEFAULT_MARK		0x00000000
#define DEFAULT_TOS		0x00

static int flow_oif = DEFAULT_OIF;
static int flow_iif = DEFAULT_IIF;
static u32 flow_mark = DEFAULT_MARK;
static struct in6_addr flow_dst_ip_addr;
static struct in6_addr flow_src_ip_addr;
static int flow_tos = DEFAULT_TOS;

static char dst_string[64];
static char src_string[64];

module_param_string(dst, dst_string, sizeof(dst_string), 0);
module_param_string(src, src_string, sizeof(src_string), 0);

static int __init flow_setup(void)
{
	if (dst_string[0] &&
	    !in6_pton(dst_string, -1, &flow_dst_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	if (src_string[0] &&
	    !in6_pton(src_string, -1, &flow_src_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	return 0;
}

module_param_named(oif, flow_oif, int, 0);
module_param_named(iif, flow_iif, int, 0);
module_param_named(mark, flow_mark, uint, 0);
module_param_named(tos, flow_tos, int, 0);

static int warmup_count = DEFAULT_WARMUP_COUNT;
module_param_named(count, warmup_count, int, 0);

static void flow_init(struct flowi6 *fl6)
{
	memset(fl6, 0, sizeof(*fl6));
	fl6->flowi6_proto = IPPROTO_ICMPV6;
	fl6->flowi6_oif = flow_oif;
	fl6->flowi6_iif = flow_iif;
	fl6->flowi6_mark = flow_mark;
	fl6->flowi6_tos = flow_tos;
	fl6->daddr = flow_dst_ip_addr;
	fl6->saddr = flow_src_ip_addr;
}

static struct sk_buff * fake_skb_get(void)
{
	struct ipv6hdr *hdr;
	struct sk_buff *skb;

	skb = alloc_skb(4096, GFP_KERNEL);
	if (!skb) {
		pr_info("Cannot alloc SKB for test\n");
		return NULL;
	}
	skb->dev = __dev_get_by_index(&init_net, flow_iif);
	if (skb->dev == NULL) {
		pr_info("Input device (%d) does not exist\n", flow_iif);
		goto err;
	}

	skb_reset_mac_header(skb);
	skb_reset_network_header(skb);
	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
	hdr = ipv6_hdr(skb);

	hdr->priority = 0;
	hdr->version = 6;
	memset(hdr->flow_lbl, 0, sizeof(hdr->flow_lbl));
	hdr->payload_len = htons(sizeof(struct icmp6hdr));
	hdr->nexthdr = IPPROTO_ICMPV6;
	hdr->saddr = flow_src_ip_addr;
	hdr->daddr = flow_dst_ip_addr;
	skb->protocol = htons(ETH_P_IPV6);
	skb->mark = flow_mark;

	return skb;
err:
	kfree_skb(skb);
	return NULL;
}

static void do_full_output_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct rt6_info *rt;
	struct flowi6 fl6;
	int i;

	rt = NULL;

	for (i = 0; i < warmup_count; i++) {
		flow_init(&fl6);

		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
		if (IS_ERR(rt))
			break;
		ip6_rt_put(rt);
	}
	if (IS_ERR(rt)) {
		pr_info("ip_route_output_key: err=%ld\n", PTR_ERR(rt));
		return;
	}

	flow_init(&fl6);

	t1 = get_tick();
	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
	t2 = get_tick();
	if (!IS_ERR(rt))
		ip6_rt_put(rt);

	tdiff = t2 - t1;
	pr_info("ip6_route_output tdiff: %llu\n", tdiff);
}

static void do_full_input_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct sk_buff *skb;
	struct rt6_info *rt;
	int err, i;

	skb = fake_skb_get();
	if (skb == NULL)
		goto out_free;

	err = 0;
	local_bh_disable();
	for (i = 0; i < warmup_count; i++) {
		ip6_route_input(skb);
		rt = (struct rt6_info *)skb_dst(skb);
		err = (!rt || rt == init_net.ipv6.ip6_null_entry);
		skb_dst_drop(skb);
		if (err)
			break;
	}
	local_bh_enable();

	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	local_bh_disable();
	t1 = get_tick();
	ip6_route_input(skb);
	t2 = get_tick();
	local_bh_enable();

	rt = (struct rt6_info *)skb_dst(skb);
	err = (!rt || rt == init_net.ipv6.ip6_null_entry);
	skb_dst_drop(skb);
	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	tdiff = t2 - t1;
	pr_info("ip6_route_input tdiff: %llu\n", tdiff);

out_free:
	kfree_skb(skb);
}

static void do_full_lookup_bench(void)
{
	if (!flow_iif)
		do_full_output_lookup_bench();
	else
		do_full_input_lookup_bench();
}

static void do_bench(void)
{
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
}

static int __init kbench_init(void)
{
	if (flow_setup())
		return -EINVAL;

	pr_info("flow [IIF(%d),OIF(%d),MARK(0x%08x),D("IP6_FMT"),"
		"S("IP6_FMT"),TOS(0x%02x)]\n",
		flow_iif, flow_oif, flow_mark,
		IP6_PRT(flow_dst_ip_addr),
		IP6_PRT(flow_src_ip_addr),
		flow_tos);

#if defined(CONFIG_X86)
	if (!cpu_has_tsc) {
		pr_err("X86 TSC is required, but is unavailable.\n");
		return -EINVAL;
	}
#endif

	pr_info("sizeof(struct rt6_info)==%zu\n", sizeof(struct rt6_info));

	do_bench();

	return -ENODEV;
}

static void __exit kbench_exit(void)
{
}

module_init(kbench_init);
module_exit(kbench_exit);
MODULE_LICENSE("GPL");

^ permalink raw reply

* [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

When there is a RTF_CACHE hit, no need to redo fib6_lookup()
with reachable=0.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f1ab2f4..98c523f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -944,12 +944,12 @@ redo_rt6_select:
 			goto out;
 	}
 
-	if (rt->rt6i_flags & RTF_CACHE)
-		goto out;
-
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
+	if (rt->rt6i_flags & RTF_CACHE)
+		goto out2;
+
 	if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 	else if (!(rt->dst.flags & DST_HOST))
-- 
1.8.1

^ permalink raw reply related

* [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

It is the prep work to reduce the number of calls to fib6_lookup().

The BACKTRACK macro could be hard-to-read and error-prone due to
its side effects (mainly goto).

This patch is to:
1. Replace BACKTRACK macro with a function (fib6_backtrack) with the following
   return values:
   * If it is backtrack-able, returns next fn for retry.
   * If it reaches the root, returns NULL.
2. The caller needs to decide if a backtrack is needed (by testing
   rt == net->ipv6.ip6_null_entry).
3. Rename the goto labels in ip6_pol_route() to make the next few
   patches easier to read.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 70 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a318dd89..f1ab2f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -772,23 +772,22 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
-#define BACKTRACK(__net, saddr)			\
-do { \
-	if (rt == __net->ipv6.ip6_null_entry) {	\
-		struct fib6_node *pn; \
-		while (1) { \
-			if (fn->fn_flags & RTN_TL_ROOT) \
-				goto out; \
-			pn = fn->parent; \
-			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
-				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
-			else \
-				fn = pn; \
-			if (fn->fn_flags & RTN_RTINFO) \
-				goto restart; \
-		} \
-	} \
-} while (0)
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+					struct in6_addr *saddr)
+{
+	struct fib6_node *pn;
+	while (1) {
+		if (fn->fn_flags & RTN_TL_ROOT)
+			return NULL;
+		pn = fn->parent;
+		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
+			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+		else
+			fn = pn;
+		if (fn->fn_flags & RTN_RTINFO)
+			return fn;
+	}
+}
 
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
@@ -804,8 +803,11 @@ restart:
 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
-	BACKTRACK(net, &fl6->saddr);
-out:
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
+	}
 	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 	return rt;
@@ -924,19 +926,25 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 
-relookup:
+redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-restart_2:
+redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 
-restart:
+redo_rt6_select:
 	rt = rt6_select(fn, oif, strict | reachable);
 	if (rt->rt6i_nsiblings)
 		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
-	BACKTRACK(net, &fl6->saddr);
-	if (rt == net->ipv6.ip6_null_entry ||
-	    rt->rt6i_flags & RTF_CACHE)
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto redo_rt6_select;
+		else
+			goto out;
+	}
+
+	if (rt->rt6i_flags & RTF_CACHE)
 		goto out;
 
 	dst_hold(&rt->dst);
@@ -967,12 +975,12 @@ restart:
 	 * released someone could insert this route.  Relookup.
 	 */
 	ip6_rt_put(rt);
-	goto relookup;
+	goto redo_fib6_lookup_lock;
 
 out:
 	if (reachable) {
 		reachable = 0;
-		goto restart_2;
+		goto redo_fib6_lookup;
 	}
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
@@ -1235,10 +1243,12 @@ restart:
 		rt = net->ipv6.ip6_null_entry;
 	else if (rt->dst.error) {
 		rt = net->ipv6.ip6_null_entry;
-		goto out;
+	} else if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
 	}
-	BACKTRACK(net, &fl6->saddr);
-out:
+
 	dst_hold(&rt->dst);
 
 	read_unlock_bh(&table->tb6_lock);
-- 
1.8.1

^ permalink raw reply related

* [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

This patch save the fn before doing rt6_backtrack.
Hence, without redo-ing the fib6_lookup(), saved_fn can be used
to redo rt6_select() with RT6_LOOKUP_F_REACHABLE off.

Some minor changes I think make sense to review as a single patch:
* Remove the 'out:' goto label.
* Remove the 'reachable' variable. Only use the 'strict' variable instead.

After this patch, "failing ip6_ins_rt()" should be the only case that
requires a redo of fib6_lookup().

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98c523f..c910831 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -917,31 +917,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi6 *fl6, int flags)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt, *nrt;
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
 
 redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	saved_fn = fn;
 
 redo_rt6_select:
-	rt = rt6_select(fn, oif, strict | reachable);
+	rt = rt6_select(fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
+		rt = rt6_multipath_select(rt, fl6, oif, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
-		else
-			goto out;
+		else if (strict & RT6_LOOKUP_F_REACHABLE) {
+			/* also consider unreachable route */
+			strict &= ~RT6_LOOKUP_F_REACHABLE;
+			fn = saved_fn;
+			goto redo_rt6_select;
+		} else {
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+			goto out2;
+		}
 	}
 
 	dst_hold(&rt->dst);
@@ -977,13 +986,6 @@ redo_rt6_select:
 	ip6_rt_put(rt);
 	goto redo_fib6_lookup_lock;
 
-out:
-	if (reachable) {
-		reachable = 0;
-		goto redo_fib6_lookup;
-	}
-	dst_hold(&rt->dst);
-	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
-- 
1.8.1

^ permalink raw reply related

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Kevin Fenzi @ 2014-10-20 20:53 UTC (permalink / raw)
  To: Dave Jones; +Cc: netdev, linux-kernel
In-Reply-To: <20141020204326.GA25668@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 752 bytes --]

On Mon, 20 Oct 2014 16:43:26 -0400
Dave Jones <davej@redhat.com> wrote:

> I've seen similar soft lockup traces from the sys_unshare path when
> running my fuzz tester.  It seems that if you create enough network
> namespaces, it can take a huge amount of time for them to be iterated.
> (Running trinity with '-c unshare' you can see the slow down happen.
> In some cases, it takes so long that the watchdog process kills it --
>  though the SIGKILL won't get delivered until the unshare() completes)
> 
> Any idea what this machine had been doing prior to this that may have
> involved creating lots of namespaces ?

That was right after boot. ;) 

This is my main rawhide running laptop.

A 'ip netns list' shows nothing.

kevin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* [PATCH net] bpf: fix bug in eBPF verifier
From: Alexei Starovoitov @ 2014-10-20 21:54 UTC (permalink / raw)
  To: David S. Miller
  Cc: Hannes Frederic Sowa, Daniel Borkmann, netdev, linux-kernel

while comparing for verifier state equivalency the comparison
was missing a check for uninitialized register.
Make sure it does so and add a testcase.

Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---

while we were staring at the verifier code with Hannes during LPC
something felt odd in this spot. Yes. It was a bug. Fix it.

 kernel/bpf/verifier.c       |    3 ++-
 samples/bpf/test_verifier.c |   11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3..9f81818 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 		if (memcmp(&old->regs[i], &cur->regs[i],
 			   sizeof(old->regs[0])) != 0) {
 			if (old->regs[i].type == NOT_INIT ||
-			    old->regs[i].type == UNKNOWN_VALUE)
+			    (old->regs[i].type == UNKNOWN_VALUE &&
+			     cur->regs[i].type != NOT_INIT))
 				continue;
 			return false;
 		}
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index f44ef11..eb4bec0 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -209,6 +209,17 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 	},
 	{
+		"program doesn't init R0 before exit in all branches",
+		.insns = {
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 !read_ok",
+		.result = REJECT,
+	},
+	{
 		"stack out of bounds",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH] rtlwifi: prevent format string usage from leaking
From: Kees Cook @ 2014-10-20 21:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Larry Finger, Chaoming Li, John W. Linville, linux-wireless,
	netdev

Use "%s" in the workqueue allocation to make sure the rtl_hal_cfg name
can never accidentally leak information via a format string.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/net/wireless/rtlwifi/base.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rtlwifi/base.c b/drivers/net/wireless/rtlwifi/base.c
index 58ba71830886..40b6d1d006d7 100644
--- a/drivers/net/wireless/rtlwifi/base.c
+++ b/drivers/net/wireless/rtlwifi/base.c
@@ -467,7 +467,7 @@ static void _rtl_init_deferred_work(struct ieee80211_hw *hw)
 		    rtl_easy_concurrent_retrytimer_callback, (unsigned long)hw);
 	/* <2> work queue */
 	rtlpriv->works.hw = hw;
-	rtlpriv->works.rtl_wq = alloc_workqueue(rtlpriv->cfg->name, 0, 0);
+	rtlpriv->works.rtl_wq = alloc_workqueue("%s", 0, 0, rtlpriv->cfg->name);
 	INIT_DELAYED_WORK(&rtlpriv->works.watchdog_wq,
 			  (void *)rtl_watchdog_wq_callback);
 	INIT_DELAYED_WORK(&rtlpriv->works.ips_nic_off_wq,
-- 
1.9.1


-- 
Kees Cook
Chrome OS Security

^ permalink raw reply related

* [PATCH] netfilter: log: protect nf_log_register against double registering
From: Marcelo Ricardo Leitner @ 2014-10-20 21:58 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev

Currently, despite the comment right before the function,
nf_log_register allows registering two loggers on with the same type and
end up overwriting the previous register.

Not a real issue today as current tree doesn't have two loggers for the
same type but it's better to get this protected.

Also make sure that all of its callers do error checking.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@redhat.com>
---

Notes:
    Please let me know if you have any issues with the identation on
    nf_log_register. I just couldn't find a better one.
    
    Thanks

 net/ipv4/netfilter/nf_log_arp.c  |  8 +++++++-
 net/ipv4/netfilter/nf_log_ipv4.c |  8 +++++++-
 net/ipv6/netfilter/nf_log_ipv6.c |  8 +++++++-
 net/netfilter/nf_log.c           | 13 ++++++++++++-
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index ccfc78db12ee8acae68faf451f2cf6bc5597f2c1..8b39174b7be390397a110ec9d3ed497bf8ce6d26 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -130,7 +130,13 @@ static int __init nf_log_arp_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+	ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_arp_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 078bdca1b607a167e05e7cf1bdfedccdd5aca92a..b3cb2ff6580343a9f7537aa2f48fd23858872b4d 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -366,7 +366,13 @@ static int __init nf_log_ipv4_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+	ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 7b17a0be93e7eccb2a26cd3294713d0f1112158d..b89576a5ca3e2b3964b8ce4aec09c8965496c2f2 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -398,7 +398,13 @@ static int __init nf_log_ipv6_init(void)
 	if (ret < 0)
 		return ret;
 
-	nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+	ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+	if (ret < 0) {
+		pr_err("log: failed to register logger\n");
+		unregister_pernet_subsys(&nf_log_ipv6_net_ops);
+		return ret;
+	}
+
 	return 0;
 }
 
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index daad6022c689c47a66a47e7a89a83c0c848c53d6..04495d9debe784827fd0cbcf5e541f10fa06839d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -82,10 +82,21 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
 	mutex_lock(&nf_log_mutex);
 
 	if (pf == NFPROTO_UNSPEC) {
+		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+			if (rcu_dereference_protected(loggers[i][logger->type],
+					lockdep_is_held(&nf_log_mutex))) {
+				mutex_unlock(&nf_log_mutex);
+				return -EEXIST;
+			}
+		}
 		for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
 			rcu_assign_pointer(loggers[i][logger->type], logger);
 	} else {
-		/* register at end of list to honor first register win */
+		if (rcu_dereference_protected(loggers[pf][logger->type],
+				lockdep_is_held(&nf_log_mutex))) {
+			mutex_unlock(&nf_log_mutex);
+			return -EEXIST;
+		}
 		rcu_assign_pointer(loggers[pf][logger->type], logger);
 	}
 
-- 
1.9.3


^ permalink raw reply related

* [PATCH 1/1] e1000: unset IFF_UNICAST_FLT on WMware 82545EM
From: Francesco Ruggeri @ 2014-10-20 22:11 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel, davem, pv-drivers, linux.nics, fruggeri,
	e1000-devel

VMWare's e1000 implementation does not seem to support unicast filtering.
This can be observed by configuring a macvlan interface on eth0 in a VM in
VMWare Fusion 5.0.5, and trying to use that interface instead of eth0.
Tested on 3.16.

Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
---
 drivers/net/ethernet/intel/e1000/e1000_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 5f6aded..24f3986 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -1075,7 +1075,10 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 				  NETIF_F_HW_CSUM |
 				  NETIF_F_SG);
 
-	netdev->priv_flags |= IFF_UNICAST_FLT;
+	/* Do not set IFF_UNICAST_FLT for VMWare's 82545EM */
+	if (hw->device_id != E1000_DEV_ID_82545EM_COPPER ||
+	    hw->subsystem_vendor_id != PCI_VENDOR_ID_VMWARE)
+		netdev->priv_flags |= IFF_UNICAST_FLT;
 
 	adapter->en_mng_pt = e1000_enable_mng_pass_thru(hw);
 
-- 
1.8.1.4

^ permalink raw reply related

* Re: qdisc running
From: Jamal Hadi Salim @ 2014-10-20 22:17 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: john Fastabend, Herbert Xu, netdev@vger.kernel.org, eric Dumazet,
	Mathieu Desnoyers
In-Reply-To: <20141020181756.2c8f33b9@redhat.com>

On 10/20/14 12:17, Jesper Dangaard Brouer wrote:
>
> On Sun, 19 Oct 2014 15:24:42 -0400 Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>

> I guess it is good for our recent dequeue batching.

It is i think ;->

> But I think/hope we
> can come up with a scheme that does not requires 6 lock/unlock
> operations (as illustrated on slide 9).
>

To be clear:
2 locks + 2 unlock and 2 atomic ops.


> John and I have talked about doing a lockless qdisc, but maintaining
> this __QDISC___STATE_RUNNING in a lockless scenario, would cost us
> extra atomic ops...
>

In the animation this __QDISC___STATE_RUNNING is shown as "occupied"
flag. It is like someone is in the toilet and you cant come in;->
They have to finish dropping the packages into the toilet^Whardware ;->
If it is occupied, you put your package outside and go.

> Are we still sure, that this model of only allowing a single CPU in the
> dequeue path, is still the best solution?

For sure it is the best if you want to batch. Look at that last orange
guy picking all the packages (busylock.swf). This is where all the
batching would  happen.

>(The TXQ lock should already
> protect several CPUs in this code path).


Note:
Maybe for the orange guy (the dequeur) the tx lock could
be avoided? Double check the code. Important to note under
busy period contention is reduced to :
1 lock + 1 unlock + 2 atomic ops for N-1 CPUs.
The orange guy on the other hand is doing 2 lock/unlock.


> I can see that you really needed the budget/fairness in the dequeue
> loop, that we recently mangled with.
>

Yes, fairness is needed so the orange guy doesnt spend all his cycles
doing all the work (that was the basis of my presentation); unless
that is not an issue and the scheduler would move things away from
that cpu.


> What tool do I use to play these SWF files? (I tried VLC but no luck).
>

Firefox should work fine.

cheers,
jamal

^ permalink raw reply

* Re: [linux-nics] [PATCH 1/1] e1000: unset IFF_UNICAST_FLT on WMware 82545EM
From: Jeff Kirsher @ 2014-10-20 22:25 UTC (permalink / raw)
  To: Francesco Ruggeri
  Cc: netdev, linux.nics, pv-drivers, fruggeri, linux-kernel,
	e1000-devel, davem
In-Reply-To: <20141020221115.1FC9B480090@fruggeri-Arora18.sjc.aristanetworks.com>

[-- Attachment #1: Type: text/plain, Size: 551 bytes --]

On Mon, 2014-10-20 at 15:11 -0700, Francesco Ruggeri wrote:
> VMWare's e1000 implementation does not seem to support unicast
> filtering.
> This can be observed by configuring a macvlan interface on eth0 in a
> VM in
> VMWare Fusion 5.0.5, and trying to use that interface instead of eth0.
> Tested on 3.16.
> 
> Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
> ---
>  drivers/net/ethernet/intel/e1000/e1000_main.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)

Thanks Francesco, I will add your patch to my queue.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [RFC PATCH net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Ben Hutchings @ 2014-10-20 22:40 UTC (permalink / raw)
  To: Erik Kline; +Cc: netdev, hannes, lorenzo, edumazet
In-Reply-To: <1413520309-13814-1-git-send-email-ek@google.com>

[-- Attachment #1: Type: text/plain, Size: 588 bytes --]

On Fri, 2014-10-17 at 13:31 +0900, Erik Kline wrote:
[...]
> --- a/include/uapi/linux/ipv6.h
> +++ b/include/uapi/linux/ipv6.h
> @@ -154,6 +154,7 @@ enum {
>  	DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
>  	DEVCONF_PROXY_NDP,
>  	DEVCONF_OPTIMISTIC_DAD,
> +	DEVCONF_USE_OPTIMISTIC,
>  	DEVCONF_ACCEPT_SOURCE_ROUTE,
>  	DEVCONF_MC_FORWARDING,
>  	DEVCONF_DISABLE_IPV6,
[...]

As this is UAPI, the new entry must be added at the end of the
enumeration (well, before DEVCONF_MAX).

Ben.

-- 
Ben Hutchings
Reality is just a crutch for people who can't handle science fiction.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 811 bytes --]

^ permalink raw reply

* Re: [RFC v2 2/6] driver-core: add driver async_probe support
From: Luis R. Rodriguez @ 2014-10-20 23:43 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Greg Kroah-Hartman, Wu Zhangjin, Takashi Iwai, Tejun Heo,
	Arjan van de Ven, linux-kernel@vger.kernel.org, Oleg Nesterov,
	hare, Andrew Morton, Tetsuo Handa, Joseph Salisbury,
	Benjamin Poirier, Santosh Rastapur, Kay Sievers,
	One Thousand Gnomes, Tim Gardner, Pierre Fersing,
	Nagalakshmi Nandigama, Praveen Krishnamoorthy, Sreekanth Reddy,
	Abhijit Mahajan <abhi
In-Reply-To: <20140905221029.GA35667@core.coreip.homeip.net>

> diff --git a/drivers/base/bus.c b/drivers/base/bus.c
> index 83e910a..49fe573 100644
> --- a/drivers/base/bus.c
> +++ b/drivers/base/bus.c
> @@ -10,6 +10,7 @@
>   *
>   */
>
> +#include <linux/async.h>
>  #include <linux/device.h>
>  #include <linux/module.h>
>  #include <linux/errno.h>
> @@ -547,15 +548,12 @@ void bus_probe_device(struct device *dev)
>  {
>         struct bus_type *bus = dev->bus;
>         struct subsys_interface *sif;
> -       int ret;
>
>         if (!bus)
>                 return;
>
> -       if (bus->p->drivers_autoprobe) {
> -               ret = device_attach(dev);
> -               WARN_ON(ret < 0);
> -       }
> +       if (bus->p->drivers_autoprobe)
> +               device_initial_probe(dev);
>
>         mutex_lock(&bus->p->mutex);
>         list_for_each_entry(sif, &bus->p->interfaces, node)
> @@ -657,6 +655,17 @@ static ssize_t uevent_store(struct device_driver *drv, const char *buf,
>  }
>  static DRIVER_ATTR_WO(uevent);

Based on my review with my latest changes this is what I was missing,
I'll be sure to address this.

 Luis

^ permalink raw reply

* Re: Queue with wait-free enqueue, blocking dequeue, splice
From: Mathieu Desnoyers @ 2014-10-21  0:04 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Paul E. McKenney, netdev, Jamal Hadi Salim
In-Reply-To: <20141020160237.302aa17c@redhat.com>

----- Original Message -----
> From: "Jesper Dangaard Brouer" <brouer@redhat.com>
> To: "Mathieu Desnoyers" <mathieu.desnoyers@efficios.com>
> Cc: brouer@redhat.com, "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>, netdev@vger.kernel.org, "Jamal Hadi Salim"
> <jhs@mojatatu.com>
> Sent: Monday, October 20, 2014 10:02:37 AM
> Subject: Re: Queue with wait-free enqueue, blocking dequeue, splice
> 
> 
> On Sat, 18 Oct 2014 11:48:12 +0000 (UTC) Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
> 
> > Following our LPC discussion on lock-free queue algorithms
> > for qdisc, here is some info on the wfcqueue implementation
> > found in Userspace RCU. See http://urcu.so for info and
> > git repository.
> 
> Thank for following up on our very interesting discussions.
> 
> I've started with the more simple variant "urcu/static/wfqueue.h" to
> understand the concepts.  And I'm now reading wfcqueue.h, which I guess
> it replacing wfqueue.h.

Yep, this is correct.

> 
>  
> > Here is the wfcqueue ported to the Linux kernel I sent last
> > year as RFC:
> > https://lkml.org/lkml/2013/3/14/289
> > 
> > I'm very interested to learn if it fits well for your
> > use-case,
> 
> Does this wfcqueue API support bulk dequeue?  (A feature needed for the
> lock-less qdisc implementation, else it cannot compete with our new
> bulk dequeue strategy).

Yes, it does. See the "__wfcq_splice" API.

> 
> AFAIK your queue implementation is a CAS-based, Wait-Free on enqueue,
> but Lock-Free on dequeue with the potential for waiting/blocking on
> a enqueue processes.
>  I'm not 100% sure, that we want this behavior for the qdisc system.

It's actually xchg-based (not CAS-based). It is indeed wait-free
in the strictest sense of the term on enqueue (at least on x86,
some other arch implement xchg using ll/sc, which is not strictly
wait-free).

On dequeue, it can busy-wait for a short while that the enqueue
completes. Note that in kernel, since we disable preemption during
enqueue, the dequeue does not have to ever block, just busy-looping
is OK, since the longest thing that could nest over the enqueue
is possibly an interrupt and softirq. So yes, I guess the dequeue
would qualify as lock-free.

> 
> I can certainly use the wfcq_empty() check,

Not sure why you would want to use it, considering that the dequeue
operation implies it. If there is nothing to dequeue, we just return
immediately. Dequeue operation does not block on empty queue. It
just busy waits if it happen to see the queue in an intermediate
state of the enqueue operation (which is very short, few instructions
at most, with preemption disabled).

> but I guess I need to
> maintain a separate counter to maintain the qdisc limit, right?
> (I would use the approximate/split counter API percpu_counter to keep
> this scalable, and wfcq_empty() would provide an accurate empty check)

Yes for split counters, not sure why you need the empty check explicitly
in your use-case though.

> 
> 
> I think, we/I should start micro benchmarking the different approaches.
> As our time budget is only 67.2ns
>  http://netoptimizer.blogspot.dk/2014/05/the-calculations-10gbits-wirespeed.html
> (or bulking tricks artificially "increase" this budget)
> 
> 
> The motivation behind this lockless qdisc is, the current qdisc locking
> cost 48ns, see slide 9/16 titled "Qdisc locking is nasty":
>  http://people.netfilter.org/hawk/presentations/LinuxPlumbers2014/performance_tx_qdisc_bulk_LPC2014.pdf
> 

Chances are that the scheme:

__wfcq_enqueue() on enqueue

__wfcq_splice() for bulk dequeue

__wfcq_first()
__wfcq_next()  for iteration on the splice dest queue

Would be must faster than the ton of locks you use currently.
The nice part about using just enqueue and splice is that you
don't need locking on the queue at all. Iteration on the
destination queue can be done by a single thread, so no need
for explicit locking there neither.

By the way, you should look at the kernel wfcq implementation
I posted earlier. It's more compact that the one in userspace RCU
because we don't need to support blocking/nonblocking modes,
because we have the luxury of disabling preemption in the kernel.

I look forward to see the numbers,

Thanks!

Mathieu

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Sr. Network Kernel Developer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [PATCH] netfilter: Fix wastful cleanup check for unconfirmed conn in get_next_corpse
From: Feng Gao @ 2014-10-21  0:31 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Patrick McHardy, kadlec, davem
  Cc: Netfilter Developer Mailing List, coreteam, netdev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 788 bytes --]

Hi all,

I am sorry to send the patch commit again because the last email is
not plain text and is rejected by some servers.

This is the patch to branch master of kernel.

The function get_next_corpse is only invoked by nf_ct_iterate_cleanup
in one while loop, and it will check the per cpu unconfirmed conntrack
list every time.

I think the whole list of unconfirmed conntracks could be accessed by
one call, so the others are not necessary.

So I move the checks outside the get_next_corpse, and create one new
function clean_up_unconfirmed_conntracks.
Let the nf_ct_iterate_cleanup invokes the
clean_up_unconfirmed_conntracks after the while loop.

These codes have already exist for a long time. Firstly I think maybe
there is some reason, but I fail to get it.


Best Regards
Feng

[-- Attachment #2: 0001-netfilter-Fix-wastful-cleanup-check-for-unconfirmed-.patch --]
[-- Type: application/octet-stream, Size: 2964 bytes --]

From 52de457b3cfd1e94a52df1c3dfcd9dbf3511fa0d Mon Sep 17 00:00:00 2001
From: fgao <gfree.wind@gmail.com>
Date: Mon, 20 Oct 2014 07:18:05 -0700
Subject: [PATCH 1/1] netfilter: Fix wastful cleanup check for unconfirmed conn in get_next_corpse

The function get_next_corpse is used to iterate the conntracks.
It will check the per cpu unconfirmed list of every cpu too.
Now it is only invoked by nf_ct_iterate_cleanup in one while loop.
Actually the unconfirmed list could be accessed completely by one call, then the others are wastful.

So move the unconfirmed list check outside the function get_next_corpse and create one new function
Let the nf_ct_iterate_cleanup invokes the new function clean_up_unconfirmed_conntracks once after the loops.

Signed-off-by: fgao <gfree.wind@gmail.com>
---
 net/netfilter/nf_conntrack_core.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5016a69..ace7c2c2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1348,6 +1348,28 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 	nf_conntrack_get(nskb->nfct);
 }
 
+static void clean_up_unconfirmed_conntracks(struct net *net,
+		int (*iter)(struct nf_conn *i, void *data),
+		void *data)
+{
+	struct nf_conntrack_tuple_hash *h;
+	struct nf_conn *ct;
+	struct hlist_nulls_node *n;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
+
+		spin_lock_bh(&pcpu->lock);
+		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
+			ct = nf_ct_tuplehash_to_ctrack(h);
+			if (iter(ct, data))
+				set_bit(IPS_DYING_BIT, &ct->status);
+		}
+		spin_unlock_bh(&pcpu->lock);
+	}
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
@@ -1356,7 +1378,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
 	struct hlist_nulls_node *n;
-	int cpu;
 	spinlock_t *lockp;
 
 	for (; *bucket < net->ct.htable_size; (*bucket)++) {
@@ -1376,17 +1397,6 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
 		local_bh_enable();
 	}
 
-	for_each_possible_cpu(cpu) {
-		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
-		spin_lock_bh(&pcpu->lock);
-		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
-			ct = nf_ct_tuplehash_to_ctrack(h);
-			if (iter(ct, data))
-				set_bit(IPS_DYING_BIT, &ct->status);
-		}
-		spin_unlock_bh(&pcpu->lock);
-	}
 	return NULL;
 found:
 	atomic_inc(&ct->ct_general.use);
@@ -1411,6 +1421,8 @@ void nf_ct_iterate_cleanup(struct net *net,
 
 		nf_ct_put(ct);
 	}
+
+	clean_up_unconfirmed_conntracks(net, iter, data);
 }
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 
-- 
1.9.1


^ permalink raw reply related

* [PATCH 2/2][V2] xfrm6: fix a potential use after free in xfrm6_policy.c
From: roy.qing.li @ 2014-10-21  0:34 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

pskb_may_pull() maybe change skb->data and make nh and exthdr pointer
oboslete, so recompute the nd and exthdr

V2: insert a space between date type(like __be16) and * as suggested by
Sergei Shtylyov

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 net/ipv6/xfrm6_policy.c |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ac49f84..115fd3b 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -170,8 +170,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 		case IPPROTO_DCCP:
 			if (!onlyproto && (nh + offset + 4 < skb->data ||
 			     pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
-				__be16 *ports = (__be16 *)exthdr;
+				__be16 *ports;
 
+				nh = skb_network_header(skb);
+				ports = (__be16 *)(nh + offset);
 				fl6->fl6_sport = ports[!!reverse];
 				fl6->fl6_dport = ports[!reverse];
 			}
@@ -180,8 +182,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 
 		case IPPROTO_ICMPV6:
 			if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) {
-				u8 *icmp = (u8 *)exthdr;
+				u8 *icmp;
 
+				nh = skb_network_header(skb);
+				icmp = (u8 *)(nh + offset);
 				fl6->fl6_icmp_type = icmp[0];
 				fl6->fl6_icmp_code = icmp[1];
 			}
@@ -192,8 +196,9 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
 		case IPPROTO_MH:
 			if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) {
 				struct ip6_mh *mh;
-				mh = (struct ip6_mh *)exthdr;
 
+				nh = skb_network_header(skb);
+				mh = (struct ip6_mh *)(nh + offset);
 				fl6->fl6_mh_type = mh->ip6mh_type;
 			}
 			fl6->flowi6_proto = nexthdr;
-- 
1.7.10.4

^ permalink raw reply related

* [net-next] iptunnel: Fix iptunnel_xmit return code for stats maintenance
From: Andy Zhou @ 2014-10-21  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, Andy Zhou

iptunnel_xmit() currently always return >= 0 instead of proper error
code, that is used to maintain stats. For example, current return code
conflicts with how iptunnel_xmit_stats() maintains stats.

Unfortunately, the return code can not be changed without readjusting
how SKB memory is managed through the call chain.  The following two
rules are adopted for this patch:

1) Proper error code are always propagate back through the call chain
   so that the caller can maintain stats.

2) Tunnel xmit functions always free resources, e.g. skb and route
   entry.

Signed-off-by: Andy Zhou <azhou@nicira.com>
---
 drivers/net/vxlan.c            |   21 +++++++++++++--------
 include/net/ip_tunnels.h       |    7 +++++++
 net/ipv4/geneve.c              |    8 ++++++--
 net/ipv4/ip_tunnel_core.c      |   14 +++++++++++---
 net/openvswitch/vport-geneve.c |    5 ++---
 net/openvswitch/vport-gre.c    |    1 +
 net/openvswitch/vport-vxlan.c  |    6 +++---
 net/openvswitch/vport.c        |    8 ++++----
 8 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ca30982..93348cb 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1626,8 +1626,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	bool udp_sum = !vs->sock->sk->sk_no_check_tx;
 
 	skb = udp_tunnel_handle_offloads(skb, udp_sum);
-	if (IS_ERR(skb))
-		return -EINVAL;
+	if (IS_ERR(skb)) {
+		err = -EINVAL;
+		goto error;
+	}
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ VXLAN_HLEN + sizeof(struct iphdr)
@@ -1636,13 +1638,15 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
-		return err;
+		goto error;
 
 	if (vlan_tx_tag_present(skb)) {
 		if (WARN_ON(!__vlan_put_tag(skb,
 					    skb->vlan_proto,
-					    vlan_tx_tag_get(skb))))
-			return -ENOMEM;
+					    vlan_tx_tag_get(skb)))) {
+			err = -ENOMEM;
+			goto error;
+		}
 
 		skb->vlan_tci = 0;
 	}
@@ -1655,6 +1659,10 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
 				   ttl, df, src_port, dst_port, xnet);
+error:
+	kfree_skb(skb);
+	ip_rt_put(rt);
+	return err;
 }
 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
 
@@ -1786,9 +1794,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     tos, ttl, df, src_port, dst_port,
 				     htonl(vni << 8),
 				     !net_eq(vxlan->net, dev_net(vxlan->dev)));
-
-		if (err < 0)
-			goto rt_tx_error;
 		iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5bc6ede..80bcf2e 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -174,6 +174,13 @@ static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
 }
 
 int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
+
+/* Transmit a packet over IP tunnel
+ * Returns:
+ *	0 Congestion notification received
+ *	>0  Number of bytes in the packet successfully sent
+ *	<0 packet dropped due to error
+ */
 int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 		  __be32 src, __be32 dst, __u8 proto,
 		  __u8 tos, __u8 ttl, __be16 df, bool xnet);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 065cd94..90fea48 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -129,14 +129,14 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err))
-		return err;
+		goto error;
 
 	if (vlan_tx_tag_present(skb)) {
 		if (unlikely(!__vlan_put_tag(skb,
 					     skb->vlan_proto,
 					     vlan_tx_tag_get(skb)))) {
 			err = -ENOMEM;
-			return err;
+			goto error;
 		}
 		skb->vlan_tci = 0;
 	}
@@ -146,6 +146,10 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
 				   tos, ttl, df, src_port, dst_port, xnet);
+error:
+	kfree_skb(skb);
+	ip_rt_put(rt);
+	return err;
 }
 EXPORT_SYMBOL_GPL(geneve_xmit_skb);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 88c386c..b3ba4a3 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -77,9 +77,17 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 	__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
 
 	err = ip_local_out_sk(sk, skb);
-	if (unlikely(net_xmit_eval(err)))
-		pkt_len = 0;
-	return pkt_len;
+
+	/* Deal with positive error numbers. Filter out NET_XMIT_CN */
+	if (err > 0)
+		return net_xmit_errno(err);
+
+	/* Success, return number of bytes transmitted */
+	if (err == 0)
+		err = pkt_len;
+
+	/* Return pkt_len or an error code */
+	return err;
 }
 EXPORT_SYMBOL_GPL(iptunnel_xmit);
 
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 106a9d8..34276fb 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -206,15 +206,14 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
 	tunnel_id_to_vni(tun_key->tun_id, vni);
 	skb->ignore_df = 1;
 
-	err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
+	return  geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr,
 			      tun_key->ipv4_dst, tun_key->ipv4_tos,
 			      tun_key->ipv4_ttl, df, sport, dport,
 			      tun_key->tun_flags, vni,
 			      tun_info->options_len, (u8 *)tun_info->options,
 			      false);
-	if (err < 0)
-		ip_rt_put(rt);
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 108b82d..8721b30 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -200,6 +200,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 err_free_rt:
 	ip_rt_put(rt);
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 2735e01..ace849a 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -174,15 +174,15 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	src_port = udp_flow_src_port(net, skb, 0, 0, true);
 
-	err = vxlan_xmit_skb(vxlan_port->vs, rt, skb,
+	return vxlan_xmit_skb(vxlan_port->vs, rt, skb,
 			     fl.saddr, tun_key->ipv4_dst,
 			     tun_key->ipv4_tos, tun_key->ipv4_ttl, df,
 			     src_port, dst_port,
 			     htonl(be64_to_cpu(tun_key->tun_id) << 8),
 			     false);
-	if (err < 0)
-		ip_rt_put(rt);
+
 error:
+	kfree_skb(skb);
 	return err;
 }
 
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 6015802..eb0a72f 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -480,11 +480,11 @@ int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 		stats->tx_packets++;
 		stats->tx_bytes += sent;
 		u64_stats_update_end(&stats->syncp);
-	} else if (sent < 0) {
-		ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
-		kfree_skb(skb);
-	} else
+	} else if (sent == -ENOBUFS || sent == -ENOMEM || sent == 0) {
 		ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
+	} else {
+		ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
+	}
 
 	return sent;
 }
-- 
1.7.9.5

^ permalink raw reply related

* Re: [RFC PATCH net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Erik Kline @ 2014-10-21  4:02 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, hannes, Lorenzo Colitti, Eric Dumazet
In-Reply-To: <1413844815.31953.32.camel@decadent.org.uk>

Thanks.  Patch v2 coming shortly.

On Tue, Oct 21, 2014 at 7:40 AM, Ben Hutchings <ben@decadent.org.uk> wrote:
> On Fri, 2014-10-17 at 13:31 +0900, Erik Kline wrote:
> [...]
>> --- a/include/uapi/linux/ipv6.h
>> +++ b/include/uapi/linux/ipv6.h
>> @@ -154,6 +154,7 @@ enum {
>>       DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN,
>>       DEVCONF_PROXY_NDP,
>>       DEVCONF_OPTIMISTIC_DAD,
>> +     DEVCONF_USE_OPTIMISTIC,
>>       DEVCONF_ACCEPT_SOURCE_ROUTE,
>>       DEVCONF_MC_FORWARDING,
>>       DEVCONF_DISABLE_IPV6,
> [...]
>
> As this is UAPI, the new entry must be added at the end of the
> enumeration (well, before DEVCONF_MAX).
>
> Ben.
>
> --
> Ben Hutchings
> Reality is just a crutch for people who can't handle science fiction.

^ permalink raw reply

* [RFC PATCH v2 net-next] net: ipv6: Add a sysctl to make optimistic addresses useful candidates
From: Erik Kline @ 2014-10-21  4:05 UTC (permalink / raw)
  To: netdev; +Cc: ben, lorenzo, hannes, Erik Kline

Add a sysctl that causes an interface's optimistic addresses
to be considered equivalent to other non-deprecated addresses
for source address selection purposes.  Preferred addresses
will still take precedence over optimistic addresses, subject
to other ranking in the source address selection algorithm.

This is useful where different interfaces are connected to
different networks from different ISPs (e.g., a cell network
and a home wifi network).

The current behaviour complies with RFC 3484/6724, and it
makes sense if the host has only one interface, or has
multiple interfaces on the same network (same or cooperating
administrative domain(s), but not in the multiple distinct
networks case.

For example, if a mobile device has an IPv6 address on an LTE
network and then connects to IPv6-enabled wifi, while the wifi
IPv6 address is undergoing DAD, IPv6 connections will try use
the wifi default route with the LTE IPv6 address, and will get
stuck until they time out.

Also, because optimistic addresses can actually be used, issue
an RTM_NEWADDR as soon as DAD starts.  If DAD fails an separate
RTM_DELADDR will be sent.

Also: add an entry in ip-sysctl.txt for optimistic_dad.

Signed-off-by: Erik Kline <ek@google.com>
---
 Documentation/networking/ip-sysctl.txt | 13 ++++++++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 36 ++++++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 0307e28..e03cf49 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1452,6 +1452,19 @@ suppress_frag_ndisc - INTEGER
 	1 - (default) discard fragmented neighbor discovery packets
 	0 - allow fragmented neighbor discovery packets
 
+optimistic_dad - BOOLEAN
+	Whether to perform Optimistic Duplicate Address Detection (RFC 4429).
+		0: disabled (default)
+		1: enabled
+
+use_optimistic - BOOLEAN
+	If enabled, do not classify optimistic addresses as deprecated during
+	source address selection.  Preferred addresses will still be chosen
+	before optimistic addresses, subject to other ranking in the source
+	address selection algorithm.
+		0: disabled (default)
+		1: enabled
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index ff56053..7121a2e 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -42,6 +42,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_from_local;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	__s32		optimistic_dad;
+	__s32		use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	__s32		mc_forwarding;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index efa2666..e863d08 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -164,6 +164,7 @@ enum {
 	DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
 	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_ACCEPT_RA_FROM_LOCAL,
+	DEVCONF_USE_OPTIMISTIC,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 725c763..c2fddb7 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1169,6 +1169,9 @@ enum {
 	IPV6_SADDR_RULE_LABEL,
 	IPV6_SADDR_RULE_PRIVACY,
 	IPV6_SADDR_RULE_ORCHID,
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	IPV6_SADDR_RULE_NOT_OPTIMISTIC,
+#endif
 	IPV6_SADDR_RULE_PREFIX,
 	IPV6_SADDR_RULE_MAX
 };
@@ -1257,10 +1260,17 @@ static int ipv6_get_saddr_eval(struct net *net,
 		score->scopedist = ret;
 		break;
 	case IPV6_SADDR_RULE_PREFERRED:
+	    {
 		/* Rule 3: Avoid deprecated and optimistic addresses */
+		u8 avoid = IFA_F_DEPRECATED;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+		if (!score->ifa->idev->cnf.use_optimistic)
+			avoid |= IFA_F_OPTIMISTIC;
+#endif
 		ret = ipv6_saddr_preferred(score->addr_type) ||
-		      !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC));
+		      !(score->ifa->flags & avoid);
 		break;
+	    }
 #ifdef CONFIG_IPV6_MIP6
 	case IPV6_SADDR_RULE_HOA:
 	    {
@@ -1299,6 +1309,14 @@ static int ipv6_get_saddr_eval(struct net *net,
 		ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
 			ipv6_addr_orchid(dst->addr));
 		break;
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
+		/* Optimistic addresses still have lower precedence than other
+		 * preferred addresses.
+		 */
+		ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
+		break;
+#endif
 	case IPV6_SADDR_RULE_PREFIX:
 		/* Rule 8: Use longest matching prefix */
 		ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
@@ -3222,8 +3240,13 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	 * Optimistic nodes can start receiving
 	 * Frames right away
 	 */
-	if (ifp->flags & IFA_F_OPTIMISTIC)
+	if (ifp->flags & IFA_F_OPTIMISTIC) {
 		ip6_ins_rt(ifp->rt);
+		/* Because optimistic nodes can receive frames, notify
+		 * listeners. If DAD fails, RTM_DELADDR is sent.
+		 */
+		ipv6_ifa_notify(RTM_NEWADDR, ifp);
+	}
 
 	addrconf_dad_kick(ifp);
 out:
@@ -4330,6 +4353,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 	array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
+	array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 	array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
@@ -5155,6 +5179,14 @@ static struct addrconf_sysctl_table
 			.proc_handler   = proc_dointvec,
 
 		},
+		{
+			.procname       = "use_optimistic",
+			.data           = &ipv6_devconf.use_optimistic,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 #endif
 #ifdef CONFIG_IPV6_MROUTE
 		{
-- 
2.1.0.rc2.206.gedb03e5

^ permalink raw reply related

* Re: [PATCH] netlink: Re-add locking to netlink_lookup() and seq walker
From: Heiko Carstens @ 2014-10-21  7:04 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Eric Dumazet, Sasha Levin, paulmck, Nikolay Aleksandrov,
	David S. Miller, netdev, linux-kernel, Ursula Braun
In-Reply-To: <20141020195355.GA2299@casper.infradead.org>

On Mon, Oct 20, 2014 at 08:53:55PM +0100, Thomas Graf wrote:
> Heiko,
> 
> Can you test the following patch:
> 
> The synchronize_rcu() in netlink_release() introduces unacceptable
> latency. Reintroduce minimal lookup so we can drop the
> synchronize_rcu() until socket destruction has been RCUfied.
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  net/netlink/af_netlink.c | 37 +++++++++++++++++++++++++------------
>  1 file changed, 25 insertions(+), 12 deletions(-)

Thanks a lot! Your patch fixes the issue for me.

Reported-and-tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>

^ permalink raw reply

* RCU stall in af_unix.c, should use spin_lock_irqsave?
From: Thomas Petazzoni @ 2014-10-21  8:03 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet
  Cc: netdev, linux-kernel, Alexandre FOURNIER, Ezequiel Garcia,
	Marcin Wojtas, Gregory Clément

Hello,

I stumbled across a reproducible RCU stall related to the AF_UNIX code
(on 3.17, on an ARM SMP system), and I'm not sure whether the problem
is caused by:

 * The af_unix.c code using spin_lock() on sk->sk_receive_queue.lock
   while it should be using spin_lock_irqsave().

OR

 * The mvpp2 Ethernet driver using on_each_cpu() in a softirq context.

At least, switching to use spin_lock_irqsave() instead of spin_lock()
has proven to fix the issue (see patch below). The spinlock validator
complains that a lockup has been detected, which might indicate that
something is indeed wrong with the spinlock handling.

Now, to the gory details. When stress-testing a lighttpd web server
that does a lot of CGI calls and therefore a lot of Unix socket
traffic, I get typically after couple of minutes the following RCU
stall:

INFO: rcu_sched self-detected stall on CPU { 0}  (t=2100 jiffies g=14665 c=14664 q=1352)
Task dump for CPU 0:
lighttpd        R running      0  1549      1 0x00000002
[<c0014f94>] (unwind_backtrace) from [<c001130c>] (show_stack+0x10/0x14)
[<c001130c>] (show_stack) from [<c0059688>] (rcu_dump_cpu_stacks+0x98/0xd4)
[<c0059688>] (rcu_dump_cpu_stacks) from [<c005c3ec>] (rcu_check_callbacks+0x424/0x740)
[<c005c3ec>] (rcu_check_callbacks) from [<c005e7c8>] (update_process_times+0x40/0x60)
[<c005e7c8>] (update_process_times) from [<c006ce70>] (tick_sched_timer+0x70/0x210)
[<c006ce70>] (tick_sched_timer) from [<c005efc4>] (__run_hrtimer.isra.35+0x6c/0x128)
[<c005efc4>] (__run_hrtimer.isra.35) from [<c005f600>] (hrtimer_interrupt+0x11c/0x2d0)
[<c005f600>] (hrtimer_interrupt) from [<c00148f8>] (twd_handler+0x34/0x44)
[<c00148f8>] (twd_handler) from [<c00557ec>] (handle_percpu_devid_irq+0x6c/0x84)
[<c00557ec>] (handle_percpu_devid_irq) from [<c0051c80>] (generic_handle_irq+0x2c/0x3c)
[<c0051c80>] (generic_handle_irq) from [<c000eafc>] (handle_IRQ+0x40/0x90)
[<c000eafc>] (handle_IRQ) from [<c00086d0>] (gic_handle_irq+0x2c/0x5c)
[<c00086d0>] (gic_handle_irq) from [<c0011e40>] (__irq_svc+0x40/0x54)
Exception stack(0xde0c1ce8 to 0xde0c1d30)
1ce0:                   c073a684 20010113 c06e30fc 00000003 de0c1d30 00000001
1d00: 00000001 0000012c dfbdcc40 ffffe70c dfbdcc48 df5bd800 00000002 de0c1d30
1d20: c00712d4 c00712bc 20010113 ffffffff
[<c0011e40>] (__irq_svc) from [<c00712bc>] (generic_exec_single+0x10c/0x180)
[<c00712bc>] (generic_exec_single) from [<c00713d0>] (smp_call_function_single+0xa0/0xcc)
[<c00713d0>] (smp_call_function_single) from [<c0071818>] (on_each_cpu+0x2c/0x48)
[<c0071818>] (on_each_cpu) from [<c03312dc>] (mvpp2_poll+0x30/0x594)
[<c03312dc>] (mvpp2_poll) from [<c041d024>] (net_rx_action+0xb0/0x170)
[<c041d024>] (net_rx_action) from [<c00220c4>] (__do_softirq+0x120/0x274)
[<c00220c4>] (__do_softirq) from [<c0022468>] (irq_exit+0x78/0xb0)
[<c0022468>] (irq_exit) from [<c000eb00>] (handle_IRQ+0x44/0x90)
[<c000eb00>] (handle_IRQ) from [<c00086d0>] (gic_handle_irq+0x2c/0x5c)
[<c00086d0>] (gic_handle_irq) from [<c0011e40>] (__irq_svc+0x40/0x54)
Exception stack(0xde0c1eb8 to 0xde0c1f00)
1ea0:                                                       de1b986c 00000000
1ec0: 00000420 de1b986c de1b9800 d761c080 be913a34 be913a34 00000007 de0c0000
1ee0: d761c0a0 be913a34 00000010 de0c1f00 c0491898 c0491918 60010013 ffffffff
[<c0011e40>] (__irq_svc) from [<c0491918>] (unix_inq_len+0x9c/0xa8)
[<c0491918>] (unix_inq_len) from [<c049194c>] (unix_ioctl+0x28/0x88)
[<c049194c>] (unix_ioctl) from [<c0407ccc>] (sock_ioctl+0x124/0x280)
[<c0407ccc>] (sock_ioctl) from [<c00c11bc>] (do_vfs_ioctl+0x3fc/0x5c0)
[<c00c11bc>] (do_vfs_ioctl) from [<c00c13b4>] (SyS_ioctl+0x34/0x5c)
[<c00c13b4>] (SyS_ioctl) from [<c000e220>] (ret_fast_syscall+0x0/0x30)
Task dump for CPU 1:
kiplink_admin.f R running      0  1932   1549 0x00000000
[<c0513a04>] (__schedule) from [<00000007>] (0x7)

If my analysis is correct, what happens on CPU0 is that:

 * lighttpd does an ioctl() on a socket, which ends up calling
   unix_inq_len(), which tries to get a spinlock using spin_lock(). The
   lock is probably taken.

 * while waiting for this lock, we get a network RX interrupt, which
   schedules the network RX softirq, which ends up calling the ->poll()
   function of the network driver, in our case mvpp2_poll().

 * since the network hardware has some per-CPU registers that we need
   to read on all CPUs, the network driver does a on_each_cpu() call.
   This apparently leads nowhere, as after a while, the timer interrupt
   kicks in and decides we're not making progress anymore.

After enabling spinlock debugging, I get the following right before the
RCU stall (note how the RCU stall happens on CPU0, while the spinlock
lockup suspected happens on CPU1) :

BUG: spinlock lockup suspected on CPU#1, kiplink_admin.f/1938
 lock: 0xde4998c0, .magic: dead4ead, .owner: lighttpd/1910, .owner_cpu: 0
CPU: 1 PID: 1938 Comm: kiplink_admin.f Tainted: G        W  O   3.17.0-00017-g53fa061 #2
[<c00154d8>] (unwind_backtrace) from [<c001183c>] (show_stack+0x10/0x14)
[<c001183c>] (show_stack) from [<c053f560>] (dump_stack+0x9c/0xbc)
[<c053f560>] (dump_stack) from [<c0057338>] (do_raw_spin_lock+0x118/0x18c)
[<c0057338>] (do_raw_spin_lock) from [<c05466fc>] (_raw_spin_lock_irqsave+0x60/0x6c)
[<c05466fc>] (_raw_spin_lock_irqsave) from [<c042a7d4>] (skb_queue_tail+0x18/0x48)
[<c042a7d4>] (skb_queue_tail) from [<c04b9f58>] (unix_stream_sendmsg+0x1c8/0x36c)
[<c04b9f58>] (unix_stream_sendmsg) from [<c0422eb8>] (sock_aio_write+0xcc/0xec)
[<c0422eb8>] (sock_aio_write) from [<c00bf414>] (do_sync_write+0x80/0xa8)
[<c00bf414>] (do_sync_write) from [<c00bfe60>] (vfs_write+0x108/0x1b0)
[<c00bfe60>] (vfs_write) from [<c00c0418>] (SyS_write+0x40/0x94)
[<c00c0418>] (SyS_write) from [<c000e3a0>] (ret_fast_syscall+0x0/0x48)

And interestingly, skb_queue_tail() is also taking the same spinlock as
unix_inq_len(), except that it does so with spin_lock_irqsave(). And
this is causing the issue: since this spin_lock_irqsave() takes place
on CPU1, the interupts are disabled, and therefore we're not getting
the IPI that allows the on_each_cpu() call coming from CPU0 to make
progress, causing the lockup.

The patch below has proven to fix the issue: I was able to reproduce
the issue in maximum 5 to 10 minutes, and with the patch the system has
survived an entire night of testing.

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e968843..c60205a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2124,11 +2124,12 @@ long unix_inq_len(struct sock *sk)
 {
        struct sk_buff *skb;
        long amount = 0;
+       unsigned long flags;
 
        if (sk->sk_state == TCP_LISTEN)
                return -EINVAL;
 
-       spin_lock(&sk->sk_receive_queue.lock);
+       spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
        if (sk->sk_type == SOCK_STREAM ||
            sk->sk_type == SOCK_SEQPACKET) {
                skb_queue_walk(&sk->sk_receive_queue, skb)
@@ -2138,7 +2139,7 @@ long unix_inq_len(struct sock *sk)
                if (skb)
                        amount = skb->len;
        }
-       spin_unlock(&sk->sk_receive_queue.lock);
+       spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
 
        return amount;
 }

So, the question is: is this patch the correct solution (but then other
usage of spin_lock in af_unix.c might also need fixing) ? Or is the
network driver at fault?

Thanks for your input,

Thomas
-- 
Thomas Petazzoni, CTO, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com

^ permalink raw reply related

* [net-next 1/2] ip6_tunnel: put ip6tnl0 FB device into 'any' mode
From: Alexey Andriyanov @ 2014-10-21  8:19 UTC (permalink / raw)
  To: Roman Gushchin, netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9409887..a48f212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1551,7 +1551,8 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 
-	t->parms.proto = IPPROTO_IPV6;
+	/* allow any registered unrelying proto for the FB device */
+	t->parms.proto = 0;
 	dev_hold(dev);
 
 	ip6_tnl_link_config(t);
-- 
1.9.1

^ permalink raw reply related

* [net-next 1/2] ip6_tunnel: put ip6tnl0 FB device into 'any' mode
From: Alexey Andriyanov @ 2014-10-21  8:11 UTC (permalink / raw)
  To: netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9409887..a48f212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1551,7 +1551,8 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
 	if (err)
 		return err;
 
-	t->parms.proto = IPPROTO_IPV6;
+	/* allow any registered unrelying proto for the FB device */
+	t->parms.proto = 0;
 	dev_hold(dev);
 
 	ip6_tnl_link_config(t);
-- 
1.9.1

^ permalink raw reply related

* [net-next 2/2] ip6_tunnel: allow to change mode for the ip6tnl0
From: Alexey Andriyanov @ 2014-10-21  8:19 UTC (permalink / raw)
  To: Roman Gushchin, netdev; +Cc: Alexey Andriyanov, David Miller, Eric Dumazet
In-Reply-To: <1413879596-13726-1-git-send-email-alan@al-an.info>

The fallback device is in ipv6 mode by default.
The mode can not be changed in runtime, so there
is no way to decapsulate ip4in6 packets coming from
various sources without creating the specific tunnel
ifaces for each peer.

This allows to update the fallback tunnel device, but only
the mode could be changed.

Cc: David Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexey Andriyanov <alan@al-an.info>
---
 net/ipv6/ip6_tunnel.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a48f212..0e45ec9 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1285,6 +1285,14 @@ static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
 	return err;
 }
 
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+{
+	/* for default tnl0 device allow to change only the proto */
+	t->parms.proto = p->proto;
+	netdev_state_change(t->dev);
+	return 0;
+}
+
 static void
 ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u)
 {
@@ -1384,7 +1392,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			break;
 		ip6_tnl_parm_from_user(&p1, &p);
 		t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL);
-		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
+		if (cmd == SIOCCHGTUNNEL) {
 			if (t != NULL) {
 				if (t->dev != dev) {
 					err = -EEXIST;
@@ -1392,8 +1400,10 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 				}
 			} else
 				t = netdev_priv(dev);
-
-			err = ip6_tnl_update(t, &p1);
+			if (dev == ip6n->fb_tnl_dev)
+				err = ip6_tnl0_update(t, &p1);
+			else
+				err = ip6_tnl_update(t, &p1);
 		}
 		if (t) {
 			err = 0;
-- 
1.9.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox