Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

This patch save the fn before doing rt6_backtrack.
Hence, without redo-ing the fib6_lookup(), saved_fn can be used
to redo rt6_select() with RT6_LOOKUP_F_REACHABLE off.

Some minor changes I think make sense to review as a single patch:
* Remove the 'out:' goto label.
* Remove the 'reachable' variable. Only use the 'strict' variable instead.

After this patch, "failing ip6_ins_rt()" should be the only case that
requires a redo of fib6_lookup().

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98c523f..c910831 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -917,31 +917,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi6 *fl6, int flags)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt, *nrt;
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
 
 redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	saved_fn = fn;
 
 redo_rt6_select:
-	rt = rt6_select(fn, oif, strict | reachable);
+	rt = rt6_select(fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
+		rt = rt6_multipath_select(rt, fl6, oif, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
-		else
-			goto out;
+		else if (strict & RT6_LOOKUP_F_REACHABLE) {
+			/* also consider unreachable route */
+			strict &= ~RT6_LOOKUP_F_REACHABLE;
+			fn = saved_fn;
+			goto redo_rt6_select;
+		} else {
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+			goto out2;
+		}
 	}
 
 	dst_hold(&rt->dst);
@@ -977,13 +986,6 @@ redo_rt6_select:
 	ip6_rt_put(rt);
 	goto redo_fib6_lookup_lock;
 
-out:
-	if (reachable) {
-		reachable = 0;
-		goto redo_fib6_lookup;
-	}
-	dst_hold(&rt->dst);
-	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
-- 
1.8.1

^ permalink raw reply related

* [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

It is the prep work to reduce the number of calls to fib6_lookup().

The BACKTRACK macro could be hard-to-read and error-prone due to
its side effects (mainly goto).

This patch is to:
1. Replace BACKTRACK macro with a function (fib6_backtrack) with the following
   return values:
   * If it is backtrack-able, returns next fn for retry.
   * If it reaches the root, returns NULL.
2. The caller needs to decide if a backtrack is needed (by testing
   rt == net->ipv6.ip6_null_entry).
3. Rename the goto labels in ip6_pol_route() to make the next few
   patches easier to read.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 70 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a318dd89..f1ab2f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -772,23 +772,22 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
-#define BACKTRACK(__net, saddr)			\
-do { \
-	if (rt == __net->ipv6.ip6_null_entry) {	\
-		struct fib6_node *pn; \
-		while (1) { \
-			if (fn->fn_flags & RTN_TL_ROOT) \
-				goto out; \
-			pn = fn->parent; \
-			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
-				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
-			else \
-				fn = pn; \
-			if (fn->fn_flags & RTN_RTINFO) \
-				goto restart; \
-		} \
-	} \
-} while (0)
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+					struct in6_addr *saddr)
+{
+	struct fib6_node *pn;
+	while (1) {
+		if (fn->fn_flags & RTN_TL_ROOT)
+			return NULL;
+		pn = fn->parent;
+		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
+			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+		else
+			fn = pn;
+		if (fn->fn_flags & RTN_RTINFO)
+			return fn;
+	}
+}
 
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
@@ -804,8 +803,11 @@ restart:
 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
-	BACKTRACK(net, &fl6->saddr);
-out:
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
+	}
 	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 	return rt;
@@ -924,19 +926,25 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 
-relookup:
+redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-restart_2:
+redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 
-restart:
+redo_rt6_select:
 	rt = rt6_select(fn, oif, strict | reachable);
 	if (rt->rt6i_nsiblings)
 		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
-	BACKTRACK(net, &fl6->saddr);
-	if (rt == net->ipv6.ip6_null_entry ||
-	    rt->rt6i_flags & RTF_CACHE)
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto redo_rt6_select;
+		else
+			goto out;
+	}
+
+	if (rt->rt6i_flags & RTF_CACHE)
 		goto out;
 
 	dst_hold(&rt->dst);
@@ -967,12 +975,12 @@ restart:
 	 * released someone could insert this route.  Relookup.
 	 */
 	ip6_rt_put(rt);
-	goto relookup;
+	goto redo_fib6_lookup_lock;
 
 out:
 	if (reachable) {
 		reachable = 0;
-		goto restart_2;
+		goto redo_fib6_lookup;
 	}
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
@@ -1235,10 +1243,12 @@ restart:
 		rt = net->ipv6.ip6_null_entry;
 	else if (rt->dst.error) {
 		rt = net->ipv6.ip6_null_entry;
-		goto out;
+	} else if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
 	}
-	BACKTRACK(net, &fl6->saddr);
-out:
+
 	dst_hold(&rt->dst);
 
 	read_unlock_bh(&table->tb6_lock);
-- 
1.8.1

^ permalink raw reply related

* [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa
In-Reply-To: <1413837765-5446-1-git-send-email-kafai@fb.com>

When there is a RTF_CACHE hit, no need to redo fib6_lookup()
with reachable=0.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f1ab2f4..98c523f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -944,12 +944,12 @@ redo_rt6_select:
 			goto out;
 	}
 
-	if (rt->rt6i_flags & RTF_CACHE)
-		goto out;
-
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
+	if (rt->rt6i_flags & RTF_CACHE)
+		goto out2;
+
 	if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 	else if (!(rt->dst.flags & DST_HOST))
-- 
1.8.1

^ permalink raw reply related

* [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev

Hi,

This patch set is trying to reduce the number of fib6_lookup()
calls from ip6_pol_route().

I have adapted davem's udpflooda and kbench_mod test
(https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
support IPv6 and here is the result:


Before:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m34.190s
user    0m3.047s
sys     0m31.108s

real    0m34.635s
user    0m3.125s
sys     0m31.475s

real    0m34.517s
user    0m3.034s
sys     0m31.449s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  660.160976] ip6_route_kbench: ip6_route_output tdiff: 933
[  660.207261] ip6_route_kbench: ip6_route_output tdiff: 988
[  660.253492] ip6_route_kbench: ip6_route_output tdiff: 896
[  660.298862] ip6_route_kbench: ip6_route_output tdiff: 898

After:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m32.695s
user    0m2.925s
sys     0m29.737s

real    0m32.636s
user    0m3.007s
sys     0m29.596s

real    0m32.797s
user    0m2.866s
sys     0m29.898s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  881.220793] ip6_route_kbench: ip6_route_output tdiff: 684
[  881.253477] ip6_route_kbench: ip6_route_output tdiff: 640
[  881.286867] ip6_route_kbench: ip6_route_output tdiff: 630
[  881.320749] ip6_route_kbench: ip6_route_output tdiff: 653


/****************************** udpflood.c ******************************/
/* It is an adaptation of the Eric Dumazet's and David Miller's
 * udpflood tool, by adding IPv6 support.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <malloc.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#define _GNU_SOURCE
#include <getopt.h>

typedef uint32_t u32;

static int debug =3D 0;

/* Allow -fstrict-aliasing */
typedef union sa_u {
	struct sockaddr_storage a46;
	struct sockaddr_in a4;
	struct sockaddr_in6 a6;
} sa_u;

static int usage(void)
{
	printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs=
 ] IP_ADDRESS\n");
	return -1;
}

static u32 get_last32h(const sa_u *sa)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		return ntohl(sa->a4.sin_addr.s_addr);
	else
		return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
}

static void set_last32h(sa_u *sa, u32 last32h)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		sa->a4.sin_addr.s_addr =3D htonl(last32h);
	else
		sa->a6.sin6_addr.s6_addr32[3] =3D htonl(last32h);
}

static void print_saddr(const sa_u *sa, const char *msg)
{
	char buf[64];

	if (!debug)
		return;

	switch (sa->a46.ss_family) {
	case PF_INET:
		inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
			  sizeof(buf));
		break;
	case PF_INET6:
		inet_ntop(PF_INET6, &(sa->a6.sin6_addr), buf, sizeof(buf));
		break;
	}

	printf("%s: %s\n", msg, buf);
}

static int send_packets(const sa_u *sa, size_t num_addrs, int count, int ms=
g_sz)
{
	char *msg =3D malloc(msg_sz);
	sa_u saddr;
	u32 start_addr32h, end_addr32h, cur_addr32h;
	int fd, i, err;

	if (!msg)
		return -ENOMEM;

	memset(msg, 0, msg_sz);

	memcpy(&saddr, sa, sizeof(saddr));
	cur_addr32h =3D start_addr32h =3D get_last32h(&saddr);
	end_addr32h =3D start_addr32h + num_addrs;

	fd =3D socket(saddr.a46.ss_family, SOCK_DGRAM, 0);
	if (fd < 0) {
		perror("socket");
		err =3D fd;
		goto out_nofd;
	}

	/* connect to avoid the kernel spending time in figuring
	 * out the source address (i.e pin the src address)
	 */
	err =3D connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
	if (err < 0) {
		perror("connect");
		goto out;
	}

	print_saddr(&saddr, "start_addr");
	for (i =3D 0; i < count; i++) {
		print_saddr(&saddr, "sendto");
		err =3D sendto(fd, msg, msg_sz, 0, (struct sockaddr *)&saddr,
			     sizeof(saddr));
		if (err < 0) {
			perror("sendto");
			goto out;
		}

		if (++cur_addr32h >=3D end_addr32h)
			cur_addr32h =3D start_addr32h;
		set_last32h(&saddr, cur_addr32h);
	}

	err =3D 0;
out:
	close(fd);
out_nofd:
	free(msg);
	return err;
}

int main(int argc, char **argv, char **envp)
{
	int port, msg_sz, count, num_addrs, ret;

	sa_u start_addr;

	port =3D 6000;
	msg_sz =3D 32;
	count =3D 10000000;
	num_addrs =3D 1;

	while ((ret =3D getopt(argc, argv, "dl:s:p:c:")) >=3D 0) {
		switch (ret) {
		case 'l':
			sscanf(optarg, "%d", &count);
			break;
		case 's':
			sscanf(optarg, "%d", &msg_sz);
			break;
		case 'p':
			sscanf(optarg, "%d", &port);
			break;
		case 'c':
			sscanf(optarg, "%d", &num_addrs);
			break;
		case 'd':
			debug =3D 1;
			break;
		case '?':
			return usage();
		}
	}

	if (num_addrs < 1)
		return usage();

	if (!argv[optind])
		return usage();

	start_addr.a4.sin_port =3D htons(port);
	if (inet_pton(PF_INET, argv[optind], &start_addr.a4.sin_addr))
		start_addr.a46.ss_family =3D PF_INET;
	else if (inet_pton(PF_INET6, argv[optind], &start_addr.a6.sin6_addr.s6_add=
r))
		start_addr.a46.ss_family =3D PF_INET6;
	else
		return usage();

	return send_packets(&start_addr, num_addrs, count, msg_sz);
}

/****************** ip6_route_kbench_mod.c ******************/
#define pr_fmt(fmt) "ip6_route_kbench: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/inet.h>
#include <linux/in6.h>

#include <net/route.h>
#include <net/ip6_route.h>

#include <linux/timex.h>
#include <uapi/linux/icmpv6.h>

/* We can't just use "get_cycles()" as on some platforms, such
 * as sparc64, that gives system cycles rather than cpu clock
 * cycles.
 */

#ifdef CONFIG_SPARC64
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	__asm__ __volatile__("rd %%tick, %0" : "=r" (t));
	return t;
}
#elif defined(CONFIG_X86)
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	rdtscll(t);

	return t;
}
#elif defined(CONFIG_POWERPC)
static inline unsigned long long get_tick(void)
{
	return get_cycles();
}
#else
#error Unsupported architecture, please implement get_tick()
#endif

#define DEFAULT_WARMUP_COUNT 100000

#define DEFAULT_DST_IP_ADDR	0x4a800001
#define DEFAULT_SRC_IP_ADDR	0x00000000
#define DEFAULT_OIF		0
#define DEFAULT_IIF		0
#define DEFAULT_MARK		0x00000000
#define DEFAULT_TOS		0x00

static int flow_oif = DEFAULT_OIF;
static int flow_iif = DEFAULT_IIF;
static u32 flow_mark = DEFAULT_MARK;
static struct in6_addr flow_dst_ip_addr;
static struct in6_addr flow_src_ip_addr;
static int flow_tos = DEFAULT_TOS;

static char dst_string[64];
static char src_string[64];

module_param_string(dst, dst_string, sizeof(dst_string), 0);
module_param_string(src, src_string, sizeof(src_string), 0);

static int __init flow_setup(void)
{
	if (dst_string[0] &&
	    !in6_pton(dst_string, -1, &flow_dst_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	if (src_string[0] &&
	    !in6_pton(src_string, -1, &flow_src_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	return 0;
}

module_param_named(oif, flow_oif, int, 0);
module_param_named(iif, flow_iif, int, 0);
module_param_named(mark, flow_mark, uint, 0);
module_param_named(tos, flow_tos, int, 0);

static int warmup_count = DEFAULT_WARMUP_COUNT;
module_param_named(count, warmup_count, int, 0);

static void flow_init(struct flowi6 *fl6)
{
	memset(fl6, 0, sizeof(*fl6));
	fl6->flowi6_proto = IPPROTO_ICMPV6;
	fl6->flowi6_oif = flow_oif;
	fl6->flowi6_iif = flow_iif;
	fl6->flowi6_mark = flow_mark;
	fl6->flowi6_tos = flow_tos;
	fl6->daddr = flow_dst_ip_addr;
	fl6->saddr = flow_src_ip_addr;
}

static struct sk_buff * fake_skb_get(void)
{
	struct ipv6hdr *hdr;
	struct sk_buff *skb;

	skb = alloc_skb(4096, GFP_KERNEL);
	if (!skb) {
		pr_info("Cannot alloc SKB for test\n");
		return NULL;
	}
	skb->dev = __dev_get_by_index(&init_net, flow_iif);
	if (skb->dev == NULL) {
		pr_info("Input device (%d) does not exist\n", flow_iif);
		goto err;
	}

	skb_reset_mac_header(skb);
	skb_reset_network_header(skb);
	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
	hdr = ipv6_hdr(skb);

	hdr->priority = 0;
	hdr->version = 6;
	memset(hdr->flow_lbl, 0, sizeof(hdr->flow_lbl));
	hdr->payload_len = htons(sizeof(struct icmp6hdr));
	hdr->nexthdr = IPPROTO_ICMPV6;
	hdr->saddr = flow_src_ip_addr;
	hdr->daddr = flow_dst_ip_addr;
	skb->protocol = htons(ETH_P_IPV6);
	skb->mark = flow_mark;

	return skb;
err:
	kfree_skb(skb);
	return NULL;
}

static void do_full_output_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct rt6_info *rt;
	struct flowi6 fl6;
	int i;

	rt = NULL;

	for (i = 0; i < warmup_count; i++) {
		flow_init(&fl6);

		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
		if (IS_ERR(rt))
			break;
		ip6_rt_put(rt);
	}
	if (IS_ERR(rt)) {
		pr_info("ip_route_output_key: err=%ld\n", PTR_ERR(rt));
		return;
	}

	flow_init(&fl6);

	t1 = get_tick();
	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
	t2 = get_tick();
	if (!IS_ERR(rt))
		ip6_rt_put(rt);

	tdiff = t2 - t1;
	pr_info("ip6_route_output tdiff: %llu\n", tdiff);
}

static void do_full_input_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct sk_buff *skb;
	struct rt6_info *rt;
	int err, i;

	skb = fake_skb_get();
	if (skb == NULL)
		goto out_free;

	err = 0;
	local_bh_disable();
	for (i = 0; i < warmup_count; i++) {
		ip6_route_input(skb);
		rt = (struct rt6_info *)skb_dst(skb);
		err = (!rt || rt == init_net.ipv6.ip6_null_entry);
		skb_dst_drop(skb);
		if (err)
			break;
	}
	local_bh_enable();

	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	local_bh_disable();
	t1 = get_tick();
	ip6_route_input(skb);
	t2 = get_tick();
	local_bh_enable();

	rt = (struct rt6_info *)skb_dst(skb);
	err = (!rt || rt == init_net.ipv6.ip6_null_entry);
	skb_dst_drop(skb);
	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	tdiff = t2 - t1;
	pr_info("ip6_route_input tdiff: %llu\n", tdiff);

out_free:
	kfree_skb(skb);
}

static void do_full_lookup_bench(void)
{
	if (!flow_iif)
		do_full_output_lookup_bench();
	else
		do_full_input_lookup_bench();
}

static void do_bench(void)
{
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
}

static int __init kbench_init(void)
{
	if (flow_setup())
		return -EINVAL;

	pr_info("flow [IIF(%d),OIF(%d),MARK(0x%08x),D("IP6_FMT"),"
		"S("IP6_FMT"),TOS(0x%02x)]\n",
		flow_iif, flow_oif, flow_mark,
		IP6_PRT(flow_dst_ip_addr),
		IP6_PRT(flow_src_ip_addr),
		flow_tos);

#if defined(CONFIG_X86)
	if (!cpu_has_tsc) {
		pr_err("X86 TSC is required, but is unavailable.\n");
		return -EINVAL;
	}
#endif

	pr_info("sizeof(struct rt6_info)==%zu\n", sizeof(struct rt6_info));

	do_bench();

	return -ENODEV;
}

static void __exit kbench_exit(void)
{
}

module_init(kbench_init);
module_exit(kbench_exit);
MODULE_LICENSE("GPL");

^ permalink raw reply

* Re: localed stuck in recent 3.18 git in copy_net_ns?
From: Dave Jones @ 2014-10-20 20:43 UTC (permalink / raw)
  To: Kevin Fenzi; +Cc: netdev, linux-kernel
In-Reply-To: <20141020141515.0688bf33@voldemort.scrye.com>

On Mon, Oct 20, 2014 at 02:15:15PM -0600, Kevin Fenzi wrote:

 > I'm seeing suspend/resume failures with recent 3.18 git kernels. 
 > 
 > Full dmesg at: http://paste.fedoraproject.org/143615/83287914/
 > 
 > The possibly interesting parts: 
 > 
 > [   78.373144] PM: Syncing filesystems ... done.
 > [   78.411180] PM: Preparing system for mem sleep
 > [   78.411995] Freezing user space processes ... 
 > [   98.429955] Freezing of tasks failed after 20.001 seconds (1 tasks refusing to freeze, wq_busy=0):
 > [   98.429971] (-localed)      D ffff88025f214c80     0  1866      1 0x00000084
 > [   98.429975]  ffff88024e777df8 0000000000000086 ffff88009b4444b0 0000000000014c80
 > [   98.429978]  ffff88024e777fd8 0000000000014c80 ffff880250ffb110 ffff88009b4444b0
 > [   98.429981]  0000000000000000 ffffffff81cec1a0 ffffffff81cec1a4 ffff88009b4444b0
 > [   98.429983] Call Trace:
 > [   98.429991]  [<ffffffff8175d619>] schedule_preempt_disabled+0x29/0x70
 > [   98.429994]  [<ffffffff8175f433>] __mutex_lock_slowpath+0xb3/0x120
 > [   98.429997]  [<ffffffff8175f4c3>] mutex_lock+0x23/0x40
 > [   98.430001]  [<ffffffff8163e325>] copy_net_ns+0x75/0x140
 > [   98.430005]  [<ffffffff810b8c2d>] create_new_namespaces+0xfd/0x1a0
 > [   98.430008]  [<ffffffff810b8e5a>] unshare_nsproxy_namespaces+0x5a/0xc0
 > [   98.430012]  [<ffffffff81098813>] SyS_unshare+0x193/0x340
 > [   98.430015]  [<ffffffff817617a9>] system_call_fastpath+0x12/0x17

I've seen similar soft lockup traces from the sys_unshare path when running my
fuzz tester.  It seems that if you create enough network namespaces,
it can take a huge amount of time for them to be iterated.
(Running trinity with '-c unshare' you can see the slow down happen. In
 some cases, it takes so long that the watchdog process kills it --
 though the SIGKILL won't get delivered until the unshare() completes)

Any idea what this machine had been doing prior to this that may have
involved creating lots of namespaces ?

	Dave

^ permalink raw reply

* Re: Routing BUG with ppp over l2tp
From: James Carlson @ 2014-10-20 20:22 UTC (permalink / raw)
  To: Alan Stern; +Cc: James Chapman, linux-ppp, netdev
In-Reply-To: <Pine.LNX.4.44L0.1410201414500.2403-100000@iolanthe.rowland.org>

On 10/20/14 15:45, Alan Stern wrote:
> On Mon, 20 Oct 2014, James Carlson wrote:
>> Indeed!  That's pretty darned lame behavior by that peer.  It would
>> probably be workable if you had a virtual router instance and were able
>> to put the L2TP connection in one routing instance and the PPP
>> connection in another routing instance, but that's likely not at all
>> simple to achieve.
> 
> I'd like to find the simplest solution.  Ideally it should "just work", 
> like the Windows and OS-X clients do.

I'm not an expert on Windows networking internals.  I assume OS X is BSD
+ whatever the folks in Cupertino have done to it.  :-/

At a guess, it's living on the edge.  It works because the L2TP
connection establishment caches a pointer to the output forwarding table
entry ("route") and just keeps living with it no matter what actually
happens down the line.

On Linux (and likely many other systems), the output computation is a
bit more dynamic, and the establishment of a direct point-to-point link
to a given IP address (as the PPP link represents) causes existing
cached pointers to get flushed away.  Future packets to that destination
(IP _always_ forwards based on destination, not source) go down the most
direct path.  Point-to-point is as direct as you can get.

It may be possible to modify the L2TP code to use flags to avoid the PPP
link (MSG_DONTROUTE?), but I suspect that's probably a bad rather than a
good thing to do.

>>> Unfortunately, I can't work around this problem by reconfiguring the
>>> VPN server -- there's no way to tell it to use a different IP address
>>> for its end of the VPN tunnel.  Furthermore, the server works just fine
>>> with clients running Windows or OS-X.
>>
>> Really?  That seems ... improbable.
> 
> I guess that depends on how you judge probabilities.  :-)

:-/

> Internet:
> Destination        Gateway            Flags    Refs      Use  Netif Expire
> default            140.247.233.37     UGSc        2        4   ppp0
> 10                 ppp0               USc         1        0   ppp0

That's *quite* interesting!  The PPP link doesn't have an interface
route as you'd find on most other systems.  Instead, it has what appears
to be an effectively unnumbered link.  Note the "ppp0" there instead of
an actual output address and the happy use of "10" for the local address
+ mask.

For what it's worth, the forced IP address option I've suggested is
morally equivalent to what's being done here on OS X, so that's a fair
reason to recommend it.

I checked out the pppd on Mac OS X (Darwin 13.4.0; Mavericks), and it
looks to be a variant of the SAMBA/ANU/CMU pppd, but I'm not sure what's
different with it, and I know of no contributions from them.  And the
BSD support is long gone from the main source base ...

> I don't understand (and can't be bothered to look up) those arcane
> symbols in the netstat output.  The IP address used for the ping test
> (10.160.0.2) is a system on the VPN's private network.

The flags aren't all that interesting.  "Up" "Gateway" "Static"
"cloning" are all expected in this context.

>> As long as you don't need to contact that specific remote server using
>> the badly-assigned "internal" VPN address and can live with the fact
>> that you'll either go through the regular Internet to that address or be
>> forced to use some other address configured on that server, you should
>> be good.
>>
>> (The address I used above is 10.160.0.2.  That was one of the internal
>> DNS server addresses provided in the log you posted.  It's not necessary
>> that the address used here is exactly that, but it may well be helpful.)
> 
> That might work.  But using a nonstandard version of pppd would be
> awkward, and I would prefer to avoid it.

What's "non-standard?"

Having the ability to force a given remote IP address looks to me like a
perfectly reasonable thing to do.  We allow the remote IP address to be
set arbitrarily when the peer (for whatever reason) refuses to divulge
its address, and this is just an extension of that idea.

>> If you can't do that for some reason, then I suppose it would be
>> possible to use IP Chains (or whatever the packet-modification tool du
>> jure is used in your Linux distribution) to nail up an exception so that
>> the outside packets go to the outside interface and the inside ones go
>> to the PPP interface.  Doing that likely requires selecting on (at
>> least!) source address, so it's messy and ugly and possibly error-prone,
>> but it might be doable.
> 
> That sounds like a fairly easy thing to try.  But it would still 
> require manual intervention instead of just working.  Fixing the kernel 
> would be preferable, IMO.

I don't quite agree that it's necessarily "broken."

I do agree that it's bad to crash due to this misconfiguration.  That's
certainly a bug of some sort.  But making the kernel "naturally" accept
that the same unicast remote IP address refers to different outputs
depending on phase-of-moon in order to make this weird server happy
sounds like adding a bug rather than fixing one.

Routing based on destination is a good thing.

>> Otherwise, contact the maintainer of that VPN server.  It's just plain
>> old broken, and life's too short for broken software.
> 
> It is an old Cisco security appliance, no doubt well past End-Of-Life.  
> I'm starting to think it might be preferable to throw the thing away 
> and start up a VPN server on the department's firewall (which is a 
> Linux box) instead.

That sounds like a good (and easier to support) solution.

-- 
James Carlson         42.703N 71.076W         <carlsonj@workingcode.com>

^ permalink raw reply

* localed stuck in recent 3.18 git in copy_net_ns?
From: Kevin Fenzi @ 2014-10-20 20:15 UTC (permalink / raw)
  To: netdev, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2849 bytes --]

Greetings. 

I'm seeing suspend/resume failures with recent 3.18 git kernels. 

Full dmesg at: http://paste.fedoraproject.org/143615/83287914/

The possibly interesting parts: 

[   78.373144] PM: Syncing filesystems ... done.
[   78.411180] PM: Preparing system for mem sleep
[   78.411995] Freezing user space processes ... 
[   98.429955] Freezing of tasks failed after 20.001 seconds (1 tasks refusing to freeze, wq_busy=0):
[   98.429971] (-localed)      D ffff88025f214c80     0  1866      1 0x00000084
[   98.429975]  ffff88024e777df8 0000000000000086 ffff88009b4444b0 0000000000014c80
[   98.429978]  ffff88024e777fd8 0000000000014c80 ffff880250ffb110 ffff88009b4444b0
[   98.429981]  0000000000000000 ffffffff81cec1a0 ffffffff81cec1a4 ffff88009b4444b0
[   98.429983] Call Trace:
[   98.429991]  [<ffffffff8175d619>] schedule_preempt_disabled+0x29/0x70
[   98.429994]  [<ffffffff8175f433>] __mutex_lock_slowpath+0xb3/0x120
[   98.429997]  [<ffffffff8175f4c3>] mutex_lock+0x23/0x40
[   98.430001]  [<ffffffff8163e325>] copy_net_ns+0x75/0x140
[   98.430005]  [<ffffffff810b8c2d>] create_new_namespaces+0xfd/0x1a0
[   98.430008]  [<ffffffff810b8e5a>] unshare_nsproxy_namespaces+0x5a/0xc0
[   98.430012]  [<ffffffff81098813>] SyS_unshare+0x193/0x340
[   98.430015]  [<ffffffff817617a9>] system_call_fastpath+0x12/0x17

[   98.430032] Restarting tasks ... done.
[   98.480361] PM: Syncing filesystems ... done.
[   98.571645] PM: Preparing system for freeze sleep
[   98.571779] Freezing user space processes ... 
[  118.592086] Freezing of tasks failed after 20.003 seconds (1 tasks refusing to freeze, wq_busy=0):
[  118.592102] (-localed)      D ffff88025f214c80     0  1866      1 0x00000084
[  118.592106]  ffff88024e777df8 0000000000000086 ffff88009b4444b0 0000000000014c80
[  118.592109]  ffff88024e777fd8 0000000000014c80 ffff880250ffb110 ffff88009b4444b0
[  118.592111]  0000000000000000 ffffffff81cec1a0 ffffffff81cec1a4 ffff88009b4444b0
[  118.592114] Call Trace:
[  118.592121]  [<ffffffff8175d619>] schedule_preempt_disabled+0x29/0x70
[  118.592125]  [<ffffffff8175f433>] __mutex_lock_slowpath+0xb3/0x120
[  118.592127]  [<ffffffff8175f4c3>] mutex_lock+0x23/0x40
[  118.592132]  [<ffffffff8163e325>] copy_net_ns+0x75/0x140
[  118.592136]  [<ffffffff810b8c2d>] create_new_namespaces+0xfd/0x1a0
[  118.592139]  [<ffffffff810b8e5a>] unshare_nsproxy_namespaces+0x5a/0xc0
[  118.592143]  [<ffffffff81098813>] SyS_unshare+0x193/0x340
[  118.592146]  [<ffffffff817617a9>] system_call_fastpath+0x12/0x17

[  118.592163] Restarting tasks ... done.

root         6  0.0  0.0      0     0 ?        D    13:49   0:00 [kworker/u16:0]
root      1876  0.0  0.0  41460  5784 ?        Ds   13:49   0:00 (-localed)

I'll try and bisect this, but perhaps it rings bells already for folks. 

kevin


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* [PATCH] netlink: Re-add locking to netlink_lookup() and seq walker
From: Thomas Graf @ 2014-10-20 19:53 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Eric Dumazet, Sasha Levin, paulmck, Nikolay Aleksandrov,
	David S. Miller, netdev, linux-kernel, Ursula Braun
In-Reply-To: <20141020082107.GB4268@osiris>

Heiko,

Can you test the following patch:

The synchronize_rcu() in netlink_release() introduces unacceptable
latency. Reintroduce minimal lookup so we can drop the
synchronize_rcu() until socket destruction has been RCUfied.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/netlink/af_netlink.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7a186e7..f1de72d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -96,6 +96,14 @@ static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 static int netlink_dump(struct sock *sk);
 static void netlink_skb_destructor(struct sk_buff *skb);
 
+/* nl_table locking explained:
+ * Lookup and traversal are protected with nl_sk_hash_lock or nl_table_lock
+ * combined with an RCU read-side lock. Insertion and removal are protected
+ * with nl_sk_hash_lock while using RCU list modification primitives and may
+ * run in parallel to nl_table_lock protected lookups. Destruction of the
+ * Netlink socket may only occur *after* nl_table_lock has been acquired
+ * either during or after the socket has been removed from the list.
+ */
 DEFINE_RWLOCK(nl_table_lock);
 EXPORT_SYMBOL_GPL(nl_table_lock);
 static atomic_t nl_table_users = ATOMIC_INIT(0);
@@ -109,10 +117,10 @@ EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
 static int lockdep_nl_sk_hash_is_held(void)
 {
 #ifdef CONFIG_LOCKDEP
-	return (debug_locks) ? lockdep_is_held(&nl_sk_hash_lock) : 1;
-#else
-	return 1;
+	if (debug_locks)
+		return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
 #endif
+	return 1;
 }
 
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
@@ -1028,11 +1036,13 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
 	struct netlink_table *table = &nl_table[protocol];
 	struct sock *sk;
 
+	read_lock(&nl_table_lock);
 	rcu_read_lock();
 	sk = __netlink_lookup(table, portid, net);
 	if (sk)
 		sock_hold(sk);
 	rcu_read_unlock();
+	read_unlock(&nl_table_lock);
 
 	return sk;
 }
@@ -1257,9 +1267,6 @@ static int netlink_release(struct socket *sock)
 	}
 	netlink_table_ungrab();
 
-	/* Wait for readers to complete */
-	synchronize_net();
-
 	kfree(nlk->groups);
 	nlk->groups = NULL;
 
@@ -1281,6 +1288,7 @@ static int netlink_autobind(struct socket *sock)
 
 retry:
 	cond_resched();
+	netlink_table_grab();
 	rcu_read_lock();
 	if (__netlink_lookup(table, portid, net)) {
 		/* Bind collision, search negative portid values. */
@@ -1288,9 +1296,11 @@ retry:
 		if (rover > -4097)
 			rover = -4097;
 		rcu_read_unlock();
+		netlink_table_ungrab();
 		goto retry;
 	}
 	rcu_read_unlock();
+	netlink_table_ungrab();
 
 	err = netlink_insert(sk, net, portid);
 	if (err == -EADDRINUSE)
@@ -2921,14 +2931,16 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(RCU)
+	__acquires(nl_table_lock) __acquires(RCU)
 {
+	read_lock(&nl_table_lock);
 	rcu_read_lock();
 	return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 }
 
 static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct rhashtable *ht;
 	struct netlink_sock *nlk;
 	struct nl_seq_iter *iter;
 	struct net *net;
@@ -2943,19 +2955,19 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	iter = seq->private;
 	nlk = v;
 
-	rht_for_each_entry_rcu(nlk, nlk->node.next, node)
+	i = iter->link;
+	ht = &nl_table[i].hash;
+	rht_for_each_entry(nlk, nlk->node.next, ht, node)
 		if (net_eq(sock_net((struct sock *)nlk), net))
 			return nlk;
 
-	i = iter->link;
 	j = iter->hash_idx + 1;
 
 	do {
-		struct rhashtable *ht = &nl_table[i].hash;
 		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 
 		for (; j < tbl->size; j++) {
-			rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
+			rht_for_each_entry(nlk, tbl->buckets[j], ht, node) {
 				if (net_eq(sock_net((struct sock *)nlk), net)) {
 					iter->link = i;
 					iter->hash_idx = j;
@@ -2971,9 +2983,10 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void netlink_seq_stop(struct seq_file *seq, void *v)
-	__releases(RCU)
+	__releases(RCU) __releases(nl_table_lock)
 {
 	rcu_read_unlock();
+	read_unlock(&nl_table_lock);
 }
 
 
-- 
1.9.3

^ permalink raw reply related

* Re: Routing BUG with ppp over l2tp
From: Alan Stern @ 2014-10-20 19:45 UTC (permalink / raw)
  To: James Carlson; +Cc: James Chapman, linux-ppp, netdev
In-Reply-To: <5445442C.8080508@workingcode.com>

On Mon, 20 Oct 2014, James Carlson wrote:

> On 10/20/14 12:39, Alan Stern wrote:
> > As far as I can tell, the problem is caused by bad routing.  The kernel
> > gets confused because the IP address assigned by the VPN server to the
> > server's end of the ppp tunnel is the _same_ as the server's actual IP
> > address.
> 
> Indeed!  That's pretty darned lame behavior by that peer.  It would
> probably be workable if you had a virtual router instance and were able
> to put the L2TP connection in one routing instance and the PPP
> connection in another routing instance, but that's likely not at all
> simple to achieve.

I'd like to find the simplest solution.  Ideally it should "just work", 
like the Windows and OS-X clients do.

> > Unfortunately, I can't work around this problem by reconfiguring the
> > VPN server -- there's no way to tell it to use a different IP address
> > for its end of the VPN tunnel.  Furthermore, the server works just fine
> > with clients running Windows or OS-X.
> 
> Really?  That seems ... improbable.

I guess that depends on how you judge probabilities.  :-)

As evidence to convince you, here's a log of a session on a rather old
Mac Powerbook G4 running OS 10.4.11.  The situation isn't exactly the
same as with my Linux system, because for this test the client and the
VPN server are on the same subnet -- I don't think that should make any
difference.  The client's IP address is 140.247.233.41, the server's is
.37, and the router to the outside world is .33.  The client's ppp IP
address (assigned by the server) is 10.170.30.1.

The following commands were carried out while the VPN was connected:


------------------------------------------------------------------------
michael-burns-powerbook-g4:~ stern$ netstat -rn -f inet
Routing tables

Internet:
Destination        Gateway            Flags    Refs      Use  Netif Expire
default            140.247.233.37     UGSc        2        4   ppp0
10                 ppp0               USc         1        0   ppp0
127                127.0.0.1          UCS         0        0    lo0
127.0.0.1          127.0.0.1          UH         12     2278    lo0
140.247.233.32/27  link#4             UCS         2        0    en0
140.247.233.33     0:8:e3:ff:fc:b8    UHLW        0        0    en0   1198
140.247.233.37     0:1e:f7:15:53:a8   UHLW        3       10    en0   1153
140.247.233.37/32  link#4             UCS         1        0    en0
140.247.233.41     127.0.0.1          UHS         0        0    lo0
169.254            link#4             UCS         0        0    en0

michael-burns-powerbook-g4:~ stern$ ping -c1 -n 10.160.0.2
PING 10.160.0.2 (10.160.0.2): 56 data bytes
64 bytes from 10.160.0.2: icmp_seq=0 ttl=64 time=1.368 ms

--- 10.160.0.2 ping statistics ---
1 packets transmitted, 1 packets received, 0% packet loss
round-trip min/avg/max/stddev = 1.368/1.368/1.368/nan ms

michael-burns-powerbook-g4:~ stern$ ifconfig en0
en0: flags=8863<UP,BROADCAST,SMART,RUNNING,SIMPLEX,MULTICAST> mtu 1500
        inet 140.247.233.41 netmask 0xffffffe0 broadcast 140.247.233.63
        ether 00:03:93:12:da:48 
        media: autoselect (100baseTX <full-duplex>) status: active
        supported media: none autoselect 10baseT/UTP <half-duplex> 10baseT/UTP <full-duplex> 10baseT/UTP <full-duplex,hw-loopback> 100baseTX <half-duplex> 100baseTX <full-duplex> 100baseTX <full-duplex,hw-loopback>

michael-burns-powerbook-g4:~ stern$ ifconfig ppp0
ppp0: flags=8051<UP,POINTOPOINT,RUNNING,MULTICAST> mtu 1280
        inet 10.170.30.1 --> 140.247.233.37 netmask 0xff000000 
------------------------------------------------------------------------


I don't understand (and can't be bothered to look up) those arcane
symbols in the netstat output.  The IP address used for the ping test
(10.160.0.2) is a system on the VPN's private network.

Here's comparable output for a connection from a computer running 
Windows 7 (same IP addresses as before):


------------------------------------------------------------------------
C:\Users\stern>netstat -rn
===========================================================================
Interface List
 20...........................Rowland VPN
 10...00 1a 6b 57 30 02 ......Intel(R) 82566DM Gigabit Network Connection
  1...........................Software Loopback Interface 1
 11...00 00 00 00 00 00 00 e0 Microsoft ISATAP Adapter
 12...00 00 00 00 00 00 00 e0 Teredo Tunneling Pseudo-Interface
 19...00 00 00 00 00 00 00 e0 Microsoft 6to4 Adapter
 21...00 00 00 00 00 00 00 e0 Microsoft ISATAP Adapter #2
===========================================================================

IPv4 Route Table
===========================================================================
Active Routes:
Network Destination        Netmask          Gateway       Interface  Metric
          0.0.0.0          0.0.0.0   140.247.233.33   140.247.233.41   4491
          0.0.0.0          0.0.0.0         On-link       10.170.30.1     11
      10.170.30.1  255.255.255.255         On-link       10.170.30.1    266
        127.0.0.0        255.0.0.0         On-link         127.0.0.1   4531
        127.0.0.1  255.255.255.255         On-link         127.0.0.1   4531
  127.255.255.255  255.255.255.255         On-link         127.0.0.1   4531
   140.247.233.32  255.255.255.224         On-link    140.247.233.41   4491
   140.247.233.37  255.255.255.255         On-link    140.247.233.41   4236
   140.247.233.41  255.255.255.255         On-link    140.247.233.41   4491
   140.247.233.63  255.255.255.255         On-link    140.247.233.41   4491
        224.0.0.0        240.0.0.0         On-link         127.0.0.1   4531
        224.0.0.0        240.0.0.0         On-link    140.247.233.41   4492
        224.0.0.0        240.0.0.0         On-link       10.170.30.1     11
  255.255.255.255  255.255.255.255         On-link         127.0.0.1   4531
  255.255.255.255  255.255.255.255         On-link    140.247.233.41   4491
  255.255.255.255  255.255.255.255         On-link       10.170.30.1    266
===========================================================================
Persistent Routes:
  Network Address          Netmask  Gateway Address  Metric
          0.0.0.0          0.0.0.0   140.247.233.33  Default
===========================================================================

C:\Users\stern>ping -n 1 10.160.0.2

Pinging 10.160.0.2 with 32 bytes of data:
Reply from 10.160.0.2: bytes=32 time<1ms TTL=64

Ping statistics for 10.160.0.2:
    Packets: Sent = 1, Received = 1, Lost = 0 (0% loss),
Approximate round trip times in milli-seconds:
    Minimum = 0ms, Maximum = 0ms, Average = 0ms

C:\Users\stern>ipconfig /all

Windows IP Configuration

   Host Name . . . . . . . . . . . . : Windows-test
   Primary Dns Suffix  . . . . . . . : rowland.org
   Node Type . . . . . . . . . . . . : Hybrid
   IP Routing Enabled. . . . . . . . : No
   WINS Proxy Enabled. . . . . . . . : No
   DNS Suffix Search List. . . . . . : rowland.org

PPP adapter Rowland VPN:

   Connection-specific DNS Suffix  . :
   Description . . . . . . . . . . . : Rowland VPN
   Physical Address. . . . . . . . . :
   DHCP Enabled. . . . . . . . . . . : No
   Autoconfiguration Enabled . . . . : Yes
   IPv4 Address. . . . . . . . . . . : 10.170.30.1(Preferred)
   Subnet Mask . . . . . . . . . . . : 255.255.255.255
   Default Gateway . . . . . . . . . : 0.0.0.0
   DNS Servers . . . . . . . . . . . : 10.160.0.2
                                       10.160.0.3
   Primary WINS Server . . . . . . . : 10.160.0.2
   NetBIOS over Tcpip. . . . . . . . : Disabled

Ethernet adapter Local Area Connection:

   Connection-specific DNS Suffix  . :
   Description . . . . . . . . . . . : Intel(R) 82566DM Gigabit Network Connection
   Physical Address. . . . . . . . . : 00-1A-6B-57-30-02
   DHCP Enabled. . . . . . . . . . . : No
   Autoconfiguration Enabled . . . . : Yes
   Link-local IPv6 Address . . . . . : fe80::1426:1891:bf83:3982%10(Preferred)
   IPv4 Address. . . . . . . . . . . : 140.247.233.41(Preferred)
   Subnet Mask . . . . . . . . . . . : 255.255.255.224
   Default Gateway . . . . . . . . . : 140.247.233.33
   DHCPv6 IAID . . . . . . . . . . . : 234887787
   DHCPv6 Client DUID. . . . . . . . : 00-01-00-01-1B-A2-3E-B1-00-1A-6B-57-30-02

   DNS Servers . . . . . . . . . . . : 8.8.8.8
   NetBIOS over Tcpip. . . . . . . . : Enabled
------------------------------------------------------------------------


Although the Windows ipconfig output doesn't show the IP address of the
server side of the ppp tunnel, it does show up in a Details window
under the Network control panel, and it is indeed set to
140.247.233.37.

> > So it looks like the problem has to be fixed either in the kernel or in 
> > the way pppd sets up its routing entry.  Can you guys help?
> 
> I think the easiest solution is to configure pppd to lie to the kernel
> about the remote address.  Who cares what the remote address is on a
> point-to-point link anyway?
> 
> There's currently no option to do this, but the code change in ipcp_up()
> in pppd/ipcp.c would be rather simple.  Just make the "noremoteip" code
> run all the time:
> 
> /* Deliberately falsify the remote address.  We don't care. */
> ho->hisaddr = htonl(0x0aa00002);
> 
> As long as you don't need to contact that specific remote server using
> the badly-assigned "internal" VPN address and can live with the fact
> that you'll either go through the regular Internet to that address or be
> forced to use some other address configured on that server, you should
> be good.
> 
> (The address I used above is 10.160.0.2.  That was one of the internal
> DNS server addresses provided in the log you posted.  It's not necessary
> that the address used here is exactly that, but it may well be helpful.)

That might work.  But using a nonstandard version of pppd would be
awkward, and I would prefer to avoid it.

> If you can't do that for some reason, then I suppose it would be
> possible to use IP Chains (or whatever the packet-modification tool du
> jure is used in your Linux distribution) to nail up an exception so that
> the outside packets go to the outside interface and the inside ones go
> to the PPP interface.  Doing that likely requires selecting on (at
> least!) source address, so it's messy and ugly and possibly error-prone,
> but it might be doable.

That sounds like a fairly easy thing to try.  But it would still 
require manual intervention instead of just working.  Fixing the kernel 
would be preferable, IMO.

> Otherwise, contact the maintainer of that VPN server.  It's just plain
> old broken, and life's too short for broken software.

It is an old Cisco security appliance, no doubt well past End-Of-Life.  
I'm starting to think it might be preferable to throw the thing away 
and start up a VPN server on the department's firewall (which is a 
Linux box) instead.

Alan Stern


^ permalink raw reply

* Re: [PATCH] drivers: net: xgene: Add missing initialization in xgene_enet_ecc_init()
From: Iyappan Subramanian @ 2014-10-20 18:20 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: David S. Miller, Keyur Chudgar, netdev,
	linux-kernel@vger.kernel.org
In-Reply-To: <1413792496-8558-1-git-send-email-geert@linux-m68k.org>

On Mon, Oct 20, 2014 at 1:08 AM, Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
> drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c: In function ‘xgene_enet_ecc_init’:
> drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c:126: warning: ‘data’ may be used uninitialized in this function
>
> Depending on the arbitrary value on the stack, the loop may terminate
> too early, and cause a bogus -ENODEV failure.
>
> Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
> ---
>  drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
> index e6d24c2101982444..19e13583b4259cd4 100644
> --- a/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
> +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_sgmac.c
> @@ -123,7 +123,7 @@ static u32 xgene_enet_rd_mac(struct xgene_enet_pdata *p, u32 rd_addr)
>  static int xgene_enet_ecc_init(struct xgene_enet_pdata *p)
>  {
>         struct net_device *ndev = p->ndev;
> -       u32 data;
> +       u32 data = 0;
>         int i;
>
>         xgene_enet_wr_diag_csr(p, ENET_CFG_MEM_RAM_SHUTDOWN_ADDR, 0);
> --
> 1.9.1
>

Acked-by: Iyappan Subramanian <isubramanian@apm.com>

^ permalink raw reply

* Re: [PATCH v2 0/3] net: minor gso encapsulation fixes
From: David Miller @ 2014-10-20 18:04 UTC (permalink / raw)
  To: fw; +Cc: netdev, edumazet, therbert
In-Reply-To: <1413805758-30026-1-git-send-email-fw@strlen.de>

From: Florian Westphal <fw@strlen.de>
Date: Mon, 20 Oct 2014 13:49:15 +0200

> The following series fixes a minor bug in the gso segmentation handlers
> when encapsulation offload is used.
> 
> Theoretically this could cause kernel panic when the stack tries
> to software-segment such a GRE offload packet, but it looks like there
> is only one affected call site (tbf scheduler) and it handles NULL
> return value.
> 
> I've included a followup patch to add IS_ERR_OR_NULL checks where needed.
> 
> While looking into this, I also found that size computation of the individual
> segments is incorrect if skb->encapsulation is set.
> 
> Please see individual patches for delta vs. v1.

Series applied, thanks Florian.

Longer term I'd really like to see the ops->gso_segment() implementations not
return NULL, but rather locally determinned pointer error codes instead.

^ permalink raw reply

* Re: Adding new packet scheduler
From: Cong Wang @ 2014-10-20 17:59 UTC (permalink / raw)
  To: Josh Clark; +Cc: netdev
In-Reply-To: <CAHmvzZS3bN1qErryz7Mps-dOveBk1_d7MJo-sqbVDMaoeu7_HA@mail.gmail.com>

On Mon, Oct 20, 2014 at 10:51 AM, Josh Clark <jcinma@gmail.com> wrote:
>
> That all makes a lot of sense. What do I need to do to apply these
> patches on a live system? I have access to a network made of virtual
> machines with standard Ubuntu 14.04, to which I have SSH access.
> There's no way for me to upload a different image to use.

https://www.kernel.org/doc/Documentation/networking/netdev-FAQ.txt

Q: How do the changes posted to netdev make their way into Linux?

A: There are always two trees (git repositories) in play.  Both are driven
   by David Miller, the main network maintainer.  There is the "net" tree,
   and the "net-next" tree.  As you can probably guess from the names, the
   net tree is for fixes to existing code already in the mainline tree from
   Linus, and net-next is where the new code goes for the future release.
   You can find the trees here:

http://git.kernel.org/?p=linux/kernel/git/davem/net.git
http://git.kernel.org/?p=linux/kernel/git/davem/net-next.git

^ permalink raw reply

* Re: Adding new packet scheduler
From: Josh Clark @ 2014-10-20 17:51 UTC (permalink / raw)
  To: Cong Wang; +Cc: netdev
In-Reply-To: <CAHA+R7PScLKyhF7V+RWahL1Ovfjb7eCO+Hh-aDYchPrfvAUDHg@mail.gmail.com>

On Mon, Oct 20, 2014 at 1:37 PM, Cong Wang <cwang@twopensource.com> wrote:
> On Mon, Oct 20, 2014 at 10:32 AM, Josh Clark <jcinma@gmail.com> wrote:
>> Hi everyone,
>>
>> I'm a student at NC State University, and I'm working on a project to
>> implement some new classful AQM algorithms and test their
>> effectiveness. However, I'm getting hung up on how to get the new
>> algorithm set up in the kernel.
>>
>> From what I've looked at, it looks like I need to add my code to
>> /net/sched/, and edit both the Kconfig and the Makefile to be able to
>> add my code as another module.
>
> Basically yes, take a look at:
> http://lwn.net/Articles/577208/
>
>>
>> Finally, in order to use the new scheduler, I need to select it using
>> the tc command. What do I need to do to add my algorithm to the tc
>> command options?
>
> Read the existing qdisc's in tc, for example:
> http://git.kernel.org/cgit/linux/kernel/git/shemminger/iproute2.git/tree/tc/q_fq.c

That all makes a lot of sense. What do I need to do to apply these
patches on a live system? I have access to a network made of virtual
machines with standard Ubuntu 14.04, to which I have SSH access.
There's no way for me to upload a different image to use.

^ permalink raw reply

* Re: Adding new packet scheduler
From: Dave Taht @ 2014-10-20 17:46 UTC (permalink / raw)
  To: Josh Clark; +Cc: netdev@vger.kernel.org
In-Reply-To: <CAHmvzZQALDuwMPZC_ZncEP=BerniYis9Mx=oObUGNAY4wUGTZg@mail.gmail.com>

On Mon, Oct 20, 2014 at 10:32 AM, Josh Clark <jcinma@gmail.com> wrote:
> Hi everyone,
>
> I'm a student at NC State University, and I'm working on a project to
> implement some new classful AQM algorithms and test their
> effectiveness. However, I'm getting hung up on how to get the new
> algorithm set up in the kernel.
>
> From what I've looked at, it looks like I need to add my code to
> /net/sched/, and edit both the Kconfig and the Makefile to be able to
> add my code as another module.
>
> Finally, in order to use the new scheduler, I need to select it using
> the tc command. What do I need to do to add my algorithm to the tc
> command options?
>
> Any insight, articles, READMEs, or criticism you have for me is welcome.
>
> Thank you for all your help!

To give you an idea here are some currently out of tree, out of date,
and highly experimental codel patches of mine
that add in some new qdiscs, and patch the right places, with the
exception of patching include/uapi/linux/pkt_sched.h (which exports
the stuff to userspace)

https://github.com/dtaht/cerowrt-3.10/blob/master/target/linux/generic/patches-3.10/680-codel-add-experimental-codel-and-fq_codel-versions.patch

to add stuff to tc, you patch the iproute2 utility to match what is
exported from pkt_sched.h.




>
>
>
> -Josh Clark
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Dave Täht

thttp://www.bufferbloat.net/projects/bloat/wiki/Upcoming_Talks

^ permalink raw reply

* Re: Adding new packet scheduler
From: Cong Wang @ 2014-10-20 17:37 UTC (permalink / raw)
  To: Josh Clark; +Cc: netdev
In-Reply-To: <CAHmvzZQALDuwMPZC_ZncEP=BerniYis9Mx=oObUGNAY4wUGTZg@mail.gmail.com>

On Mon, Oct 20, 2014 at 10:32 AM, Josh Clark <jcinma@gmail.com> wrote:
> Hi everyone,
>
> I'm a student at NC State University, and I'm working on a project to
> implement some new classful AQM algorithms and test their
> effectiveness. However, I'm getting hung up on how to get the new
> algorithm set up in the kernel.
>
> From what I've looked at, it looks like I need to add my code to
> /net/sched/, and edit both the Kconfig and the Makefile to be able to
> add my code as another module.

Basically yes, take a look at:
http://lwn.net/Articles/577208/

>
> Finally, in order to use the new scheduler, I need to select it using
> the tc command. What do I need to do to add my algorithm to the tc
> command options?

Read the existing qdisc's in tc, for example:
http://git.kernel.org/cgit/linux/kernel/git/shemminger/iproute2.git/tree/tc/q_fq.c

^ permalink raw reply

* Adding new packet scheduler
From: Josh Clark @ 2014-10-20 17:32 UTC (permalink / raw)
  To: netdev

Hi everyone,

I'm a student at NC State University, and I'm working on a project to
implement some new classful AQM algorithms and test their
effectiveness. However, I'm getting hung up on how to get the new
algorithm set up in the kernel.

>From what I've looked at, it looks like I need to add my code to
/net/sched/, and edit both the Kconfig and the Makefile to be able to
add my code as another module.

Finally, in order to use the new scheduler, I need to select it using
the tc command. What do I need to do to add my algorithm to the tc
command options?

Any insight, articles, READMEs, or criticism you have for me is welcome.

Thank you for all your help!

-Josh Clark

^ permalink raw reply

* Re: Routing BUG with ppp over l2tp
From: James Carlson @ 2014-10-20 17:19 UTC (permalink / raw)
  To: Alan Stern, James Chapman, Michal Ostrowski; +Cc: linux-ppp, netdev
In-Reply-To: <Pine.LNX.4.44L0.1410201152580.2403-100000@iolanthe.rowland.org>

On 10/20/14 12:39, Alan Stern wrote:
> As far as I can tell, the problem is caused by bad routing.  The kernel
> gets confused because the IP address assigned by the VPN server to the
> server's end of the ppp tunnel is the _same_ as the server's actual IP
> address.

Indeed!  That's pretty darned lame behavior by that peer.  It would
probably be workable if you had a virtual router instance and were able
to put the L2TP connection in one routing instance and the PPP
connection in another routing instance, but that's likely not at all
simple to achieve.

> Unfortunately, I can't work around this problem by reconfiguring the
> VPN server -- there's no way to tell it to use a different IP address
> for its end of the VPN tunnel.  Furthermore, the server works just fine
> with clients running Windows or OS-X.

Really?  That seems ... improbable.

> So it looks like the problem has to be fixed either in the kernel or in 
> the way pppd sets up its routing entry.  Can you guys help?

I think the easiest solution is to configure pppd to lie to the kernel
about the remote address.  Who cares what the remote address is on a
point-to-point link anyway?

There's currently no option to do this, but the code change in ipcp_up()
in pppd/ipcp.c would be rather simple.  Just make the "noremoteip" code
run all the time:

/* Deliberately falsify the remote address.  We don't care. */
ho->hisaddr = htonl(0x0aa00002);

As long as you don't need to contact that specific remote server using
the badly-assigned "internal" VPN address and can live with the fact
that you'll either go through the regular Internet to that address or be
forced to use some other address configured on that server, you should
be good.

(The address I used above is 10.160.0.2.  That was one of the internal
DNS server addresses provided in the log you posted.  It's not necessary
that the address used here is exactly that, but it may well be helpful.)

If you can't do that for some reason, then I suppose it would be
possible to use IP Chains (or whatever the packet-modification tool du
jure is used in your Linux distribution) to nail up an exception so that
the outside packets go to the outside interface and the inside ones go
to the PPP interface.  Doing that likely requires selecting on (at
least!) source address, so it's messy and ugly and possibly error-prone,
but it might be doable.

Otherwise, contact the maintainer of that VPN server.  It's just plain
old broken, and life's too short for broken software.

-- 
James Carlson         42.703N 71.076W         <carlsonj@workingcode.com>

^ permalink raw reply

* Routing BUG with ppp over l2tp
From: Alan Stern @ 2014-10-20 16:39 UTC (permalink / raw)
  To: James Chapman, Michal Ostrowski; +Cc: linux-ppp, netdev

James and Michal:

I'm having problem setting up a VPN connection that uses ppp over l2tp
over ipsec (this shows up under both 3.16 and 3.17-rc7).  The ipsec
part is working fine, and xl2tpd sets up its connection okay.  The
problem arises when ppp starts up.

As far as I can tell, the problem is caused by bad routing.  The kernel
gets confused because the IP address assigned by the VPN server to the
server's end of the ppp tunnel is the _same_ as the server's actual IP
address.

Here are some details.  My local address is 192.168.0.203 (behind a 
NAT-ing wireless router).  The VPN server is 140.247.233.37, as you can 
see from this entry in the system log:

Oct 13 17:10:27 saphir NetworkManager: xl2tpd[2616]: Connecting to host 140.247.233.37, port 1701

The addresses of the ppp tunnel endpoints are given later in the log:

Oct 13 17:10:30 saphir pppd[2618]: local  IP address 10.170.30.1
Oct 13 17:10:30 saphir pppd[2618]: remote IP address 140.247.233.37

The overall status from NetworkManager shows up in the log like this:

Oct 13 17:10:30 saphir NetworkManager: ** Message: L2TP service (IP Config Get) reply received.
Oct 13 17:10:30 saphir NetworkManager[439]: <info> VPN connection 'Rowland' (IP4 Config Get) reply received from old-style plugin.
Oct 13 17:10:30 saphir NetworkManager[439]: <info> VPN Gateway: 140.247.233.37
Oct 13 17:10:30 saphir NetworkManager[439]: <info> Tunnel Device: ppp0
Oct 13 17:10:30 saphir NetworkManager[439]: <info> IPv4 configuration:
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Internal Address: 10.170.30.1
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Internal Prefix: 32
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Internal Point-to-Point Address: 140.247.233.37
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Maximum Segment Size (MSS): 0
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Forbid Default Route: no
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Internal DNS: 10.160.0.2
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   Internal DNS: 10.160.0.3
Oct 13 17:10:30 saphir NetworkManager[439]: <info>   DNS Domain: '(none)'
Oct 13 17:10:30 saphir NetworkManager[439]: <info> No IPv6 configuration
Oct 13 17:10:30 saphir NetworkManager[439]: <info> VPN connection 'Rowland' (IP Config Get) complete.

Once the ppp tunnel was set up, xl2tpd started getting errors:

Oct 13 17:11:31 saphir NetworkManager: xl2tpd[2616]: network_thread: select timeout
Oct 13 17:11:32 saphir NetworkManager: xl2tpd[2616]: network_thread: select timeout
Oct 13 17:11:32 saphir NetworkManager: xl2tpd[2616]: Maximum retries exceeded for tunnel 33716.  Closing.
Oct 13 17:11:32 saphir NetworkManager: xl2tpd[2616]: Connection 147 closed to 140.247.233.37, port 1701 (Timeout)

Packet-level debugging showed that once I reached this stage, the
control messages sent by xl2tpd were not received by the server.  I 
believe this is because they were not routed correctly.

Unfortunately, at the moment I don't have a copy of the routing table.  
Nevertheless, it definitely appears that the packets xl2tpd wanted to
send directly to the VPN server were instead routed back through the
ppp tunnel!  Presumably this was because the routing table contained
two entries with their destinations both set to 140.247.233.37/32 (one
for the l2tp connection and one for the ppp tunnel), and the kernel
used the wrong entry.

In fact, on several occasions during testing, the system deadlocked.  I 
was able to get a stack dump:

[ 2214.970639] BUG: soft lockup - CPU#1 stuck for 22s! [pppd:9423]
[ 2214.970648] Modules linked in: l2tp_ppp l2tp_netlink l2tp_core pppoe pppox ppp_generic slhc authenc cmac rmd160 crypto_null ip_vti ip_tunnel af_key ah6 ah4 esp6 esp4 xfrm4_mode_beet xfrm4_tunnel tunnel4 xfrm4_mode_tunnel xfrm4_mode_transport xfrm6_mode_transport xfrm6_mode_ro xfrm6_mode_beet xfrm6_mode_tunnel ipcomp ipcomp6 xfrm6_tunnel tunnel6 xfrm_ipcomp salsa20_i586 camellia_generic cast6_generic cast5_generic cast_common deflate cts gcm ccm serpent_sse2_i586 serpent_generic glue_helper blowfish_generic blowfish_common twofish_generic twofish_i586 twofish_common xcbc sha512_generic des_generic geode_aes tpm_rng tpm timeriomem_rng virtio_rng uas usb_storage fuse ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 ip6table_filter xt_conntrack ip6_tables nf_con
 ntrack vfat
[ 2214.970769]  fat snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel arc4 iwldvm snd_hda_controller snd_hda_codec uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_core v4l2_common videodev mac80211 snd_hwdep coretemp kvm_intel kvm media snd_seq snd_seq_device iTCO_wdt iTCO_vendor_support snd_pcm snd_timer snd joydev iwlwifi microcode serio_raw cfg80211 asus_laptop lpc_ich atl1c soundcore sparse_keymap rfkill input_polldev acpi_cpufreq binfmt_misc i915 i2c_algo_bit drm_kms_helper drm i2c_core video
[ 2214.970854] CPU: 1 PID: 9423 Comm: pppd Tainted: G        W     3.16.3-200.fc20.i686 #1
[ 2214.970860] Hardware name: ASUSTeK Computer Inc.         UL20A               /UL20A     , BIOS 207     11/02/2009
[ 2214.970866] task: f0706a00 ti: e359c000 task.ti: e359c000
[ 2214.970873] EIP: 0060:[<c0a077b8>] EFLAGS: 00200287 CPU: 1
[ 2214.970885] EIP is at _raw_spin_lock_bh+0x28/0x40
[ 2214.970890] EAX: e5ff02a4 EBX: e5ff02a4 ECX: 00000060 EDX: 0000005f
[ 2214.970895] ESI: e5ff02b0 EDI: e3470d40 EBP: e359dc34 ESP: e359dc34
[ 2214.970900]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
[ 2214.970906] CR0: 8005003b CR2: b72eb000 CR3: 25f28000 CR4: 000407d0
[ 2214.970910] Stack:
[ 2214.970914]  e359dc9c f94efe2a f5140000 e359dc50 c045aba6 f5140000 00200286 e359dc78
[ 2214.970929]  c045c553 00000001 f5140000 001f9076 00200286 628a17c6 f075acc0 f075acc0
[ 2214.970942]  00000000 00200246 f4464848 00200246 00200246 e359dc9c e3470d84 f075acc0
[ 2214.970957] Call Trace:
[ 2214.970973]  [<f94efe2a>] ppp_push+0x32a/0x550 [ppp_generic]
[ 2214.970986]  [<c045aba6>] ? internal_add_timer+0x26/0x60
[ 2214.970994]  [<c045c553>] ? mod_timer_pending+0x63/0x130
[ 2214.971005]  [<f94f288d>] ppp_xmit_process+0x3cd/0x5e0 [ppp_generic]
[ 2214.971007]  [<c0914ae1>] ? harmonize_features+0x31/0x1d0
[ 2214.971007]  [<f94f2c78>] ppp_start_xmit+0x108/0x180 [ppp_generic]
[ 2214.971007]  [<c0915024>] dev_hard_start_xmit+0x2c4/0x540
[ 2214.971007]  [<c093244f>] sch_direct_xmit+0x9f/0x170
[ 2214.971007]  [<c091546a>] __dev_queue_xmit+0x1ca/0x430
[ 2214.971007]  [<c094c9b0>] ? ip_fragment+0x930/0x930
[ 2214.971007]  [<c09156df>] dev_queue_xmit+0xf/0x20
[ 2214.971007]  [<c091bacf>] neigh_direct_output+0xf/0x20
[ 2214.971007]  [<c094cb5a>] ip_finish_output+0x1aa/0x850
[ 2214.971007]  [<c094c9b0>] ? ip_fragment+0x930/0x930
[ 2214.971007]  [<c094dbbf>] ip_output+0x8f/0xe0
[ 2214.971007]  [<c094c9b0>] ? ip_fragment+0x930/0x930
[ 2214.971007]  [<c09a4f52>] xfrm_output_resume+0x342/0x3a0
[ 2214.971007]  [<c09a5013>] xfrm_output+0x43/0xf0
[ 2214.971007]  [<c0998f4d>] xfrm4_output_finish+0x3d/0x40
[ 2214.971007]  [<c0998e25>] __xfrm4_output+0x25/0x40
[ 2214.971007]  [<c0998f7f>] xfrm4_output+0x2f/0x70
[ 2214.971007]  [<c0998e00>] ? xfrm4_udp_encap_rcv+0x1b0/0x1b0
[ 2214.971007]  [<c094d2e7>] ip_local_out_sk+0x27/0x30
[ 2214.971007]  [<c094d5f4>] ip_queue_xmit+0x124/0x3f0
[ 2214.971007]  [<c0999f04>] ? xfrm_bundle_ok+0x64/0x170
[ 2214.971007]  [<c099a0ab>] ? xfrm_dst_check+0x1b/0x30
[ 2214.971007]  [<f94fd618>] l2tp_xmit_skb+0x298/0x4b0 [l2tp_core]
[ 2214.971007]  [<f950cd04>] pppol2tp_xmit+0x124/0x1d0 [l2tp_ppp]
[ 2214.971007]  [<f94f2adb>] ppp_channel_push+0x3b/0xb0 [ppp_generic]
[ 2214.971007]  [<f94f2d77>] ppp_write+0x87/0xc8 [ppp_generic]
[ 2214.971007]  [<f94f2cf0>] ? ppp_start_xmit+0x180/0x180 [ppp_generic]
[ 2214.971007]  [<c057723d>] vfs_write+0x9d/0x1d0
[ 2214.971007]  [<c0577951>] SyS_write+0x51/0xb0
[ 2214.971007]  [<c0a07b9f>] sysenter_do_call+0x12/0x12
[ 2214.971007] Code: 00 00 00 55 89 e5 66 66 66 66 90 64 81 05 90 b6 dc c0 00 02 00 00 ba 00 01 00 00 f0 66 0f c1 10 0f b6 ce 38 d1 75 04 5d c3 f3 90 <0f> b6 10 38 ca 75 f7 5d c3 90 90 90 90 90 90 90 90 90 90 90 90

The deadlock occurs because ppp_channel_push() (near the end of the
stack listing) holds the pch->downl spinlock while calling
pch->chan->ops->start_xmit().  The dump shows this call filtering down
through the routing layer and into ppp_push() (near the top of the
listing), which tries to acquire the same spinlock.

It sure looks like a ppp data packet was put into an l2tp wrapper 
and then sent back to the ppp layer for transmission, rather than 
getting sent out through the wlan0 interface.

Unfortunately, I can't work around this problem by reconfiguring the
VPN server -- there's no way to tell it to use a different IP address
for its end of the VPN tunnel.  Furthermore, the server works just fine
with clients running Windows or OS-X.

So it looks like the problem has to be fixed either in the kernel or in 
the way pppd sets up its routing entry.  Can you guys help?

Thanks,

Alan Stern

^ permalink raw reply

* Re: qdisc running
From: Jesper Dangaard Brouer @ 2014-10-20 16:17 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: john Fastabend, Herbert Xu, netdev@vger.kernel.org, eric Dumazet,
	brouer, Mathieu Desnoyers
In-Reply-To: <54440FFA.40307@mojatatu.com>


On Sun, 19 Oct 2014 15:24:42 -0400 Jamal Hadi Salim <jhs@mojatatu.com> wrote:

> Jesper,
> 
> You asked at the meeting the point to qdisc running.

Talking about __QDISC___STATE_RUNNING see slide 9/16:
 http://people.netfilter.org/hawk/presentations/LinuxPlumbers2014/performance_tx_qdisc_bulk_LPC2014.pdf

> Original intent is to allow only one cpu to enter the lower half of the
> qdisc path. IOW, if one cpu was already in the qdisc then that guy
> could be used to dequeue packets. i.e this is good for batching.
> Original idea was Herbert's with major improvement from Eric
> and a small one from me.

I guess it is good for our recent dequeue batching. But I think/hope we
can come up with a scheme that does not requires 6 lock/unlock
operations (as illustrated on slide 9).

John and I have talked about doing a lockless qdisc, but maintaining
this __QDISC___STATE_RUNNING in a lockless scenario, would cost us
extra atomic ops...

Are we still sure, that this model of only allowing a single CPU in the
dequeue path, is still the best solution?  (The TXQ lock should already
protect several CPUs in this code path).


> For history of different tried approaches look at:
> Look at slide 2: 
> http://vger.kernel.org/netconf2011_slides/jamal_netconf2011.pdf

I can see that you really needed the budget/fairness in the dequeue
loop, that we recently mangled with.

> then download the **amazing** flash animations which describe
> that history.
> http://vger.kernel.org/netconf2011_slides/netconf-2011-flash.tgz
> 
> Follow the bullets in slide2 and map to the flash animations.

What tool do I use to play these SWF files? (I tried VLC but no luck).

> If you go over them, you'll see it is still needed.

Too bad, I would like to avoid the second 
 
> I think someone oughta put those **amazing** animations on some
> website;->

I hope someone else can pick that up ;-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH net] net/mlx4_en: mlx4_en_netpoll shouldn't call napi_schedule when port is down
From: Ido Shamay @ 2014-10-20 15:59 UTC (permalink / raw)
  To: David Miller, amirv; +Cc: idos, netdev, yevgenyp, ogerlitz
In-Reply-To: <20140929.134704.646626597842088240.davem@davemloft.net>

On 9/29/2014 8:47 PM, David Miller wrote:
> From: Amir Vadai <amirv@mellanox.com>
> Date: Mon, 29 Sep 2014 14:04:55 +0300
>
>> From: Ido Shamay <idos@mellanox.com>
>>
>> mlx4_en_netpoll, which is mlx4_en ndo_poll_controller callback,
>> might be called when port is down, causing a napi_schedule when
>> napi->poll callback in NULL. mutex_trylock is needed to acquire
>> the port_state lock, since other threads may grab it and stop
>> the port while we are in napi scheduling. Using trylock since in atomic
>> context.
>>
>> Signed-off-by: Ido Shamay <idos@mellanox.com>
>> Signed-off-by: Amir Vadai <amirv@mellanox.com>
>
> I don't think netpoll should 'variably succeed' on poll controller's
> ability to immediately take a trylock.
>
> You need to find a way to make this code always process the queues
> when it is called, the approach you are taking here is not a correct.
>
> If you've designed things in such a way that the locking here is
> difficult, that's unfortunate but you still have to make this
> function behave properly.

Sorry about that, aborting this commit since the problem was already 
fixed in commit 3484aac1, which added netif_device_detach to driver port 
stop flow and that made netconsole not to call ndo_poll_controller callback.

Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH 0/7] netfilter fixes for net
From: David Miller @ 2014-10-20 15:58 UTC (permalink / raw)
  To: pablo; +Cc: netfilter-devel, netdev
In-Reply-To: <1413792639-3954-1-git-send-email-pablo@netfilter.org>

From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 20 Oct 2014 10:10:32 +0200

> The following patchset contains netfilter fixes for your net tree,
> they are:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks a lot Pablo.

^ permalink raw reply

* Re: v3.18-rc1 bloat-o-meter
From: Alexei Starovoitov @ 2014-10-20 15:57 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: linux-embedded, Josh Triplett, Linux Kernel Development,
	netdev@vger.kernel.org
In-Reply-To: <alpine.DEB.2.02.1410201018230.24954@ayla.of.borg>

On Mon, Oct 20, 2014 at 10:37 AM, Geert Uytterhoeven
<geert@linux-m68k.org> wrote:
>         Hi all,
>
> Below is the bloat-o-meter output when comparing an m68k/atari_defconfig
> kernel for v3.17 and v3.18-rc1.
>
> Major culprit seems to be bpf. Can this become modular or optional?
> Currently it's always included if CONFIG_NET=y.

yes, the plan is to get it its own CONFIG_BPF so that tracing wouldn't
need to depend on whole NET and so it can be stubbed out.

^ permalink raw reply

* Re: [PATCH stable v3.2 v3.4] ipv4: disable bh while doing route gc
From: David Miller @ 2014-10-20 15:39 UTC (permalink / raw)
  To: ben; +Cc: mleitner, stable, netdev, hannes
In-Reply-To: <1413807808.31953.24.camel@decadent.org.uk>

From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 20 Oct 2014 13:23:28 +0100

> On Mon, 2014-10-20 at 00:23 -0400, David Miller wrote:
>> From: Ben Hutchings <ben@decadent.org.uk>
>> Date: Mon, 20 Oct 2014 04:09:41 +0100
>> 
>> > I've appplied this and the previous two patches mentioned ('ipv4: move
>> > route garbage collector to work queue' and 'ipv4: avoid parallel route
>> > cache gc executions').  But I didn't get the other two from you.  The
>> > last batch of networking fixes I received and applied was dated
>> > 2014-08-07, and the next one I've seen is dated 2014-10-11 and has
>> > nothing for 3.2 or 3.4.  Did I miss one between these?
>> 
>> I'm at the point where I'm personally not going to go back more than
>> four releases, anything more than that is rediculous.
>> 
>> And this time that was 3.17, 3.16, 3.14, and 3.10
> 
> OK.  I appreciate all the work you've done to backport to 3.2, but would
> also have appreciated an explicit note to say you were dropping the
> earlier versions.

My apologies, but as time goes on and more -stable releases go active
I will certainly be tail dropping to keep it within 4 releases.

^ permalink raw reply

* Re: [PATCH] net: ethernet : stmicro: fixed power suspend and resume failure in stmmac driver
From: David Miller @ 2014-10-20 15:36 UTC (permalink / raw)
  To: hliang1025; +Cc: peppe.cavallaro, netdev, linux-kernel, adi-linux-docs
In-Reply-To: <CAPig_t=Dguhw8jmAX_KvMrB9ZjL1POB+7jrdBmwpvTzjasKD1w@mail.gmail.com>

From: Hao Liang <hliang1025@gmail.com>
Date: Mon, 20 Oct 2014 16:55:08 +0800

> I can't quite seize David's meaing whether the ->mac->xxx should
> inside the lock or outside the lock.  In my opinion, the ->mac->xxx
> calls did not operate any shared data or struct. The calls just
> control the hardware by write data to registers.  So ->mac->xxx
> calls can be removed from lock.

If you make a decision to program the hardware in a certain way, you
must make sure that hardware programming operation occurs atomically.

This could be because it takes multiple register writes to perform
the operation and they are dependent upon eachother, or you are
makeing multiple modifications which must occur in a certain
sequence.

If you allow these ->mac->xxx calls to run in parallel with eachother
I guarantee it will cause problems.

Stop brushing it off as a non-issue, unless you want your driver
to mysteriously not work when the race is triggered.

^ permalink raw reply

* Re: [PATCH RFT 0/8] Marvell PXA168 libphy handling and Berlin Ethernet
From: Antoine Tenart @ 2014-10-20 15:10 UTC (permalink / raw)
  To: Sebastian Hesselbarth
  Cc: Antoine Tenart, David S. Miller, Florian Fainelli, Eric Miao,
	Haojian Zhuang, linux-arm-kernel, netdev, devicetree,
	linux-kernel
In-Reply-To: <54451E3F.1030700@gmail.com>

Sebastian,

On Mon, Oct 20, 2014 at 04:37:51PM +0200, Sebastian Hesselbarth wrote:
> On 16.10.2014 11:53, Antoine Tenart wrote:
> >On Thu, Oct 09, 2014 at 02:38:58PM +0200, Sebastian Hesselbarth wrote:
> >>This patch series deals with a removing a IP feature that can be found
> >>on all currently supported Marvell Ethernet IP (pxa168_eth, mv643xx_eth,
> >>mvneta). The MAC IP allows to automatically perform PHY auto-negotiation
> >>without software interaction.
> >>
> >>However, this feature (a) fundamentally clashes with the way libphy works
> >>and (b) is unable to deal with quirky PHYs that require special treatment.
> >>In this series, pxa168_eth driver is rewritten to completely disable that
> >>feature and properly deal with libphy provided PHYs. The other two drivers
> >>are suspect to future patch sets, also removing the code related with it.
> >>
> >>Currently, the patches are based on next-20141009 and will be resent once
> >>v3.18-rc1 drops. This is a Request-For-Test on both BG2Q and MMP/gplug as
> >
> >I tested the series on a BG2Q, it worked well.
> 
> Thanks for testing! I assume you have added a phy-connection-type
> property to BG2Q's ethernet node?

Yes, I added the following property to the ethernet-phy node:

	phy-connection-type = "mii";

Feel free to add this to your series, or I can also send a patch.


Antoine

-- 
Antoine Ténart, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox