Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 6/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-08 11:17 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel; +Cc: horms, ja, wensong, daniel.lezcano

This patch just contains ip_vs_est.c

There is one estimator i.e not one per netns
When est runs it loops all netns
  for_each_net(net) { ... }


Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>

diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801..e8c185d 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,13 @@
  *              as published by the Free Software Foundation; either version
  *              2 of the License, or (at your option) any later version.
  *
- * Changes:
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
  *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *		Affected data: est_list and est_lock.
+ *		estimation_timer() runs with a common timer, but
+ *		do update every netns on timeout.
  */

 #define KMSG_COMPONENT "IPVS"
@@ -45,13 +50,13 @@
     rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.

   * A lot code is taken from net/sched/estimator.c
+
+  * netns: estimation_timer runs every netns
  */


 static void estimation_timer(unsigned long arg);

-static LIST_HEAD(est_list);
-static DEFINE_SPINLOCK(est_lock);
 static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);

 static void estimation_timer(unsigned long arg)
@@ -62,50 +67,55 @@ static void estimation_timer(unsigned long arg)
 	u32 n_inpkts, n_outpkts;
 	u64 n_inbytes, n_outbytes;
 	u32 rate;
-
-	spin_lock(&est_lock);
-	list_for_each_entry(e, &est_list, list) {
-		s = container_of(e, struct ip_vs_stats, est);
-
-		spin_lock(&s->lock);
-		n_conns = s->ustats.conns;
-		n_inpkts = s->ustats.inpkts;
-		n_outpkts = s->ustats.outpkts;
-		n_inbytes = s->ustats.inbytes;
-		n_outbytes = s->ustats.outbytes;
-
-		/* scaled by 2^10, but divided 2 seconds */
-		rate = (n_conns - e->last_conns)<<9;
-		e->last_conns = n_conns;
-		e->cps += ((long)rate - (long)e->cps)>>2;
-		s->ustats.cps = (e->cps+0x1FF)>>10;
-
-		rate = (n_inpkts - e->last_inpkts)<<9;
-		e->last_inpkts = n_inpkts;
-		e->inpps += ((long)rate - (long)e->inpps)>>2;
-		s->ustats.inpps = (e->inpps+0x1FF)>>10;
-
-		rate = (n_outpkts - e->last_outpkts)<<9;
-		e->last_outpkts = n_outpkts;
-		e->outpps += ((long)rate - (long)e->outpps)>>2;
-		s->ustats.outpps = (e->outpps+0x1FF)>>10;
-
-		rate = (n_inbytes - e->last_inbytes)<<4;
-		e->last_inbytes = n_inbytes;
-		e->inbps += ((long)rate - (long)e->inbps)>>2;
-		s->ustats.inbps = (e->inbps+0xF)>>5;
-
-		rate = (n_outbytes - e->last_outbytes)<<4;
-		e->last_outbytes = n_outbytes;
-		e->outbps += ((long)rate - (long)e->outbps)>>2;
-		s->ustats.outbps = (e->outbps+0xF)>>5;
-		spin_unlock(&s->lock);
+	struct net *net;
+	struct netns_ipvs *ipvs;
+
+	for_each_net(net) {
+		ipvs = net->ipvs;
+		spin_lock(&ipvs->est_lock);
+		list_for_each_entry(e, &ipvs->est_list, list) {
+			s = container_of(e, struct ip_vs_stats, est);
+
+			spin_lock(&s->lock);
+			n_conns = s->ustats.conns;
+			n_inpkts = s->ustats.inpkts;
+			n_outpkts = s->ustats.outpkts;
+			n_inbytes = s->ustats.inbytes;
+			n_outbytes = s->ustats.outbytes;
+
+			/* scaled by 2^10, but divided 2 seconds */
+			rate = (n_conns - e->last_conns)<<9;
+			e->last_conns = n_conns;
+			e->cps += ((long)rate - (long)e->cps)>>2;
+			s->ustats.cps = (e->cps+0x1FF)>>10;
+
+			rate = (n_inpkts - e->last_inpkts)<<9;
+			e->last_inpkts = n_inpkts;
+			e->inpps += ((long)rate - (long)e->inpps)>>2;
+			s->ustats.inpps = (e->inpps+0x1FF)>>10;
+
+			rate = (n_outpkts - e->last_outpkts)<<9;
+			e->last_outpkts = n_outpkts;
+			e->outpps += ((long)rate - (long)e->outpps)>>2;
+			s->ustats.outpps = (e->outpps+0x1FF)>>10;
+
+			rate = (n_inbytes - e->last_inbytes)<<4;
+			e->last_inbytes = n_inbytes;
+			e->inbps += ((long)rate - (long)e->inbps)>>2;
+			s->ustats.inbps = (e->inbps+0xF)>>5;
+
+			rate = (n_outbytes - e->last_outbytes)<<4;
+			e->last_outbytes = n_outbytes;
+			e->outbps += ((long)rate - (long)e->outbps)>>2;
+			s->ustats.outbps = (e->outbps+0xF)>>5;
+			spin_unlock(&s->lock);
+		}
+		spin_unlock(&ipvs->est_lock);
 	}
-	spin_unlock(&est_lock);
 	mod_timer(&est_timer, jiffies + 2*HZ);
 }

-void ip_vs_new_estimator(struct ip_vs_stats *stats)
+void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats)
 {
 	struct ip_vs_estimator *est = &stats->est;

@@ -126,18 +136,18 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
 	est->last_outbytes = stats->ustats.outbytes;
 	est->outbps = stats->ustats.outbps<<5;

-	spin_lock_bh(&est_lock);
-	list_add(&est->list, &est_list);
-	spin_unlock_bh(&est_lock);
+	spin_lock_bh(&net->ipvs->est_lock);
+	list_add(&est->list, &net->ipvs->est_list);
+	spin_unlock_bh(&net->ipvs->est_lock);
 }

-void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats)
 {
 	struct ip_vs_estimator *est = &stats->est;

-	spin_lock_bh(&est_lock);
+	spin_lock_bh(&net->ipvs->est_lock);
 	list_del(&est->list);
-	spin_unlock_bh(&est_lock);
+	spin_unlock_bh(&net->ipvs->est_lock);
 }

 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -156,14 +166,31 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
 	est->inbps = 0;
 	est->outbps = 0;
 }
+static int __net_init __ip_vs_estimator_init(struct net *net)
+{
+	INIT_LIST_HEAD(&net->ipvs->est_list);
+	spin_lock_init(&net->ipvs->est_lock);
+	return 0;
+}
+
+static struct pernet_operations ip_vs_app_ops = {
+	.init = __ip_vs_estimator_init,
+//	.exit = __ip_vs_estimator_cleanup,
+};

 int __init ip_vs_estimator_init(void)
 {
+	int rv;
+
+	rv = register_pernet_subsys(&ip_vs_app_ops);
+	if(rv < 0)
+		return rv;
 	mod_timer(&est_timer, jiffies + 2 * HZ);
-	return 0;
+	return rv;
 }

 void ip_vs_estimator_cleanup(void)
 {
 	del_timer_sync(&est_timer);
+	unregister_pernet_subsys(&ip_vs_app_ops);
 }

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply related

* [RFC PATCH 7/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-08 11:17 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel; +Cc: horms, ja, wensong, daniel.lezcano

This patch just contains ip_vs_ftp.c

minor changes.

Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7e9af5b..9d54eb0 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,7 +157,8 @@ ip_vs_expect_callback(struct nf_conn *ct,
 {
 	struct nf_conntrack_tuple *orig, new_reply;
 	struct ip_vs_conn *cp;
-
+	struct net *net = nf_ct_net(ct);
+
 	if (exp->tuple.src.l3num != PF_INET)
 		return;

@@ -168,10 +169,10 @@ ip_vs_expect_callback(struct nf_conn *ct,
 	 * actual values from the newly created original conntrack direction.
 	 * The conntrack is confirmed when packet reaches IPVS hooks.
 	 */
-
+	BUG_ON(!net);
 	/* RS->CLIENT */
 	orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-	cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+	cp = ip_vs_conn_out_get(net, exp->tuple.src.l3num, orig->dst.protonum,
 				&orig->src.u3, orig->src.u.tcp.port,
 				&orig->dst.u3, orig->dst.u.tcp.port);
 	if (cp) {
@@ -193,7 +194,7 @@ ip_vs_expect_callback(struct nf_conn *ct,
 	}

 	/* CLIENT->VS */
-	cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+	cp = ip_vs_conn_in_get(net, exp->tuple.src.l3num, orig->dst.protonum,
 			       &orig->src.u3, orig->src.u.tcp.port,
 			       &orig->dst.u3, orig->dst.u.tcp.port);
 	if (cp) {
@@ -290,7 +291,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	int ret = 0;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
-
+	struct net *net = dev_net(skb->dev);
+
 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
 	 * so turn this into a no-op for IPv6 packets
@@ -328,10 +330,10 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		/*
 		 * Now update or create an connection entry for it
 		 */
-		n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port,
-					  &cp->caddr, 0);
+		n_cp = ip_vs_conn_out_get(net, AF_INET, iph->protocol,
+				          &from, port, &cp->caddr, 0);
 		if (!n_cp) {
-			n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+			n_cp = ip_vs_conn_new(net, AF_INET, IPPROTO_TCP,
 					      &cp->caddr, 0,
 					      &cp->vaddr, port,
 					      &from, port,
@@ -381,7 +383,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		 */

 		cp->app_data = NULL;
-		ip_vs_tcp_conn_listen(n_cp);
+		ip_vs_tcp_conn_listen(net, n_cp);
 		ip_vs_conn_put(n_cp);
 		return ret;
 	}
@@ -410,6 +412,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	union nf_inet_addr to;
 	__be16 port;
 	struct ip_vs_conn *n_cp;
+	struct nf_conn *ct;
+	struct net *net = dev_net(skb->dev);

 #ifdef CONFIG_IP_VS_IPV6
 	/* This application helper doesn't work with IPv6 yet,
@@ -479,11 +483,11 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		  ip_vs_proto_name(iph->protocol),
 		  &to.ip, ntohs(port), &cp->vaddr.ip, 0);

-	n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol,
+	n_cp = ip_vs_conn_in_get(net, AF_INET, iph->protocol,
 				 &to, port,
 				 &cp->vaddr, htons(ntohs(cp->vport)-1));
 	if (!n_cp) {
-		n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP,
+		n_cp = ip_vs_conn_new(net, AF_INET, IPPROTO_TCP,
 				      &to, port,
 				      &cp->vaddr, htons(ntohs(cp->vport)-1),
 				      &cp->daddr, htons(ntohs(cp->dport)-1),
@@ -499,7 +503,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 	/*
 	 *	Move tunnel to listen state
 	 */
-	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_tcp_conn_listen(net, n_cp);
 	ip_vs_conn_put(n_cp);

 	return 1;
@@ -520,23 +524,22 @@ static struct ip_vs_app ip_vs_ftp = {
 	.pkt_in =	ip_vs_ftp_in,
 };

-
 /*
- *	ip_vs_ftp initialization
+ *	per netns ip_vs_ftp initialization
  */
-static int __init ip_vs_ftp_init(void)
+static int __net_init __ip_vs_ftp_init(struct net *net)
 {
 	int i, ret;
 	struct ip_vs_app *app = &ip_vs_ftp;
-
-	ret = register_ip_vs_app(app);
+
+	ret = register_ip_vs_app(net, app);
 	if (ret)
 		return ret;

 	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
 		if (!ports[i])
 			continue;
-		ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+		ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
 		if (ret)
 			break;
 		pr_info("%s: loaded support on port[%d] = %d\n",
@@ -544,18 +547,39 @@ static int __init ip_vs_ftp_init(void)
 	}

 	if (ret)
-		unregister_ip_vs_app(app);
+		unregister_ip_vs_app(net, app);

 	return ret;
 }
+/*
+ * 	netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+	struct ip_vs_app *app = &ip_vs_ftp;
+
+	unregister_ip_vs_app(net, app);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+	.init = __ip_vs_ftp_init,
+	.exit = __ip_vs_ftp_exit,
+};
+
+int __init ip_vs_ftp_init(void)
+{
+	int rv;

+	rv = register_pernet_subsys(&ip_vs_ftp_ops);
+	return rv;
+}

 /*
  *	ip_vs_ftp finish.
  */
 static void __exit ip_vs_ftp_exit(void)
 {
-	unregister_ip_vs_app(&ip_vs_ftp);
+	unregister_pernet_subsys(&ip_vs_ftp_ops);
 }



-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply related

* [RFC PATCH 8/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-08 11:17 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel; +Cc: horms, ja, wensong, daniel.lezcano

This patch contains all proto files

All timeouts are moved to ipvs struct.
Global "timeout tables" are used as default values only.

Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>

diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 027f654..c17e02c 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -38,7 +38,6 @@
  * ipvs protocol table.
  */

-#define IP_VS_PROTO_TAB_SIZE		32	/* must be power of 2 */
 #define IP_VS_PROTO_HASH(proto)		((proto) & (IP_VS_PROTO_TAB_SIZE-1))

 static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
@@ -60,6 +59,30 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
 	return 0;
 }

+/*
+ *	register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp )
+{
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+	struct ip_vs_proto_data *pd =
+			kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
+
+	if (!pd) {
+		pr_err("%s(): no memory.\n", __func__);
+		return -ENOMEM;
+	}
+	pd->pp=pp;	/* For speed issues */
+	pd->next = net->ipvs->proto_data_table[hash];
+	net->ipvs->proto_data_table[hash] = pd;
+	atomic_set(&pd->appcnt,0);	/* Init app counter */
+
+	if (pp->init_netns != NULL)
+		pp->init_netns(net, pd);
+
+	return 0;
+}

 /*
  *	unregister an ipvs protocol
@@ -81,6 +104,28 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)

 	return -ESRCH;
 }
+/*
+ *	unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+	struct ip_vs_proto_data **pd_p;
+	unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+	pd_p = &net->ipvs->proto_data_table[hash];
+	for (; *pd_p; pd_p = &(*pd_p)->next) {
+		if (*pd_p == pd) {
+			*pd_p = pd->next;
+			if (pd->pp->exit_netns != NULL)
+				pd->pp->exit_netns(net, pd);
+			kfree(pd);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}


 /*
@@ -100,6 +145,24 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
 }
 EXPORT_SYMBOL(ip_vs_proto_get);

+/*
+ *	get ip_vs_protocol object data by netns and proto
+ */
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+	struct ip_vs_proto_data *pd;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+	struct netns_ipvs *ipvs = net->ipvs;
+
+	for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+		if (pd->pp->protocol == proto)
+			return pd;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);

 /*
  *	Propagate event for state change to all protocols
@@ -118,8 +181,7 @@ void ip_vs_protocol_timeout_change(int flags)
 }


-int *
-ip_vs_create_timeout_table(int *table, int size)
+int *ip_vs_create_timeout_table(const int *table, int size)
 {
 	return kmemdup(table, size, GFP_ATOMIC);
 }
@@ -235,7 +297,44 @@ ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
 #endif
 		ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
 }
+static int  __net_init  __ip_vs_protocol_init(struct net *net)
+{

+#ifdef CONFIG_IP_VS_PROTO_TCP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
+#endif
+	return 0;
+}
+
+static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+{
+	struct ip_vs_proto_data *pd;
+	int i;
+	struct netns_ipvs *ipvs = net->ipvs;
+
+	/* unregister all the ipvs proto data for this netns */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pd = ipvs->proto_data_table[i]) != NULL)
+			unregister_ip_vs_proto_netns(net, pd);
+	}
+}
+
+static struct pernet_operations ipvs_proto_ops = {
+	.init = __ip_vs_protocol_init,
+	.exit = __ip_vs_protocol_cleanup,
+};

 int __init ip_vs_protocol_init(void)
 {
@@ -266,7 +365,7 @@ int __init ip_vs_protocol_init(void)
 #endif
 	pr_info("Registered protocols (%s)\n", &protocols[2]);

-	return 0;
+	return register_pernet_subsys(&ipvs_proto_ops);
 }


@@ -275,6 +374,7 @@ void ip_vs_protocol_cleanup(void)
 	struct ip_vs_protocol *pp;
 	int i;

+	unregister_pernet_subsys(&ipvs_proto_ops);
 	/* unregister all the ipvs protocols */
 	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
 		while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 1892dfc..1b77ef1 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -47,15 +47,17 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
 		   int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct net *net = dev_net(skb->dev);

+	BUG_ON(!net);
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+		cp = ip_vs_conn_in_get(net, af, IPPROTO_UDP,
 				       &iph->saddr,
 				       htons(PORT_ISAKMP),
 				       &iph->daddr,
 				       htons(PORT_ISAKMP));
 	} else {
-		cp = ip_vs_conn_in_get(af, IPPROTO_UDP,
+		cp = ip_vs_conn_in_get(net, af, IPPROTO_UDP,
 				       &iph->daddr,
 				       htons(PORT_ISAKMP),
 				       &iph->saddr,
@@ -87,15 +89,17 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
 		    int inverse)
 {
 	struct ip_vs_conn *cp;
+	struct net *net = dev_net(skb->dev);

+	BUG_ON(!net);
 	if (likely(!inverse)) {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+		cp = ip_vs_conn_out_get(net, af, IPPROTO_UDP,
 					&iph->saddr,
 					htons(PORT_ISAKMP),
 					&iph->daddr,
 					htons(PORT_ISAKMP));
 	} else {
-		cp = ip_vs_conn_out_get(af, IPPROTO_UDP,
+		cp = ip_vs_conn_out_get(net, af, IPPROTO_UDP,
 					&iph->daddr,
 					htons(PORT_ISAKMP),
 					&iph->saddr,
@@ -173,27 +177,14 @@ ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 		ah_esp_debug_packet_v4(pp, skb, offset, msg);
 }

-
-static void ah_esp_init(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
-static void ah_esp_exit(struct ip_vs_protocol *pp)
-{
-	/* nothing to do now */
-}
-
-
 #ifdef CONFIG_IP_VS_PROTO_AH
 struct ip_vs_protocol ip_vs_protocol_ah = {
 	.name =			"AH",
 	.protocol =		IPPROTO_AH,
 	.num_states =		1,
 	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
+	.init =			NULL,
+	.exit =			NULL,
 	.conn_schedule =	ah_esp_conn_schedule,
 	.conn_in_get =		ah_esp_conn_in_get,
 	.conn_out_get =		ah_esp_conn_out_get,
@@ -206,7 +197,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
 	.app_conn_bind =	NULL,
 	.debug_packet =		ah_esp_debug_packet,
 	.timeout_change =	NULL,		/* ISAKMP */
-	.set_state_timeout =	NULL,
 };
 #endif

@@ -216,8 +206,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
 	.protocol =		IPPROTO_ESP,
 	.num_states =		1,
 	.dont_defrag =		1,
-	.init =			ah_esp_init,
-	.exit =			ah_esp_exit,
+	.init =			NULL,
+	.exit =			NULL,
 	.conn_schedule =	ah_esp_conn_schedule,
 	.conn_in_get =		ah_esp_conn_in_get,
 	.conn_out_get =		ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 4c0855c..0e7eb5d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -16,7 +16,9 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	sctp_chunkhdr_t _schunkh, *sch;
 	sctp_sctphdr_t *sh, _sctph;
 	struct ip_vs_iphdr iph;
-
+	struct net *net = dev_net(skb->dev);
+
+	BUG_ON(!net);
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

 	sh = skb_header_pointer(skb, iph.len, sizeof(_sctph), &_sctph);
@@ -29,7 +31,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;

 	if ((sch->type == SCTP_CID_INIT) &&
-	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
 				     &iph.daddr, sh->dest))) {
 		if (ip_vs_todrop()) {
 			/*
@@ -224,7 +226,7 @@ static enum ipvs_sctp_event_t sctp_events[255] = {
 	IP_VS_SCTP_EVE_SHUT_COM_CLI,
 };

-static struct ipvs_sctp_nextstate
+static const struct ipvs_sctp_nextstate
  sctp_states_table[IP_VS_SCTP_S_LAST][IP_VS_SCTP_EVE_LAST] = {
 	/*
 	 * STATE : IP_VS_SCTP_S_NONE
@@ -853,7 +855,7 @@ static struct ipvs_sctp_nextstate
 /*
  *      Timeout table[state]
  */
-static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
 	[IP_VS_SCTP_S_NONE]         =     2 * HZ,
 	[IP_VS_SCTP_S_INIT_CLI]     =     1 * 60 * HZ,
 	[IP_VS_SCTP_S_INIT_SER]     =     1 * 60 * HZ,
@@ -901,6 +903,7 @@ static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
 {
 }

+/*
 static int
 sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 {
@@ -908,7 +911,7 @@ sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
 				sctp_state_name_table, sname, to);
 }
-
+*/
 static inline int
 set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 		int direction, const struct sk_buff *skb)
@@ -917,7 +920,10 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	unsigned char chunk_type;
 	int event, next_state;
 	int ihl;
+	struct net *net = dev_net(skb->dev);
+	struct ip_vs_proto_data *pd;

+	BUG_ON(!net);
 #ifdef CONFIG_IP_VS_IPV6
 	ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
 #else
@@ -992,10 +998,13 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			}
 		}
 	}
+	pd = ip_vs_proto_data_get(net, pp->protocol);
+	if(likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = next_state];
+	else	/* What to do ? */
+		cp->timeout = sctp_timeouts[cp->state = next_state];

-	 cp->timeout = pp->timeout_table[cp->state = next_state];
-
-	 return 1;
+	return 1;
 }

 static int
@@ -1011,59 +1020,55 @@ sctp_state_transition(struct ip_vs_conn *cp, int direction,
 	return ret;
 }

-/*
- *      Hash table for SCTP application incarnations
- */
-#define SCTP_APP_TAB_BITS        4
-#define SCTP_APP_TAB_SIZE        (1 << SCTP_APP_TAB_BITS)
-#define SCTP_APP_TAB_MASK        (SCTP_APP_TAB_SIZE - 1)
-
-static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(sctp_app_lock);
-
 static inline __u16 sctp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
 		& SCTP_APP_TAB_MASK;
 }

-static int sctp_register_app(struct ip_vs_app *inc)
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
 {
 	struct ip_vs_app *i;
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net->ipvs;
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);

 	hash = sctp_app_hashkey(port);

-	spin_lock_bh(&sctp_app_lock);
-	list_for_each_entry(i, &sctp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->sctp_app_lock);
+	list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &sctp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_sctp.appcnt);
+	list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+	atomic_inc(&pd->appcnt);
 out:
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&ipvs->sctp_app_lock);

 	return ret;
 }

-static void sctp_unregister_app(struct ip_vs_app *inc)
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
-	spin_lock_bh(&sctp_app_lock);
-	atomic_dec(&ip_vs_protocol_sctp.appcnt);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+	BUG_ON(!pd);
+	spin_lock_bh(&net->ipvs->sctp_app_lock);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&sctp_app_lock);
+	spin_unlock_bh(&net->ipvs->sctp_app_lock);
 }

-static int sctp_app_conn_bind(struct ip_vs_conn *cp)
+static int sctp_app_conn_bind(struct net *net, struct ip_vs_conn *cp)
 {
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
+	struct netns_ipvs *ipvs = net->ipvs;

 	/* Default binding: bind app only for NAT */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
@@ -1071,12 +1076,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = sctp_app_hashkey(cp->vport);

-	spin_lock(&sctp_app_lock);
-	list_for_each_entry(inc, &sctp_apps[hash], p_list) {
+	spin_lock(&ipvs->sctp_app_lock);
+	list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&sctp_app_lock);
+			spin_unlock(&ipvs->sctp_app_lock);

 			IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
 					"%s:%u to app %s on port %u\n",
@@ -1092,43 +1097,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&sctp_app_lock);
+	spin_unlock(&ipvs->sctp_app_lock);
 out:
 	return result;
 }

-static void ip_vs_sctp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(sctp_apps);
-	pp->timeout_table = sctp_timeouts;
+	ip_vs_init_hash_table(net->ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+	spin_lock_init(&net->ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table(sctp_timeouts,
+							sizeof(sctp_timeouts));
 }

-
-static void ip_vs_sctp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
-
+	kfree(pd->timeout_table);
 }

+
 struct ip_vs_protocol ip_vs_protocol_sctp = {
-	.name = "SCTP",
-	.protocol = IPPROTO_SCTP,
-	.num_states = IP_VS_SCTP_S_LAST,
-	.dont_defrag = 0,
-	.appcnt = ATOMIC_INIT(0),
-	.init = ip_vs_sctp_init,
-	.exit = ip_vs_sctp_exit,
-	.register_app = sctp_register_app,
+	.name 		= "SCTP",
+	.protocol 	= IPPROTO_SCTP,
+	.num_states 	= IP_VS_SCTP_S_LAST,
+	.dont_defrag 	= 0,
+	.init 		= NULL,
+	.exit 		= NULL,
+	.init_netns 	= __ip_vs_sctp_init,
+	.exit_netns 	= __ip_vs_sctp_exit,
+	.register_app 	= sctp_register_app,
 	.unregister_app = sctp_unregister_app,
-	.conn_schedule = sctp_conn_schedule,
-	.conn_in_get = ip_vs_conn_in_get_proto,
-	.conn_out_get = ip_vs_conn_out_get_proto,
-	.snat_handler = sctp_snat_handler,
-	.dnat_handler = sctp_dnat_handler,
-	.csum_check = sctp_csum_check,
-	.state_name = sctp_state_name,
+	.conn_schedule 	= sctp_conn_schedule,
+	.conn_in_get 	= ip_vs_conn_in_get_proto,
+	.conn_out_get 	= ip_vs_conn_out_get_proto,
+	.snat_handler 	= sctp_snat_handler,
+	.dnat_handler 	= sctp_dnat_handler,
+	.csum_check 	= sctp_csum_check,
+	.state_name 	= sctp_state_name,
 	.state_transition = sctp_state_transition,
-	.app_conn_bind = sctp_app_conn_bind,
-	.debug_packet = ip_vs_tcpudp_debug_packet,
+	.app_conn_bind 	= sctp_app_conn_bind,
+	.debug_packet 	= ip_vs_tcpudp_debug_packet,
 	.timeout_change = sctp_timeout_change,
-	.set_state_timeout = sctp_set_state_timeout,
+/*	.set_state_timeout = sctp_set_state_timeout, */
 };
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 282d24d..bd40721 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,7 +9,12 @@
  *              as published by the Free Software Foundation; either version
  *              2 of the License, or (at your option) any later version.
  *
- * Changes:
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
+ *              tcp_timeouts table has copy per netns in a hash table per
+ *              protocol ip_vs_proto_data and is handled by netns
  *
  */

@@ -34,7 +39,9 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	struct ip_vs_service *svc;
 	struct tcphdr _tcph, *th;
 	struct ip_vs_iphdr iph;
-
+	struct net *net = dev_net(skb->dev);
+
+	BUG_ON(!net);
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

 	th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
@@ -44,8 +51,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	}

 	if (th->syn &&
-	    (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr,
-				     th->dest))) {
+	    (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
+			    	     &iph.daddr, th->dest))) {
 		if (ip_vs_todrop()) {
 			/*
 			 * It seems that we are very loaded.
@@ -316,7 +323,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
 /*
  *	Timeout table[state]
  */
-static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
 	[IP_VS_TCP_S_NONE]		=	2*HZ,
 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
@@ -430,13 +437,13 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
 	*/
 	tcp_state_table = (on? tcp_states_dos : tcp_states);
 }
-
+/* Removed not used
 static int
 tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
 {
 	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
 				       tcp_state_name_table, sname, to);
-}
+} */

 static inline int tcp_state_idx(struct tcphdr *th)
 {
@@ -452,12 +459,13 @@ static inline int tcp_state_idx(struct tcphdr *th)
 }

 static inline void
-set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+set_tcp_state(struct net *net, struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 	      int direction, struct tcphdr *th)
 {
 	int state_idx;
 	int new_state = IP_VS_TCP_S_CLOSE;
 	int state_off = tcp_state_off[direction];
+	struct ip_vs_proto_data *pd;

 	/*
 	 *    Update state offset to INPUT_ONLY if necessary
@@ -512,8 +520,12 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
 			}
 		}
 	}
-
-	cp->timeout = pp->timeout_table[cp->state = new_state];
+	pd = ip_vs_proto_data_get(net, pp->protocol);
+	if(likely(pd))
+		cp->timeout = pd->timeout_table[cp->state = new_state];
+	else	/* What to do ? */
+		cp->timeout = tcp_timeouts[cp->state = new_state];
+	IP_VS_DBG(8, "%s() timeout=%lu, pd=%p def=%d\n", __func__, cp->timeout, pd->timeout_table, tcp_timeouts[new_state]);
 }


@@ -525,6 +537,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 		     const struct sk_buff *skb,
 		     struct ip_vs_protocol *pp)
 {
+	struct net *net = dev_net(skb->dev);
 	struct tcphdr _tcph, *th;

 #ifdef CONFIG_IP_VS_IPV6
@@ -538,7 +551,7 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 		return 0;

 	spin_lock(&cp->lock);
-	set_tcp_state(pp, cp, direction, th);
+	set_tcp_state(net, pp, cp, direction, th);
 	spin_unlock(&cp->lock);

 	return 1;
@@ -548,12 +561,6 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
 /*
  *	Hash table for TCP application incarnations
  */
-#define	TCP_APP_TAB_BITS	4
-#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
-#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
-
-static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(tcp_app_lock);

 static inline __u16 tcp_app_hashkey(__be16 port)
 {
@@ -562,47 +569,51 @@ static inline __u16 tcp_app_hashkey(__be16 port)
 }


-static int tcp_register_app(struct ip_vs_app *inc)
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
 {
 	struct ip_vs_app *i;
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net->ipvs;
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);

 	hash = tcp_app_hashkey(port);

-	spin_lock_bh(&tcp_app_lock);
-	list_for_each_entry(i, &tcp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->tcp_app_lock);
+	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &tcp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_tcp.appcnt);
+	list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+	atomic_inc(&pd->appcnt);

   out:
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&ipvs->tcp_app_lock);
 	return ret;
 }


-static void
-tcp_unregister_app(struct ip_vs_app *inc)
+static void tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
-	spin_lock_bh(&tcp_app_lock);
-	atomic_dec(&ip_vs_protocol_tcp.appcnt);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+	BUG_ON(!pd);
+	spin_lock_bh(&net->ipvs->tcp_app_lock);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&tcp_app_lock);
+	spin_unlock_bh(&net->ipvs->tcp_app_lock);
 }


-static int
-tcp_app_conn_bind(struct ip_vs_conn *cp)
+static int tcp_app_conn_bind(struct net *net, struct ip_vs_conn *cp)
 {
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
+	struct netns_ipvs *ipvs = net->ipvs;

 	/* Default binding: bind app only for NAT */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
@@ -611,12 +622,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = tcp_app_hashkey(cp->vport);

-	spin_lock(&tcp_app_lock);
-	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+	spin_lock(&ipvs->tcp_app_lock);
+	list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&tcp_app_lock);
+			spin_unlock(&ipvs->tcp_app_lock);

 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -633,7 +644,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&tcp_app_lock);
+	spin_unlock(&ipvs->tcp_app_lock);

   out:
 	return result;
@@ -643,24 +654,32 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
 /*
  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
  */
-void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
 {
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
 	spin_lock(&cp->lock);
 	cp->state = IP_VS_TCP_S_LISTEN;
-	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+	cp->timeout = ( pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+	                   : tcp_timeouts[IP_VS_TCP_S_LISTEN] );
 	spin_unlock(&cp->lock);
 }

-
-static void ip_vs_tcp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(tcp_apps);
-	pp->timeout_table = tcp_timeouts;
+	ip_vs_init_hash_table(net->ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+	spin_lock_init(&net->ipvs->tcp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table(tcp_timeouts,
+							sizeof(tcp_timeouts));
 }

-
-static void ip_vs_tcp_exit(struct ip_vs_protocol *pp)
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
+	kfree(pd->timeout_table);
 }


@@ -669,9 +688,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.protocol =		IPPROTO_TCP,
 	.num_states =		IP_VS_TCP_S_LAST,
 	.dont_defrag =		0,
-	.appcnt =		ATOMIC_INIT(0),
-	.init =			ip_vs_tcp_init,
-	.exit =			ip_vs_tcp_exit,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__ip_vs_tcp_init,
+	.exit_netns =		__ip_vs_tcp_exit,
 	.register_app =		tcp_register_app,
 	.unregister_app =	tcp_unregister_app,
 	.conn_schedule =	tcp_conn_schedule,
@@ -685,5 +705,5 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
 	.app_conn_bind =	tcp_app_conn_bind,
 	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	tcp_timeout_change,
-	.set_state_timeout =	tcp_set_state_timeout,
+/*	.set_state_timeout =	tcp_set_state_timeout, */
 };
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 8553231..d067843 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,10 @@
  *              as published by the Free Software Foundation; either version
  *              2 of the License, or (at your option) any later version.
  *
- * Changes:
+ * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ *              Network name space (netns) aware.
+ *              Global data moved to netns i.e struct netns_ipvs
  *
  */

@@ -34,7 +37,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 	struct ip_vs_service *svc;
 	struct udphdr _udph, *uh;
 	struct ip_vs_iphdr iph;
-
+	struct net *net = dev_net(skb->dev);
+
+	BUG_ON(!net);
 	ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);

 	uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
@@ -43,7 +48,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		return 0;
 	}

-	svc = ip_vs_service_get(af, skb->mark, iph.protocol,
+	svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
 				&iph.daddr, uh->dest);
 	if (svc) {
 		if (ip_vs_todrop()) {
@@ -323,13 +328,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
  *	unregister_app or app_conn_bind is called each time.
  */

-#define	UDP_APP_TAB_BITS	4
-#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
-#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)
-
-static struct list_head udp_apps[UDP_APP_TAB_SIZE];
-static DEFINE_SPINLOCK(udp_app_lock);
-
 static inline __u16 udp_app_hashkey(__be16 port)
 {
 	return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -337,47 +335,52 @@ static inline __u16 udp_app_hashkey(__be16 port)
 }


-static int udp_register_app(struct ip_vs_app *inc)
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
 {
 	struct ip_vs_app *i;
 	__u16 hash;
 	__be16 port = inc->port;
 	int ret = 0;
+	struct netns_ipvs *ipvs = net->ipvs;
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);

+	BUG_ON(!pd);
 	hash = udp_app_hashkey(port);

-
-	spin_lock_bh(&udp_app_lock);
-	list_for_each_entry(i, &udp_apps[hash], p_list) {
+	spin_lock_bh(&ipvs->udp_app_lock);
+	list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
 		if (i->port == port) {
 			ret = -EEXIST;
 			goto out;
 		}
 	}
-	list_add(&inc->p_list, &udp_apps[hash]);
-	atomic_inc(&ip_vs_protocol_udp.appcnt);
+	list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+	atomic_inc(&pd->appcnt);

   out:
-	spin_unlock_bh(&udp_app_lock);
+	spin_unlock_bh(&ipvs->udp_app_lock);
 	return ret;
 }


-static void
-udp_unregister_app(struct ip_vs_app *inc)
+static void udp_unregister_app(struct net *net, struct ip_vs_app *inc)
 {
-	spin_lock_bh(&udp_app_lock);
-	atomic_dec(&ip_vs_protocol_udp.appcnt);
+	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+	BUG_ON(!pd);
+	spin_lock_bh(&net->ipvs->udp_app_lock);
+	atomic_dec(&pd->appcnt);
 	list_del(&inc->p_list);
-	spin_unlock_bh(&udp_app_lock);
+	spin_unlock_bh(&net->ipvs->udp_app_lock);
 }


-static int udp_app_conn_bind(struct ip_vs_conn *cp)
+static int udp_app_conn_bind(struct net *net, struct ip_vs_conn *cp)
 {
 	int hash;
 	struct ip_vs_app *inc;
 	int result = 0;
+	struct netns_ipvs *ipvs = net->ipvs;

 	/* Default binding: bind app only for NAT */
 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
@@ -386,12 +389,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 	/* Lookup application incarnations and bind the right one */
 	hash = udp_app_hashkey(cp->vport);

-	spin_lock(&udp_app_lock);
-	list_for_each_entry(inc, &udp_apps[hash], p_list) {
+	spin_lock(&ipvs->udp_app_lock);
+	list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
 		if (inc->port == cp->vport) {
 			if (unlikely(!ip_vs_app_inc_get(inc)))
 				break;
-			spin_unlock(&udp_app_lock);
+			spin_unlock(&ipvs->udp_app_lock);

 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
 				      "%s:%u to app %s on port %u\n",
@@ -408,14 +411,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
 			goto out;
 		}
 	}
-	spin_unlock(&udp_app_lock);
+	spin_unlock(&ipvs->udp_app_lock);

   out:
 	return result;
 }


-static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
 	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
 	[IP_VS_UDP_S_LAST]		=	2*HZ,
 };
@@ -425,14 +428,20 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
 	[IP_VS_UDP_S_LAST]		=	"BUG!",
 };

-
+/*
 static int
-udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+udp_set_state_timeout(struct net *net, struct ip_vs_protocol *pp, char *sname,
+                      int to)
 {
-	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
-				       udp_state_name_table, sname, to);
+	struct ip_vs_proto_data *pd=ip_vs_proto_data_get(net, IPPROTO_UDP);
+	if (pd)
+		return ip_vs_set_state_timeout(pd->timeout_table,
+					       IP_VS_UDP_S_LAST,
+					       udp_state_name_table, sname, to);
+	else
+		return -ENOENT;
 }
-
+*/
 static const char * udp_state_name(int state)
 {
 	if (state >= IP_VS_UDP_S_LAST)
@@ -445,28 +454,40 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
 		     const struct sk_buff *skb,
 		     struct ip_vs_protocol *pp)
 {
-	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+	struct net *net = dev_net(skb->dev);
+	struct ip_vs_proto_data *pd=ip_vs_proto_data_get(net, IPPROTO_UDP);
+	if(unlikely(pd))
+		return 0;
+
+	cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
 	return 1;
 }
-
-static void udp_init(struct ip_vs_protocol *pp)
+/* ---------------------------------------------
+ *   timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
-	IP_VS_INIT_HASH_TABLE(udp_apps);
-	pp->timeout_table = udp_timeouts;
+	ip_vs_init_hash_table(net->ipvs->udp_apps, UDP_APP_TAB_SIZE);
+	spin_lock_init(&net->ipvs->udp_app_lock);
+	pd->timeout_table = ip_vs_create_timeout_table(udp_timeouts,
+							sizeof(udp_timeouts));
 }

-static void udp_exit(struct ip_vs_protocol *pp)
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
 {
+	kfree(pd->timeout_table);
 }

-
 struct ip_vs_protocol ip_vs_protocol_udp = {
 	.name =			"UDP",
 	.protocol =		IPPROTO_UDP,
 	.num_states =		IP_VS_UDP_S_LAST,
 	.dont_defrag =		0,
-	.init =			udp_init,
-	.exit =			udp_exit,
+	.init =			NULL,
+	.exit =			NULL,
+	.init_netns =		__udp_init,
+	.exit_netns =		__udp_exit,
 	.conn_schedule =	udp_conn_schedule,
 	.conn_in_get =		ip_vs_conn_in_get_proto,
 	.conn_out_get =		ip_vs_conn_out_get_proto,
@@ -480,5 +501,5 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
 	.app_conn_bind =	udp_app_conn_bind,
 	.debug_packet =		ip_vs_tcpudp_debug_packet,
 	.timeout_change =	NULL,
-	.set_state_timeout =	udp_set_state_timeout,
+/*	.set_state_timeout =	udp_set_state_timeout, */
 };

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply related

* [RFC PATCH 9/9] ipvs network name space aware
From: Hans Schillstrom @ 2010-10-08 11:17 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter-devel; +Cc: horms, ja, wensong, daniel.lezcano

This patch contains ip_vs_sync.c and ip_vs_xmit.c

There is one sync daemon per netns, and a number is prepended to its name.
(a kind of incarnation counter)

Part of the netns migration in ip_vs_xmit.c was done in the IPv6 tunnel patch,
so make sure that "[patch v4] ipvs: IPv6 tunnel mode" is applied

Signed-off-by:Hans Schillstrom <hans.schillstrom@ericsson.com>

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 7ba0693..98575da 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -74,6 +74,7 @@ struct ip_vs_sync_conn_options {
 struct ip_vs_sync_thread_data {
 	struct socket *sock;
 	char *buf;
+	struct net *net;
 };

 #define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
@@ -113,9 +114,6 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };

-/* the maximum length of sync (sending/receiving) message */
-static int sync_send_mesg_maxlen;
-static int sync_recv_mesg_maxlen;

 struct ip_vs_sync_buff {
 	struct list_head        list;
@@ -127,70 +125,41 @@ struct ip_vs_sync_buff {
 	unsigned char           *end;
 };

-
-/* the sync_buff list head and the lock */
-static LIST_HEAD(ip_vs_sync_queue);
-static DEFINE_SPINLOCK(ip_vs_sync_lock);
-
-/* current sync_buff for accepting new conn entries */
-static struct ip_vs_sync_buff   *curr_sb = NULL;
-static DEFINE_SPINLOCK(curr_sb_lock);
-
-/* ipvs sync daemon state */
-volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
-volatile int ip_vs_master_syncid = 0;
-volatile int ip_vs_backup_syncid = 0;
-
-/* multicast interface name */
-char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
-
-/* sync daemon tasks */
-static struct task_struct *sync_master_thread;
-static struct task_struct *sync_backup_thread;
-
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
-	.sin_family		= AF_INET,
-	.sin_port		= cpu_to_be16(IP_VS_SYNC_PORT),
-	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
-};
-
-
-static inline struct ip_vs_sync_buff *sb_dequeue(void)
+static inline struct ip_vs_sync_buff *sb_dequeue(struct net *net)
 {
 	struct ip_vs_sync_buff *sb;
+	struct netns_ipvs *ipvs = net->ipvs;

-	spin_lock_bh(&ip_vs_sync_lock);
-	if (list_empty(&ip_vs_sync_queue)) {
+	spin_lock_bh(&ipvs->sync_lock);
+	if (list_empty(&ipvs->sync_queue)) {
 		sb = NULL;
 	} else {
-		sb = list_entry(ip_vs_sync_queue.next,
+		sb = list_entry(ipvs->sync_queue.next,
 				struct ip_vs_sync_buff,
 				list);
 		list_del(&sb->list);
 	}
-	spin_unlock_bh(&ip_vs_sync_lock);
+	spin_unlock_bh(&ipvs->sync_lock);

 	return sb;
 }

-static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(struct net *net)
 {
 	struct ip_vs_sync_buff *sb;

 	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
 		return NULL;

-	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+	if (!(sb->mesg=kmalloc(net->ipvs->sync_send_mesg_maxlen, GFP_ATOMIC))) {
 		kfree(sb);
 		return NULL;
 	}
 	sb->mesg->nr_conns = 0;
-	sb->mesg->syncid = ip_vs_master_syncid;
+	sb->mesg->syncid = net->ipvs->master_syncid;
 	sb->mesg->size = 4;
 	sb->head = (unsigned char *)sb->mesg + 4;
-	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+	sb->end = (unsigned char *)sb->mesg + net->ipvs->sync_send_mesg_maxlen;
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -201,14 +170,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
 	kfree(sb);
 }

-static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+static inline void sb_queue_tail(struct net *net, struct ip_vs_sync_buff *sb)
 {
-	spin_lock(&ip_vs_sync_lock);
-	if (ip_vs_sync_state & IP_VS_STATE_MASTER)
-		list_add_tail(&sb->list, &ip_vs_sync_queue);
+	struct netns_ipvs *ipvs = net->ipvs;
+
+	spin_lock(&ipvs->sync_lock);
+	if (ipvs->sync_state & IP_VS_STATE_MASTER)
+		list_add_tail(&sb->list, &ipvs->sync_queue);
 	else
 		ip_vs_sync_buff_release(sb);
-	spin_unlock(&ip_vs_sync_lock);
+	spin_unlock(&ipvs->sync_lock);
 }

 /*
@@ -216,18 +187,19 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
  *	than the specified time or the specified time is zero.
  */
 static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(unsigned long time)
+get_curr_sync_buff(struct net *net, unsigned long time)
 {
 	struct ip_vs_sync_buff *sb;
+	struct netns_ipvs *ipvs = net->ipvs;

-	spin_lock_bh(&curr_sb_lock);
-	if (curr_sb && (time == 0 ||
-			time_before(jiffies - curr_sb->firstuse, time))) {
-		sb = curr_sb;
-		curr_sb = NULL;
+	spin_lock_bh(&ipvs->sync_buff_lock);
+	if (ipvs->sync_buff && (time == 0 ||
+			time_before(jiffies - ipvs->sync_buff->firstuse, time))) {
+		sb = ipvs->sync_buff;
+		ipvs->sync_buff = NULL;
 	} else
 		sb = NULL;
-	spin_unlock_bh(&curr_sb_lock);
+	spin_unlock_bh(&ipvs->sync_buff_lock);
 	return sb;
 }

@@ -236,16 +208,17 @@ get_curr_sync_buff(unsigned long time)
  *      Add an ip_vs_conn information into the current sync_buff.
  *      Called by ip_vs_in.
  */
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
 {
 	struct ip_vs_sync_mesg *m;
 	struct ip_vs_sync_conn *s;
 	int len;
+	struct netns_ipvs *ipvs = net->ipvs;

-	spin_lock(&curr_sb_lock);
-	if (!curr_sb) {
-		if (!(curr_sb=ip_vs_sync_buff_create())) {
-			spin_unlock(&curr_sb_lock);
+	spin_lock(&ipvs->sync_buff_lock);
+	if (!ipvs->sync_buff) {
+		if (!(ipvs->sync_buff=ip_vs_sync_buff_create(net))) {
+			spin_unlock(&ipvs->sync_buff_lock);
 			pr_err("ip_vs_sync_buff_create failed.\n");
 			return;
 		}
@@ -253,8 +226,8 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)

 	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
 		SIMPLE_CONN_SIZE;
-	m = curr_sb->mesg;
-	s = (struct ip_vs_sync_conn *)curr_sb->head;
+	m = ipvs->sync_buff->mesg;
+	s = (struct ip_vs_sync_conn *)ipvs->sync_buff->head;

 	/* copy members */
 	s->protocol = cp->protocol;
@@ -274,18 +247,18 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)

 	m->nr_conns++;
 	m->size += len;
-	curr_sb->head += len;
+	ipvs->sync_buff->head += len;

 	/* check if there is a space for next one */
-	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
+	if (ipvs->sync_buff->head+FULL_CONN_SIZE > ipvs->sync_buff->end) {
+		sb_queue_tail(net, ipvs->sync_buff);
+		ipvs->sync_buff = NULL;
 	}
-	spin_unlock(&curr_sb_lock);
+	spin_unlock(&ipvs->sync_buff_lock);

 	/* synchronize its controller if it has */
 	if (cp->control)
-		ip_vs_sync_conn(cp->control);
+		ip_vs_sync_conn(net, cp->control);
 }


@@ -293,13 +266,15 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
  *      Process received multicast message and create the corresponding
  *      ip_vs_conn entries.
  */
-static void ip_vs_process_message(const char *buffer, const size_t buflen)
+static void
+ip_vs_process_message(struct net *net, const char *buffer, const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
 	struct ip_vs_sync_conn *s;
 	struct ip_vs_sync_conn_options *opt;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
+	struct ip_vs_proto_data *pd;
 	struct ip_vs_dest *dest;
 	char *p;
 	int i;
@@ -318,7 +293,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 	}

 	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+	if (net->ipvs->backup_syncid != 0 && m->syncid != net->ipvs->backup_syncid) {
 		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
 			  m->syncid);
 		return;
@@ -371,13 +346,13 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 		}

 		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(AF_INET, s->protocol,
+			cp = ip_vs_conn_in_get(net, AF_INET, s->protocol,
 					       (union nf_inet_addr *)&s->caddr,
 					       s->cport,
 					       (union nf_inet_addr *)&s->vaddr,
 					       s->vport);
 		else
-			cp = ip_vs_ct_in_get(AF_INET, s->protocol,
+			cp = ip_vs_ct_in_get(net, AF_INET, s->protocol,
 					     (union nf_inet_addr *)&s->caddr,
 					     s->cport,
 					     (union nf_inet_addr *)&s->vaddr,
@@ -388,7 +363,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 			 * If it is not found the connection will remain unbound
 			 * but still handled.
 			 */
-			dest = ip_vs_find_dest(AF_INET,
+			dest = ip_vs_find_dest(net, AF_INET,
 					       (union nf_inet_addr *)&s->daddr,
 					       s->dport,
 					       (union nf_inet_addr *)&s->vaddr,
@@ -406,7 +381,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				else
 					flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
-			cp = ip_vs_conn_new(AF_INET, s->protocol,
+			cp = ip_vs_conn_new(net, AF_INET, s->protocol,
 					    (union nf_inet_addr *)&s->caddr,
 					    s->cport,
 					    (union nf_inet_addr *)&s->vaddr,
@@ -421,7 +396,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 				return;
 			}
 		} else if (!cp->dest) {
-			dest = ip_vs_try_bind_dest(cp);
+			dest = ip_vs_try_bind_dest(net, cp);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
@@ -452,7 +427,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)

 		if (opt)
 			memcpy(&cp->in_seq, opt, sizeof(*opt));
-		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+		atomic_set(&cp->in_pkts, net->ipvs->sysctl_sync_threshold[0]);
 		cp->state = state;
 		cp->old_state = cp->state;
 		/*
@@ -461,8 +436,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
 		 * virtual service. If needed, we can do it for
 		 * non-fwmark persistent services.
 		 */
-		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-			cp->timeout = pp->timeout_table[state];
+		pd = ip_vs_proto_data_get(net,cp->protocol);
+		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table )
+			cp->timeout = pd->timeout_table[state];
 		else
 			cp->timeout = (3*60*HZ);
 		ip_vs_conn_put(cp);
@@ -503,8 +479,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
 {
 	struct net_device *dev;
 	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);

-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	BUG_ON(!net);
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;

 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -523,30 +501,31 @@ static int set_mcast_if(struct sock *sk, char *ifname)
  *	Set the maximum length of sync message according to the
  *	specified interface's MTU.
  */
-static int set_sync_mesg_maxlen(int sync_state)
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
 {
 	struct net_device *dev;
 	int num;
+	struct netns_ipvs *ipvs = net->ipvs;

 	if (sync_state == IP_VS_STATE_MASTER) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL)
+		if ((dev = __dev_get_by_name(net, ipvs->master_mcast_ifn)) == NULL)
 			return -ENODEV;

 		num = (dev->mtu - sizeof(struct iphdr) -
 		       sizeof(struct udphdr) -
 		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
-		sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+		ipvs->sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
 			SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
 		IP_VS_DBG(7, "setting the maximum length of sync sending "
-			  "message %d.\n", sync_send_mesg_maxlen);
+			  "message %d.\n", ipvs->sync_send_mesg_maxlen);
 	} else if (sync_state == IP_VS_STATE_BACKUP) {
-		if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL)
+		if ((dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn)) == NULL)
 			return -ENODEV;

-		sync_recv_mesg_maxlen = dev->mtu -
+		ipvs->sync_recv_mesg_maxlen = dev->mtu -
 			sizeof(struct iphdr) - sizeof(struct udphdr);
 		IP_VS_DBG(7, "setting the maximum length of sync receiving "
-			  "message %d.\n", sync_recv_mesg_maxlen);
+			  "message %d.\n", ipvs->sync_recv_mesg_maxlen);
 	}

 	return 0;
@@ -564,11 +543,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
 	struct ip_mreqn mreq;
 	struct net_device *dev;
 	int ret;
+	struct net *net = sock_net(sk);

+	BUG_ON(!net);
 	memset(&mreq, 0, sizeof(mreq));
 	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));

-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;
 	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
 		return -EINVAL;
@@ -588,8 +569,10 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 	struct net_device *dev;
 	__be32 addr;
 	struct sockaddr_in sin;
+	struct net *net = sock_net(sock->sk);

-	if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL)
+	BUG_ON(!net);
+	if ((dev = __dev_get_by_name(net, ifname)) == NULL)
 		return -ENODEV;

 	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -611,19 +594,19 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
 /*
  *      Set up sending multicast socket over UDP
  */
-static struct socket * make_send_sock(void)
+static struct socket * make_send_sock(struct net *net)
 {
 	struct socket *sock;
 	int result;

-	/* First create a socket */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	/* First create a socket in current netns  */
+	result = sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}

-	result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn);
+	result = set_mcast_if(sock->sk, net->ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
 		goto error;
@@ -632,13 +615,14 @@ static struct socket * make_send_sock(void)
 	set_mcast_loop(sock->sk, 0);
 	set_mcast_ttl(sock->sk, 1);

-	result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn);
+	result = bind_mcastif_addr(sock, net->ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error binding address of the mcast interface\n");
 		goto error;
 	}

-	result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+	result = sock->ops->connect(sock,
+			(struct sockaddr *) &net->ipvs->sync_mcast_addr,
 			sizeof(struct sockaddr), 0);
 	if (result < 0) {
 		pr_err("Error connecting to the multicast addr\n");
@@ -656,13 +640,13 @@ static struct socket * make_send_sock(void)
 /*
  *      Set up receiving multicast socket over UDP
  */
-static struct socket * make_receive_sock(void)
+static struct socket * make_receive_sock(struct net *net)
 {
 	struct socket *sock;
 	int result;

-	/* First create a socket */
-	result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	/* First create a socket in current netns */
+	result = sock_create(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
@@ -671,7 +655,8 @@ static struct socket * make_receive_sock(void)
 	/* it is equivalent to the REUSEADDR option in user-space */
 	sock->sk->sk_reuse = 1;

-	result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+	result = sock->ops->bind(sock,
+			(struct sockaddr *) &net->ipvs->sync_mcast_addr,
 			sizeof(struct sockaddr));
 	if (result < 0) {
 		pr_err("Error binding to the multicast addr\n");
@@ -680,8 +665,8 @@ static struct socket * make_receive_sock(void)

 	/* join the multicast group */
 	result = join_mcast_group(sock->sk,
-			(struct in_addr *) &mcast_addr.sin_addr,
-			ip_vs_backup_mcast_ifn);
+			(struct in_addr *) &net->ipvs->sync_mcast_addr.sin_addr,
+			net->ipvs->backup_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error joining to the multicast group\n");
 		goto error;
@@ -756,16 +741,17 @@ static int sync_thread_master(void *data)

 	pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+		tinfo->net->ipvs->master_mcast_ifn,
+		tinfo->net->ipvs->master_syncid);

 	while (!kthread_should_stop()) {
-		while ((sb = sb_dequeue())) {
+		while ((sb = sb_dequeue(tinfo->net))) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
 		}

 		/* check if entries stay in curr_sb for 2 seconds */
-		sb = get_curr_sync_buff(2 * HZ);
+		sb = get_curr_sync_buff(tinfo->net, 2 * HZ);
 		if (sb) {
 			ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
 			ip_vs_sync_buff_release(sb);
@@ -775,12 +761,12 @@ static int sync_thread_master(void *data)
 	}

 	/* clean up the sync_buff queue */
-	while ((sb=sb_dequeue())) {
+	while ((sb=sb_dequeue(tinfo->net))) {
 		ip_vs_sync_buff_release(sb);
 	}

 	/* clean up the current sync_buff */
-	if ((sb = get_curr_sync_buff(0))) {
+	if ((sb = get_curr_sync_buff(tinfo->net, 0))) {
 		ip_vs_sync_buff_release(sb);
 	}

@@ -796,10 +782,11 @@ static int sync_thread_backup(void *data)
 {
 	struct ip_vs_sync_thread_data *tinfo = data;
 	int len;
-
+
 	pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
 		"syncid = %d\n",
-		ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+		tinfo->net->ipvs->backup_mcast_ifn,
+		tinfo->net->ipvs->backup_syncid);

 	while (!kthread_should_stop()) {
 		wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -809,16 +796,15 @@ static int sync_thread_backup(void *data)
 		/* do we have data now? */
 		while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
 			len = ip_vs_receive(tinfo->sock, tinfo->buf,
-					sync_recv_mesg_maxlen);
+					tinfo->net->ipvs->sync_recv_mesg_maxlen);
 			if (len <= 0) {
 				pr_err("receiving message error\n");
 				break;
 			}
-
-			/* disable bottom half, because it accesses the data
+			/* disable bottom half per netns, because it accesses the data
 			   shared by softirq while getting/creating conns */
 			local_bh_disable();
-			ip_vs_process_message(tinfo->buf, len);
+			ip_vs_process_message(tinfo->net, tinfo->buf, len);
 			local_bh_enable();
 		}
 	}
@@ -832,41 +818,43 @@ static int sync_thread_backup(void *data)
 }


-int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 {
 	struct ip_vs_sync_thread_data *tinfo;
 	struct task_struct **realtask, *task;
 	struct socket *sock;
+	struct netns_ipvs *ipvs = net->ipvs;
 	char *name, *buf = NULL;
 	int (*threadfn)(void *data);
 	int result = -ENOMEM;

-	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+	IP_VS_DBG(7, "%s(): pid %d inc:%d\n", __func__, task_pid_nr(current),
+		                             ipvs->inc);
 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
 		  sizeof(struct ip_vs_sync_conn));

 	if (state == IP_VS_STATE_MASTER) {
-		if (sync_master_thread)
+		if (ipvs->sync_master_thread)
 			return -EEXIST;

-		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_master_mcast_ifn));
-		ip_vs_master_syncid = syncid;
-		realtask = &sync_master_thread;
-		name = "ipvs_syncmaster";
+		strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->master_mcast_ifn));
+		ipvs->master_syncid = syncid;
+		realtask = &ipvs->sync_master_thread;
+		name = "ipvs_master:%d";
 		threadfn = sync_thread_master;
-		sock = make_send_sock();
+		sock = make_send_sock(net);
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (sync_backup_thread)
+		if (ipvs->sync_backup_thread)
 			return -EEXIST;

-		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
-			sizeof(ip_vs_backup_mcast_ifn));
-		ip_vs_backup_syncid = syncid;
-		realtask = &sync_backup_thread;
-		name = "ipvs_syncbackup";
+		strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+			sizeof(ipvs->backup_mcast_ifn));
+		ipvs->backup_syncid = syncid;
+		realtask = &ipvs->sync_backup_thread;
+		name = "ipvs_backup:%d";
 		threadfn = sync_thread_backup;
-		sock = make_receive_sock();
+		sock = make_receive_sock(net);
 	} else {
 		return -EINVAL;
 	}
@@ -876,9 +864,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 		goto out;
 	}

-	set_sync_mesg_maxlen(state);
+	set_sync_mesg_maxlen(net, state);
 	if (state == IP_VS_STATE_BACKUP) {
-		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
+		buf = kmalloc(ipvs->sync_recv_mesg_maxlen, GFP_KERNEL);
 		if (!buf)
 			goto outsocket;
 	}
@@ -889,16 +877,17 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)

 	tinfo->sock = sock;
 	tinfo->buf = buf;
+	tinfo->net = net;

-	task = kthread_run(threadfn, tinfo, name);
+	task = kthread_run(threadfn, tinfo, name, ipvs->inc);
 	if (IS_ERR(task)) {
 		result = PTR_ERR(task);
 		goto outtinfo;
 	}
-
+	IP_VS_DBG(1, "kthread %s started (%d)\n", name, task->pid);
 	/* mark as active */
 	*realtask = task;
-	ip_vs_sync_state |= state;
+	ipvs->sync_state |= state;

 	/* increase the module use count */
 	ip_vs_use_count_inc();
@@ -916,16 +905,19 @@ out:
 }


-int stop_sync_thread(int state)
+int stop_sync_thread(struct net *net, int state)
 {
+	struct netns_ipvs *ipvs = net->ipvs;
+
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));

 	if (state == IP_VS_STATE_MASTER) {
-		if (!sync_master_thread)
+		if (!ipvs->sync_master_thread)
 			return -ESRCH;

-		pr_info("stopping master sync thread %d ...\n",
-			task_pid_nr(sync_master_thread));
+		pr_info("stopping master sync thread %d  inc:%d...\n",
+			task_pid_nr(ipvs->sync_master_thread),
+			ipvs->inc);

 		/*
 		 * The lock synchronizes with sb_queue_tail(), so that we don't
@@ -933,21 +925,22 @@ int stop_sync_thread(int state)
 		 * progress of stopping the master sync daemon.
 		 */

-		spin_lock_bh(&ip_vs_sync_lock);
-		ip_vs_sync_state &= ~IP_VS_STATE_MASTER;
-		spin_unlock_bh(&ip_vs_sync_lock);
-		kthread_stop(sync_master_thread);
-		sync_master_thread = NULL;
+		spin_lock_bh(&ipvs->sync_lock);
+		ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+		spin_unlock_bh(&ipvs->sync_lock);
+		kthread_stop(ipvs->sync_master_thread);
+		ipvs->sync_master_thread = NULL;
 	} else if (state == IP_VS_STATE_BACKUP) {
-		if (!sync_backup_thread)
+		if (!ipvs->sync_backup_thread)
 			return -ESRCH;

-		pr_info("stopping backup sync thread %d ...\n",
-			task_pid_nr(sync_backup_thread));
+		pr_info("stopping backup sync thread %d inc:%d...\n",
+			task_pid_nr(ipvs->sync_backup_thread),
+			ipvs->inc);

-		ip_vs_sync_state &= ~IP_VS_STATE_BACKUP;
-		kthread_stop(sync_backup_thread);
-		sync_backup_thread = NULL;
+		ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+		kthread_stop(ipvs->sync_backup_thread);
+		ipvs->sync_backup_thread = NULL;
 	} else {
 		return -EINVAL;
 	}
@@ -957,3 +950,41 @@ int stop_sync_thread(int state)

 	return 0;
 }
+
+/*
+ * Initialize data struct for each netns
+ */
+static int __net_init __ip_vs_sync_init(struct net *net)
+{
+	struct netns_ipvs *ipvs = net->ipvs;
+	INIT_LIST_HEAD(&ipvs->sync_queue);
+	spin_lock_init(&ipvs->sync_lock);
+	spin_lock_init(&ipvs->sync_buff_lock);
+
+	ipvs->sync_mcast_addr.sin_family = AF_INET;
+	ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
+	ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
+	return 0;
+}
+
+static void __ip_vs_sync_cleanup(struct net *net)
+{
+	stop_sync_thread(net, net->ipvs->sync_state &
+			      (IP_VS_STATE_MASTER | IP_VS_STATE_BACKUP));
+	return;
+}
+static struct pernet_operations ipvs_sync_ops = {
+	.init = __ip_vs_sync_init,
+	.exit = __ip_vs_sync_cleanup,
+};
+
+
+int __init ip_vs_sync_init(void)
+{
+	return register_pernet_subsys(&ipvs_sync_ops);
+}
+
+void __exit ip_vs_sync_cleanup(void)
+{
+	unregister_pernet_subsys(&ipvs_sync_ops);
+}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index a2e8497..d68178f 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -410,13 +410,15 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
+		struct net *net;
 		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
 		if (p == NULL)
 			goto tx_error;
-		ip_vs_conn_fill_cport(cp, *p);
+		net = dev_net(skb->dev);
+		ip_vs_conn_fill_cport(net, cp, *p);
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
-
+	IP_VS_DBG(10, "%s() dst:%x\n", __func__, iph->daddr);
 	if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
 		goto tx_error_icmp;

@@ -486,14 +488,16 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	/* check if it is a connection of no-client-port */
 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
 		__be16 _pt, *p;
+		struct net *net;
 		p = skb_header_pointer(skb, sizeof(struct ipv6hdr),
 				       sizeof(_pt), &_pt);
 		if (p == NULL)
 			goto tx_error;
-		ip_vs_conn_fill_cport(cp, *p);
+		net = dev_net(skb->dev);
+		BUG_ON(!net);
+		ip_vs_conn_fill_cport(net, cp, *p);
 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
 	}
-
 	rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
 	if (!rt)
 		goto tx_error_icmp;

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply related

* Re: BUG ? ipip unregister_netdevice_many()
From: Daniel Lezcano @ 2010-10-08 11:19 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: netdev@vger.kernel.org, Eric W. Biederman
In-Reply-To: <201010071048.12817.hans.schillstrom@ericsson.com>

On 10/07/2010 10:48 AM, Hans Schillstrom wrote:
> Hello
> I'm trying to exit a network name space and it doesn't work (or am I doing something wrong?)
> The only netdevices left are lo and the tunnels ip6tnl0, sit0 and tunl0 when exiting netns.
>
> A netns is created by lxc-execute with two interfaces eth0 eth1 (macvlan)
> (see conf file at the end)
>
> Kernel: net-next-2.6 top from 4 october 2010
>    

Hi Hans,

I tried to reproduce your problem but I just get a big kernel crash when 
exiting the container :/

The stack is different but it may be related to the same problem.

BUG: unable to handle kernel paging request at ffff88003ba453a0
IP: [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
PGD 180b063 PUD 180f063 PMD 1ffdb067 PTE 3ba45160
Oops: 0002 [#1] DEBUG_PAGEALLOC
last sysfs file: /sys/devices/virtual/net/mc0PyXBA/type
CPU 0
Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc7-next-20101007+ #11 /Bochs
RIP: 0010:[<ffffffff813020b6>]  [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
RSP: 0018:ffff88003f111c30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff88003bdd1e60 RCX: 000000000000c100
RDX: ffff88003ba453a0 RSI: ffff88003f111d70 RDI: ffffffff810300e5
RBP: ffff88003f111c50 R08: ffff88003f111d70 R09: 00000000000000cc
R10: 0000000000000001 R11: ffff88003f111ba0 R12: ffff88003bdd1800
R13: ffff880039fec800 R14: ffff88003f111d70 R15: ffff88003ba06830
FS:  0000000000000000(0000) GS:ffffffff8181b000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffff88003ba453a0 CR3: 000000003c284000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process kworker/u:0 (pid: 5, threadinfo ffff88003f110000, task 
ffff88003f0f20a0)
Stack:
  ffffffff81657a10 ffff88003bdd1800 ffffffff81657a10 ffff88003bb23800
<0> ffff88003f111c70 ffffffff81362d09 ffff88003bdd1800 ffff88003f111cf0
<0> ffff88003f111c90 ffffffff81362d31 ffff88003ba067c0 ffff88003bdd1800
Call Trace:
  [<ffffffff81362d09>] __dev_close+0x75/0x83
  [<ffffffff81362d31>] dev_close+0x1a/0x3f
  [<ffffffff81362e38>] rollback_registered_many+0xe2/0x21c
  [<ffffffff81362f88>] unregister_netdevice_many+0x16/0x6d
  [<ffffffff8136314d>] default_device_exit_batch+0xa7/0xbb
  [<ffffffff8135db06>] ops_exit_list+0x4e/0x56
  [<ffffffff8135e285>] cleanup_net+0xf5/0x195
  [<ffffffff8103e084>] process_one_work+0x25d/0x3e7
  [<ffffffff8103e027>] ? process_one_work+0x200/0x3e7
  [<ffffffff8135e190>] ? cleanup_net+0x0/0x195
  [<ffffffff8103e54a>] worker_thread+0x1b5/0x342
  [<ffffffff8103e395>] ? worker_thread+0x0/0x342
  [<ffffffff81041495>] kthread+0x7c/0x84
  [<ffffffff810034f4>] kernel_thread_helper+0x4/0x10
  [<ffffffff814389ba>] ? restore_args+0x0/0x30
  [<ffffffff81041419>] ? kthread+0x0/0x84
  [<ffffffff810034f0>] ? kernel_thread_helper+0x0/0x10
Code: 00 00 02 74 0b 83 ce ff 4c 89 ef e8 ab eb 05 00 49 8b b4 24 a0 02 
00 00 4c 89 ef e8 b9 52 06 00 48 8b 43 18 48 8b 53 20 48 85 c0 <48> 89 
02 74 04 48 89 50 08 48 be 00 02 20 00 00 00 ad de 48 89
RIP  [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
  RSP <ffff88003f111c30>
CR2: ffff88003ba453a0
---[ end trace 05c41c2103816005 ]---
BUG: unable to handle kernel paging request at fffffffffffffff8
IP: [<ffffffff810410bf>] kthread_data+0xb/0x11
PGD 180c067 PUD 180d067 PMD 0
Oops: 0000 [#2] DEBUG_PAGEALLOC
last sysfs file: /sys/devices/virtual/net/mc0PyXBA/type
CPU 0
Pid: 5, comm: kworker/u:0 Tainted: G      D     
2.6.36-rc7-next-20101007+ #11 /Bochs
RIP: 0010:[<ffffffff810410bf>]  [<ffffffff810410bf>] kthread_data+0xb/0x11
RSP: 0018:ffff88003f111868  EFLAGS: 00010096
RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88003f111fd8
RDX: ffff88003f0f20a0 RSI: 0000000000000000 RDI: ffff88003f0f20a0
RBP: ffff88003f111868 R08: 0000000000000002 R09: 0000000000000001
R10: 0000000000000246 R11: 09f911029d74e35b R12: 0000000000000000
R13: ffff88003f111948 R14: ffff88003f0c60a0 R15: ffff88003f0f2218
FS:  0000000000000000(0000) GS:ffffffff8181b000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: fffffffffffffff8 CR3: 000000003cb19000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process kworker/u:0 (pid: 5, threadinfo ffff88003f110000, task 
ffff88003f0f20a0)
Stack:
  ffff88003f111888 ffffffff8103d4e4 ffff88003f111888 ffff88003f0f2310
<0> ffff88003f110010 ffff88003f0f20a0 ffff88003f111fd8 ffff88003f111fd8
Call Trace:
  [<ffffffff8103d4e4>] wq_worker_sleeping+0x10/0x76
  [<ffffffff81435ffe>] schedule+0xf4/0x405
  [<ffffffff8102ebc4>] do_exit+0x647/0x660
  [<ffffffff81005ba0>] oops_end+0xb3/0xbb
  [<ffffffff8101c6b8>] no_context+0x1f5/0x204
  [<ffffffff8101c854>] __bad_area_nosemaphore+0x18d/0x1b0
  [<ffffffff8101c885>] bad_area_nosemaphore+0xe/0x10
  [<ffffffff8101cb52>] do_page_fault+0x16b/0x34d
  [<ffffffff81300a85>] ? ei_set_multicast_list+0x1f/0x3d
  [<ffffffff81437f46>] ? trace_hardirqs_off_thunk+0x3a/0x3c
  [<ffffffff81438b9f>] page_fault+0x1f/0x30
  [<ffffffff810300e5>] ? local_bh_enable_ip+0xb7/0xbd
  [<ffffffff813020b6>] ? macvlan_stop+0x57/0x7d
  [<ffffffff81362d09>] __dev_close+0x75/0x83
  [<ffffffff81362d31>] dev_close+0x1a/0x3f
  [<ffffffff81362e38>] rollback_registered_many+0xe2/0x21c
  [<ffffffff81362f88>] unregister_netdevice_many+0x16/0x6d
  [<ffffffff8136314d>] default_device_exit_batch+0xa7/0xbb
  [<ffffffff8135db06>] ops_exit_list+0x4e/0x56
  [<ffffffff8135e285>] cleanup_net+0xf5/0x195
  [<ffffffff8103e084>] process_one_work+0x25d/0x3e7
  [<ffffffff8103e027>] ? process_one_work+0x200/0x3e7
  [<ffffffff8135e190>] ? cleanup_net+0x0/0x195
  [<ffffffff8103e54a>] worker_thread+0x1b5/0x342
  [<ffffffff8103e395>] ? worker_thread+0x0/0x342
  [<ffffffff81041495>] kthread+0x7c/0x84
  [<ffffffff810034f4>] kernel_thread_helper+0x4/0x10
  [<ffffffff814389ba>] ? restore_args+0x0/0x30
  [<ffffffff81041419>] ? kthread+0x0/0x84
  [<ffffffff810034f0>] ? kernel_thread_helper+0x0/0x10
Code: 5c 41 5d 41 5e c9 c3 90 55 48 8b 04 25 40 a0 81 81 48 8b 80 18 02 
00 00 48 89 e5 8b 40 f0 c9 c3 48 8b 87 18 02 00 00 55 48 89 e5 <48> 8b 
40 f8 c9 c3 48 89 f0 c1 ee 06 55 89 f6 83 e0 3f 48 c1 e6
RIP  [<ffffffff810410bf>] kthread_data+0xb/0x11
  RSP <ffff88003f111868>
CR2: fffffffffffffff8
---[ end trace 05c41c2103816006 ]---



Thanks
   -- Daniel

> I added some printk's inn ipip.c  ipip_exit_net()
> ...
>          rtnl_lock();
>          printk(KERN_ERR "ipip_exit_net(enter)\n");
>          ipip_destroy_tunnels(ipn,&list);
>          printk(KERN_ERR "ipip_exit_net(1)\n");
>          unregister_netdevice_queue(ipn->fb_tunnel_dev,&list);
>          printk(KERN_ERR "ipip_exit_net(2)\n");
>          unregister_netdevice_many(&list);
>          printk(KERN_ERR "ipip_exit_net(3)\n");
>          rtnl_unlock();
>          printk(KERN_ERR "ipip_exit_net(exit)\n");
>
>
> Exit steps:
> ===== Screen dump =====
>
>   # ifconfig eth0  0.0.0.0  down
>   # ifconfig eth1  0.0.0.0  down
>   # ifconfig lo  0.0.0.0  down
>   # ip li de eth0
>   # ip li de eth1
>   # ifconfig -a
> ip6tnl0   Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00
>            NOARP  MTU:1460  Metric:1
>            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>            collisions:0 txqueuelen:0
>            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> lo        Link encap:Local Loopback
>            inet addr:127.0.0.1  Mask:255.0.0.0
>            LOOPBACK  MTU:16436  Metric:1
>            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>            collisions:0 txqueuelen:0
>            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> sit0      Link encap:IPv6-in-IPv4
>            NOARP  MTU:1480  Metric:1
>            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>            collisions:0 txqueuelen:0
>            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> tunl0     Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00
>            NOARP  MTU:1480  Metric:1
>            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>            collisions:0 txqueuelen:0
>            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
>   # ps
>    PID USER       VSZ STAT COMMAND
>      1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
>      2 root      4540 S    /bin/ash /var/bin/init
>      7 root      6640 S    inetd
>      8 root      4544 S    /bin/ash
>     26 root      4544 R    ps
>   # lsmod
> Module                  Size  Used by    Not tainted
> macvlan                 8709  0
> pcnet32                29549  0
> tg3                   112093  0
> libphy                 21043  1 tg3
>   # kill 7 2
>   # ps
>    PID USER       VSZ STAT COMMAND
>      1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
>      8 root      4544 S    /bin/ash
>     28 root      4544 R    ps
>   # exit  ( here is the exit from netns  )
>   # ipip_exit_net(enter)
> ipip_exit_net(1)
> ipip_exit_net(2)
> ------------[ cut here ]------------
> WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
> Hardware name: Bochs
> Modules linked in: macvlan pcnet32 tg3 libphy
> Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3+ #7
> Call Trace:
>   [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
>   [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
>   [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
>   [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
>   [<ffffffff81342108>] addrconf_ifdown+0x415/0x45e
>   [<ffffffff81342b98>] addrconf_notify+0x756/0x7fe
>   [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
>   [<ffffffff813622b3>] ? ip6mr_device_event+0x8d/0x9e
>   [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
>   [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
>   [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
>   [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
>   [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
>   [<ffffffff81324209>] ipip_exit_net+0xea/0x11a
>   [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
>   [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
>   [<ffffffff812bca39>] cleanup_net+0xf8/0x198
>   [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
>   [<ffffffff81056e35>] worker_thread+0x1db/0x34e
>   [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
>   [<ffffffff8105a030>] kthread+0x82/0x8a
>   [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
>   [<ffffffff81059fae>] ? kthread+0x0/0x8a
>   [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
> ---[ end trace 939b5185219f32e7 ]---
> ipip_exit_net(3)
> ipip_exit_net(exit)
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> ....
> ...
> ===== End of screen dump =====
>
> lxc conf file:
> # Container with network virtualized using the vlan device driver
> # Local eth0 uplink
> lxc.utsname = fee_0
> lxc.network.type = macvlan
> lxc.network.flags = up
> lxc.network.link = eth1
> lxc.network.hwaddr = 00:00:04:01:01:01
> lxc.network.ipv4 = 192.168.1.21/24
> lxc.network.ipv6 = 2003::2:1:1/96
> # local eth1 downlink - to the RS farm
> lxc.network.type = macvlan
> lxc.network.flags = up
> lxc.network.link = eth0
> lxc.network.hwaddr = 00:00:03:01:01:01
> lxc.network.ipv4 = 192.168.0.21/24
> lxc.network.ipv6 = 2003::1:1:1/96
> lxc.mount.entry = /var/lib/lxc/fee_0/var /var none rw,bind 0 0
>    

^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: Hans Schillstrom @ 2010-10-08 11:53 UTC (permalink / raw)
  To: Daniel Lezcano; +Cc: netdev@vger.kernel.org, Eric W. Biederman
In-Reply-To: <4CAEFE2C.3010007@free.fr>

On Friday 08 October 2010 13:19:08 Daniel Lezcano wrote:
Hello
> On 10/07/2010 10:48 AM, Hans Schillstrom wrote:
> > Hello
> > I'm trying to exit a network name space and it doesn't work (or am I doing something wrong?)
> > The only netdevices left are lo and the tunnels ip6tnl0, sit0 and tunl0 when exiting netns.
> >
> > A netns is created by lxc-execute with two interfaces eth0 eth1 (macvlan)
> > (see conf file at the end)
> >
> > Kernel: net-next-2.6 top from 4 october 2010
> >
> 
> Hi Hans,
> 
> I tried to reproduce your problem but I just get a big kernel crash when
> exiting the container :/
> 
> The stack is different but it may be related to the same problem.
> 
> BUG: unable to handle kernel paging request at ffff88003ba453a0
> IP: [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
> PGD 180b063 PUD 180f063 PMD 1ffdb067 PTE 3ba45160
> Oops: 0002 [#1] DEBUG_PAGEALLOC
> last sysfs file: /sys/devices/virtual/net/mc0PyXBA/type
> CPU 0
> Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc7-next-20101007+ #11 /Bochs
> RIP: 0010:[<ffffffff813020b6>]  [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
> RSP: 0018:ffff88003f111c30  EFLAGS: 00010246
> RAX: 0000000000000000 RBX: ffff88003bdd1e60 RCX: 000000000000c100
> RDX: ffff88003ba453a0 RSI: ffff88003f111d70 RDI: ffffffff810300e5
> RBP: ffff88003f111c50 R08: ffff88003f111d70 R09: 00000000000000cc
> R10: 0000000000000001 R11: ffff88003f111ba0 R12: ffff88003bdd1800
> R13: ffff880039fec800 R14: ffff88003f111d70 R15: ffff88003ba06830
> FS:  0000000000000000(0000) GS:ffffffff8181b000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: ffff88003ba453a0 CR3: 000000003c284000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Process kworker/u:0 (pid: 5, threadinfo ffff88003f110000, task
> ffff88003f0f20a0)
> Stack:
>   ffffffff81657a10 ffff88003bdd1800 ffffffff81657a10 ffff88003bb23800
> <0> ffff88003f111c70 ffffffff81362d09 ffff88003bdd1800 ffff88003f111cf0
> <0> ffff88003f111c90 ffffffff81362d31 ffff88003ba067c0 ffff88003bdd1800
> Call Trace:
>   [<ffffffff81362d09>] __dev_close+0x75/0x83
>   [<ffffffff81362d31>] dev_close+0x1a/0x3f
>   [<ffffffff81362e38>] rollback_registered_many+0xe2/0x21c
>   [<ffffffff81362f88>] unregister_netdevice_many+0x16/0x6d
>   [<ffffffff8136314d>] default_device_exit_batch+0xa7/0xbb
>   [<ffffffff8135db06>] ops_exit_list+0x4e/0x56
>   [<ffffffff8135e285>] cleanup_net+0xf5/0x195
>   [<ffffffff8103e084>] process_one_work+0x25d/0x3e7
>   [<ffffffff8103e027>] ? process_one_work+0x200/0x3e7
>   [<ffffffff8135e190>] ? cleanup_net+0x0/0x195
>   [<ffffffff8103e54a>] worker_thread+0x1b5/0x342
>   [<ffffffff8103e395>] ? worker_thread+0x0/0x342
>   [<ffffffff81041495>] kthread+0x7c/0x84
>   [<ffffffff810034f4>] kernel_thread_helper+0x4/0x10
>   [<ffffffff814389ba>] ? restore_args+0x0/0x30
>   [<ffffffff81041419>] ? kthread+0x0/0x84
>   [<ffffffff810034f0>] ? kernel_thread_helper+0x0/0x10
> Code: 00 00 02 74 0b 83 ce ff 4c 89 ef e8 ab eb 05 00 49 8b b4 24 a0 02
> 00 00 4c 89 ef e8 b9 52 06 00 48 8b 43 18 48 8b 53 20 48 85 c0 <48> 89
> 02 74 04 48 89 50 08 48 be 00 02 20 00 00 00 ad de 48 89
> RIP  [<ffffffff813020b6>] macvlan_stop+0x57/0x7d
>   RSP <ffff88003f111c30>
> CR2: ffff88003ba453a0
> ---[ end trace 05c41c2103816005 ]---
> BUG: unable to handle kernel paging request at fffffffffffffff8
> IP: [<ffffffff810410bf>] kthread_data+0xb/0x11
> PGD 180c067 PUD 180d067 PMD 0
> Oops: 0000 [#2] DEBUG_PAGEALLOC
> last sysfs file: /sys/devices/virtual/net/mc0PyXBA/type
> CPU 0
> Pid: 5, comm: kworker/u:0 Tainted: G      D
> 2.6.36-rc7-next-20101007+ #11 /Bochs
> RIP: 0010:[<ffffffff810410bf>]  [<ffffffff810410bf>] kthread_data+0xb/0x11
> RSP: 0018:ffff88003f111868  EFLAGS: 00010096
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88003f111fd8
> RDX: ffff88003f0f20a0 RSI: 0000000000000000 RDI: ffff88003f0f20a0
> RBP: ffff88003f111868 R08: 0000000000000002 R09: 0000000000000001
> R10: 0000000000000246 R11: 09f911029d74e35b R12: 0000000000000000
> R13: ffff88003f111948 R14: ffff88003f0c60a0 R15: ffff88003f0f2218
> FS:  0000000000000000(0000) GS:ffffffff8181b000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> CR2: fffffffffffffff8 CR3: 000000003cb19000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Process kworker/u:0 (pid: 5, threadinfo ffff88003f110000, task
> ffff88003f0f20a0)
> Stack:
>   ffff88003f111888 ffffffff8103d4e4 ffff88003f111888 ffff88003f0f2310
> <0> ffff88003f110010 ffff88003f0f20a0 ffff88003f111fd8 ffff88003f111fd8
> Call Trace:
>   [<ffffffff8103d4e4>] wq_worker_sleeping+0x10/0x76
>   [<ffffffff81435ffe>] schedule+0xf4/0x405
>   [<ffffffff8102ebc4>] do_exit+0x647/0x660
>   [<ffffffff81005ba0>] oops_end+0xb3/0xbb
>   [<ffffffff8101c6b8>] no_context+0x1f5/0x204
>   [<ffffffff8101c854>] __bad_area_nosemaphore+0x18d/0x1b0
>   [<ffffffff8101c885>] bad_area_nosemaphore+0xe/0x10
>   [<ffffffff8101cb52>] do_page_fault+0x16b/0x34d
>   [<ffffffff81300a85>] ? ei_set_multicast_list+0x1f/0x3d
>   [<ffffffff81437f46>] ? trace_hardirqs_off_thunk+0x3a/0x3c
>   [<ffffffff81438b9f>] page_fault+0x1f/0x30
>   [<ffffffff810300e5>] ? local_bh_enable_ip+0xb7/0xbd
>   [<ffffffff813020b6>] ? macvlan_stop+0x57/0x7d
>   [<ffffffff81362d09>] __dev_close+0x75/0x83
>   [<ffffffff81362d31>] dev_close+0x1a/0x3f
>   [<ffffffff81362e38>] rollback_registered_many+0xe2/0x21c
>   [<ffffffff81362f88>] unregister_netdevice_many+0x16/0x6d
>   [<ffffffff8136314d>] default_device_exit_batch+0xa7/0xbb
>   [<ffffffff8135db06>] ops_exit_list+0x4e/0x56
>   [<ffffffff8135e285>] cleanup_net+0xf5/0x195
>   [<ffffffff8103e084>] process_one_work+0x25d/0x3e7
>   [<ffffffff8103e027>] ? process_one_work+0x200/0x3e7
>   [<ffffffff8135e190>] ? cleanup_net+0x0/0x195
>   [<ffffffff8103e54a>] worker_thread+0x1b5/0x342
>   [<ffffffff8103e395>] ? worker_thread+0x0/0x342
>   [<ffffffff81041495>] kthread+0x7c/0x84
>   [<ffffffff810034f4>] kernel_thread_helper+0x4/0x10
>   [<ffffffff814389ba>] ? restore_args+0x0/0x30
>   [<ffffffff81041419>] ? kthread+0x0/0x84
>   [<ffffffff810034f0>] ? kernel_thread_helper+0x0/0x10
> Code: 5c 41 5d 41 5e c9 c3 90 55 48 8b 04 25 40 a0 81 81 48 8b 80 18 02
> 00 00 48 89 e5 8b 40 f0 c9 c3 48 8b 87 18 02 00 00 55 48 89 e5 <48> 8b
> 40 f8 c9 c3 48 89 f0 c1 ee 06 55 89 f6 83 e0 3f 48 c1 e6
> RIP  [<ffffffff810410bf>] kthread_data+0xb/0x11
>   RSP <ffff88003f111868>
> CR2: fffffffffffffff8
> ---[ end trace 05c41c2103816006 ]---
> 
> 
> 
> Thanks
>    -- Daniel

I did the same setup without any tunnel modules loaded and then it almost worked 
except free-ing of the loopback interface :-(
"unregister_netdevice: waiting for lo to become free. Usage count = 4"

When adding a tunnel module (here ip6_tunnel) you'll have the crash

ex: 
/var/lib/lxc # ifconfig -a
ip6tnl0   Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00  
          NOARP  MTU:1460  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

lo        Link encap:Local Loopback  
          LOOPBACK  MTU:16436  Metric:1
          RX packets:0 errors:0 dropped:0 overruns:0 frame:0
          TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:0 
          RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)

 # ifconfig
 # ps
  PID USER       VSZ STAT COMMAND
    1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
    2 root      4540 S    /bin/ash /var/bin/init
    6 root      4544 S    /bin/ash
   16 root      4544 R    ps
 # kill 2
 # ^D (exit of ns)
 # ------------[ cut here ]------------
WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
Hardware name: Bochs
Modules linked in: macvlan ip6_tunnel tunnel6 pcnet32 tg3 libphy
Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3 #2
Call Trace:
 [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
 [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
 [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
 [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
 [<ffffffff81340c75>] addrconf_ifdown+0x415/0x45e
 [<ffffffff81341705>] addrconf_notify+0x756/0x7fe
 [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
 [<ffffffff81360eb3>] ? ip6mr_device_event+0x8d/0x9e
 [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
 [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
 [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
 [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
 [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
 [<ffffffffa0047244>] ip6_tnl_exit_net+0xa4/0xb8 [ip6_tunnel]
 [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
 [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
 [<ffffffff812bca39>] cleanup_net+0xf8/0x198
 [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
 [<ffffffff81056e35>] worker_thread+0x1db/0x34e
 [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
 [<ffffffff8105a030>] kthread+0x82/0x8a
 [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
 [<ffffffff81059fae>] ? kthread+0x0/0x8a
 [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
---[ end trace eb3bc950cf9a8748 ]---
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4
unregister_netdevice: waiting for lo to become free. Usage count = 4

Regards 
Hans

> 
> > I added some printk's inn ipip.c  ipip_exit_net()
> > ...
> >          rtnl_lock();
> >          printk(KERN_ERR "ipip_exit_net(enter)\n");
> >          ipip_destroy_tunnels(ipn,&list);
> >          printk(KERN_ERR "ipip_exit_net(1)\n");
> >          unregister_netdevice_queue(ipn->fb_tunnel_dev,&list);
> >          printk(KERN_ERR "ipip_exit_net(2)\n");
> >          unregister_netdevice_many(&list);
> >          printk(KERN_ERR "ipip_exit_net(3)\n");
> >          rtnl_unlock();
> >          printk(KERN_ERR "ipip_exit_net(exit)\n");
> >
> >
> > Exit steps:
> > ===== Screen dump =====
> >
> >   # ifconfig eth0  0.0.0.0  down
> >   # ifconfig eth1  0.0.0.0  down
> >   # ifconfig lo  0.0.0.0  down
> >   # ip li de eth0
> >   # ip li de eth1
> >   # ifconfig -a
> > ip6tnl0   Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00
> >            NOARP  MTU:1460  Metric:1
> >            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
> >            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
> >            collisions:0 txqueuelen:0
> >            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
> >
> > lo        Link encap:Local Loopback
> >            inet addr:127.0.0.1  Mask:255.0.0.0
> >            LOOPBACK  MTU:16436  Metric:1
> >            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
> >            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
> >            collisions:0 txqueuelen:0
> >            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
> >
> > sit0      Link encap:IPv6-in-IPv4
> >            NOARP  MTU:1480  Metric:1
> >            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
> >            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
> >            collisions:0 txqueuelen:0
> >            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
> >
> > tunl0     Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00
> >            NOARP  MTU:1480  Metric:1
> >            RX packets:0 errors:0 dropped:0 overruns:0 frame:0
> >            TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
> >            collisions:0 txqueuelen:0
> >            RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
> >
> >   # ps
> >    PID USER       VSZ STAT COMMAND
> >      1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
> >      2 root      4540 S    /bin/ash /var/bin/init
> >      7 root      6640 S    inetd
> >      8 root      4544 S    /bin/ash
> >     26 root      4544 R    ps
> >   # lsmod
> > Module                  Size  Used by    Not tainted
> > macvlan                 8709  0
> > pcnet32                29549  0
> > tg3                   112093  0
> > libphy                 21043  1 tg3
> >   # kill 7 2
> >   # ps
> >    PID USER       VSZ STAT COMMAND
> >      1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
> >      8 root      4544 S    /bin/ash
> >     28 root      4544 R    ps
> >   # exit  ( here is the exit from netns  )
> >   # ipip_exit_net(enter)
> > ipip_exit_net(1)
> > ipip_exit_net(2)
> > ------------[ cut here ]------------
> > WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
> > Hardware name: Bochs
> > Modules linked in: macvlan pcnet32 tg3 libphy
> > Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3+ #7
> > Call Trace:
> >   [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
> >   [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
> >   [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
> >   [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
> >   [<ffffffff81342108>] addrconf_ifdown+0x415/0x45e
> >   [<ffffffff81342b98>] addrconf_notify+0x756/0x7fe
> >   [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
> >   [<ffffffff813622b3>] ? ip6mr_device_event+0x8d/0x9e
> >   [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
> >   [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
> >   [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
> >   [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
> >   [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
> >   [<ffffffff81324209>] ipip_exit_net+0xea/0x11a
> >   [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
> >   [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
> >   [<ffffffff812bca39>] cleanup_net+0xf8/0x198
> >   [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
> >   [<ffffffff81056e35>] worker_thread+0x1db/0x34e
> >   [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
> >   [<ffffffff8105a030>] kthread+0x82/0x8a
> >   [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
> >   [<ffffffff81059fae>] ? kthread+0x0/0x8a
> >   [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
> > ---[ end trace 939b5185219f32e7 ]---
> > ipip_exit_net(3)
> > ipip_exit_net(exit)
> > unregister_netdevice: waiting for lo to become free. Usage count = 4
> > unregister_netdevice: waiting for lo to become free. Usage count = 4
> > unregister_netdevice: waiting for lo to become free. Usage count = 4
> > ....
> > ...
> > ===== End of screen dump =====
> >
> > lxc conf file:
> > # Container with network virtualized using the vlan device driver
> > # Local eth0 uplink
> > lxc.utsname = fee_0
> > lxc.network.type = macvlan
> > lxc.network.flags = up
> > lxc.network.link = eth1
> > lxc.network.hwaddr = 00:00:04:01:01:01
> > lxc.network.ipv4 = 192.168.1.21/24
> > lxc.network.ipv6 = 2003::2:1:1/96
> > # local eth1 downlink - to the RS farm
> > lxc.network.type = macvlan
> > lxc.network.flags = up
> > lxc.network.link = eth0
> > lxc.network.hwaddr = 00:00:03:01:01:01
> > lxc.network.ipv4 = 192.168.0.21/24
> > lxc.network.ipv6 = 2003::1:1:1/96
> > lxc.mount.entry = /var/lib/lxc/fee_0/var /var none rw,bind 0 0
> >
> 

-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: Hans Schillstrom @ 2010-10-08 12:28 UTC (permalink / raw)
  To: Daniel Lezcano, Eric W. Biederman; +Cc: netdev@vger.kernel.org
In-Reply-To: <201010081353.28056.hans.schillstrom@ericsson.com>

Hi Eric,
Any advice how to trace this down ?
This rollback_registered_many() seems to have on the lists before...
All IPv4 and IPv6 tunnels causes this crash, all you have to do is load the tunnel module(s)
enter a new ns and exit from it.

Have not tested any more devices than tunnels, 
I did an "ip link delete" on my macvlans before exiting the ns.


snip
>  # ------------[ cut here ]------------
> WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
> Hardware name: Bochs
> Modules linked in: macvlan ip6_tunnel tunnel6 pcnet32 tg3 libphy
> Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3 #2
> Call Trace:
>  [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
>  [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
>  [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
>  [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
>  [<ffffffff81340c75>] addrconf_ifdown+0x415/0x45e
>  [<ffffffff81341705>] addrconf_notify+0x756/0x7fe
>  [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
>  [<ffffffff81360eb3>] ? ip6mr_device_event+0x8d/0x9e
>  [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
>  [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
>  [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
>  [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
>  [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
>  [<ffffffffa0047244>] ip6_tnl_exit_net+0xa4/0xb8 [ip6_tunnel]
>  [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
>  [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
>  [<ffffffff812bca39>] cleanup_net+0xf8/0x198
>  [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
>  [<ffffffff81056e35>] worker_thread+0x1db/0x34e
>  [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
>  [<ffffffff8105a030>] kthread+0x82/0x8a
>  [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
>  [<ffffffff81059fae>] ? kthread+0x0/0x8a
>  [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
> ---[ end trace eb3bc950cf9a8748 ]---
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4


-- 
Regards
Hans Schillstrom <hans.schillstrom@ericsson.com>

^ permalink raw reply

* [PATCH 1/4] Phonet: add to MAINTAINERS and add myself
From: Rémi Denis-Courmont @ 2010-10-08 14:02 UTC (permalink / raw)
  To: netdev; +Cc: Rémi Denis-Courmont

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
---
 MAINTAINERS |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9ddb5ac..1fd58ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4541,6 +4541,14 @@ L:	linux-abi-devel@lists.sourceforge.net
 S:	Maintained
 F:	include/linux/personality.h
 
+PHONET PROTOCOL
+M:	Remi Denis-Courmont <remi.denis-courmont@nokia.com>
+S:	Supported
+F:	Documentation/networking/phonet.txt
+F:	include/linux/phonet.h
+F:	include/net/phonet/
+F:	net/phonet/
+
 PHRAM MTD DRIVER
 M:	Joern Engel <joern@lazybastard.org>
 L:	linux-mtd@lists.infradead.org
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 2/4] Phonet: advise against enabling the pipe controller
From: Rémi Denis-Courmont @ 2010-10-08 14:02 UTC (permalink / raw)
  To: netdev; +Cc: Rémi Denis-Courmont
In-Reply-To: <1286546523-3340-1-git-send-email-remi@remlab.net>

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

As it currently is, the new code path is not compatible with existing
Nokia modems. This would break existing userspace for Nokia modem, such
as the existing oFono ISI driver.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
---
 net/phonet/Kconfig |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 901956a..a4fceb8 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -24,4 +24,5 @@ config PHONET_PIPECTRLR
 	  data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500
 	  platform.
 
-	  If unsure, say N.
+	  This option is incompatible with older Nokia modems.
+	  Say N here unless you really know what you are doing.
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 3/4] Phonet: cleanup pipe enable socket option
From: Rémi Denis-Courmont @ 2010-10-08 14:02 UTC (permalink / raw)
  To: netdev; +Cc: Rémi Denis-Courmont, Kumar Sanghvi
In-Reply-To: <1286546523-3340-2-git-send-email-remi@remlab.net>

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

The current code works like this:

  int garbage, status;
  socklen_t len = sizeof(status);

  /* enable pipe */
  setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &garbage, sizeof(garbage));
  /* disable pipe */
  setsockopt(fd, SOL_PNPIPE, PNPIPE_DISABLE, &garbage, sizeof(garbage));
  /* get status */
  getsockopt(fd, SOL_PNPIPE, PNPIPE_INQ, &status, &len);

...which does not follow the usual socket option pattern. This patch
merges all three "options" into a single gettable&settable option,
before Linux 2.6.37 gets out:

  int status;
  socklen_t len = sizeof(status);

  /* enable pipe */
  status = 1;
  setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, sizeof(status));
  /* disable pipe */
  status = 0;
  setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, sizeof(status));
  /* get status */
  getsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, &len);

This also fixes the error code from EFAULT to ENOTCONN.

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
Cc: Kumar Sanghvi <kumar.sanghvi@stericsson.com>
---
 Documentation/networking/phonet.txt |   15 +------
 include/linux/phonet.h              |    3 +-
 net/phonet/pep.c                    |   72 ++++++++++++++--------------------
 3 files changed, 34 insertions(+), 56 deletions(-)

diff --git a/Documentation/networking/phonet.txt b/Documentation/networking/phonet.txt
index cccf5ff..2d9bc2b 100644
--- a/Documentation/networking/phonet.txt
+++ b/Documentation/networking/phonet.txt
@@ -213,12 +213,9 @@ The implementation adds socket options at SOL_PNPIPE level:
 	It then updates the pipe state associated with the sequenced socket to
 	be PIPE_DISABLED.
 
-  PNPIPE_ENABLE
-	It follows the same sequence as above for enabling a pipe by sending
-	PNS_PEP_ENABLE_REQ initially and then sending PNS_PEP_ENABLED_IND after
-	getting responses from sequenced socket and remote-pep.
-	It will also update the pipe state associated with the sequenced socket
-	to PIPE_ENABLED.
+  PNPIPE_ENABLE accepts one integer value (int). If set to zero, the pipe
+    is disabled. If the value is non-zero, the pipe is enabled. If the pipe
+    is not (yet) connected, ENOTCONN is error is returned.
 
    PNPIPE_DESTROY
 	This will send out PNS_PEP_DISCONNECT_REQ on the sequenced socket and
@@ -226,12 +223,6 @@ The implementation adds socket options at SOL_PNPIPE level:
 	It will also update the pipe state associated with the sequenced socket
 	to PIPE_IDLE
 
-   PNPIPE_INQ
-	This getsocktopt allows the user-space running on the sequenced socket
-	to examine the pipe state associated with that socket ie. whether the
-	pipe is created (PIPE_DISABLED) or enabled (PIPE_ENABLED) or disabled
-	(PIPE_DISABLED) or no pipe exists (PIPE_IDLE).
-
 After a pipe has been created and enabled successfully, the Pipe data can be
 exchanged between the host-pep and remote-pep (modem).
 
diff --git a/include/linux/phonet.h b/include/linux/phonet.h
index 96f5625..e27cbf9 100644
--- a/include/linux/phonet.h
+++ b/include/linux/phonet.h
@@ -38,9 +38,8 @@
 #define PNPIPE_IFINDEX		2
 #define PNPIPE_CREATE           3
 #define PNPIPE_ENABLE           4
-#define PNPIPE_DISABLE          5
+/* unused slot */
 #define PNPIPE_DESTROY          6
-#define PNPIPE_INQ              7
 
 #define PNADDR_ANY		0
 #define PNADDR_BROADCAST	0xFC
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index aa3d870..f818f76 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -327,29 +327,20 @@ static int pipe_handler_send_ind(struct sock *sk, u16 dobj, u8 utid,
 	return pn_skb_send(sk, skb, &spn);
 }
 
-static int pipe_handler_enable_pipe(struct sock *sk, int cmd)
+static int pipe_handler_enable_pipe(struct sock *sk, int enable)
 {
-	int ret;
 	struct pep_sock *pn = pep_sk(sk);
-
-	switch (cmd) {
-	case PNPIPE_ENABLE:
-		ret = pipe_handler_send_req(sk, pn->pn_sk.sobject,
-				PNS_PIPE_ENABLE_UTID, PNS_PEP_ENABLE_REQ,
-				pn->pipe_handle, GFP_ATOMIC);
-		break;
-
-	case PNPIPE_DISABLE:
-		ret = pipe_handler_send_req(sk, pn->pn_sk.sobject,
-				PNS_PIPE_DISABLE_UTID, PNS_PEP_DISABLE_REQ,
-				pn->pipe_handle, GFP_ATOMIC);
-		break;
-
-	default:
-		ret = -EINVAL;
+	int utid, req;
+
+	if (enable) {
+		utid = PNS_PIPE_ENABLE_UTID;
+		req = PNS_PEP_ENABLE_REQ;
+	} else {
+		utid = PNS_PIPE_DISABLE_UTID;
+		req = PNS_PEP_DISABLE_REQ;
 	}
-
-	return ret;
+	return pipe_handler_send_req(sk, pn->pn_sk.sobject, utid, req,
+			pn->pipe_handle, GFP_ATOMIC);
 }
 
 static int pipe_handler_create_pipe(struct sock *sk, int pipe_handle, int cmd)
@@ -1187,23 +1178,6 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 			break;
 		}
 
-	case PNPIPE_ENABLE:
-		if (pn->pipe_state != PIPE_DISABLED) {
-			err = -EFAULT;
-			break;
-		}
-		err = pipe_handler_enable_pipe(sk, PNPIPE_ENABLE);
-		break;
-
-	case PNPIPE_DISABLE:
-		if (pn->pipe_state != PIPE_ENABLED) {
-			err = -EFAULT;
-			break;
-		}
-
-		err = pipe_handler_enable_pipe(sk, PNPIPE_DISABLE);
-		break;
-
 	case PNPIPE_DESTROY:
 		if (pn->pipe_state < PIPE_DISABLED) {
 			err = -EFAULT;
@@ -1239,6 +1213,17 @@ static int pep_setsockopt(struct sock *sk, int level, int optname,
 			err = 0;
 		}
 		goto out_norel;
+
+#ifdef CONFIG_PHONET_PIPECTRLR
+	case PNPIPE_ENABLE:
+		if (pn->pipe_state <= PIPE_IDLE) {
+			err = -ENOTCONN;
+			break;
+		}
+		err = pipe_handler_enable_pipe(sk, val);
+		break;
+#endif
+
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1264,15 +1249,18 @@ static int pep_getsockopt(struct sock *sk, int level, int optname,
 		val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE;
 		break;
 
+	case PNPIPE_IFINDEX:
+		val = pn->ifindex;
+		break;
+
 #ifdef CONFIG_PHONET_PIPECTRLR
-	case PNPIPE_INQ:
-		val = pn->pipe_state;
+	case PNPIPE_ENABLE:
+		if (pn->pipe_state <= PIPE_IDLE)
+			return -ENOTCONN;
+		val = pn->pipe_state != PIPE_DISABLED;
 		break;
 #endif
 
-	case PNPIPE_IFINDEX:
-		val = pn->ifindex;
-		break;
 	default:
 		return -ENOPROTOOPT;
 	}
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 4/4] Phonet: mark the pipe controller as EXPERIMENTAL
From: Rémi Denis-Courmont @ 2010-10-08 14:02 UTC (permalink / raw)
  To: netdev; +Cc: Rémi Denis-Courmont
In-Reply-To: <1286546523-3340-3-git-send-email-remi@remlab.net>

From: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>

There are a bunch of issues that need to be fixed, including:
 - GFP_KERNEL allocations from atomic context
   (and GFP_ATOMIC in process context),
 - abuse of the setsockopt() call convention,
 - unprotected/unlocked static variables...

IMHO, we will need to alter the userspace ABI when we fix it. So mark
the configuration option as EXPERIMENTAL for the time being (or should
it be BROKEN instead?).

Signed-off-by: Rémi Denis-Courmont <remi.denis-courmont@nokia.com>
---
 net/phonet/Kconfig |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index a4fceb8..0d9b8a2 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -16,8 +16,8 @@ config PHONET
 	  will be called phonet. If unsure, say N.
 
 config PHONET_PIPECTRLR
-	bool "Phonet Pipe Controller"
-	depends on PHONET
+	bool "Phonet Pipe Controller (EXPERIMENTAL)"
+	depends on PHONET && EXPERIMENTAL
 	default N
 	help
 	  The Pipe Controller implementation in Phonet stack to support Pipe
-- 
1.7.0.4


^ permalink raw reply related

* Re: [PATCH] ehea: Fix a checksum issue on the receive path
From: Breno Leitao @ 2010-10-08 14:14 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, netdev, Jay Vosburgh
In-Reply-To: <1286513130.6536.467.camel@edumazet-laptop>

Hi Eric

On 10/08/2010 01:45 AM, Eric Dumazet wrote:
> Just to be clear : packets with wrong checksums are not given to upper
> stack, so a tcpdump can not display them ? I am not sure many drivers do
> that.
Well, what my code does is: 1) if the current packet is a UDP/TCP, then 
the checksum is not necessary, since we would check the checksum on 
ehea_proc_rwqes(), specific at this part of the code:

                if (!ehea_check_cqe(cqe, &rq)) {
			// Send the packet to the up layers

And ehea_check_cqe() checks for wrong checksumed packets on:
	
         if ((cqe->status & EHEA_CQE_STAT_ERR_MASK) == 0)
                 return 0;


Botton line, TCP/UDP packets with wrong checksums are dropped by 
ehea_proc_rwqes(), others go to the up layer.

So, back to your question, you are saying that we shouldn't do that, 
meaning that we should send to the upper layers all packets ? even those 
that have the wrong checksum ?

Thanks
Breno

^ permalink raw reply

* [PATCH 1/2] r8169: allocate with GFP_KERNEL flag when able to sleep
From: Stanislaw Gruszka @ 2010-10-08 14:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Stanislaw Gruszka

We have fedora bug report where driver fail to initialize after
suspend/resume because of memory allocation errors:
https://bugzilla.redhat.com/show_bug.cgi?id=629158

To fix use GFP_KERNEL allocation where possible.

Tested-by: Neal Becker <ndbecker2@gmail.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   12 ++++++------
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index fe3b762..a7fb044 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4006,7 +4006,7 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
 static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
 					    struct net_device *dev,
 					    struct RxDesc *desc, int rx_buf_sz,
-					    unsigned int align)
+					    unsigned int align, gfp_t gfp)
 {
 	struct sk_buff *skb;
 	dma_addr_t mapping;
@@ -4014,7 +4014,7 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
 
 	pad = align ? align : NET_IP_ALIGN;
 
-	skb = netdev_alloc_skb(dev, rx_buf_sz + pad);
+	skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
 	if (!skb)
 		goto err_out;
 
@@ -4045,7 +4045,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
 }
 
 static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
-			   u32 start, u32 end)
+			   u32 start, u32 end, gfp_t gfp)
 {
 	u32 cur;
 
@@ -4060,7 +4060,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
 
 		skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev,
 					   tp->RxDescArray + i,
-					   tp->rx_buf_sz, tp->align);
+					   tp->rx_buf_sz, tp->align, gfp);
 		if (!skb)
 			break;
 
@@ -4088,7 +4088,7 @@ static int rtl8169_init_ring(struct net_device *dev)
 	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
 	memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *));
 
-	if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC) != NUM_RX_DESC)
+	if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
 		goto err_out;
 
 	rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
@@ -4587,7 +4587,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 	count = cur_rx - tp->cur_rx;
 	tp->cur_rx = cur_rx;
 
-	delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx);
+	delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
 	if (!delta && count)
 		netif_info(tp, intr, dev, "no Rx buffer allocated\n");
 	tp->dirty_rx += delta;
-- 
1.7.1


^ permalink raw reply related

* [PATCH 2/2] r8169: use device model DMA API
From: Stanislaw Gruszka @ 2010-10-08 14:25 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Stanislaw Gruszka
In-Reply-To: <1286547901-10782-1-git-send-email-sgruszka@redhat.com>

Use DMA API as PCI equivalents will be deprecated. This change also
allow to allocate with GFP_KERNEL where possible.

Tested-by: Neal Becker <ndbecker2@gmail.com>
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
---
 drivers/net/r8169.c |   53 +++++++++++++++++++++++++++-----------------------
 1 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index a7fb044..bc669a4 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -1217,7 +1217,8 @@ static void rtl8169_update_counters(struct net_device *dev)
 	if ((RTL_R8(ChipCmd) & CmdRxEnb) == 0)
 		return;
 
-	counters = pci_alloc_consistent(tp->pci_dev, sizeof(*counters), &paddr);
+	counters = dma_alloc_coherent(&tp->pci_dev->dev, sizeof(*counters),
+				      &paddr, GFP_KERNEL);
 	if (!counters)
 		return;
 
@@ -1238,7 +1239,8 @@ static void rtl8169_update_counters(struct net_device *dev)
 	RTL_W32(CounterAddrLow, 0);
 	RTL_W32(CounterAddrHigh, 0);
 
-	pci_free_consistent(tp->pci_dev, sizeof(*counters), counters, paddr);
+	dma_free_coherent(&tp->pci_dev->dev, sizeof(*counters), counters,
+			  paddr);
 }
 
 static void rtl8169_get_ethtool_stats(struct net_device *dev,
@@ -3298,15 +3300,15 @@ static int rtl8169_open(struct net_device *dev)
 
 	/*
 	 * Rx and Tx desscriptors needs 256 bytes alignment.
-	 * pci_alloc_consistent provides more.
+	 * dma_alloc_coherent provides more.
 	 */
-	tp->TxDescArray = pci_alloc_consistent(pdev, R8169_TX_RING_BYTES,
-					       &tp->TxPhyAddr);
+	tp->TxDescArray = dma_alloc_coherent(&pdev->dev, R8169_TX_RING_BYTES,
+					     &tp->TxPhyAddr, GFP_KERNEL);
 	if (!tp->TxDescArray)
 		goto err_pm_runtime_put;
 
-	tp->RxDescArray = pci_alloc_consistent(pdev, R8169_RX_RING_BYTES,
-					       &tp->RxPhyAddr);
+	tp->RxDescArray = dma_alloc_coherent(&pdev->dev, R8169_RX_RING_BYTES,
+					     &tp->RxPhyAddr, GFP_KERNEL);
 	if (!tp->RxDescArray)
 		goto err_free_tx_0;
 
@@ -3340,12 +3342,12 @@ out:
 err_release_ring_2:
 	rtl8169_rx_clear(tp);
 err_free_rx_1:
-	pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
-			    tp->RxPhyAddr);
+	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+			  tp->RxPhyAddr);
 	tp->RxDescArray = NULL;
 err_free_tx_0:
-	pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
-			    tp->TxPhyAddr);
+	dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+			  tp->TxPhyAddr);
 	tp->TxDescArray = NULL;
 err_pm_runtime_put:
 	pm_runtime_put_noidle(&pdev->dev);
@@ -3981,7 +3983,7 @@ static void rtl8169_free_rx_skb(struct rtl8169_private *tp,
 {
 	struct pci_dev *pdev = tp->pci_dev;
 
-	pci_unmap_single(pdev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
+	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), tp->rx_buf_sz,
 			 PCI_DMA_FROMDEVICE);
 	dev_kfree_skb(*sk_buff);
 	*sk_buff = NULL;
@@ -4020,7 +4022,7 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
 
 	skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
 
-	mapping = pci_map_single(pdev, skb->data, rx_buf_sz,
+	mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
@@ -4105,7 +4107,8 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
 {
 	unsigned int len = tx_skb->len;
 
-	pci_unmap_single(pdev, le64_to_cpu(desc->addr), len, PCI_DMA_TODEVICE);
+	dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len,
+			 PCI_DMA_TODEVICE);
 	desc->opts1 = 0x00;
 	desc->opts2 = 0x00;
 	desc->addr = 0x00;
@@ -4249,7 +4252,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 		txd = tp->TxDescArray + entry;
 		len = frag->size;
 		addr = ((void *) page_address(frag->page)) + frag->page_offset;
-		mapping = pci_map_single(tp->pci_dev, addr, len, PCI_DMA_TODEVICE);
+		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
+					 PCI_DMA_TODEVICE);
 
 		/* anti gcc 2.95.3 bugware (sic) */
 		status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
@@ -4319,7 +4323,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 		tp->tx_skb[entry].skb = skb;
 	}
 
-	mapping = pci_map_single(tp->pci_dev, skb->data, len, PCI_DMA_TODEVICE);
+	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
+				 PCI_DMA_TODEVICE);
 
 	tp->tx_skb[entry].len = len;
 	txd->addr = cpu_to_le64(mapping);
@@ -4482,8 +4487,8 @@ static inline bool rtl8169_try_rx_copy(struct sk_buff **sk_buff,
 	if (!skb)
 		goto out;
 
-	pci_dma_sync_single_for_cpu(tp->pci_dev, addr, pkt_size,
-				    PCI_DMA_FROMDEVICE);
+	dma_sync_single_for_cpu(&tp->pci_dev->dev, addr, pkt_size,
+				PCI_DMA_FROMDEVICE);
 	skb_copy_from_linear_data(*sk_buff, skb->data, pkt_size);
 	*sk_buff = skb;
 	done = true;
@@ -4552,11 +4557,11 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 			}
 
 			if (rtl8169_try_rx_copy(&skb, tp, pkt_size, addr)) {
-				pci_dma_sync_single_for_device(pdev, addr,
+				dma_sync_single_for_device(&pdev->dev, addr,
 					pkt_size, PCI_DMA_FROMDEVICE);
 				rtl8169_mark_to_asic(desc, tp->rx_buf_sz);
 			} else {
-				pci_unmap_single(pdev, addr, tp->rx_buf_sz,
+				dma_unmap_single(&pdev->dev, addr, tp->rx_buf_sz,
 						 PCI_DMA_FROMDEVICE);
 				tp->Rx_skbuff[entry] = NULL;
 			}
@@ -4773,10 +4778,10 @@ static int rtl8169_close(struct net_device *dev)
 
 	free_irq(dev->irq, dev);
 
-	pci_free_consistent(pdev, R8169_RX_RING_BYTES, tp->RxDescArray,
-			    tp->RxPhyAddr);
-	pci_free_consistent(pdev, R8169_TX_RING_BYTES, tp->TxDescArray,
-			    tp->TxPhyAddr);
+	dma_free_coherent(&pdev->dev, R8169_RX_RING_BYTES, tp->RxDescArray,
+			  tp->RxPhyAddr);
+	dma_free_coherent(&pdev->dev, R8169_TX_RING_BYTES, tp->TxDescArray,
+			  tp->TxPhyAddr);
 	tp->TxDescArray = NULL;
 	tp->RxDescArray = NULL;
 
-- 
1.7.1


^ permalink raw reply related

* [RFC PATCH 1/2] r8169: check dma mapping failures
From: Stanislaw Gruszka @ 2010-10-08 14:30 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka

This is on top on my two r8169 patches just send.

Check possible dma mapping errors and do clean up if it happens.
Patch was not tested.

BTW: I see many drivers do not check these, so is really possible to
have this errors, and if yes, when ?
---
 drivers/net/r8169.c |   36 ++++++++++++++++++++++++++++--------
 1 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index bc669a4..b3b28b1 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4018,20 +4018,24 @@ static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
 
 	skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
 	if (!skb)
-		goto err_out;
+		goto err0;
 
 	skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
 
 	mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
+	if (dma_mapping_error(&pdev->dev, mapping))
+		goto err1;
 
 	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
-out:
+
 	return skb;
 
-err_out:
+err1:
+	dev_kfree_skb(skb);
+err0:
 	rtl8169_make_unusable_by_asic(desc);
-	goto out;
+	return NULL;
 }
 
 static void rtl8169_rx_clear(struct rtl8169_private *tp)
@@ -4115,11 +4119,11 @@ static void rtl8169_unmap_tx_skb(struct pci_dev *pdev, struct ring_info *tx_skb,
 	tx_skb->len = 0;
 }
 
-static void rtl8169_tx_clear(struct rtl8169_private *tp)
+static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start, u32 end)
 {
-	unsigned int i;
+	u32 i;
 
-	for (i = tp->dirty_tx; i < tp->dirty_tx + NUM_TX_DESC; i++) {
+	for (i = start; i < end; i++) {
 		unsigned int entry = i % NUM_TX_DESC;
 		struct ring_info *tx_skb = tp->tx_skb + entry;
 		unsigned int len = tx_skb->len;
@@ -4136,6 +4140,11 @@ static void rtl8169_tx_clear(struct rtl8169_private *tp)
 			tp->dev->stats.tx_dropped++;
 		}
 	}
+}
+
+static inline void rtl8169_tx_clear(struct rtl8169_private *tp)
+{
+	rtl8169_tx_clear_range(tp, tp->dirty_tx, tp->dirty_tx + NUM_TX_DESC);
 	tp->cur_tx = tp->dirty_tx = 0;
 }
 
@@ -4254,6 +4263,8 @@ static int rtl8169_xmit_frags(struct rtl8169_private *tp, struct sk_buff *skb,
 		addr = ((void *) page_address(frag->page)) + frag->page_offset;
 		mapping = dma_map_single(&tp->pci_dev->dev, addr, len,
 					 PCI_DMA_TODEVICE);
+		if (dma_mapping_error(&tp->pci_dev->dev, mapping))
+			return -cur_frag;
 
 		/* anti gcc 2.95.3 bugware (sic) */
 		status = opts1 | len | (RingEnd * !((entry + 1) % NUM_TX_DESC));
@@ -4314,7 +4325,10 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 	opts1 = DescOwn | rtl8169_tso_csum(skb, dev);
 
 	frags = rtl8169_xmit_frags(tp, skb, opts1);
-	if (frags) {
+	if (frags < 0) {
+		frags = -frags;
+		goto err_dma;
+	} else if (frags) {
 		len = skb_headlen(skb);
 		opts1 |= FirstFrag;
 	} else {
@@ -4325,6 +4339,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, len,
 				 PCI_DMA_TODEVICE);
+	if (dma_mapping_error(&tp->pci_dev->dev, mapping))
+		goto err_dma;
 
 	tp->tx_skb[entry].len = len;
 	txd->addr = cpu_to_le64(mapping);
@@ -4355,6 +4371,10 @@ err_stop:
 	netif_stop_queue(dev);
 	dev->stats.tx_dropped++;
 	return NETDEV_TX_BUSY;
+
+err_dma:
+	rtl8169_tx_clear_range(tp, entry, entry + frags + 1);
+	return NETDEV_TX_OK;
 }
 
 static void rtl8169_pcierr_interrupt(struct net_device *dev)
-- 
1.7.1


^ permalink raw reply related

* [RFC PATCH 2/2] r8169: reduce number of functions arguments
From: Stanislaw Gruszka @ 2010-10-08 14:30 UTC (permalink / raw)
  To: Francois Romieu, netdev; +Cc: Denis Kirjanov, Stanislaw Gruszka
In-Reply-To: <1286548203-10831-1-git-send-email-sgruszka@redhat.com>

We don't need to pass arguments on stack since we have them in per
device private structure. Patch was not tested.
---
 drivers/net/r8169.c |   30 ++++++++++++------------------
 1 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index b3b28b1..65d4219 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -4005,29 +4005,26 @@ static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping,
 	rtl8169_mark_to_asic(desc, rx_buf_sz);
 }
 
-static struct sk_buff *rtl8169_alloc_rx_skb(struct pci_dev *pdev,
-					    struct net_device *dev,
-					    struct RxDesc *desc, int rx_buf_sz,
-					    unsigned int align, gfp_t gfp)
+static struct sk_buff *rtl8169_alloc_rx_skb(struct rtl8169_private *tp,
+					    struct RxDesc *desc, gfp_t gfp)
 {
 	struct sk_buff *skb;
 	dma_addr_t mapping;
-	unsigned int pad;
+	unsigned int align = tp->align;
+	unsigned int pad = align ? align : NET_IP_ALIGN;
 
-	pad = align ? align : NET_IP_ALIGN;
-
-	skb = __netdev_alloc_skb(dev, rx_buf_sz + pad, gfp);
+	skb = __netdev_alloc_skb(tp->dev, tp->rx_buf_sz + pad, gfp);
 	if (!skb)
 		goto err0;
 
 	skb_reserve(skb, align ? ((pad - 1) & (unsigned long)skb->data) : pad);
 
-	mapping = dma_map_single(&pdev->dev, skb->data, rx_buf_sz,
+	mapping = dma_map_single(&tp->pci_dev->dev, skb->data, tp->rx_buf_sz,
 				 PCI_DMA_FROMDEVICE);
-	if (dma_mapping_error(&pdev->dev, mapping))
+	if (dma_mapping_error(&tp->pci_dev->dev, mapping))
 		goto err1;
 
-	rtl8169_map_to_asic(desc, mapping, rx_buf_sz);
+	rtl8169_map_to_asic(desc, mapping, tp->rx_buf_sz);
 
 	return skb;
 
@@ -4050,8 +4047,7 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
 	}
 }
 
-static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
-			   u32 start, u32 end, gfp_t gfp)
+static u32 rtl8169_rx_fill(struct rtl8169_private *tp, u32 start, u32 end, gfp_t gfp)
 {
 	u32 cur;
 
@@ -4064,9 +4060,7 @@ static u32 rtl8169_rx_fill(struct rtl8169_private *tp, struct net_device *dev,
 		if (tp->Rx_skbuff[i])
 			continue;
 
-		skb = rtl8169_alloc_rx_skb(tp->pci_dev, dev,
-					   tp->RxDescArray + i,
-					   tp->rx_buf_sz, tp->align, gfp);
+		skb = rtl8169_alloc_rx_skb(tp, tp->RxDescArray + i, gfp);
 		if (!skb)
 			break;
 
@@ -4094,7 +4088,7 @@ static int rtl8169_init_ring(struct net_device *dev)
 	memset(tp->tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
 	memset(tp->Rx_skbuff, 0x0, NUM_RX_DESC * sizeof(struct sk_buff *));
 
-	if (rtl8169_rx_fill(tp, dev, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
+	if (rtl8169_rx_fill(tp, 0, NUM_RX_DESC, GFP_KERNEL) != NUM_RX_DESC)
 		goto err_out;
 
 	rtl8169_mark_as_last_descriptor(tp->RxDescArray + NUM_RX_DESC - 1);
@@ -4612,7 +4606,7 @@ static int rtl8169_rx_interrupt(struct net_device *dev,
 	count = cur_rx - tp->cur_rx;
 	tp->cur_rx = cur_rx;
 
-	delta = rtl8169_rx_fill(tp, dev, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
+	delta = rtl8169_rx_fill(tp, tp->dirty_rx, tp->cur_rx, GFP_ATOMIC);
 	if (!delta && count)
 		netif_info(tp, intr, dev, "no Rx buffer allocated\n");
 	tp->dirty_rx += delta;
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH] ehea: Fix a checksum issue on the receive path
From: Eric Dumazet @ 2010-10-08 14:36 UTC (permalink / raw)
  To: Breno Leitao; +Cc: davem, netdev, Jay Vosburgh
In-Reply-To: <4CAF2732.90703@linux.vnet.ibm.com>

Le vendredi 08 octobre 2010 à 11:14 -0300, Breno Leitao a écrit :
> Hi Eric
> 
> On 10/08/2010 01:45 AM, Eric Dumazet wrote:
> > Just to be clear : packets with wrong checksums are not given to upper
> > stack, so a tcpdump can not display them ? I am not sure many drivers do
> > that.
> Well, what my code does is: 1) if the current packet is a UDP/TCP, then 
> the checksum is not necessary, since we would check the checksum on 
> ehea_proc_rwqes(), specific at this part of the code:
> 
>                 if (!ehea_check_cqe(cqe, &rq)) {
> 			// Send the packet to the up layers
> 
> And ehea_check_cqe() checks for wrong checksumed packets on:
> 	
>          if ((cqe->status & EHEA_CQE_STAT_ERR_MASK) == 0)
>                  return 0;
> 
> 
> Botton line, TCP/UDP packets with wrong checksums are dropped by 
> ehea_proc_rwqes(), others go to the up layer.
> 
> So, back to your question, you are saying that we shouldn't do that, 
> meaning that we should send to the upper layers all packets ? even those 
> that have the wrong checksum ?
> 

I am pretty sure most (if not all) netdev drivers pass the packet with
invalid checksum to upper stack, so that we can increment appropriate
SNMP counters, in IP stack or UDP/TCP/whatever stack.

tg3, bnx2, e1000, skge, sky2, bnx2x, niu, r8169, igb, ... seems to do
that.




^ permalink raw reply

* Re: [PATCH 1/2] r8169: allocate with GFP_KERNEL flag when able to sleep
From: Stanislaw Gruszka @ 2010-10-08 14:52 UTC (permalink / raw)
  To: Francois Romieu, netdev
In-Reply-To: <1286547901-10782-1-git-send-email-sgruszka@redhat.com>

On Fri, Oct 08, 2010 at 04:25:00PM +0200, Stanislaw Gruszka wrote:
> We have fedora bug report where driver fail to initialize after
> suspend/resume because of memory allocation errors:
> https://bugzilla.redhat.com/show_bug.cgi?id=629158

There is also one more thing to do regarding above. Calltraces from bug
reports, shows that order 3 allocation fail. On arch with 4kB pages,
order 3 mean 32kB allocation. We want to alloc 16kB, but there is also
internal sk_buff data what make that we exceed the boundary and take
32kB from allocator, getting almost 50% wastage.

To fix we can use similar method as in niu or iwlwifi drivers, alloc
pages directly form buddy allocator and attach them to skb (by
skb_add_rx_frag for example). I'm going to prepare such patch, but
I have one doubt, what happens if page size in system is bigger
than 16kB, should I care about such case? 

Stanislaw

^ permalink raw reply

* Re: [PATCH 1/2] r8169: allocate with GFP_KERNEL flag when able to sleep
From: Eric Dumazet @ 2010-10-08 15:04 UTC (permalink / raw)
  To: Stanislaw Gruszka; +Cc: Francois Romieu, netdev
In-Reply-To: <20101008145256.GB10393@redhat.com>

Le vendredi 08 octobre 2010 à 16:52 +0200, Stanislaw Gruszka a écrit :
> On Fri, Oct 08, 2010 at 04:25:00PM +0200, Stanislaw Gruszka wrote:
> > We have fedora bug report where driver fail to initialize after
> > suspend/resume because of memory allocation errors:
> > https://bugzilla.redhat.com/show_bug.cgi?id=629158
> 
> There is also one more thing to do regarding above. Calltraces from bug
> reports, shows that order 3 allocation fail. On arch with 4kB pages,
> order 3 mean 32kB allocation. We want to alloc 16kB, but there is also
> internal sk_buff data what make that we exceed the boundary and take
> 32kB from allocator, getting almost 50% wastage.
> 

Or its only an 1460+overhead allocation, and SLUB uses order-3 pages to
satisfy 2048 bytes allocations.

# grep 2048 /proc/slabinfo 
kmalloc-2048        8664   8752   2048   16    8 : tunables    0    0
0 : slabdata    547    547      0


8 in the <pagesperslab> column just says that : order-3 pages, even for
small allocations.

Switch to SLAB -> no more problem ;)


> To fix we can use similar method as in niu or iwlwifi drivers, alloc
> pages directly form buddy allocator and attach them to skb (by
> skb_add_rx_frag for example). I'm going to prepare such patch, but
> I have one doubt, what happens if page size in system is bigger
> than 16kB, should I care about such case? 

Seems tricky. Should we patch all drivers to do something like that ?




^ permalink raw reply

* Re: Linux 2.6.36-rc7
From: James Bottomley @ 2010-10-08 15:05 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Linus Torvalds, Linux Kernel Mailing List, Russell King,
	David Miller, netdev, John W. Linville, Michal Marek,
	Dmitry Torokhov
In-Reply-To: <20101007114938.ad3d2c76.sfr@canb.auug.org.au>

On Thu, 2010-10-07 at 11:49 +1100, Stephen Rothwell wrote:
> Hi Linus,
> 
> On Wed, 6 Oct 2010 14:45:13 -0700 Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >
> > This should be the last -rc, I'm not seeing any reason to keep
> > delaying a real release. There was still more changes to
> > drivers/gpu/drm than I really would have hoped for, but they all look
> > harmless and good. Famous last words.
> 
> I have no idea how critical any of this stuff is, but linux-next contain
> the following in it's "current" trees i.e. stuff that is supposed to go
> into 2.6.36.  These are from the arm-current, scsi-rc-fixes, net-current,
> wireless-current, kbuild-current, input-current and ide-curent trees
> (contacts cc'd).

The SCSI rc-fixes stuff is critical if you run into the bugs, but the
bugs are fairly rare cases for most people.  I'd still like to get them
in, though (and I have another 3 rc fixes candidates going through the
test pipeline).

James

^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: Daniel Lezcano @ 2010-10-08 15:53 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: Eric W. Biederman, netdev@vger.kernel.org
In-Reply-To: <201010081428.37639.hans.schillstrom@ericsson.com>

On 10/08/2010 02:28 PM, Hans Schillstrom wrote:
> Hi Eric,
> Any advice how to trace this down ?
> This rollback_registered_many() seems to have on the lists before...
> All IPv4 and IPv6 tunnels causes this crash, all you have to do is load the tunnel module(s)
> enter a new ns and exit from it.
>
> Have not tested any more devices than tunnels,
> I did an "ip link delete" on my macvlans before exiting the ns.
>    

Ah ! I succeed to reproduce it.
It does not appear immediately in fact.

I am trying to simplify the configuration but I am falling in the bug I 
talked about in the previous email.

> snip
>    
>>   # ------------[ cut here ]------------
>> WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953 unregister_sysctl_table+0xc7/0xf9()
>> Hardware name: Bochs
>> Modules linked in: macvlan ip6_tunnel tunnel6 pcnet32 tg3 libphy
>> Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3 #2
>> Call Trace:
>>   [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
>>   [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
>>   [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
>>   [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
>>   [<ffffffff81340c75>] addrconf_ifdown+0x415/0x45e
>>   [<ffffffff81341705>] addrconf_notify+0x756/0x7fe
>>   [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
>>   [<ffffffff81360eb3>] ? ip6mr_device_event+0x8d/0x9e
>>   [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
>>   [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
>>   [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
>>   [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
>>   [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
>>   [<ffffffffa0047244>] ip6_tnl_exit_net+0xa4/0xb8 [ip6_tunnel]
>>   [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
>>   [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
>>   [<ffffffff812bca39>] cleanup_net+0xf8/0x198
>>   [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
>>   [<ffffffff81056e35>] worker_thread+0x1db/0x34e
>>   [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
>>   [<ffffffff8105a030>] kthread+0x82/0x8a
>>   [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
>>   [<ffffffff81059fae>] ? kthread+0x0/0x8a
>>   [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
>> ---[ end trace eb3bc950cf9a8748 ]---
>> unregister_netdevice: waiting for lo to become free. Usage count = 4
>> unregister_netdevice: waiting for lo to become free. Usage count = 4
>> unregister_netdevice: waiting for lo to become free. Usage count = 4
>>      
>
>    


^ permalink raw reply

* Re: [PATCH 1/2] r8169: allocate with GFP_KERNEL flag when able to sleep
From: Stanislaw Gruszka @ 2010-10-08 16:03 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Francois Romieu, netdev
In-Reply-To: <1286550247.2959.444.camel@edumazet-laptop>

On Fri, Oct 08, 2010 at 05:04:07PM +0200, Eric Dumazet wrote:
> Le vendredi 08 octobre 2010 à 16:52 +0200, Stanislaw Gruszka a écrit :
> > On Fri, Oct 08, 2010 at 04:25:00PM +0200, Stanislaw Gruszka wrote:
> > > We have fedora bug report where driver fail to initialize after
> > > suspend/resume because of memory allocation errors:
> > > https://bugzilla.redhat.com/show_bug.cgi?id=629158
> > 
> > There is also one more thing to do regarding above. Calltraces from bug
> > reports, shows that order 3 allocation fail. On arch with 4kB pages,
> > order 3 mean 32kB allocation. We want to alloc 16kB, but there is also
> > internal sk_buff data what make that we exceed the boundary and take
> > 32kB from allocator, getting almost 50% wastage.
> > 
> 
> Or its only an 1460+overhead allocation, and SLUB uses order-3 pages to
> satisfy 2048 bytes allocations.

Rather not, trace show failure in rtl8169_rx_fill, where we allocate rx
buffers and these are 16kB big by default.

> Switch to SLAB -> no more problem ;)

yeh, I wish to, but fedora use SLUB because of some debugging
capabilities. 

> > To fix we can use similar method as in niu or iwlwifi drivers, alloc
> > pages directly form buddy allocator and attach them to skb (by
> > skb_add_rx_frag for example). I'm going to prepare such patch, but
> > I have one doubt, what happens if page size in system is bigger
> > than 16kB, should I care about such case? 
> 
> Seems tricky. Should we patch all drivers to do something like that ?

I think, only on these drivers which do alloc_skb(n*PAGE_SIZE).
As alternative we can be smarter in alloc_skb.

Stanislaw
> 
> 
> 

^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: Eric W. Biederman @ 2010-10-08 16:06 UTC (permalink / raw)
  To: Hans Schillstrom; +Cc: netdev@vger.kernel.org, Daniel Lezcano
In-Reply-To: <201010071048.12817.hans.schillstrom@ericsson.com>

Hans Schillstrom <hans.schillstrom@ericsson.com> writes:

> Hello
> I'm trying to exit a network name space and it doesn't work (or am I doing something wrong?)
> The only netdevices left are lo and the tunnels ip6tnl0, sit0 and tunl0 when exiting netns.
>
> A netns is created by lxc-execute with two interfaces eth0 eth1 (macvlan)
> (see conf file at the end)
>
> Kernel: net-next-2.6 top from 4 october 2010
>
> I added some printk's inn ipip.c  ipip_exit_net()
> ...
>         rtnl_lock();
>         printk(KERN_ERR "ipip_exit_net(enter)\n");
>         ipip_destroy_tunnels(ipn, &list);
>         printk(KERN_ERR "ipip_exit_net(1)\n");
>         unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
>         printk(KERN_ERR "ipip_exit_net(2)\n");
>         unregister_netdevice_many(&list);
>         printk(KERN_ERR "ipip_exit_net(3)\n");
>         rtnl_unlock();
>         printk(KERN_ERR "ipip_exit_net(exit)\n");
>
>
> Exit steps:
> ===== Screen dump =====
>
>  # ifconfig eth0  0.0.0.0  down
>  # ifconfig eth1  0.0.0.0  down
>  # ifconfig lo  0.0.0.0  down
>  # ip li de eth0
>  # ip li de eth1
>  # ifconfig -a
> ip6tnl0   Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00  
>           NOARP  MTU:1460  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:0 
>           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> lo        Link encap:Local Loopback  
>           inet addr:127.0.0.1  Mask:255.0.0.0
>           LOOPBACK  MTU:16436  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:0 
>           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> sit0      Link encap:IPv6-in-IPv4  
>           NOARP  MTU:1480  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:0 
>           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
> tunl0     Link encap:UNSPEC  HWaddr 00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00  
>           NOARP  MTU:1480  Metric:1
>           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
>           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 txqueuelen:0 
>           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
>
>  # ps
>   PID USER       VSZ STAT COMMAND
>     1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
>     2 root      4540 S    /bin/ash /var/bin/init
>     7 root      6640 S    inetd
>     8 root      4544 S    /bin/ash
>    26 root      4544 R    ps
>  # lsmod 
> Module                  Size  Used by    Not tainted
> macvlan                 8709  0 
> pcnet32                29549  0 
> tg3                   112093  0 
> libphy                 21043  1 tg3
>  # kill 7 2
>  # ps
>   PID USER       VSZ STAT COMMAND
>     1 root     12412 S    /usr/lib64/lxc/lxc-init -- /var/bin/init
>     8 root      4544 S    /bin/ash
>    28 root      4544 R    ps
>  # exit  ( here is the exit from netns  )
>  # ipip_exit_net(enter)
> ipip_exit_net(1)
> ipip_exit_net(2)
> ------------[ cut here ]------------
> WARNING: at /home/hans/evip/kvm/net-next-2.6/kernel/sysctl.c:1953
>   unregister_sysctl_table+0xc7/0xf9()

This warning is caused by removing the parent directory
before the child in the sysctl tables.  Not strictly fatal but
it is a problem.  It may be worth looking at which sysctl
tables ipip registers to see if we can rectify this.

> Hardware name: Bochs
> Modules linked in: macvlan pcnet32 tg3 libphy
> Pid: 5, comm: kworker/u:0 Not tainted 2.6.36-rc3+ #7
> Call Trace:
>  [<ffffffff8103e281>] warn_slowpath_common+0x85/0x9d
>  [<ffffffff8103e2b3>] warn_slowpath_null+0x1a/0x1c
>  [<ffffffff81045e64>] unregister_sysctl_table+0xc7/0xf9
>  [<ffffffff812c86a5>] neigh_sysctl_unregister+0x27/0x3f
>  [<ffffffff81342108>] addrconf_ifdown+0x415/0x45e
>  [<ffffffff81342b98>] addrconf_notify+0x756/0x7fe
>  [<ffffffff812cacfb>] ? neigh_ifdown+0xc3/0xd4
>  [<ffffffff813622b3>] ? ip6mr_device_event+0x8d/0x9e
>  [<ffffffff8105eddb>] notifier_call_chain+0x37/0x63
>  [<ffffffff8105ee8b>] raw_notifier_call_chain+0x14/0x16
>  [<ffffffff812c15c7>] call_netdevice_notifiers+0x4a/0x4f
>  [<ffffffff812c1c1b>] rollback_registered_many+0x121/0x208
>  [<ffffffff812c1d1d>] unregister_netdevice_many+0x1b/0x71
>  [<ffffffff81324209>] ipip_exit_net+0xea/0x11a
>  [<ffffffff812bc941>] ? cleanup_net+0x0/0x198
>  [<ffffffff812bc2cf>] ops_exit_list+0x2a/0x5b
>  [<ffffffff812bca39>] cleanup_net+0xf8/0x198
>  [<ffffffff810568c7>] process_one_work+0x2a2/0x44d
>  [<ffffffff81056e35>] worker_thread+0x1db/0x34e
>  [<ffffffff81056c5a>] ? worker_thread+0x0/0x34e
>  [<ffffffff8105a030>] kthread+0x82/0x8a
>  [<ffffffff81003954>] kernel_thread_helper+0x4/0x10
>  [<ffffffff81059fae>] ? kthread+0x0/0x8a
>  [<ffffffff81003950>] ? kernel_thread_helper+0x0/0x10
> ---[ end trace 939b5185219f32e7 ]---
> ipip_exit_net(3)
> ipip_exit_net(exit)
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4
> unregister_netdevice: waiting for lo to become free. Usage count = 4

Nasty. Someone has left a reference lying around to one of the network
devices.  It is a reference that we can transfer to the loopback device
at device exit time, but we never drop the reference and so the loopback
interface never frees up.

Ouch!

There is the painful method of instrumenting of dev_hold and dev_release
that may give you a clue.  It may also be worth seeing which kinds of
device reference we transfer from the loopback device when a device
exits.

Eric

^ permalink raw reply

* Re: [PATCH] sysctl: fix min/max handling in __do_proc_doulongvec_minmax()
From: Américo Wang @ 2010-10-08 16:13 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Américo Wang, Robin Holt, Andrew Morton, linux-kernel,
	Willy Tarreau, David S. Miller, netdev, James Morris,
	Pekka Savola (ipv6), Patrick McHardy, Alexey Kuznetsov, ebiederm
In-Reply-To: <1286445081.2912.15.camel@edumazet-laptop>

On Thu, Oct 07, 2010 at 11:51:21AM +0200, Eric Dumazet wrote:
>Le jeudi 07 octobre 2010 à 17:25 +0800, Américo Wang a écrit :
>> >>
>> >
>> >Here is the final one.
>> 
>> Oops, that one is not correct. Hopefully this one
>> is correct.
>> 
>> --------------->
>> 
>> Eric D. noticed that we may trigger an OOPS if we leave ->extra{1,2}
>> to NULL when we use proc_doulongvec_minmax().
>> 
>> Actually, we don't need to store min/max values in a vector,
>> because all the elements in the vector should share the same min/max
>> value, like what proc_dointvec_minmax() does.
>> 
>
>If we assert same min/max limits are to be applied to all elements,
>a much simpler fix than yours would be :
>
>diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>index f88552c..8e45451 100644
>--- a/kernel/sysctl.c
>+++ b/kernel/sysctl.c
>@@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
> 		kbuf[left] = 0;
> 	}
> 
>-	for (; left && vleft--; i++, min++, max++, first=0) {
>+	for (; left && vleft--; i++, first=0) {
> 		unsigned long val;
> 
> 		if (write) {
>
>
>Please dont send huge patches like this to 'fix' a bug,
>especially on slow path.

Well, my patch makes that horrible code a little better. :)

>
>First we fix the bug, _then_ we can try to make code more 
>efficient or more pretty or shorter.
>
>So the _real_ question is :
>
>Should the min/max limits should be a single pair,
>shared by all elements, or a vector of limits.
>

Yes, actually I talked with Eric W. about this before
sending the patch.

I also checked the users of proc_doulongvec_minmax(),
none of them are using more than one limit, so it is
safe to remove that.


-- 
Live like a child, think like the god.
 

^ permalink raw reply

* Re: BUG ? ipip unregister_netdevice_many()
From: Daniel Lezcano @ 2010-10-08 16:17 UTC (permalink / raw)
  Cc: Hans Schillstrom, Eric W. Biederman, netdev@vger.kernel.org
In-Reply-To: <4CAF3E78.8030202@free.fr>

On 10/08/2010 05:53 PM, Daniel Lezcano wrote:
> On 10/08/2010 02:28 PM, Hans Schillstrom wrote:
>> Hi Eric,
>> Any advice how to trace this down ?
>> This rollback_registered_many() seems to have on the lists before...
>> All IPv4 and IPv6 tunnels causes this crash, all you have to do is
>> load the tunnel module(s)
>> enter a new ns and exit from it.
>>
>> Have not tested any more devices than tunnels,
>> I did an "ip link delete" on my macvlans before exiting the ns.
>
> Ah ! I succeed to reproduce it.
> It does not appear immediately in fact.
>
> I am trying to simplify the configuration but I am falling in the bug I
> talked about in the previous email.

Ok, so after investigating, we just need a macvlan and specify an ipv6 
address for it (inside a new netns of course), and the loopback is not 
released. I compiled out the tunnels, so they are not related to this 
problem I think.

That reduces the scope of investigation :)

Looking forward ...

   -- Daniel

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox