Netdev List
 help / color / mirror / Atom feed
* [PATCH 6/9] tproxy: added IPv6 socket lookup function to nf_tproxy_core
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 include/net/netfilter/nf_tproxy_core.h |   72 ++++++++++++++++++++++++++++++++
 1 files changed, 71 insertions(+), 1 deletions(-)

diff --git a/include/net/netfilter/nf_tproxy_core.h b/include/net/netfilter/nf_tproxy_core.h
index 1027d7f..cd85b3b 100644
--- a/include/net/netfilter/nf_tproxy_core.h
+++ b/include/net/netfilter/nf_tproxy_core.h
@@ -5,7 +5,8 @@
 #include <linux/in.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
-#include <net/inet_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
 #include <net/tcp.h>
 
 #define NFT_LOOKUP_ANY         0
@@ -130,6 +131,75 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
 	return sk;
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static inline struct sock *
+nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+		      const struct in6_addr *saddr, const struct in6_addr *daddr,
+		      const __be16 sport, const __be16 dport,
+		      const struct net_device *in, int lookup_type)
+{
+	struct sock *sk;
+
+	/* look up socket */
+	switch (protocol) {
+	case IPPROTO_TCP:
+		switch (lookup_type) {
+		case NFT_LOOKUP_ANY:
+			sk = inet6_lookup(net, &tcp_hashinfo,
+					  saddr, sport, daddr, dport,
+					  in->ifindex);
+			break;
+		case NFT_LOOKUP_LISTENER:
+			sk = inet6_lookup_listener(net, &tcp_hashinfo,
+						   daddr, ntohs(dport),
+						   in->ifindex);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too */
+
+			break;
+		case NFT_LOOKUP_ESTABLISHED:
+			sk = __inet6_lookup_established(net, &tcp_hashinfo,
+							saddr, sport, daddr, ntohs(dport),
+							in->ifindex);
+			break;
+		default:
+			WARN_ON(1);
+			sk = NULL;
+			break;
+		}
+		break;
+	case IPPROTO_UDP:
+		sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
+				     in->ifindex);
+		if (sk && lookup_type != NFT_LOOKUP_ANY) {
+			int connected = (sk->sk_state == TCP_ESTABLISHED);
+			int wildcard = ipv6_addr_any(&inet6_sk(sk)->rcv_saddr);
+
+			/* NOTE: we return listeners even if bound to
+			 * 0.0.0.0, those are filtered out in
+			 * xt_socket, since xt_TPROXY needs 0 bound
+			 * listeners too */
+			if ((lookup_type == NFT_LOOKUP_ESTABLISHED && (!connected || wildcard)) ||
+			    (lookup_type == NFT_LOOKUP_LISTENER && connected)) {
+				sock_put(sk);
+				sk = NULL;
+			}
+		}
+		break;
+	default:
+		WARN_ON(1);
+		sk = NULL;
+	}
+
+	pr_debug("tproxy socket lookup: proto %u %pI6:%u -> %pI6:%u, lookup type: %d, sock %p\n",
+		 protocol, saddr, ntohs(sport), daddr, ntohs(dport), lookup_type, sk);
+
+	return sk;
+}
+#endif
 
 static inline void
 nf_tproxy_put_sock(struct sock *sk)



^ permalink raw reply related

* [PATCH 7/9] tproxy: added IPv6 support to the TPROXY target
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

This requires a new revision as the old target structure was
IPv4 specific.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 include/linux/netfilter/xt_TPROXY.h |   15 +-
 net/netfilter/xt_TPROXY.c           |  262 ++++++++++++++++++++++++++++++-----
 2 files changed, 236 insertions(+), 41 deletions(-)

diff --git a/include/linux/netfilter/xt_TPROXY.h b/include/linux/netfilter/xt_TPROXY.h
index 152e8f9..7b4e06d 100644
--- a/include/linux/netfilter/xt_TPROXY.h
+++ b/include/linux/netfilter/xt_TPROXY.h
@@ -1,14 +1,21 @@
-#ifndef _XT_TPROXY_H_target
-#define _XT_TPROXY_H_target
+#ifndef _XT_TPROXY_H
+#define _XT_TPROXY_H
 
 /* TPROXY target is capable of marking the packet to perform
  * redirection. We can get rid of that whenever we get support for
  * mutliple targets in the same rule. */
-struct xt_tproxy_target_info {
+struct xt_tproxy_target_info_v0 {
 	u_int32_t mark_mask;
 	u_int32_t mark_value;
 	__be32 laddr;
 	__be16 lport;
 };
 
-#endif /* _XT_TPROXY_H_target */
+struct xt_tproxy_target_info_v1 {
+	u_int32_t mark_mask;
+	u_int32_t mark_value;
+	union nf_inet_addr laddr;
+	__be16 lport;
+};
+
+#endif /* _XT_TPROXY_H */
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 67cbed8..6ce76d6 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -1,7 +1,7 @@
 /*
  * Transparent proxy support for Linux/iptables
  *
- * Copyright (c) 2006-2007 BalaBit IT Ltd.
+ * Copyright (c) 2006-2010 BalaBit IT Ltd.
  * Author: Balazs Scheidler, Krisztian Kovacs
  *
  * This program is free software; you can redistribute it and/or modify
@@ -19,15 +19,18 @@
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <net/netfilter/nf_tproxy_core.h>
 
 /**
- * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections
+ * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
  * @skb:	The skb being processed.
- * @par:	Iptables target parameters.
+ * @laddr:	IPv4 address to redirect to or zero.
+ * @lport:	TCP port to redirect to or zero.
  * @sk:		The TIME_WAIT TCP socket found by the lookup.
  *
  * We have to handle SYN packets arriving to TIME_WAIT sockets
@@ -35,16 +38,16 @@
  * redirect the new connection to the proxy if there's a listener
  * socket present.
  *
- * tproxy_handle_time_wait() consumes the socket reference passed in.
+ * tproxy_handle_time_wait4() consumes the socket reference passed in.
  *
  * Returns the listener socket if there's one, the TIME_WAIT socket if
  * no such listener is found, or NULL if the TCP header is incomplete.
  */
 static struct sock *
-tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk)
+tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+			struct sock *sk)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct tcphdr _hdr, *hp;
 
 	hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
@@ -59,13 +62,64 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
 		struct sock *sk2;
 
 		sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					    iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-					    hp->source, tgi->lport ? tgi->lport : hp->dest,
-					    par->in, NFT_LOOKUP_LISTENER);
+					    iph->saddr, laddr ? laddr : iph->daddr,
+					    hp->source, lport ? lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
 		if (sk2) {
-			/* yeah, there's one, let's kill the TIME_WAIT
-			 * socket and redirect to the listener
-			 */
 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
 			inet_twsk_put(inet_twsk(sk));
 			sk = sk2;
@@ -76,10 +130,10 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par,
 }
 
 static unsigned int
-tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
+tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
+	   u_int32_t mark_mask, u_int32_t mark_value)
 {
 	const struct iphdr *iph = ip_hdr(skb);
-	const struct xt_tproxy_target_info *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
 
@@ -87,39 +141,140 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (hp == NULL)
 		return NF_DROP;
 
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
 	sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
 				   iph->saddr, iph->daddr,
 				   hp->source, hp->dest,
-				   par->in, NFT_LOOKUP_ESTABLISHED);
+				   skb->dev, NFT_LOOKUP_ESTABLISHED);
 
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
-		sk = tproxy_handle_time_wait(skb, par, sk);
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait4(skb, laddr, lport, sk);
 	else if (!sk)
+		/* no, there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					   iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr,
-					   hp->source, tgi->lport ? tgi->lport : hp->dest,
-					   par->in, NFT_LOOKUP_LISTENER);
+					   iph->saddr, laddr ? laddr : iph->daddr,
+					   hp->source, lport ? lport : hp->dest,
+					   skb->dev, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
 	if (sk && nf_tproxy_assign_sock(skb, sk)) {
 		/* This should be in a separate target, but we don't do multiple
 		   targets on the same rule yet */
-		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
+		skb->mark = (skb->mark & ~mark_mask) ^ mark_value;
 
-		pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-			 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-			 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		pr_debug("redirecting: proto %u %pI4:%u -> %pI4:%u, mark: %x\n",
+			 iph->protocol, &iph->daddr, ntohs(hp->dest),
+			 &laddr, ntohs(lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
 	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
 		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark);
+		 ntohl(laddr), ntohs(lport), skb->mark);
+	return NF_DROP;
+}
+
+static unsigned int
+tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v0 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+static unsigned int
+tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value);
+}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static unsigned int
+tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+	struct udphdr _hdr, *hp;
+	struct sock *sk;
+	int thoff;
+	int tproto;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	/* check if there's an ongoing connection on the packet
+	 * addresses, this happens if the redirect already happened
+	 * and the current packet belongs to an already established
+	 * connection */
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   &iph->saddr, &iph->daddr,
+				   hp->source, hp->dest,
+				   par->in, NFT_LOOKUP_ESTABLISHED);
+
+	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
+	if (sk && sk->sk_state == TCP_TIME_WAIT)
+		/* reopening a TIME_WAIT connection needs special handling */
+		sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk);
+	else if (!sk)
+		/* no there's no established connection, check if
+		 * there's a listener on the redirected addr/port */
+		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					   &iph->saddr,
+					   !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
+					   hp->source,
+					   tgi->lport ? tgi->lport : hp->dest,
+					   par->in, NFT_LOOKUP_LISTENER);
+
+	/* NOTE: assign_sock consumes our sk reference */
+	if (sk && nf_tproxy_assign_sock(skb, sk)) {
+		/* This should be in a separate target, but we don't do multiple
+		   targets on the same rule yet */
+		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
+
+		pr_debug("redirecting: proto %u %pI6:%u -> %pI6:%u, mark: %x\n",
+			 tproto, &iph->saddr, ntohs(hp->dest),
+			 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+		return NF_ACCEPT;
+	}
+
+	pr_debug("no socket, dropping: proto %u %pI6:%u -> %pI6:%u, mark: %x\n",
+		 tproto, &iph->saddr, ntohs(hp->dest),
+		 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
 	return NF_DROP;
 }
 
-static int tproxy_tg_check(const struct xt_tgchk_param *par)
+static int tproxy_tg6_check(const struct xt_tgchk_param *par)
+{
+	const struct ip6t_ip6 *i = par->entryinfo;
+
+	if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP)
+	    && !(i->flags & IP6T_INV_PROTO))
+		return 0;
+
+	pr_info("Can be used only in combination with "
+		"either -p tcp or -p udp\n");
+	return -EINVAL;
+}
+#endif
+
+static int tproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_ip *i = par->entryinfo;
 
@@ -132,31 +287,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par)
 	return -EINVAL;
 }
 
-static struct xt_target tproxy_tg_reg __read_mostly = {
-	.name		= "TPROXY",
-	.family		= AF_INET,
-	.table		= "mangle",
-	.target		= tproxy_tg,
-	.targetsize	= sizeof(struct xt_tproxy_target_info),
-	.checkentry	= tproxy_tg_check,
-	.hooks		= 1 << NF_INET_PRE_ROUTING,
-	.me		= THIS_MODULE,
+static struct xt_target tproxy_tg_reg[] __read_mostly = {
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v0,
+		.revision	= 0,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v0),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tproxy_tg4_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg4_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "TPROXY",
+		.family		= NFPROTO_IPV6,
+		.table		= "mangle",
+		.target		= tproxy_tg6_v1,
+		.revision	= 1,
+		.targetsize	= sizeof(struct xt_tproxy_target_info_v1),
+		.checkentry	= tproxy_tg6_check,
+		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.me		= THIS_MODULE,
+	},
+#endif
+
 };
 
 static int __init tproxy_tg_init(void)
 {
 	nf_defrag_ipv4_enable();
-	return xt_register_target(&tproxy_tg_reg);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
+	return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 static void __exit tproxy_tg_exit(void)
 {
-	xt_unregister_target(&tproxy_tg_reg);
+	xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg));
 }
 
 module_init(tproxy_tg_init);
 module_exit(tproxy_tg_exit);
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Krisztian Kovacs");
+MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
 MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module.");
 MODULE_ALIAS("ipt_TPROXY");
+MODULE_ALIAS("ip6t_TPROXY");



^ permalink raw reply related

* [PATCH 9/9] tproxy: use the interface primary IP address as a default value for --on-ip
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

The REDIRECT target and the older TProxy versions used the primary address
of the incoming interface as the default value of the --on-ip parameter.
This was unintentionally changed during the initial TProxy submission and
caused confusion among users.

Since IPv6 has no notion of primary address, we just select the first address
on the list: this way the socket lookup finds wildcard bound sockets
properly and we cannot really do better without the user telling us the
IPv6 address of the proxy.

This is implemented for both IPv4 and IPv6.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 net/netfilter/xt_TPROXY.c |  198 +++++++++++++++++++++++++++++----------------
 1 files changed, 128 insertions(+), 70 deletions(-)

diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 6ce76d6..a90ff20 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -16,15 +16,41 @@
 #include <net/checksum.h>
 #include <net/udp.h>
 #include <net/inet_sock.h>
-
+#include <linux/inetdevice.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_TPROXY.h>
 
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
 #include <net/netfilter/nf_tproxy_core.h>
+#include <linux/netfilter/xt_TPROXY.h>
+
+static inline __be32
+tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
+{
+	struct in_device *indev;
+	__be32 laddr;
+
+	if (user_laddr)
+		return user_laddr;
+
+	laddr = 0;
+	rcu_read_lock();
+	indev = __in_dev_get_rcu(skb->dev);
+	for_primary_ifa(indev) {
+		laddr = ifa->ifa_local;
+		break;
+	} endfor_ifa(indev);
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
 
 /**
  * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections
@@ -75,60 +101,6 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	return sk;
 }
 
-/**
- * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
- * @skb:	The skb being processed.
- * @tproto:	Transport protocol.
- * @thoff:	Transport protocol header offset.
- * @par:	Iptables target parameters.
- * @sk:		The TIME_WAIT TCP socket found by the lookup.
- *
- * We have to handle SYN packets arriving to TIME_WAIT sockets
- * differently: instead of reopening the connection we should rather
- * redirect the new connection to the proxy if there's a listener
- * socket present.
- *
- * tproxy_handle_time_wait6() consumes the socket reference passed in.
- *
- * Returns the listener socket if there's one, the TIME_WAIT socket if
- * no such listener is found, or NULL if the TCP header is incomplete.
- */
-static struct sock *
-tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
-			 const struct xt_action_param *par,
-			 struct sock *sk)
-{
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct tcphdr _hdr, *hp;
-	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
-
-	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
-	if (hp == NULL) {
-		inet_twsk_put(inet_twsk(sk));
-		return NULL;
-	}
-
-	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
-		/* SYN to a TIME_WAIT socket, we'd rather redirect it
-		 * to a listener socket if there's one */
-		struct sock *sk2;
-
-		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
-					    &iph->saddr,
-					    !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
-					    hp->source,
-					    tgi->lport ? tgi->lport : hp->dest,
-					    skb->dev, NFT_LOOKUP_LISTENER);
-		if (sk2) {
-			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
-			inet_twsk_put(inet_twsk(sk));
-			sk = sk2;
-		}
-	}
-
-	return sk;
-}
-
 static unsigned int
 tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 	   u_int32_t mark_mask, u_int32_t mark_value)
@@ -150,6 +122,10 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 				   hp->source, hp->dest,
 				   skb->dev, NFT_LOOKUP_ESTABLISHED);
 
+	laddr = tproxy_laddr4(skb, laddr, iph->daddr);
+	if (!lport)
+		lport = hp->dest;
+
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
@@ -158,8 +134,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 		/* no, there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol,
-					   iph->saddr, laddr ? laddr : iph->daddr,
-					   hp->source, lport ? lport : hp->dest,
+					   iph->saddr, laddr,
+					   hp->source, lport,
 					   skb->dev, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
@@ -174,9 +150,9 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport,
 		return NF_ACCEPT;
 	}
 
-	pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n",
-		 iph->protocol, ntohl(iph->daddr), ntohs(hp->dest),
-		 ntohl(laddr), ntohs(lport), skb->mark);
+	pr_debug("no socket, dropping: proto %u %pI4:%u -> %pI4:%u, mark: %x\n",
+		 iph->protocol, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
 	return NF_DROP;
 }
 
@@ -197,6 +173,85 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par)
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static inline const struct in6_addr *
+tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, const struct in6_addr *daddr)
+{
+	struct inet6_dev *indev;
+	struct inet6_ifaddr *ifa;
+	struct in6_addr *laddr;
+
+	if (!ipv6_addr_any(user_laddr))
+		return user_laddr;
+	laddr = NULL;
+
+	rcu_read_lock();
+	indev = __in6_dev_get(skb->dev);
+	if (indev)
+		list_for_each_entry(ifa, &indev->addr_list, if_list) {
+			/* FIXME: address selection */
+			laddr = &ifa->addr;
+			break;
+		}
+	rcu_read_unlock();
+
+	return laddr ? laddr : daddr;
+}
+
+/**
+ * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections
+ * @skb:	The skb being processed.
+ * @tproto:	Transport protocol.
+ * @thoff:	Transport protocol header offset.
+ * @par:	Iptables target parameters.
+ * @sk:		The TIME_WAIT TCP socket found by the lookup.
+ *
+ * We have to handle SYN packets arriving to TIME_WAIT sockets
+ * differently: instead of reopening the connection we should rather
+ * redirect the new connection to the proxy if there's a listener
+ * socket present.
+ *
+ * tproxy_handle_time_wait6() consumes the socket reference passed in.
+ *
+ * Returns the listener socket if there's one, the TIME_WAIT socket if
+ * no such listener is found, or NULL if the TCP header is incomplete.
+ */
+static struct sock *
+tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
+			 const struct xt_action_param *par,
+			 struct sock *sk)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct tcphdr _hdr, *hp;
+	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
+
+	hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL) {
+		inet_twsk_put(inet_twsk(sk));
+		return NULL;
+	}
+
+	if (hp->syn && !hp->rst && !hp->ack && !hp->fin) {
+		/* SYN to a TIME_WAIT socket, we'd rather redirect it
+		 * to a listener socket if there's one */
+		struct sock *sk2;
+
+		sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+					    &iph->saddr,
+					    tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
+					    hp->source,
+					    tgi->lport ? tgi->lport : hp->dest,
+					    skb->dev, NFT_LOOKUP_LISTENER);
+		if (sk2) {
+			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+			inet_twsk_put(inet_twsk(sk));
+			sk = sk2;
+		}
+	}
+
+	return sk;
+}
+
 static unsigned int
 tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -204,6 +259,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 	const struct xt_tproxy_target_info_v1 *tgi = par->targinfo;
 	struct udphdr _hdr, *hp;
 	struct sock *sk;
+	const struct in6_addr *laddr;
+	__be16 lport;
 	int thoff;
 	int tproto;
 
@@ -228,6 +285,9 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 				   hp->source, hp->dest,
 				   par->in, NFT_LOOKUP_ESTABLISHED);
 
+	laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr);
+	lport = tgi->lport ? tgi->lport : hp->dest;
+
 	/* UDP has no TCP_TIME_WAIT state, so we never enter here */
 	if (sk && sk->sk_state == TCP_TIME_WAIT)
 		/* reopening a TIME_WAIT connection needs special handling */
@@ -236,10 +296,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		/* no there's no established connection, check if
 		 * there's a listener on the redirected addr/port */
 		sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
-					   &iph->saddr,
-					   !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr,
-					   hp->source,
-					   tgi->lport ? tgi->lport : hp->dest,
+					   &iph->saddr, laddr,
+					   hp->source, lport,
 					   par->in, NFT_LOOKUP_LISTENER);
 
 	/* NOTE: assign_sock consumes our sk reference */
@@ -249,14 +307,14 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
 		skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value;
 
 		pr_debug("redirecting: proto %u %pI6:%u -> %pI6:%u, mark: %x\n",
-			 tproto, &iph->saddr, ntohs(hp->dest),
-			 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+			 tproto, &iph->saddr, ntohs(hp->source),
+			 laddr, ntohs(lport), skb->mark);
 		return NF_ACCEPT;
 	}
 
 	pr_debug("no socket, dropping: proto %u %pI6:%u -> %pI6:%u, mark: %x\n",
-		 tproto, &iph->saddr, ntohs(hp->dest),
-		 &tgi->laddr.in6, ntohs(tgi->lport), skb->mark);
+		 tproto, &iph->saddr, ntohs(hp->source),
+		 &iph->daddr, ntohs(hp->dest), skb->mark);
 	return NF_DROP;
 }
 



^ permalink raw reply related

* [PATCH 8/9] tproxy: added IPv6 support to the socket match
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

The ICMP extraction bits were contributed by Harry Mason.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 net/netfilter/xt_socket.c |  165 ++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 154 insertions(+), 11 deletions(-)

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 266faa0..1dc2784 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -14,6 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/tcp.h>
 #include <net/udp.h>
 #include <net/icmp.h>
@@ -21,6 +22,7 @@
 #include <net/inet_sock.h>
 #include <net/netfilter/nf_tproxy_core.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 #include <linux/netfilter/xt_socket.h>
 
@@ -30,7 +32,7 @@
 #endif
 
 static int
-extract_icmp_fields(const struct sk_buff *skb,
+extract_icmp4_fields(const struct sk_buff *skb,
 		    u8 *protocol,
 		    __be32 *raddr,
 		    __be32 *laddr,
@@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb,
 	return 0;
 }
 
-
 static bool
 socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 	     const struct xt_socket_mtinfo1 *info)
@@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 		dport = hp->dest;
 
 	} else if (iph->protocol == IPPROTO_ICMP) {
-		if (extract_icmp_fields(skb, &protocol, &saddr, &daddr,
+		if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
 					&sport, &dport))
 			return false;
 	} else {
@@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
 			sk = NULL;
 	}
 
-	pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n",
-		 protocol, ntohl(saddr), ntohs(sport),
-		 ntohl(daddr), ntohs(dport),
-		 ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk);
+	pr_debug("proto %u %pI4:%u -> %pI4:%u (orig %pI4:%u) sock %p\n",
+		 protocol, &saddr, ntohs(sport),
+		 &daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
 
 	return (sk != NULL);
 }
 
 static bool
-socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, NULL);
 }
 
 static bool
-socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	return socket_match(skb, par, par->matchinfo);
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+
+static int
+extract_icmp6_fields(const struct sk_buff *skb,
+		     unsigned int outside_hdrlen,
+		     u8 *protocol,
+		     struct in6_addr **raddr,
+		     struct in6_addr **laddr,
+		     __be16 *rport,
+		     __be16 *lport)
+{
+	struct ipv6hdr *inside_iph, _inside_iph;
+	struct icmp6hdr *icmph, _icmph;
+	__be16 *ports, _ports[2];
+	u8 inside_nexthdr;
+	int inside_hdrlen;
+
+	icmph = skb_header_pointer(skb, outside_hdrlen,
+				   sizeof(_icmph), &_icmph);
+	if (icmph == NULL)
+		return 1;
+
+	if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK)
+		return 1;
+
+	inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph);
+	if (inside_iph == NULL)
+		return 1;
+	inside_nexthdr = inside_iph->nexthdr;
+
+	inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr);
+	if (inside_hdrlen < 0)
+		return 1; /* hjm: Packet has no/incomplete transport layer headers. */
+
+	if (inside_nexthdr != IPPROTO_TCP &&
+	    inside_nexthdr != IPPROTO_UDP)
+		return 1;
+
+	ports = skb_header_pointer(skb, inside_hdrlen,
+				   sizeof(_ports), &_ports);
+	if (ports == NULL)
+		return 1;
+
+	/* the inside IP packet is the one quoted from our side, thus
+	 * its saddr is the local address */
+	*protocol = inside_nexthdr;
+	*laddr = &inside_iph->saddr;
+	*lport = ports[0];
+	*raddr = &inside_iph->daddr;
+	*rport = ports[1];
+
+	return 0;
+}
+
+static bool
+socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct udphdr _hdr, *hp = NULL;
+	struct sock *sk;
+	struct in6_addr *daddr, *saddr;
+	__be16 dport, sport;
+	int thoff;
+	u8 tproto;
+	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
+
+	tproto = ipv6_find_hdr(skb, &thoff, -1, NULL);
+	if (tproto < 0) {
+		pr_debug("unable to find transport header in IPv6 packet, dropping\n");
+		return NF_DROP;
+	}
+
+	if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) {
+		hp = skb_header_pointer(skb, thoff,
+					sizeof(_hdr), &_hdr);
+		if (hp == NULL)
+			return false;
+
+		saddr = &iph->saddr;
+		sport = hp->source;
+		daddr = &iph->daddr;
+		dport = hp->dest;
+
+	} else if (tproto == IPPROTO_ICMPV6) {
+		if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
+					 &sport, &dport))
+			return false;
+	} else {
+		return false;
+	}
+
+	sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto,
+				   saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY);
+	if (sk != NULL) {
+		bool wildcard;
+		bool transparent = true;
+
+		/* Ignore sockets listening on INADDR_ANY */
+		wildcard = (sk->sk_state != TCP_TIME_WAIT &&
+			    ipv6_addr_any(&inet6_sk(sk)->rcv_saddr));
+
+		/* Ignore non-transparent sockets,
+		   if XT_SOCKET_TRANSPARENT is used */
+		if (info && info->flags & XT_SOCKET_TRANSPARENT)
+			transparent = ((sk->sk_state != TCP_TIME_WAIT &&
+					inet_sk(sk)->transparent) ||
+				       (sk->sk_state == TCP_TIME_WAIT &&
+					inet_twsk(sk)->tw_transparent));
+
+		nf_tproxy_put_sock(sk);
+
+		if (wildcard || !transparent)
+			sk = NULL;
+	}
+
+	pr_debug("proto %u %pI6:%u -> %pI6:%u "
+		 "(orig %pI6:%u) sock %p\n",
+		 tproto, saddr, ntohs(sport),
+		 daddr, ntohs(dport),
+		 &iph->daddr, hp ? ntohs(hp->dest) : 0, sk);
+
+	return (sk != NULL);
+}
+#endif
+
 static struct xt_match socket_mt_reg[] __read_mostly = {
 	{
 		.name		= "socket",
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v0,
+		.match		= socket_mt4_v0,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
@@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.name		= "socket",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
-		.match		= socket_mt_v1,
+		.match		= socket_mt4_v1,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	{
+		.name		= "socket",
+		.revision	= 1,
+		.family		= NFPROTO_IPV6,
+		.match		= socket_mt6_v1,
+		.matchsize	= sizeof(struct xt_socket_mtinfo1),
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
+		.me		= THIS_MODULE,
+	},
+#endif
 };
 
 static int __init socket_mt_init(void)
 {
 	nf_defrag_ipv4_enable();
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	nf_defrag_ipv6_enable();
+#endif
+
 	return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg));
 }
 
@@ -225,3 +367,4 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler");
 MODULE_DESCRIPTION("x_tables socket match module");
 MODULE_ALIAS("ipt_socket");
+MODULE_ALIAS("ip6t_socket");



^ permalink raw reply related

* [PATCH 3/9] tproxy: added udp6_lib_lookup function
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

Just like with IPv4, we need access to the UDP hash table to look up local
sockets, but instead of exporting the global udp_table, export a lookup
function.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 include/net/udp.h |    3 +++
 net/ipv6/udp.c    |    8 ++++++++
 2 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index a184d34..200b828 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -183,6 +183,9 @@ extern int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 extern struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 				    __be32 daddr, __be16 dport,
 				    int dif);
+extern struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+				    const struct in6_addr *daddr, __be16 dport,
+				    int dif);
 
 /*
  * 	SNMP statistics for UDP and UDP-Lite
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 33e3683..c84dad4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -320,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 				 udptable);
 }
 
+struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+			     const struct in6_addr *daddr, __be16 dport, int dif)
+{
+	return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp6_lib_lookup);
+
+
 /*
  * 	This should be easy, if there is something there we
  * 	return it, otherwise we block.



^ permalink raw reply related

* [PATCH 2/9] tproxy: added const specifiers to udp lookup functions
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller
In-Reply-To: <20101020112118.6260.31618.stgit@este.odu>

From: Balazs Scheidler <bazsi@balabit.hu>

The parameters for various UDP lookup functions were non-const, even though
they could be const. TProxy has some const references and instead of
downcasting it, I added const specifiers along the path.

Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
---
 net/ipv6/udp.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5acb356..33e3683 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -122,8 +122,8 @@ static void udp_v6_rehash(struct sock *sk)
 
 static inline int compute_score(struct sock *sk, struct net *net,
 				unsigned short hnum,
-				struct in6_addr *saddr, __be16 sport,
-				struct in6_addr *daddr, __be16 dport,
+				const struct in6_addr *saddr, __be16 sport,
+				const struct in6_addr *daddr, __be16 dport,
 				int dif)
 {
 	int score = -1;
@@ -239,8 +239,8 @@ exact_match:
 }
 
 static struct sock *__udp6_lib_lookup(struct net *net,
-				      struct in6_addr *saddr, __be16 sport,
-				      struct in6_addr *daddr, __be16 dport,
+				      const struct in6_addr *saddr, __be16 sport,
+				      const struct in6_addr *daddr, __be16 dport,
 				      int dif, struct udp_table *udptable)
 {
 	struct sock *sk, *result;



^ permalink raw reply related

* [PATCH 0/9] tproxy: add IPv6 support
From: KOVACS Krisztian @ 2010-10-20 11:21 UTC (permalink / raw)
  To: netdev, netfilter-devel; +Cc: Patrick McHardy, David Miller

The following series adds IPv6 support for tproxy. The parts touching
non-Netfilter code include exporting the UDP lookup function, adding the
sockopt infrastructure for getting the original destination address and
allowing non-local binds if the IP_TRANSPARENT socket option is set.

Netfilter changes are splitting the defragmentation code off of conntrack,
adding IPv6 socket lookup helpers to the tproxy core module and updating the
socket match and the TPROXY target.

The last patch in the series tries to make it easier to use the TPROXY target
by selecting a meaningful address to redirect to in case the user did not
explicitly specify it with '--on-ip'.

---

Balazs Scheidler (9):
      tproxy: split off ipv6 defragmentation to a separate module
      tproxy: added const specifiers to udp lookup functions
      tproxy: added udp6_lib_lookup function
      tproxy: added tproxy sockopt interface in the IPV6 layer
      tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled
      tproxy: added IPv6 socket lookup function to nf_tproxy_core
      tproxy: added IPv6 support to the TPROXY target
      tproxy: added IPv6 support to the socket match
      tproxy: use the interface primary IP address as a default value for --on-ip


 include/linux/in6.h                            |    4 
 include/linux/ipv6.h                           |    4 
 include/linux/netfilter/xt_TPROXY.h            |   15 +
 include/net/netfilter/ipv6/nf_defrag_ipv6.h    |    6 
 include/net/netfilter/nf_tproxy_core.h         |   72 +++++
 include/net/udp.h                              |    3 
 net/ipv6/af_inet6.c                            |    2 
 net/ipv6/datagram.c                            |   19 +
 net/ipv6/ipv6_sockglue.c                       |   23 ++
 net/ipv6/netfilter/Makefile                    |    5 
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   78 ------
 net/ipv6/netfilter/nf_conntrack_reasm.c        |   12 +
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |  131 ++++++++++
 net/ipv6/udp.c                                 |   16 +
 net/netfilter/xt_TPROXY.c                      |  324 +++++++++++++++++++++---
 net/netfilter/xt_socket.c                      |  165 +++++++++++-
 16 files changed, 740 insertions(+), 139 deletions(-)
 create mode 100644 include/net/netfilter/ipv6/nf_defrag_ipv6.h
 create mode 100644 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c

-- 
KOVACS Krisztian


^ permalink raw reply

* Re: tbf/htb qdisc limitations
From: Jarek Poplawski @ 2010-10-20 11:06 UTC (permalink / raw)
  To: Bill Fink; +Cc: Eric Dumazet, Rick Jones, Steven Brudenell, netdev
In-Reply-To: <20101019033724.0bffbe88.billfink@mindspring.com>

On Tue, Oct 19, 2010 at 03:37:24AM -0400, Bill Fink wrote:
> On Sun, 17 Oct 2010, Jarek Poplawski wrote:
> 
> > On Sat, Oct 16, 2010 at 09:24:34PM -0400, Bill Fink wrote:
> > > On Sat, 16 Oct 2010, Jarek Poplawski wrote:
> > ...
> > > > http://code.google.com/p/pspacer/wiki/HTBon10GbE
> > > >  
> > > > If it doesn't help reconsider hfsc.
> > > 
> > > Thanks for the link.  From his results, it appears you can
> > > get better accuracy by keeping TSO/GSO enabled and upping
> > > the tc mtu parameter to 64000.  I will have to try that out.
> > 
> > Sure, but you have to remember that scheduler doesn't know real packet
> > sizes and rate tables are less accurate especially for smaller packets,
> > so it depends on conditions.
> 
> On my testing on the real data path, TSO/GSO enabled did seem
> to give more accurate results for a single stream.  But when
> I tried multiple 10-GigE paths simultaneously, each with a
> single stream across it, non-TSO/GSO seemed to fare better
> overall.

Btw, if you find time I would be interested in checking an opposite
concept of lower than real mtu (256) to use rate tables different way
(other tbf parameters without change). The patch below is needed for
this to work.

Thanks,
Jarek P.
---

diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 641a30d..9ac3460 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -123,9 +123,6 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	struct tbf_sched_data *q = qdisc_priv(sch);
 	int ret;
 
-	if (qdisc_pkt_len(skb) > q->max_size)
-		return qdisc_reshape_fail(skb, sch);
-
 	ret = qdisc_enqueue(skb, q->qdisc);
 	if (ret != NET_XMIT_SUCCESS) {
 		if (net_xmit_drop_count(ret))

^ permalink raw reply related

* Re: [PATCH 1/2] Remove netpoll blocking from uninit path
From: Neil Horman @ 2010-10-20 10:51 UTC (permalink / raw)
  To: David Miller; +Cc: amwang, netdev, bonding-devel, fubar, andy
In-Reply-To: <20101020.014538.193720733.davem@davemloft.net>

On Wed, Oct 20, 2010 at 01:45:38AM -0700, David Miller wrote:
> From: Cong Wang <amwang@redhat.com>
> Date: Wed, 20 Oct 2010 15:47:11 +0800
> 
> > On 10/20/10 01:04, nhorman@tuxdriver.com wrote:
> >> From: Neil Horman<nhorman@tuxdriver.com>
> >>
> >> Some recent testing in netpoll with bonding showed this backtrace
> ...
> >> Signed-off-by: Neil Horman<nhorman@tuxdriver.com>
> > 
> > Reviewed-by: WANG Cong <amwang@redhat.com>
> 
> Applied.
> 
> Neil, please add proper subsystem prefixes to your subject
> lines, I've had to add them by hand as I add your patches.
> 
Copy that, apologies.

Thanks!
Neil

> Thanks.
> 

^ permalink raw reply

* Re: RFC: Crypto API User-interface
From: Nikos Mavrogiannopoulos @ 2010-10-20 10:24 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Linux Crypto Mailing List, netdev, Linux Kernel Mailing List,
	Cryptodev-linux-devel
In-Reply-To: <20101019134418.GA13514@gondor.apana.org.au>

On Tue, Oct 19, 2010 at 3:44 PM, Herbert Xu
<herbert@gondor.hengli.com.au> wrote:
> OK I've gone ahead and implemented the user-space API for hashes
> and ciphers.
> To recap this interface is designed to allow user-space programs
> to access hardware cryptographic accelerators that we have added
> to the kernel.
> The intended usage scenario is where a large amount of data needs
> to be processed where the benefits offered by hardware acceleration
> that is normally unavailable in user-space (as opposed to ones
> such as the Intel AES instruction which may be used directly from
> user-space) outweigh the overhead of going through the kernel.

What is the overall advantage of this API comparing to other existing
ones that achieve similar goals[0][1]?

Some observations:
1. To perform an encryption of data 6 system calls are made (I don't
count the 2 used for socket initialization since I suppose can be global
for all operations) and a file descriptor is assigned. The number of
system calls
made has great impact to the actual speed seen by userspace (as you said this
API is for user-space to access the high-speed peripherals that do encryption).
2. Due to the usage of read() and write() no zero-copy can happen for
user-space buffers[3].

regards,
Nikos

[0]. http://home.gna.org/cryptodev-linux/
[1]. http://home.gna.org/cryptodev-linux/ncr.html
[2]. The openbsd[0] api can do it with 3 system calls and NCR[1] with one,
and both require no file descriptor for each operation.
[3]. The openbsd[0] api and NCR[1] do zero-copy for user-space buffers.

^ permalink raw reply

* Why RTM_NEWADDR is sent before FIB update?
From: Timo Teräs @ 2010-10-20 10:23 UTC (permalink / raw)
  To: netdev

Hi,

I'm wonder why does devinet.c:__inet_insert_ifa() (and other places)
send first the RTM_NEWADDR notification and only after that calls
inetaddr_chain notifiers?

The reason I'm asking that this gives a race condition to user land:
 1. process A changes IP address
 2. kernel sends RTM_NEWADDR
 3. process B gets notification
 4. process B tries to bind() to new IP but that fails with
EADDRNOTAVAIL because FIB is not yet updated and inet_addr_type() in
inet_bind() does not recognize the IP as local
 5. kernel calls inetaddr_chain notifiers which updates FIB

My understanding was that RTM_NEWADDR was notification about new address
being usable. But it's currently a notification about "new address will
be usable soon".

So should we:
 a) call first notifiers and after that send RTM_NEWADDR?
 b) synchronise inet_bind() with address changes somehow?
 c) live with the bad semantics of the notification in userland
 d) ??

- Timo

^ permalink raw reply

* Re: [PATCH 0/2] can: mcp251x: fix error handling and error frames bug
From: Marc Kleine-Budde @ 2010-10-20 10:16 UTC (permalink / raw)
  To: Wolfgang Grandegger
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4CBEC06D.6000102-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>


[-- Attachment #1.1: Type: text/plain, Size: 798 bytes --]

On 10/20/2010 12:11 PM, Wolfgang Grandegger wrote:
>> this series of patches fixes two bugs, one introduced by me in the other 
>> mcp251x series.
>>
>> Please test, especially the error handling, i.e. wrong bitrate, shortcut in
>> CAN bus. I'm also interested in tests with non i.MX SPI can cores.
> 
> You can add my Acked-by but more Tested-by's are definitely more useful.

Thanks and ACK :)

> Unfortunately, I do not have that hardware at hand.

I'm sure Jargalan is going to test it.

cheers, Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

[-- Attachment #2: Type: text/plain, Size: 188 bytes --]

_______________________________________________
Socketcan-core mailing list
Socketcan-core-0fE9KPoRgkgATYTw5x5z8w@public.gmane.org
https://lists.berlios.de/mailman/listinfo/socketcan-core

^ permalink raw reply

* Re: [PATCH 0/2] can: mcp251x: fix error handling and error frames bug
From: Wolfgang Grandegger @ 2010-10-20 10:11 UTC (permalink / raw)
  To: Marc Kleine-Budde
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1287568946-32727-1-git-send-email-mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

On 10/20/2010 12:02 PM, Marc Kleine-Budde wrote:
> Hello,
> 
> this series of patches fixes two bugs, one introduced by me in the other 
> mcp251x series.
> 
> Please test, especially the error handling, i.e. wrong bitrate, shortcut in
> CAN bus. I'm also interested in tests with non i.MX SPI can cores.

You can add my Acked-by but more Tested-by's are definitely more useful.
Unfortunately, I do not have that hardware at hand.

Wolfgang.

^ permalink raw reply

* Re: [PATCH net-next] net: avoid RCU for NOCACHE dst
From: David Miller @ 2010-10-20 10:03 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1287157451.2647.126.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 15 Oct 2010 17:44:11 +0200

> There is no point using RCU for dst we allocate for a very short time
> (used once).
> 
> Change dst_release() to take DST_NOCACHE into account, but also change
> skb_dst_set_noref() to force a refcount increment for such dst.
> 
> This is a _huge_ gain, because we dont waste memory to store xx thousand
> of dsts. Instead of queueing them to RCU, we can free them instantly. 
 ...
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Wonderful, we are now faster without the routing cache. :-)

Applied.

^ permalink raw reply

* [PATCH 1/2] can: mcp251x: fix endless loop in interrupt handler if CANINTF_MERRF is set
From: Marc Kleine-Budde @ 2010-10-20 10:02 UTC (permalink / raw)
  To: socketcan-core; +Cc: netdev, Marc Kleine-Budde
In-Reply-To: <1287568946-32727-1-git-send-email-mkl@pengutronix.de>

Commit d3cd15657516141adce387810be8cb377baf020e introduced a bug, the
interrupt handler would loop endlessly if the CANINTF_MERRF bit is set,
because it's not cleared.

This patch fixes the problem by masking out the CANINTF_MERRF and all other
non interesting bits.

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/mcp251x.c |   14 +++++++++-----
 1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c
index c664be2..59f40bc 100644
--- a/drivers/net/can/mcp251x.c
+++ b/drivers/net/can/mcp251x.c
@@ -125,8 +125,9 @@
 #  define CANINTF_TX0IF 0x04
 #  define CANINTF_RX1IF 0x02
 #  define CANINTF_RX0IF 0x01
-#  define CANINTF_ERR_TX \
-	(CANINTF_ERRIF | CANINTF_TX2IF | CANINTF_TX1IF | CANINTF_TX0IF)
+#  define CANINTF_RX (CANINTF_RX0IF | CANINTF_RX1IF)
+#  define CANINTF_TX (CANINTF_TX2IF | CANINTF_TX1IF | CANINTF_TX0IF)
+#  define CANINTF_ERR (CANINTF_ERRIF)
 #define EFLG	      0x2d
 #  define EFLG_EWARN	0x01
 #  define EFLG_RXWAR	0x02
@@ -790,6 +791,9 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
 
 		mcp251x_read_2regs(spi, CANINTF, &intf, &eflag);
 
+		/* mask out flags we don't care about */
+		intf &= CANINTF_RX | CANINTF_TX | CANINTF_ERR;
+
 		/* receive buffer 0 */
 		if (intf & CANINTF_RX0IF) {
 			mcp251x_hw_rx(spi, 0);
@@ -810,8 +814,8 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
 		}
 
 		/* any error or tx interrupt we need to clear? */
-		if (intf & CANINTF_ERR_TX)
-			clear_intf |= intf & CANINTF_ERR_TX;
+		if (intf & (CANINTF_ERR | CANINTF_TX))
+			clear_intf |= intf & (CANINTF_ERR | CANINTF_TX);
 		if (clear_intf)
 			mcp251x_write_bits(spi, CANINTF, clear_intf, 0x00);
 
@@ -887,7 +891,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
 		if (intf == 0)
 			break;
 
-		if (intf & (CANINTF_TX2IF | CANINTF_TX1IF | CANINTF_TX0IF)) {
+		if (intf & CANINTF_TX) {
 			net->stats.tx_packets++;
 			net->stats.tx_bytes += priv->tx_len - 1;
 			if (priv->tx_len) {
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 2/2] can: mcp251x: fix generation of error frames
From: Marc Kleine-Budde @ 2010-10-20 10:02 UTC (permalink / raw)
  To: socketcan-core-0fE9KPoRgkgATYTw5x5z8w
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, Marc Kleine-Budde
In-Reply-To: <1287568946-32727-1-git-send-email-mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>

The function "mcp251x_error_skb" is used to generate error frames.
They are identified by the "CAN_ERR_FLAG" in can_id. The function
overwrites the can_id so that the frames show up as normal frames instead
of error frames.

This patch fixes the problem by or'ing the can_id instead of overwriting it.

Signed-off-by: Marc Kleine-Budde <mkl-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
Tested-by: Jargalan Nermunkh <jargalan.nermunkh-wZX4cNJlHJ2sVWG7oymsAA@public.gmane.org>
---
 drivers/net/can/mcp251x.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c
index 59f40bc..6aadc3e 100644
--- a/drivers/net/can/mcp251x.c
+++ b/drivers/net/can/mcp251x.c
@@ -705,7 +705,7 @@ static void mcp251x_error_skb(struct net_device *net, int can_id, int data1)
 
 	skb = alloc_can_err_skb(net, &frame);
 	if (skb) {
-		frame->can_id = can_id;
+		frame->can_id |= can_id;
 		frame->data[1] = data1;
 		netif_rx_ni(skb);
 	} else {
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 0/2] can: mcp251x: fix error handling and error frames bug
From: Marc Kleine-Budde @ 2010-10-20 10:02 UTC (permalink / raw)
  To: socketcan-core-0fE9KPoRgkgATYTw5x5z8w; +Cc: netdev-u79uwXL29TY76Z2rM5mHXA

Hello,

this series of patches fixes two bugs, one introduced by me in the other 
mcp251x series.

Please test, especially the error handling, i.e. wrong bitrate, shortcut in
CAN bus. I'm also interested in tests with non i.MX SPI can cores.

cheers, Marc

^ permalink raw reply

* Re: [PATCH 0/3] net: RX and TX queue allocation cleanup
From: David Miller @ 2010-10-20  9:38 UTC (permalink / raw)
  To: therbert; +Cc: netdev, eric.dumazet, bhutchings
In-Reply-To: <20101020.023041.112586469.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Wed, 20 Oct 2010 02:30:41 -0700 (PDT)

> From: Tom Herbert <therbert@google.com>
> Date: Mon, 18 Oct 2010 11:02:06 -0700 (PDT)
> 
>> Move TX queue allocation to register_netdevice.  Enforce
>> alloc_netdev_mq & netif_set_real_num_[rt]x_queues netif_alloc_rx_queues
>> are called with at least one queue to allocate.
> 
> All applied, and I fixed the indentation issue Eric noticed
> while integrating.

Just to be clear, I did make sure I used the "v2" series
of the patches, not the original. :-)


^ permalink raw reply

* Re: [PATCH 0/3] net: RX and TX queue allocation cleanup
From: David Miller @ 2010-10-20  9:30 UTC (permalink / raw)
  To: therbert; +Cc: netdev, eric.dumazet, bhutchings
In-Reply-To: <alpine.DEB.1.00.1010181053200.15812@pokey.mtv.corp.google.com>

From: Tom Herbert <therbert@google.com>
Date: Mon, 18 Oct 2010 11:02:06 -0700 (PDT)

> Move TX queue allocation to register_netdevice.  Enforce
> alloc_netdev_mq & netif_set_real_num_[rt]x_queues netif_alloc_rx_queues
> are called with at least one queue to allocate.

All applied, and I fixed the indentation issue Eric noticed
while integrating.

Thanks.

^ permalink raw reply

* Re: SCTP AUTO-ASCONF patch
From: Shan Wei @ 2010-10-20  9:30 UTC (permalink / raw)
  To: Michio Honda; +Cc: Vlad Yasevich, netdev, hadi
In-Reply-To: <EB46A57B-530A-4F52-8A29-120192604816@sfc.wide.ad.jp>

Michio Honda wrote, at 10/12/2010 08:27 PM:
> Hi, 
> 
> I'm resubmitting a patch to enable AUTO_ASCONF for Linux SCTP. (version is for 2.6.36-rc7).

For so big patch, please provide more documents for reviewing. :;)
See some trivial comment in the context.

> 
> Thanks,
> - Michio
> 
> Only in linux-2.6/include: asm-arm
> Only in linux-2.6/include: asm-mn10300
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/include/linux/sysctl.h linux-2.6/include/linux/sysctl.h
> --- linux-2.6.orig/include/linux/sysctl.h	2010-10-11 08:24:33.000000000 +0900
> +++ linux-2.6/include/linux/sysctl.h	2010-10-11 07:21:40.000000000 +0900
> @@ -767,6 +767,7 @@ enum {
>  	NET_SCTP_SNDBUF_POLICY		 = 15,
>  	NET_SCTP_SACK_TIMEOUT		 = 16,
>  	NET_SCTP_RCVBUF_POLICY		 = 17,
> +	NET_SCTP_AUTO_ASCONF_ENABLE	 = 18,
>  };
>  
>  /* /proc/sys/net/bridge */
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/include/net/sctp/sctp.h linux-2.6/include/net/sctp/sctp.h
> --- linux-2.6.orig/include/net/sctp/sctp.h	2010-10-11 08:24:33.000000000 +0900
> +++ linux-2.6/include/net/sctp/sctp.h	2010-10-11 07:21:40.000000000 +0900
> @@ -121,6 +121,8 @@ extern int sctp_copy_local_addr_list(str
>  				     int flags);
>  extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
>  extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
> +void sctp_addr_wq_mgmt(union sctp_addr *, int);
> +void sctp_path_check_and_react(struct sctp_association *, struct sockaddr *);
>  
>  /*
>   * sctp/socket.c
> @@ -135,6 +137,9 @@ void sctp_sock_rfree(struct sk_buff *skb
>  void sctp_copy_sock(struct sock *newsk, struct sock *sk,
>  		    struct sctp_association *asoc);
>  extern struct percpu_counter sctp_sockets_allocated;
> +int sctp_asconf_mgmt(struct sctp_endpoint *, struct sock *sk);
> +void sctp_add_addr_to_laddr(struct sockaddr *, struct sctp_association *);
> +void sctp_trans_immediate_retrans(struct sctp_transport *);
>  
>  /*
>   * sctp/primitive.c
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/include/net/sctp/sm.h linux-2.6/include/net/sctp/sm.h
> --- linux-2.6.orig/include/net/sctp/sm.h	2010-10-11 08:24:33.000000000 +0900
> +++ linux-2.6/include/net/sctp/sm.h	2010-10-11 07:21:40.000000000 +0900
> @@ -295,6 +295,8 @@ int sctp_addip_addr_config(struct sctp_a
>  __u32 sctp_generate_tag(const struct sctp_endpoint *);
>  __u32 sctp_generate_tsn(const struct sctp_endpoint *);
>  
> +void sctp_path_check_and_react(struct sctp_association *, struct sockaddr *);
> +
>  /* Extern declarations for major data structures.  */
>  extern sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES];
>  
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/include/net/sctp/structs.h linux-2.6/include/net/sctp/structs.h
> --- linux-2.6.orig/include/net/sctp/structs.h	2010-10-11 08:24:33.000000000 +0900
> +++ linux-2.6/include/net/sctp/structs.h	2010-10-11 07:21:40.000000000 +0900
> @@ -205,6 +205,10 @@ extern struct sctp_globals {
>  	 * It is a list of sctp_sockaddr_entry.
>  	 */
>  	struct list_head local_addr_list;
> +	int auto_asconf_enable;
> +	struct list_head addr_waitq;
> +	struct timer_list addr_wq_timer;
> +	spinlock_t addr_wq_lock;
>  
>  	/* Lock that protects the local_addr_list writers */
>  	spinlock_t addr_list_lock;
> @@ -265,6 +269,10 @@ extern struct sctp_globals {
>  #define sctp_port_alloc_lock		(sctp_globals.port_alloc_lock)
>  #define sctp_port_hashtable		(sctp_globals.port_hashtable)
>  #define sctp_local_addr_list		(sctp_globals.local_addr_list)
> +#define sctp_addr_waitq			(sctp_globals.addr_waitq)

be align.

> +#define sctp_addr_wq_timer		(sctp_globals.addr_wq_timer)
> +#define sctp_addr_wq_lock		(sctp_globals.addr_wq_lock)
> +#define sctp_auto_asconf_enable		(sctp_globals.auto_asconf_enable)

be align.

>  #define sctp_local_addr_lock		(sctp_globals.addr_list_lock)
>  #define sctp_scope_policy		(sctp_globals.ipv4_scope_policy)
>  #define sctp_addip_enable		(sctp_globals.addip_enable)
> @@ -798,6 +806,16 @@ struct sctp_sockaddr_entry {
>  	__u8 valid;
>  };
>  
> +#define SCTP_NEWADDR	1
> +#define SCTP_DELADDR	2
> +#define SCTP_ADDRESS_TICK_DELAY	500
> +struct sctp_addr_wait {
> +	struct list_head list;
> +	struct rcu_head rcu;
> +	union sctp_addr a;
> +	int	cmd;
> +};
> +
>  typedef struct sctp_chunk *(sctp_packet_phandler_t)(struct sctp_association *);
>  
>  /* This structure holds lists of chunks as we are assembling for
> @@ -1241,6 +1259,7 @@ sctp_scope_t sctp_scope(const union sctp
>  int sctp_in_scope(const union sctp_addr *addr, const sctp_scope_t scope);
>  int sctp_is_any(struct sock *sk, const union sctp_addr *addr);
>  int sctp_addr_is_valid(const union sctp_addr *addr);
> +int sctp_is_ep_boundall(struct sock *sk);
>  
>  
>  /* What type of endpoint?  */
> @@ -1903,6 +1922,11 @@ struct sctp_association {
>  	 * after reaching 4294967295.
>  	 */
>  	__u32 addip_serial;
> +	/* list of valid address in association local */
> +	struct list_head asoc_laddr_list; 
> +	union sctp_addr *asconf_addr_del_pending;
> +	__u32 asconf_del_pending_cid;
> +	int src_out_of_asoc_ok;
>  
>  	/* SCTP AUTH: list of the endpoint shared keys.  These
>  	 * keys are provided out of band by the user applicaton
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/kernel/sysctl_check.c linux-2.6/kernel/sysctl_check.c
> --- linux-2.6.orig/kernel/sysctl_check.c	2010-04-22 17:55:41.000000000 +0900
> +++ linux-2.6/kernel/sysctl_check.c	2010-04-21 18:40:22.000000000 +0900
> @@ -5,7 +5,6 @@
>  #include <linux/string.h>
>  #include <net/ip_vs.h>
>  
> -

an redundant change.

>  static int sysctl_depth(struct ctl_table *table)
>  {
>  	struct ctl_table *tmp;
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/associola.c linux-2.6/net/sctp/associola.c
> --- linux-2.6.orig/net/sctp/associola.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/associola.c	2010-10-11 07:21:40.000000000 +0900
> @@ -278,6 +278,10 @@ static struct sctp_association *sctp_ass
>  	if (sctp_addip_noauth)
>  		asoc->peer.asconf_capable = 1;
>  
> +	asoc->asconf_addr_del_pending = NULL;
> +	asoc->asconf_del_pending_cid = 0;
> +	asoc->src_out_of_asoc_ok = 0;
> +	INIT_LIST_HEAD(&asoc->asoc_laddr_list);
>  	/* Create an input queue.  */
>  	sctp_inq_init(&asoc->base.inqueue);
>  	sctp_inq_set_th_handler(&asoc->base.inqueue, sctp_assoc_bh_rcv);
> @@ -444,6 +448,17 @@ void sctp_association_free(struct sctp_a
>  	/* Free any cached ASCONF_ACK chunk. */
>  	sctp_assoc_free_asconf_acks(asoc);
>  
> +	/* Free pending address space being deleted */
> +	if (asoc->asconf_addr_del_pending != NULL) 
> +		kfree(asoc->asconf_addr_del_pending);
> +	if (!list_empty(&asoc->asoc_laddr_list)) {
> +		struct sctp_sockaddr_entry *laddr = NULL;
> +		list_for_each_entry(laddr, &asoc->asoc_laddr_list, list) {
> +			list_del(&laddr->list);
> +			kfree(laddr);
> +		}
> +	}
> +
>  	/* Free any cached ASCONF chunk. */
>  	if (asoc->addip_last_asconf)
>  		sctp_chunk_free(asoc->addip_last_asconf);
> @@ -618,6 +633,7 @@ void sctp_assoc_rm_peer(struct sctp_asso
>  			if (!mod_timer(&active->T3_rtx_timer,
>  					jiffies + active->rto))
>  				sctp_transport_hold(active);
> +		active->flight_size += peer->flight_size;
>  	}
>  
>  	asoc->peer.transport_count--;
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/bind_addr.c linux-2.6/net/sctp/bind_addr.c
> --- linux-2.6.orig/net/sctp/bind_addr.c	2010-04-22 17:55:41.000000000 +0900
> +++ linux-2.6/net/sctp/bind_addr.c	2010-06-24 02:33:20.000000000 +0900
> @@ -536,6 +536,24 @@ int sctp_in_scope(const union sctp_addr 
>  	return 0;
>  }
>  
> +int sctp_is_ep_boundall(struct sock *sk)
> +{
> +	struct sctp_bind_addr *bp;
> +	struct sctp_sockaddr_entry *addr;
> +
> +	if (!sk) 
> +		return 0;
> +       
> +	bp = &sctp_sk(sk)->ep->base.bind_addr;
> +	if (sctp_list_single_entry(&bp->address_list)) {
> +		addr = list_entry(bp->address_list.next,
> +				  struct sctp_sockaddr_entry, list);
> +		if (sctp_is_any(sk, &addr->a)) 
> +			return 1;
> +	}
> +	return 0;
> +}
> +
>  /********************************************************************
>   * 3rd Level Abstractions
>   ********************************************************************/
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/ipv6.c linux-2.6/net/sctp/ipv6.c
> --- linux-2.6.orig/net/sctp/ipv6.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/ipv6.c	2010-10-11 07:21:40.000000000 +0900
> @@ -103,6 +103,7 @@ static int sctp_inet6addr_event(struct n
>  			addr->valid = 1;
>  			spin_lock_bh(&sctp_local_addr_lock);
>  			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
> +			sctp_addr_wq_mgmt(&addr->a, SCTP_NEWADDR);
>  			spin_unlock_bh(&sctp_local_addr_lock);
>  		}
>  		break;
> @@ -113,6 +114,7 @@ static int sctp_inet6addr_event(struct n
>  			if (addr->a.sa.sa_family == AF_INET6 &&
>  					ipv6_addr_equal(&addr->a.v6.sin6_addr,
>  						&ifa->addr)) {
> +				sctp_addr_wq_mgmt(&addr->a, SCTP_DELADDR);
>  				found = 1;
>  				addr->valid = 0;
>  				list_del_rcu(&addr->list);
> @@ -330,6 +332,25 @@ static void sctp_v6_get_saddr(struct sct
>  				matchlen = bmatchlen;
>  			}
>  		}
> +		if ((laddr->state == SCTP_ADDR_NEW) && asoc->src_out_of_asoc_ok) {
> +			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
> +			if (!baddr || (matchlen < bmatchlen)) {
> +				baddr = &laddr->a;
> +				matchlen = bmatchlen;
> +			}
> +		}
> +	}
> +	if (baddr == NULL) {
> +		/* We don't have a valid src addr in "endpoint-wide".  
> +		 * Looking up in assoc-locally valid address list.  
> +		 */
> +		list_for_each_entry(laddr, &asoc->asoc_laddr_list, list) {
> +			bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
> +			if (!baddr || (matchlen < bmatchlen)) {
> +				baddr = &laddr->a;
> +				matchlen = bmatchlen;
> +			}
> +		}
>  	}
>  
>  	if (baddr) {
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/outqueue.c linux-2.6/net/sctp/outqueue.c
> --- linux-2.6.orig/net/sctp/outqueue.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/outqueue.c	2010-10-11 07:21:40.000000000 +0900
> @@ -342,7 +342,13 @@ int sctp_outq_tail(struct sctp_outq *q, 
>  			break;
>  		}
>  	} else {
> -		list_add_tail(&chunk->list, &q->control_chunk_list);
> +		/* We add the ASCONF for the only one newly added address at 
> +		 * the front of the queue 
> +		 */
> +		if (q->asoc->src_out_of_asoc_ok && (chunk->chunk_hdr->type == SCTP_CID_ASCONF))
> +			list_add(&chunk->list, &q->control_chunk_list);
> +		else
> +			list_add_tail(&chunk->list, &q->control_chunk_list);
>  		SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
>  	}
>  
> @@ -850,6 +856,24 @@ static int sctp_outq_flush(struct sctp_o
>  		case SCTP_CID_SHUTDOWN:
>  		case SCTP_CID_ECN_ECNE:
>  		case SCTP_CID_ASCONF:
> +			/* RFC 5061, 5.3
> +			 * F1) This means that until such time as the ASCONF 
> +			 * containing the add is acknowledged, the sender MUST 
> +			 * NOT use the new IP address as a source for ANY SCTP 
> +			 * packet except on carrying an ASCONF Chunk.
> +			 */
> +			if (asoc->src_out_of_asoc_ok) {
> +				SCTP_DEBUG_PRINTK("outq_flush: out_of_asoc_ok, transmit chunk type %d\n", chunk->chunk_hdr->type);
> +				packet = &transport->packet;
> +				sctp_packet_config(packet, vtag, 
> +						asoc->peer.ecn_capable);
> +				sctp_packet_append_chunk(packet, chunk);
> +				error = sctp_packet_transmit(packet);
> +				if (error < 0) {
> +					return error;
> +				}
> +				goto sctp_flush_out;
> +			}
>  		case SCTP_CID_FWD_TSN:
>  			status = sctp_packet_transmit_chunk(packet, chunk,
>  							    one_packet);
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/protocol.c linux-2.6/net/sctp/protocol.c
> --- linux-2.6.orig/net/sctp/protocol.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/protocol.c	2010-10-11 07:21:40.000000000 +0900
> @@ -508,12 +508,19 @@ static struct dst_entry *sctp_v4_get_dst
>  		sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port));
>  		rcu_read_lock();
>  		list_for_each_entry_rcu(laddr, &bp->address_list, list) {
> -			if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC))
> +			if (!laddr->valid || ((laddr->state != SCTP_ADDR_SRC) && (asoc->src_out_of_asoc_ok == 0)))
>  				continue;
>  			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
>  				goto out_unlock;
>  		}
>  		rcu_read_unlock();
> +		/* We don't have a valid src addr in "endpoint-wide".  
> +		 * Looking up in assoc-locally valid address list.  
> +		 */
> +		list_for_each_entry(laddr, &asoc->asoc_laddr_list, list) {
> +			if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a))
> +				goto out_unlock;
> +		}
>  
>  		/* None of the bound addresses match the source address of the
>  		 * dst. So release it.
> @@ -633,6 +640,184 @@ static void sctp_v4_ecn_capable(struct s
>  	INET_ECN_xmit(sk);
>  }
>  
> +void sctp_addr_wq_timeout_handler(unsigned long arg)
> +{
> +	struct sctp_addr_wait *addrw = NULL;
> +	union sctp_addr *addr = NULL;
> +	struct sctp_ep_common *epb = NULL;
> +	struct sctp_endpoint *ep = NULL;
> +	struct hlist_node *node = NULL;
> +	struct sctp_hashbucket *head = NULL;
> +	int cnt=0;
> +	int i; 
> +
> +	spin_lock_bh(&sctp_addr_wq_lock);
> +retry_wq:
> +	if (list_empty(&sctp_addr_waitq)) {
> +		SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: nothing in addr waitq\n");
> +		spin_unlock_bh(&sctp_addr_wq_lock);
> +		return;
> +	}
> +	addrw = list_first_entry(&sctp_addr_waitq, struct sctp_addr_wait, list);
> +	if ((addrw->cmd != SCTP_NEWADDR) && (addrw->cmd != SCTP_DELADDR)) {
> +		SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: Huh, cmd is neither NEWADDR nor DELADDR\n");
> +		list_del(&addrw->list);
> +		kfree(addrw);
> +		goto retry_wq;
> +	}
> +
> +	addr = &addrw->a;
> +	SCTP_DEBUG_PRINTK_IPADDR("sctp_addrwq_timo_handler: the first ent in wq %p is "," for cmd %d at entry %p\n", &sctp_addr_waitq, addr, addrw->cmd, addrw);
> +
> +	/* Now we send an ASCONF for each association */
> +	/* Note. we currently don't handle link local IPv6 addressees */
> +	if (addr->sa.sa_family == AF_INET6) {
> +		struct in6_addr *in6 = (struct in6_addr *)&addr->v6.sin6_addr;
> +
> +		if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) {
> +			SCTP_DEBUG_PRINTK("sctp_timo_handler: link local, hence don't tell eps\n");
> +			list_del(&addrw->list);
> +			kfree(addrw);
> +			goto retry_wq;
> +		}
> +		if ((ipv6_chk_addr(&init_net, in6, NULL, 0) == 0) && (addrw->cmd == SCTP_NEWADDR)) {
> +			unsigned long timeo_val;
> +
> +			SCTP_DEBUG_PRINTK("sctp_timo_handler: this is on DAD, trying %d sec later\n", SCTP_ADDRESS_TICK_DELAY);
> +			timeo_val = jiffies;
> +			timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> +			(void)mod_timer(&sctp_addr_wq_timer, timeo_val);
> +			spin_unlock_bh(&sctp_addr_wq_lock);
> +			return;
> +		}
> +	}
> +	for (i = 0; i < sctp_ep_hashsize; ++i) {
> +		head = &sctp_ep_hashtable[i];
> +		if (head == NULL) {
> +			SCTP_DEBUG_PRINTK("addrwq_timo_handler: no head in hash\n");
> +			continue;
> +		}
> +		write_lock(&head->lock);
> +		epb = NULL;
> +		sctp_for_each_hentry(epb, node, &head->chain) {
> +
> +			if (epb == NULL) {
> +				SCTP_DEBUG_PRINTK("addrwq_timo_handler: no epb\n");
> +				continue;
> +			}
> +			if (!sctp_is_ep_boundall(epb->sk)) {
> +				/* ignore bound-specific endpoints */
> +				continue;
> +			}
> +			ep = sctp_ep(epb);
> +			if (sctp_asconf_mgmt(ep, epb->sk) < 0) {
> +				SCTP_DEBUG_PRINTK("sctp_addrwq_timo_handler: sctp_asconf_mgmt failed\n");
> +				continue;
> +			}
> +			++cnt;
> +		}
> +		write_unlock(&head->lock);
> +	}
> +
> +	list_del(&addrw->list);
> +	kfree(addrw);
> +
> +	if (list_empty(&sctp_addr_waitq)) {
> +		spin_unlock_bh(&sctp_addr_wq_lock);
> +		return;
> +	} else {
> +		goto retry_wq;
> +	}
> +	spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
> +void sctp_addr_wq_mgmt(union sctp_addr *reqaddr, int cmd)
> +{
> +	struct sctp_addr_wait *addrw = NULL;
> +	struct sctp_addr_wait *addrw_new = NULL;
> +	unsigned long timeo_val;
> +	union sctp_addr *tmpaddr; // for debugging
> +
> +	/* first, we check if an opposite message already exist in the queue.  
> +	 * If we found such message, it is removed.  
> +	 * This operation is a bit stupid, but the DHCP client attaches the 
> +	 * new address after a couple of addition and deletion of that address
> +	 */
> +
> +	if (reqaddr == NULL) {
> +		SCTP_DEBUG_PRINTK("sctp_addr_wq_mgmt: no address message?\n");
> +		return;
> +	}
> +	
> +	spin_lock_bh(&sctp_addr_wq_lock);
> +	/* Offsets existing events in addr_wq */
> +	list_for_each_entry(addrw, &sctp_addr_waitq, list) {
> +		if (addrw->a.sa.sa_family != reqaddr->sa.sa_family) {
> +			continue;
> +		}
> +		if (reqaddr->sa.sa_family == AF_INET) {
> +			if (reqaddr->v4.sin_addr.s_addr == addrw->a.v4.sin_addr.s_addr) {
> +				if (cmd != addrw->cmd) {
> +					tmpaddr = &addrw->a;
> +					SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt offsets existing entry for %d "," in waitq %p\n", addrw->cmd, tmpaddr, &sctp_addr_waitq);
> +					list_del(&addrw->list);
> +					kfree(addrw);
> +					/* nothing to do anymore */
> +					spin_unlock_bh(&sctp_addr_wq_lock);
> +					return;
> +				}
> +			}
> +		}
> +		else if (reqaddr->sa.sa_family == AF_INET6) {
> +			if (memcmp(&reqaddr->v6.sin6_addr, &addrw->a.v6.sin6_addr, sizeof(struct in6_addr)) == 0) {

try ipv6_addr_equal().
ipv6_addr_equal() is used to check whether these two IPv6 addresss is equal.

> +				if (cmd != addrw->cmd) {
> +					tmpaddr = &addrw->a;
> +					SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt: offsets existing entry for %d "," in waitq %p\n", addrw->cmd, tmpaddr, &sctp_addr_waitq);
> +					list_del(&addrw->list);
> +					kfree(addrw);
> +					spin_unlock_bh(&sctp_addr_wq_lock);
> +					return;
> +				}
> +			}
> +		}
> +	}
> +				
> +	/* OK, we have to add the new address to the wait queue */
> +	addrw_new = kmalloc(sizeof(struct sctp_addr_wait), GFP_ATOMIC);
> +	if (addrw_new == NULL) {
> +		SCTP_DEBUG_PRINTK("sctp_addr_weitq_mgmt no memory? return\n");
> +		spin_unlock_bh(&sctp_addr_wq_lock);
> +		return;
> +	}
> +	memset(addrw_new, 0, sizeof(struct sctp_addr_wait));

what about kzalloc()?

> +	if (reqaddr->sa.sa_family == AF_INET) {
> +		addrw_new->a.v4.sin_family = AF_INET;
> +		memcpy(&addrw_new->a.v4.sin_addr, &reqaddr->v4.sin_addr, sizeof(struct in_addr));

&addrw_new->a.v4.sin_addr.s_addr = &reqaddr->v4.sin_addr.s_addr.

> +	} else if (reqaddr->sa.sa_family == AF_INET6) {
> +		addrw_new->a.v6.sin6_family = AF_INET6;
> +		memcpy(&addrw_new->a.v6.sin6_addr, &reqaddr->v6.sin6_addr, sizeof(struct in6_addr));

using ipv6_addr_copy().

> +	} else {
> +		SCTP_DEBUG_PRINTK("sctp_addr_waitq_mgmt: Unknown family of request addr, return\n");
> +		kfree(addrw_new);
> +		spin_unlock_bh(&sctp_addr_wq_lock);
> +		return;
> +	}
> +	addrw_new->cmd = cmd;
> +	list_add_tail(&addrw_new->list, &sctp_addr_waitq);
> +	tmpaddr = &addrw_new->a;
> +	SCTP_DEBUG_PRINTK_IPADDR("sctp_addr_wq_mgmt add new entry for cmd:%d "," in waitq %p, start a timer\n", addrw_new->cmd, tmpaddr, &sctp_addr_waitq);
> +
> +	if (timer_pending(&sctp_addr_wq_timer)) {
> +		SCTP_DEBUG_PRINTK("sctp_addr_wq_mgmt: addr_wq timer is already running\n");
> +		spin_unlock_bh(&sctp_addr_wq_lock);
> +		return;
> +	}
> +	timeo_val = jiffies;
> +	timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY);
> +	(void)mod_timer(&sctp_addr_wq_timer, timeo_val);
> +	spin_unlock_bh(&sctp_addr_wq_lock);
> +}
> +
>  /* Event handler for inet address addition/deletion events.
>   * The sctp_local_addr_list needs to be protocted by a spin lock since
>   * multiple notifiers (say IPv4 and IPv6) may be running at the same
> @@ -660,6 +845,7 @@ static int sctp_inetaddr_event(struct no
>  			addr->valid = 1;
>  			spin_lock_bh(&sctp_local_addr_lock);
>  			list_add_tail_rcu(&addr->list, &sctp_local_addr_list);
> +			sctp_addr_wq_mgmt(&addr->a, SCTP_NEWADDR);
>  			spin_unlock_bh(&sctp_local_addr_lock);
>  		}
>  		break;
> @@ -670,6 +856,7 @@ static int sctp_inetaddr_event(struct no
>  			if (addr->a.sa.sa_family == AF_INET &&
>  					addr->a.v4.sin_addr.s_addr ==
>  					ifa->ifa_local) {
> +				sctp_addr_wq_mgmt(&addr->a, SCTP_DELADDR);
>  				found = 1;
>  				addr->valid = 0;
>  				list_del_rcu(&addr->list);
> @@ -1276,6 +1463,10 @@ SCTP_STATIC __init int sctp_init(void)
>  
>  	/* Initialize the local address list. */
>  	INIT_LIST_HEAD(&sctp_local_addr_list);
> +	INIT_LIST_HEAD(&sctp_addr_waitq);
> +	spin_lock_init(&sctp_addr_wq_lock);
> +	sctp_addr_wq_timer.expires = 0;
> +	setup_timer(&sctp_addr_wq_timer, sctp_addr_wq_timeout_handler, (unsigned long)NULL); 
>  	spin_lock_init(&sctp_local_addr_lock);
>  	sctp_get_local_addr_list();
>  
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/sm_make_chunk.c linux-2.6/net/sctp/sm_make_chunk.c
> --- linux-2.6.orig/net/sctp/sm_make_chunk.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/sm_make_chunk.c	2010-10-11 07:21:40.000000000 +0900
> @@ -2649,6 +2649,78 @@ __u32 sctp_generate_tsn(const struct sct
>  	return retval;
>  }
>  
> +void
> +sctp_trans_immediate_retrans(struct sctp_transport *trans)
> +{
> +	struct sctp_association *asoc = trans->asoc;
> +
> +	/* Stop pending T3_rtx_timer on this transport */
> +	if (timer_pending(&trans->T3_rtx_timer)) {
> +		(void)del_timer(&trans->T3_rtx_timer);
> +		sctp_transport_put(trans);
> +	} 
> +
> +	/* We consider the event as if the T3RTX timer expires */
> +	sctp_retransmit(&asoc->outqueue, trans, SCTP_RTXR_T3_RTX);
> +	if (!timer_pending(&trans->T3_rtx_timer)) {
> +		if (!mod_timer(&trans->T3_rtx_timer, jiffies + trans->rto)) 
> +			sctp_transport_hold(trans);
> +	}
> +
> +	return;
> +}
> +
> +void
> +sctp_path_check_and_react(struct sctp_association *asoc, struct sockaddr *sa)
> +{
> +	struct sctp_transport *trans;
> +	int addrnum, family;
> +	struct sctp_sockaddr_entry *saddr;
> +	struct sctp_bind_addr *bp;
> +	union sctp_addr *tmpaddr;
> +
> +	family = sa->sa_family;
> +	bp = &asoc->base.bind_addr;
> +	addrnum = 0;
> +	/* count up the number of local addresses in the same family */
> +	list_for_each_entry(saddr, &bp->address_list, list) {
> +		if (saddr->a.sa.sa_family == family) {
> +			tmpaddr = &saddr->a;
> +			if (family == AF_INET6 && 
> +			    ipv6_addr_type(&tmpaddr->v6.sin6_addr) & 
> +			    IPV6_ADDR_LINKLOCAL) {
> +				continue;
> +			}
> +			addrnum++;
> +		}
> +	}
> +	if (addrnum == 1) {
> +		union sctp_addr *tmpaddr;
> +		tmpaddr = (union sctp_addr *)sa;
> +		SCTP_DEBUG_PRINTK_IPADDR("pcheck_react: only 1 local addr in asoc %p "," family %d\n", asoc, tmpaddr, family);
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) {
> +			/* reset path information and release refcount to the 
> +			 * dst_entry  based on the src change */
> +			sctp_transport_hold(trans);
> +			trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
> +			trans->ssthresh = asoc->peer.i.a_rwnd;
> +			trans->rtt = 0;
> +			trans->srtt = 0;
> +			trans->rttvar = 0;
> +			trans->rto = asoc->rto_initial;
> +			dst_release(trans->dst);
> +			trans->dst = NULL;
> +			memset(&trans->saddr, 0, sizeof(union sctp_addr));
> +			sctp_transport_route(trans, NULL, sctp_sk(asoc->base.sk));
> +			SCTP_DEBUG_PRINTK_IPADDR("we freed dst_entry (asoc: %p dst: "," trans: %p)\n", asoc, (&trans->ipaddr), trans);
> +			trans->rto_pending = 1;
> +			sctp_trans_immediate_retrans(trans);
> +			sctp_transport_put(trans);
> +		}
> +	}
> +	return;
> +}
> +
>  /*
>   * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF)
>   *      0                   1                   2                   3
> @@ -2742,11 +2814,30 @@ struct sctp_chunk *sctp_make_asconf_upda
>  	int			addr_param_len = 0;
>  	int 			totallen = 0;
>  	int 			i;
> +	sctp_addip_param_t del_param; // 8 Bytes (Type(0xC002), Len and CrrID)
> +	sctp_addip_param_t spr_param;
> +	struct sctp_af *del_af;
> +	struct sctp_af *spr_af;
> +	int del_addr_param_len = 0;
> +	int spr_addr_param_len = 0;
> +	int del_paramlen = sizeof(sctp_addip_param_t);
> +	int spr_paramlen = sizeof(sctp_addip_param_t);
> +	union sctp_addr_param del_addr_param; // (v4) 8 Bytes, (v6) 20 Bytes
> +	union sctp_addr_param spr_addr_param;
> +	int			v4 = 0;
> +	int			v6 = 0;
>  
>  	/* Get total length of all the address parameters. */
>  	addr_buf = addrs;
>  	for (i = 0; i < addrcnt; i++) {
>  		addr = (union sctp_addr *)addr_buf;
> +		if (addr != NULL) {
> +			if (addr->sa.sa_family == AF_INET) {
> +				v4 = 1;
> +			} else if (addr->sa.sa_family == AF_INET6) {
> +				v6 = 1;
> +			}
> +		}
>  		af = sctp_get_af_specific(addr->v4.sin_family);
>  		addr_param_len = af->to_addr_param(addr, &addr_param);
>  
> @@ -2755,6 +2846,35 @@ struct sctp_chunk *sctp_make_asconf_upda
>  
>  		addr_buf += af->sockaddr_len;
>  	}
> +	/* Add the length of a pending address being deleted */
> +	if ((flags == SCTP_PARAM_ADD_IP) && 
> +	    (asoc->asconf_addr_del_pending != NULL)) {
> +		if (((asoc->asconf_addr_del_pending->sa.sa_family == AF_INET) 
> +		    && v4) || 
> +		    ((asoc->asconf_addr_del_pending->sa.sa_family == AF_INET6)
> +		    && v6)) {
> +			del_af = sctp_get_af_specific(asoc->asconf_addr_del_pending->sa.sa_family);
> +			del_addr_param_len = del_af->to_addr_param(asoc->asconf_addr_del_pending, &del_addr_param);
> +			totallen += del_paramlen;
> +			totallen += del_addr_param_len;
> +			SCTP_DEBUG_PRINTK("mkasconf_update_ip: now we picked del_pending addr, totallen for all addresses is %d\n", totallen);
> +			/* for Set Primary (equal size as del parameters */
> +			totallen += del_paramlen;
> +			totallen += del_addr_param_len;
> +		}
> +		if (v4) {
> +			if ((totallen != 32) && (totallen != 48)) {
> +				SCTP_DEBUG_PRINTK("mkasconf_update_ip: incorrect total length of ASCONF parameters, del + add MUST be 32 bytes, but %d bytes\n", totallen);
> +			return NULL;
> +			}
> +		} else if (v6) {
> +			if ((totallen != 56) && (totallen != 84)) {
> +				SCTP_DEBUG_PRINTK("mkasconf_update_ip: incorrect total length of ASCONF parameters, del + add MUST be 56 bytes, but %d bytes\n", totallen);
> +			return NULL;
> +			}
> +		}
> +	}
> +	SCTP_DEBUG_PRINTK("mkasconf_update_ip: call mkasconf() for %d bytes\n", totallen);
>  
>  	/* Create an asconf chunk with the required length. */
>  	retval = sctp_make_asconf(asoc, laddr, totallen);
> @@ -2776,6 +2896,29 @@ struct sctp_chunk *sctp_make_asconf_upda
>  
>  		addr_buf += af->sockaddr_len;
>  	}
> +	if ((flags == SCTP_PARAM_ADD_IP) && 
> +	    (asoc->asconf_addr_del_pending != NULL)) {
> +		addr = asoc->asconf_addr_del_pending;
> +		del_af = sctp_get_af_specific(addr->v4.sin_family);
> +		del_addr_param_len = del_af->to_addr_param(addr, &del_addr_param);
> +		del_param.param_hdr.type = SCTP_PARAM_DEL_IP;
> +		del_param.param_hdr.length = htons(del_paramlen + del_addr_param_len);
> +		del_param.crr_id = i;
> +		asoc->asconf_del_pending_cid = i;
> +
> +		sctp_addto_chunk(retval, del_paramlen, &del_param);
> +		sctp_addto_chunk(retval, del_addr_param_len, &del_addr_param);
> +		/* For SET_PRIMARY */
> +		addr_buf = addrs;
> +		addr = (union sctp_addr *)addr_buf;
> +		spr_af = sctp_get_af_specific(addr->v4.sin_family);
> +		spr_addr_param_len = spr_af->to_addr_param(addr, &spr_addr_param);
> +		spr_param.param_hdr.type = SCTP_PARAM_SET_PRIMARY;
> +		spr_param.param_hdr.length = htons(spr_paramlen + spr_addr_param_len);
> +		spr_param.crr_id = (i+1);
> +		sctp_addto_chunk(retval, spr_paramlen, &spr_param);
> +		sctp_addto_chunk(retval, spr_addr_param_len, &spr_addr_param);
> +	}
>  	return retval;
>  }
>  
> @@ -2988,7 +3131,7 @@ static __be16 sctp_process_asconf_param(
>  		 * an Error Cause TLV set to the new error code 'Request to
>  		 * Delete Source IP Address'
>  		 */
> -		if (sctp_cmp_addr_exact(sctp_source(asconf), &addr))
> +		if (sctp_cmp_addr_exact(&asconf->source, &addr))
>  			return SCTP_ERROR_DEL_SRC_IP;
>  
>  		/* Section 4.2.2
> @@ -3169,7 +3312,6 @@ static void sctp_asconf_param_success(st
>  	struct sctp_bind_addr *bp = &asoc->base.bind_addr;
>  	union sctp_addr_param *addr_param;
>  	struct sctp_transport *transport;
> -	struct sctp_sockaddr_entry *saddr;
>  
>  	addr_param = (union sctp_addr_param *)
>  			((void *)asconf_param + sizeof(sctp_addip_param_t));
> @@ -3184,9 +3326,16 @@ static void sctp_asconf_param_success(st
>  		 * held, so the list can not change.
>  		 */
>  		local_bh_disable();
> -		list_for_each_entry(saddr, &bp->address_list, list) {
> -			if (sctp_cmp_addr_exact(&saddr->a, &addr))
> -				saddr->state = SCTP_ADDR_SRC;
> +		/* Until this ASCONF is acked on all associations, we cannot 
> +		 * consider this address as ADDR_SRC
> +		 */
> +		asoc->src_out_of_asoc_ok = 0;
> +		sctp_add_addr_to_laddr(&addr.sa, asoc);
> +		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
> +				transports) {
> +			dst_release(transport->dst);
> +			sctp_transport_route(transport, NULL,
> +					     sctp_sk(asoc->base.sk));
>  		}
>  		local_bh_enable();
>  		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
> @@ -3201,6 +3350,26 @@ static void sctp_asconf_param_success(st
>  	case SCTP_PARAM_DEL_IP:
>  		local_bh_disable();
>  		sctp_del_bind_addr(bp, &addr);
> +		if (asoc->asconf_addr_del_pending != NULL) {
> +			if ((addr.sa.sa_family == AF_INET) && 
> +			    (asoc->asconf_addr_del_pending->sa.sa_family == 
> +			     AF_INET)) {
> +				if (asoc->asconf_addr_del_pending->v4.sin_addr.s_addr == addr.v4.sin_addr.s_addr) {
> +					kfree(asoc->asconf_addr_del_pending);
> +					asoc->asconf_del_pending_cid = 0;
> +					asoc->asconf_addr_del_pending = NULL;
> +				}
> +			} 
> +			else if ((addr.sa.sa_family == AF_INET6) && 
> +				(asoc->asconf_addr_del_pending->sa.sa_family == 
> +				 AF_INET6)) {
> +				if (memcmp(&asoc->asconf_addr_del_pending->v6.sin6_addr, &addr.v6.sin6_addr, sizeof(struct in6_addr)) == 0) {
> +					kfree(asoc->asconf_addr_del_pending);
> +					asoc->asconf_del_pending_cid = 0;
> +					asoc->asconf_addr_del_pending = NULL;
> +				}
> +			}
> +		}
>  		local_bh_enable();
>  		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
>  				transports) {
> @@ -3291,6 +3460,8 @@ int sctp_process_asconf_ack(struct sctp_
>  	int	no_err = 1;
>  	int	retval = 0;
>  	__be16	err_code = SCTP_ERROR_NO_ERROR;
> +	sctp_addip_param_t *first_asconf_param = NULL;
> +	int first_asconf_paramlen;
>  
>  	/* Skip the chunkhdr and addiphdr from the last asconf sent and store
>  	 * a pointer to address parameter.
> @@ -3305,6 +3476,8 @@ int sctp_process_asconf_ack(struct sctp_
>  	length = ntohs(addr_param->v4.param_hdr.length);
>  	asconf_param = (sctp_addip_param_t *)((void *)addr_param + length);
>  	asconf_len -= length;
> +	first_asconf_param = asconf_param;
> +	first_asconf_paramlen = ntohs(first_asconf_param->param_hdr.length);
>  
>  	/* ADDIP 4.1
>  	 * A8) If there is no response(s) to specific TLV parameter(s), and no
> @@ -3359,6 +3532,34 @@ int sctp_process_asconf_ack(struct sctp_
>  		asconf_len -= length;
>  	}
>  
> +	/* When the source address obviously changes to newly added one, we 
> +	   reset the cwnd to re-probe the path condition
> +	*/
> +	if (no_err && (first_asconf_param->param_hdr.type == SCTP_PARAM_ADD_IP)) {
> +		if (first_asconf_paramlen == 16) {
> +			struct sockaddr_in sin;
> +
> +			memset(&sin, 0, sizeof(struct sockaddr_in));
> +			sin.sin_family = AF_INET;
> +			memcpy(&sin.sin_addr.s_addr, first_asconf_param + 1, 
> +					sizeof(struct in_addr));
> +			sctp_path_check_and_react(asoc, 
> +					(struct sockaddr *)&sin);
> +
> +		} else if (first_asconf_paramlen == 28) {
> +			struct sockaddr_in6 sin6;
> +
> +			memset(&sin6, 0, sizeof(struct sockaddr_in6));
> +			sin6.sin6_family = AF_INET6;
> +			memcpy(&sin6.sin6_addr, first_asconf_param + 1, 
> +					sizeof(struct in6_addr));
> +			sctp_path_check_and_react(asoc, 
> +					(struct sockaddr *)&sin6);
> +		} else {
> +			SCTP_DEBUG_PRINTK("funny asconf_paramlen? (%d)\n", first_asconf_paramlen);
> +		}
> +	}
> +
>  	/* Free the cached last sent asconf chunk. */
>  	list_del_init(&asconf->transmitted_list);
>  	sctp_chunk_free(asconf);
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/socket.c linux-2.6/net/sctp/socket.c
> --- linux-2.6.orig/net/sctp/socket.c	2010-10-11 08:24:34.000000000 +0900
> +++ linux-2.6/net/sctp/socket.c	2010-10-11 07:21:40.000000000 +0900
> @@ -525,6 +525,7 @@ static int sctp_send_asconf_add_ip(struc
>  	struct list_head		*p;
>  	int 				i;
>  	int 				retval = 0;
> +	struct sctp_transport 		*trans = NULL;
>  
>  	if (!sctp_addip_enable)
>  		return retval;
> @@ -581,13 +582,11 @@ static int sctp_send_asconf_add_ip(struc
>  			goto out;
>  		}
>  
> -		retval = sctp_send_asconf(asoc, chunk);
> -		if (retval)
> -			goto out;
>  
>  		/* Add the new addresses to the bind address list with
>  		 * use_as_src set to 0.
>  		 */
> +		SCTP_DEBUG_PRINTK("snd_asconf_addip: next, add_bind_addr with ADDR_NEW flag\n");
>  		addr_buf = addrs;
>  		for (i = 0; i < addrcnt; i++) {
>  			addr = (union sctp_addr *)addr_buf;
> @@ -597,6 +596,26 @@ static int sctp_send_asconf_add_ip(struc
>  						    SCTP_ADDR_NEW, GFP_ATOMIC);
>  			addr_buf += af->sockaddr_len;
>  		}
> +		list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) {
> +			if (asoc->asconf_addr_del_pending != NULL) {
> +				/* This ADDIP ASCONF piggybacks DELIP for the 
> +				 * last address, so need to select src addr 
> +				 * from the out_of_asoc addrs 
> +				 */
> +				asoc->src_out_of_asoc_ok = 1;
> +			}
> +			/* Clear the source and route cache in the path */
> +			memset(&trans->saddr, 0, sizeof(union sctp_addr));
> +			dst_release(trans->dst);
> +			trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380));
> +			trans->ssthresh = asoc->peer.i.a_rwnd;
> +			trans->rto = asoc->rto_initial;
> +			trans->rtt = 0;
> +			trans->srtt = 0;
> +			trans->rttvar = 0;
> +			sctp_transport_route(trans, NULL, sctp_sk(asoc->base.sk));
> +		}
> +		retval = sctp_send_asconf(asoc, chunk);
>  	}
>  
>  out:
> @@ -638,6 +657,7 @@ static int sctp_bindx_rem(struct sock *s
>  		 * bind address, there is nothing more to be removed (we need
>  		 * at least one address here).
>  		 */
> +		
>  		if (list_empty(&bp->address_list) ||
>  		    (sctp_list_single_entry(&bp->address_list))) {
>  			retval = -EBUSY;
> @@ -709,7 +729,9 @@ static int sctp_send_asconf_del_ip(struc
>  	struct sctp_sockaddr_entry *saddr;
>  	int 			i;
>  	int 			retval = 0;
> +	int			stored = 0;
>  
> +	chunk = NULL;
>  	if (!sctp_addip_enable)
>  		return retval;
>  
> @@ -760,8 +782,36 @@ static int sctp_send_asconf_del_ip(struc
>  		bp = &asoc->base.bind_addr;
>  		laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs,
>  					       addrcnt, sp);
> -		if (!laddr)
> -			continue;
> +		if ((laddr == NULL) && (addrcnt == 1)) {
> +			union sctp_addr *sa_addr = NULL;
> +
> +			if (asoc->asconf_addr_del_pending == NULL) {
> +				asoc->asconf_addr_del_pending = kmalloc(sizeof(union sctp_addr), GFP_ATOMIC);
> +				memset(asoc->asconf_addr_del_pending, 0, 
> +						sizeof(union sctp_addr));
> +				if (addrs->sa_family == AF_INET) {
> +					struct sockaddr_in *sin;
> +
> +					sin = (struct sockaddr_in *)addrs;
> +					asoc->asconf_addr_del_pending->v4.sin_family = AF_INET;
> +					memcpy(&asoc->asconf_addr_del_pending->v4.sin_addr, &sin->sin_addr, sizeof(struct in_addr));
> +				} else if (addrs->sa_family == AF_INET6) {
> +					struct sockaddr_in6 *sin6;
> +
> +					sin6 = (struct sockaddr_in6 *)addrs;
> +					asoc->asconf_addr_del_pending->v6.sin6_family = AF_INET6;
> +					memcpy(&asoc->asconf_addr_del_pending->v6.sin6_addr, &sin6->sin6_addr, sizeof(struct in6_addr));
> +				}
> +				sa_addr = (union sctp_addr *)addrs;
> +				SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: keep the last address asoc: %p "," at %p\n", asoc, sa_addr, asoc->asconf_addr_del_pending);
> +				stored = 1;
> +				goto skip_mkasconf;
> +			} else {
> +				SCTP_DEBUG_PRINTK_IPADDR("send_asconf_del_ip: asoc %p, deleting last address "," is already stored at %p\n", asoc, asoc->asconf_addr_del_pending, asoc->asconf_addr_del_pending);
> +				continue;
> +			}
> +		}
> +
>  
>  		/* We do not need RCU protection throughout this loop
>  		 * because this is done under a socket lock from the
> @@ -774,6 +824,7 @@ static int sctp_send_asconf_del_ip(struc
>  			goto out;
>  		}
>  
> +skip_mkasconf:
>  		/* Reset use_as_src flag for the addresses in the bind address
>  		 * list that are to be deleted.
>  		 */
> @@ -795,16 +846,222 @@ static int sctp_send_asconf_del_ip(struc
>  		list_for_each_entry(transport, &asoc->peer.transport_addr_list,
>  					transports) {
>  			dst_release(transport->dst);
> +			/* Clear source address cache */
> +			memset(&transport->saddr, 0, sizeof(union sctp_addr));
>  			sctp_transport_route(transport, NULL,
>  					     sctp_sk(asoc->base.sk));
>  		}
>  
> +		if (stored) {
> +			/* We don't need to transmit ASCONF */
> +			continue;
> +		}
>  		retval = sctp_send_asconf(asoc, chunk);
>  	}
>  out:
>  	return retval;
>  }
>  
> +/* Add a new address to the list contains available addresses only in the 
> + * association.  If the new address is also available on the other associations 
> + * on the endpoint, it is marked as SCTP_ADDR_SRC in the bind address list on 
> + * the endpoint.  This situation is possible when some of associations receive
> + * ASCONF-ACK for ADD_IP at the endpoint
> + */
> +void
> +sctp_add_addr_to_laddr(struct sockaddr *sa, struct sctp_association *asoc)
> +{
> +	struct sctp_endpoint *ep = asoc->ep;
> +	struct sctp_association *tmp = NULL;
> +	struct sctp_bind_addr *bp;
> +	struct sctp_sockaddr_entry *addr;
> +	struct sockaddr_in *sin = NULL;
> +	struct sockaddr_in6 *sin6 = NULL;
> +	int local;
> +	int found;
> +
> +	union sctp_addr *tmpaddr = NULL;
> +	tmpaddr = (union sctp_addr *)sa;
> +	SCTP_DEBUG_PRINTK_IPADDR("add_addr_to_laddr: asoc: %p "," ep: %p", asoc, tmpaddr, ep);
> +	if (sa->sa_family == AF_INET) {
> +		sin = (struct sockaddr_in *)sa;
> +	} else if (sa->sa_family == AF_INET6) {
> +		sin6 = (struct sockaddr_in6 *)sa;
> +	}
> +
> +	/* Check if this address is locally available in the other asocs */
> +	local = 0;
> +	list_for_each_entry(tmp, &ep->asocs, asocs) {
> +		if (tmp == asoc) {
> +			continue;
> +		}
> +		found = 0;
> +		list_for_each_entry(addr, &tmp->asoc_laddr_list, list) {
> +			tmpaddr = &addr->a;
> +			if (sa->sa_family != addr->a.sa.sa_family) {
> +				continue;
> +			}
> +			if (sa->sa_family == AF_INET) {
> +				if (sin->sin_addr.s_addr == addr->a.v4.sin_addr.s_addr) {
> +					found = 1;
> +				}
> +			} else if (sa->sa_family == AF_INET6) {
> +				if (memcmp(&sin6->sin6_addr, &addr->a.v6.sin6_addr, sizeof(struct in6_addr)) == 0) {
> +					found = 1;
> +
> +				}
> +			}
> +		}
> +		if (!found) {
> +			SCTP_DEBUG_PRINTK("add_addr_to_laddr: not found in asoc %p\n", tmp);
> +			local = 1;
> +			break;
> +		}
> +	}
> +	addr = NULL;
> +
> +	if (local) {
> +		/* this address is not available in some of the other 
> +		 * associations.  So add as locally-available in this 
> +		 * asocciation 
> +		 */
> +		addr = kmalloc(sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC);
> +		if  (addr == NULL) {
> +			SCTP_DEBUG_PRINTK("add_addr_to_laddr: failed to allocate memory for this address\n");
> +			return;
> +		}
> +		memset(addr, 0, sizeof(struct sctp_sockaddr_entry));
> +		if (sa->sa_family == AF_INET) {
> +			addr->a.sa.sa_family = AF_INET;
> +			addr->a.v4.sin_port = sin->sin_port;
> +			addr->a.v4.sin_addr.s_addr = sin->sin_addr.s_addr;
> +		} else if (sa->sa_family == AF_INET6) {
> +			addr->a.sa.sa_family = AF_INET6;
> +			addr->a.v6.sin6_port = sin6->sin6_port;
> +			memcpy(&addr->a.v6.sin6_addr, &sin6->sin6_addr, sizeof(struct in6_addr));
> +		}
> +		list_add_tail(&addr->list, &asoc->asoc_laddr_list);
> +		SCTP_DEBUG_PRINTK("add_addr_to_laddr: now we added this address to the local list on asoc %p\n", asoc);
> +	} else {
> +		/* this address is also available in all other asocs.  So set 
> +		 * it as ADDR_SRC in the bind-addr list in the endpoint, then 
> +		 * remove from the asoc_laddr_list on the associations.  
> +		 */
> +		SCTP_DEBUG_PRINTK("add_addr_to_laddr: this address is available in all other asocs\n");
> +		bp = &asoc->base.bind_addr;
> +
> +		/* change state of the new address in the bind list */
> +		list_for_each_entry(addr, &bp->address_list, list) {
> +			if (addr->state != SCTP_ADDR_NEW) {
> +				continue;
> +			}
> +			if (addr->a.sa.sa_family != sa->sa_family) {
> +				continue;
> +			}
> +			if (addr->a.sa.sa_family == AF_INET) {
> +				if (sin->sin_port != addr->a.v4.sin_port) {
> +					continue;
> +				}
> +				if (sin->sin_addr.s_addr != 
> +				    addr->a.v4.sin_addr.s_addr) {
> +					continue;
> +				}
> +			} else if (addr->a.sa.sa_family == AF_INET6) {
> +				if (sin6->sin6_port != addr->a.v6.sin6_port) {
> +					continue;
> +				}
> +				if (memcmp(&sin6->sin6_addr, 
> +				    &addr->a.v6.sin6_addr, 
> +				    sizeof(struct in6_addr)) != 0) {
> +					continue;
> +				}
> +			}
> +			SCTP_DEBUG_PRINTK("add_addr_to_laddr: found the entry for this address with ADDR_NEW flag, set to ADDR_SRC\n");
> +			addr->state = SCTP_ADDR_SRC;
> +		}
> +
> +		/* remove the entry of this address from the asoc-local list */
> +		list_for_each_entry(tmp, &ep->asocs, asocs) {
> +			if (tmp == asoc) {
> +				continue;
> +			}
> +			addr = NULL;
> +			list_for_each_entry(addr, &tmp->asoc_laddr_list, list) {
> +				if (sa->sa_family != addr->a.sa.sa_family) {
> +					continue;
> +				}
> +				if (sa->sa_family == AF_INET) {
> +					if (sin->sin_addr.s_addr != addr->a.v4.sin_addr.s_addr) {
> +						continue;
> +					}
> +				} else if (sa->sa_family == AF_INET6) {
> +					if (memcmp(&sin6->sin6_addr, &addr->a.v6.sin6_addr, sizeof(struct in6_addr)) != 0) {
> +						continue;
> +					}
> +				}
> +				break;
> +			}
> +			if (addr == NULL) {
> +				SCTP_DEBUG_PRINTK("add_addr_to_laddr: Huh, asoc %p doesn't have the entry for this address?\n", asoc);
> +				continue;
> +			}
> +			list_del(&addr->list);
> +			kfree(addr);
> +		}
> +	}
> +}
> +
> +/* set address events to associations in the given endpoint.  We assume the ep 
> + * is write-locked, and addr_wq is read-locked.  
> + */
> +int
> +sctp_asconf_mgmt(struct sctp_endpoint *ep, struct sock *sk)
> +{
> +	struct sctp_addr_wait *addrw = NULL;
> +	union sctp_addr *addr = NULL;
> +	int cmd;
> +	int error = 0;
> +
> +	if (!sctp_auto_asconf_enable) {
> +		return (0);
> +	}
> +	if ((ep == NULL) || (sk == NULL)) {
> +		return(-EINVAL);
> +	}
> +	if (list_empty(&sctp_addr_waitq)) {
> +		SCTP_DEBUG_PRINTK("asconf_mgmt: nothing in the wq\n");
> +		return(-EINVAL);
> +	}
> +	addrw = list_first_entry(&sctp_addr_waitq, struct sctp_addr_wait, list);
> +	if (addrw->cmd != SCTP_NEWADDR && addrw->cmd != SCTP_DELADDR) {
> +		return(-EINVAL);
> +	}
> +	addr = &addrw->a;
> +	cmd = addrw->cmd;
> +
> +	if (addr->sa.sa_family == AF_INET) {
> +		addr->v4.sin_port = htons(ep->base.bind_addr.port);
> +	} else if (addr->sa.sa_family == AF_INET6) {
> +		addr->v6.sin6_port = htons(ep->base.bind_addr.port);
> +	}
> +
> +	if (cmd == SCTP_NEWADDR) {
> +		error = sctp_send_asconf_add_ip(sk, (struct sockaddr *)addr, 1);
> +		if (error) {
> +			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_add_ip returns %d\n", error);
> +			return(error);
> +		}
> +	} else if (cmd == SCTP_DELADDR) {
> +		error = sctp_send_asconf_del_ip(sk, (struct sockaddr *)addr, 1);
> +		if (error) {
> +			SCTP_DEBUG_PRINTK("asconf_mgmt: send_asconf_del_ip returns %d\n", error);
> +			return(error);
> +		}
> +	}
> +
> +	return(0);
> +}
> +
>  /* Helper for tunneling sctp_bindx() requests through sctp_setsockopt()
>   *
>   * API 8.1
> @@ -1146,6 +1403,7 @@ static int __sctp_connect(struct sock* s
>  	if ((err == 0 || err == -EINPROGRESS) && assoc_id)
>  		*assoc_id = asoc->assoc_id;
>  
> +	sctp_hash_endpoint(ep);
>  	/* Don't free association on exit. */
>  	asoc = NULL;
>  
> @@ -3559,6 +3817,8 @@ SCTP_STATIC struct sock *sctp_accept(str
>  	struct sctp_association *asoc;
>  	long timeo;
>  	int error = 0;
> +	struct sctp_sock *newsp = NULL;
> +	struct sctp_endpoint *newep = NULL;
>  
>  	sctp_lock_sock(sk);
>  
> @@ -3596,6 +3856,9 @@ SCTP_STATIC struct sock *sctp_accept(str
>  	 * asoc to the newsk.
>  	 */
>  	sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP);
> +	newsp = sctp_sk(newsk);
> +	newep = newsp->ep;
> +	sctp_hash_endpoint(newep);
>  
>  out:
>  	sctp_release_sock(sk);
> diff -ru -x '\.git' -x arch -x drivers -x fs -x asm -x Documentation -p linux-2.6.orig/net/sctp/sysctl.c linux-2.6/net/sctp/sysctl.c
> --- linux-2.6.orig/net/sctp/sysctl.c	2010-04-22 17:55:41.000000000 +0900
> +++ linux-2.6/net/sctp/sysctl.c	2010-06-23 09:11:02.000000000 +0900
> @@ -183,6 +183,13 @@ static ctl_table sctp_table[] = {
>  		.proc_handler	= proc_dointvec,
>  	},
>  	{
> +		.procname	= "auto_asconf_enable",
> +		.data		= &sctp_auto_asconf_enable,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
>  		.procname	= "prsctp_enable",
>  		.data		= &sctp_prsctp_enable,
>  		.maxlen		= sizeof(int),
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 

Best Regards
-----
Shan Wei



^ permalink raw reply

* Re: [RFC PATCH 0/9] ipvs network name space (netns) aware
From: Simon Horman @ 2010-10-20  9:17 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: lvs-devel@vger.kernel.org, netdev@vger.kernel.org,
	netfilter-devel@vger.kernel.org, ja@ssi.bg, wensong@linux-vs.org,
	daniel.lezcano@free.fr
In-Reply-To: <201010181355.59763.hans.schillstrom@ericsson.com>

On Mon, Oct 18, 2010 at 01:55:58PM +0200, Hans Schillstrom wrote:
> On Sunday 17 October 2010 08:47:31 Simon Horman wrote:
> > On Fri, Oct 08, 2010 at 01:16:36PM +0200, Hans Schillstrom wrote:
> > > This patch series adds network name space (netns) support to the LVS.
> > > 
> > > REVISION
> > > 
> > > This is version 1
> > > 
> > > OVERVIEW
> > > 
> > > The patch doesn't remove or add any functionality except for netns.
> > > For users that don't use network name space (netns) this patch is
> > > completely transparent.
> > > 
> > > No it's possible to run LVS in a Linux container (see lxc-tools)
> > > i.e.  a light weight virtualization. For example it's possible to run
> > > one or several lvs on a real server in their own network name spaces.
> > > >From the LVS point of view it looks like it runs on it's own machine.
> > > 
> > > IMPLEMENTATION
> > > Basic requirements for netns awareness
> > >  - Global variables has to be moved to dyn. allocated memory.
> > > 
> > > Most global variables now resides in a struct ipvs { } in netns/ip_vs.h.
> > > What is moved and what is not ?
> > > 
> > > Some cache aligned locks are still in global, module init params and some debug_level.
> > > 
> > > Algorithm files they are untouched.
> > > 
> > > QUESTIONS
> > > Drop rate in ip_vs_ctl per netns or grand total ?
> 
> This is a tricky one (I think), 
> if the interface is shared with root name-space and/or other name-spaces
>  - use grand total
> if it's an "own interface"  
>  - drop rate can/should be in netns...

I hadn't thought about shared devices - yes that is tricky.

> > My gut-feeling is that per netns makes more sense.
> > 
> > > Should more lock variables be moved (or less) ?
> > 
> > I'm unsure what you are asking here but I will make a general statement
> > about locking in IPVS: it needs work.
> 
> Some locks still resides as global variables, and others in netns_ipvs struct.
> Since you have a lot of experience with IPVS locks, 
> you might have ideas what to move and what to not move.

My basic thought is that locks tend to either related to a connection
or the configuration of a service. And it seems to me that if you
have a per-namespace connection hash table then both of these categories
of locks are good candidates to be made per-namespace.

Do you have any particular locks that you are worried about?

> > > PATCH SET
> > > This patch set is based upon net-next-2.6 (2.6.36-rc3) from 4 oct 2010
> > > and [patch v4] ipvs: IPv6 tunnel mode
> > > 
> > > Note: ip_vs_xmit.c will not work without "[patch v4] ipvs: IPv6 tunnel mode"
> > 
> > Unfortunately the patches don't apply with the persistence engine
> > patches which were recently merged into nf-next-2.6 (although
> > "[patch v4.1 ]ipvs: IPv6 tunnel mode" is still unmerged).
> > 
> I do have a patch based on the nf-next without the SIP/PE patch
> 
> > I'm happy to work with you to make the required changes there.
> 
> I would appreciate that.

No problem. I am a bit busy this week as I am attending the Netfilter
Workshop. But I will try to find some time to rebase your changes soon.

> > (I realise those patches weren't merged when you made your post.
> >  But regardless, either your or me will need to update the patches).
> > 
> > Another issue is that your patches seem to be split in a way
> > where the build breaks along the way. E.g. after applying
> > patch 1, the build breaks. Could you please split things up
> > in a manner such that this doesn't happen. The reason being
> > that it breaks bisection.
> > 
> Hmm, Daniel also pointed at this,
> The Patch is quite large, and will become even larger with pe and sip.
> My Idea was to review the patch in pieces and put it together in one or two large patches when submitting it.
> I don't know that might be a stupid ?
> It's hard to break it up, making the code reentrant causes changes every where.
> 
> Daniel L, had another approach break it into many many tiny patches.

I would prefer the tiny patch approach.

> > Lastly, could you provide a unique subject for each patch.
> > I know its a bit tedious, but it does make a difference when
> > browsing the changelog.
> > 
> Yepp, no problem

^ permalink raw reply

* Re: [PATCH 1/4] net - Add AF_ALG macros
From: David Miller @ 2010-10-20  9:01 UTC (permalink / raw)
  To: herbert; +Cc: linux-crypto, netdev, linux-kernel
In-Reply-To: <E1P8CVt-0003Yv-Ml@gondolin.me.apana.org.au>

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 19 Oct 2010 21:46:01 +0800

> net - Add AF_ALG macros
> 
> This patch adds the socket family/level macros for the yet-to-be-born
> AF_ALG family.  The AF_ALG family provides the user-space interface
> for the kernel crypto API.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: pull request: wireless-next-2.6 2010-10-15
From: David Miller @ 2010-10-20  9:00 UTC (permalink / raw)
  To: linville; +Cc: linux-wireless, netdev
In-Reply-To: <20101015204048.GC2438@tuxdriver.com>

From: "John W. Linville" <linville@tuxdriver.com>
Date: Fri, 15 Oct 2010 16:40:50 -0400

> Here is the latest batch of updates intended for 2.6.37.  As usual,
> it is primarily a bunch of driver updates and fixes to code already
> in -next.  This also includes a batch of Bluetooth updates courtesy
> of Gustavo Padovan.  There is also the movement of the wl1251 driver
> out of the wl12xx directory.  This is a prelude to the expansion of
> the wl1271 code to cover some new hardware, all of which is actually
> largely unrelated to wl1251.

Pulled, thanks John.

^ permalink raw reply

* Re: [PATCH] phonet: remove the unused variable pn
From: David Miller @ 2010-10-20  8:56 UTC (permalink / raw)
  To: xiaosuo; +Cc: remi.denis-courmont, netdev
In-Reply-To: <1287561063-24548-1-git-send-email-xiaosuo@gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 20 Oct 2010 15:51:03 +0800

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>

Applied, thanks.

^ permalink raw reply

* [v3 RFC PATCH 4/4] qemu changes
From: Krishna Kumar @ 2010-10-20  8:55 UTC (permalink / raw)
  To: rusty, davem, mst
  Cc: eric.dumazet, kvm, netdev, arnd, avi, anthony, Krishna Kumar
In-Reply-To: <20101020085452.15579.76002.sendpatchset@krkumar2.in.ibm.com>

Changes in qemu to support mq TX.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---     
 hw/vhost.c      |    7 ++++--
 hw/vhost.h      |    2 -
 hw/vhost_net.c  |   16 +++++++++----
 hw/vhost_net.h  |    2 -
 hw/virtio-net.c |   53 ++++++++++++++++++++++++++++++++--------------
 hw/virtio-net.h |    2 +
 hw/virtio-pci.c |    2 +
 net.c           |   17 ++++++++++++++
 net.h           |    1 
 net/tap.c       |   34 ++++++++++++++++++++++++++---
 10 files changed, 107 insertions(+), 29 deletions(-)

diff -ruNp org3/hw/vhost.c new3/hw/vhost.c
--- org3/hw/vhost.c	2010-10-19 19:38:11.000000000 +0530
+++ new3/hw/vhost.c	2010-10-20 12:44:21.000000000 +0530
@@ -580,7 +580,7 @@ static void vhost_virtqueue_cleanup(stru
                               0, virtio_queue_get_desc_size(vdev, idx));
 }
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int numtxqs)
 {
     uint64_t features;
     int r;
@@ -592,11 +592,14 @@ int vhost_dev_init(struct vhost_dev *hde
             return -errno;
         }
     }
-    r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
+
+    r = ioctl(hdev->control, VHOST_SET_OWNER, numtxqs);
     if (r < 0) {
         goto fail;
     }
 
+    hdev->nvqs = numtxqs + 1;
+
     r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
     if (r < 0) {
         goto fail;
diff -ruNp org3/hw/vhost.h new3/hw/vhost.h
--- org3/hw/vhost.h	2010-07-01 11:42:09.000000000 +0530
+++ new3/hw/vhost.h	2010-10-20 12:47:10.000000000 +0530
@@ -40,7 +40,7 @@ struct vhost_dev {
     unsigned long long log_size;
 };
 
-int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+int vhost_dev_init(struct vhost_dev *hdev, int devfd, int numtxqs);
 void vhost_dev_cleanup(struct vhost_dev *hdev);
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
diff -ruNp org3/hw/vhost_net.c new3/hw/vhost_net.c
--- org3/hw/vhost_net.c	2010-09-28 10:07:31.000000000 +0530
+++ new3/hw/vhost_net.c	2010-10-19 19:46:52.000000000 +0530
@@ -36,7 +36,8 @@
 
 struct vhost_net {
     struct vhost_dev dev;
-    struct vhost_virtqueue vqs[2];
+    struct vhost_virtqueue *vqs;
+    int nvqs;
     int backend;
     VLANClientState *vc;
 };
@@ -81,7 +82,8 @@ static int vhost_net_get_fd(VLANClientSt
     }
 }
 
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+				 int numtxqs)
 {
     int r;
     struct vhost_net *net = qemu_malloc(sizeof *net);
@@ -98,10 +100,14 @@ struct vhost_net *vhost_net_init(VLANCli
         (1 << VHOST_NET_F_VIRTIO_NET_HDR);
     net->backend = r;
 
-    r = vhost_dev_init(&net->dev, devfd);
+    r = vhost_dev_init(&net->dev, devfd, numtxqs);
     if (r < 0) {
         goto fail;
     }
+
+    net->nvqs = numtxqs + 1;
+    net->vqs = qemu_malloc(net->nvqs * (sizeof *net->vqs));
+
     if (!tap_has_vnet_hdr_len(backend,
                               sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
         net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
@@ -131,7 +137,6 @@ int vhost_net_start(struct vhost_net *ne
                              sizeof(struct virtio_net_hdr_mrg_rxbuf));
     }
 
-    net->dev.nvqs = 2;
     net->dev.vqs = net->vqs;
     r = vhost_dev_start(&net->dev, dev);
     if (r < 0) {
@@ -188,7 +193,8 @@ void vhost_net_cleanup(struct vhost_net 
     qemu_free(net);
 }
 #else
-struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd,
+				 int nvqs)
 {
 	return NULL;
 }
diff -ruNp org3/hw/vhost_net.h new3/hw/vhost_net.h
--- org3/hw/vhost_net.h	2010-07-01 11:42:09.000000000 +0530
+++ new3/hw/vhost_net.h	2010-10-19 19:46:52.000000000 +0530
@@ -6,7 +6,7 @@
 struct vhost_net;
 typedef struct vhost_net VHostNetState;
 
-VHostNetState *vhost_net_init(VLANClientState *backend, int devfd);
+VHostNetState *vhost_net_init(VLANClientState *backend, int devfd, int nvqs);
 
 int vhost_net_start(VHostNetState *net, VirtIODevice *dev);
 void vhost_net_stop(VHostNetState *net, VirtIODevice *dev);
diff -ruNp org3/hw/virtio-net.c new3/hw/virtio-net.c
--- org3/hw/virtio-net.c	2010-10-19 19:38:11.000000000 +0530
+++ new3/hw/virtio-net.c	2010-10-19 21:02:33.000000000 +0530
@@ -32,7 +32,7 @@ typedef struct VirtIONet
     uint8_t mac[ETH_ALEN];
     uint16_t status;
     VirtQueue *rx_vq;
-    VirtQueue *tx_vq;
+    VirtQueue **tx_vq;
     VirtQueue *ctrl_vq;
     NICState *nic;
     QEMUTimer *tx_timer;
@@ -65,6 +65,7 @@ typedef struct VirtIONet
     } mac_table;
     uint32_t *vlans;
     DeviceState *qdev;
+    uint16_t numtxqs;
 } VirtIONet;
 
 /* TODO
@@ -82,6 +83,7 @@ static void virtio_net_get_config(VirtIO
     struct virtio_net_config netcfg;
 
     netcfg.status = n->status;
+    netcfg.numtxqs = n->numtxqs;
     memcpy(netcfg.mac, n->mac, ETH_ALEN);
     memcpy(config, &netcfg, sizeof(netcfg));
 }
@@ -196,6 +198,8 @@ static uint32_t virtio_net_get_features(
     VirtIONet *n = to_virtio_net(vdev);
 
     features |= (1 << VIRTIO_NET_F_MAC);
+    if (n->numtxqs > 1)
+        features |= (1 << VIRTIO_NET_F_NUMTXQS);
 
     if (peer_has_vnet_hdr(n)) {
         tap_using_vnet_hdr(n->nic->nc.peer, 1);
@@ -659,13 +663,16 @@ static void virtio_net_tx_complete(VLANC
 {
     VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
 
-    virtqueue_push(n->tx_vq, &n->async_tx.elem, n->async_tx.len);
-    virtio_notify(&n->vdev, n->tx_vq);
+    /*
+     * If this function executes, we are single TX and hence use only txq[0]
+     */
+    virtqueue_push(n->tx_vq[0], &n->async_tx.elem, n->async_tx.len);
+    virtio_notify(&n->vdev, n->tx_vq[0]);
 
     n->async_tx.elem.out_num = n->async_tx.len = 0;
 
-    virtio_queue_set_notification(n->tx_vq, 1);
-    virtio_net_flush_tx(n, n->tx_vq);
+    virtio_queue_set_notification(n->tx_vq[0], 1);
+    virtio_net_flush_tx(n, n->tx_vq[0]);
 }
 
 /* TX */
@@ -679,7 +686,7 @@ static int32_t virtio_net_flush_tx(VirtI
     }
 
     if (n->async_tx.elem.out_num) {
-        virtio_queue_set_notification(n->tx_vq, 0);
+        virtio_queue_set_notification(n->tx_vq[0], 0);
         return num_packets;
     }
 
@@ -714,7 +721,7 @@ static int32_t virtio_net_flush_tx(VirtI
         ret = qemu_sendv_packet_async(&n->nic->nc, out_sg, out_num,
                                       virtio_net_tx_complete);
         if (ret == 0) {
-            virtio_queue_set_notification(n->tx_vq, 0);
+            virtio_queue_set_notification(n->tx_vq[0], 0);
             n->async_tx.elem = elem;
             n->async_tx.len  = len;
             return -EBUSY;
@@ -771,8 +778,8 @@ static void virtio_net_tx_timer(void *op
     if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
         return;
 
-    virtio_queue_set_notification(n->tx_vq, 1);
-    virtio_net_flush_tx(n, n->tx_vq);
+    virtio_queue_set_notification(n->tx_vq[0], 1);
+    virtio_net_flush_tx(n, n->tx_vq[0]);
 }
 
 static void virtio_net_tx_bh(void *opaque)
@@ -786,7 +793,7 @@ static void virtio_net_tx_bh(void *opaqu
     if (unlikely(!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)))
         return;
 
-    ret = virtio_net_flush_tx(n, n->tx_vq);
+    ret = virtio_net_flush_tx(n, n->tx_vq[0]);
     if (ret == -EBUSY) {
         return; /* Notification re-enable handled by tx_complete */
     }
@@ -802,9 +809,9 @@ static void virtio_net_tx_bh(void *opaqu
     /* If less than a full burst, re-enable notification and flush
      * anything that may have come in while we weren't looking.  If
      * we find something, assume the guest is still active and reschedule */
-    virtio_queue_set_notification(n->tx_vq, 1);
-    if (virtio_net_flush_tx(n, n->tx_vq) > 0) {
-        virtio_queue_set_notification(n->tx_vq, 0);
+    virtio_queue_set_notification(n->tx_vq[0], 1);
+    if (virtio_net_flush_tx(n, n->tx_vq[0]) > 0) {
+        virtio_queue_set_notification(n->tx_vq[0], 0);
         qemu_bh_schedule(n->tx_bh);
         n->tx_waiting = 1;
     }
@@ -820,6 +827,7 @@ static void virtio_net_save(QEMUFile *f,
     virtio_save(&n->vdev, f);
 
     qemu_put_buffer(f, n->mac, ETH_ALEN);
+    qemu_put_be16(f, n->numtxqs);
     qemu_put_be32(f, n->tx_waiting);
     qemu_put_be32(f, n->mergeable_rx_bufs);
     qemu_put_be16(f, n->status);
@@ -849,6 +857,7 @@ static int virtio_net_load(QEMUFile *f, 
     virtio_load(&n->vdev, f);
 
     qemu_get_buffer(f, n->mac, ETH_ALEN);
+    n->numtxqs = qemu_get_be32(f);
     n->tx_waiting = qemu_get_be32(f);
     n->mergeable_rx_bufs = qemu_get_be32(f);
 
@@ -966,11 +975,14 @@ VirtIODevice *virtio_net_init(DeviceStat
                               virtio_net_conf *net)
 {
     VirtIONet *n;
+    int i;
 
     n = (VirtIONet *)virtio_common_init("virtio-net", VIRTIO_ID_NET,
                                         sizeof(struct virtio_net_config),
                                         sizeof(VirtIONet));
 
+    n->numtxqs = conf->peer->numtxqs;
+
     n->vdev.get_config = virtio_net_get_config;
     n->vdev.set_config = virtio_net_set_config;
     n->vdev.get_features = virtio_net_get_features;
@@ -978,8 +990,8 @@ VirtIODevice *virtio_net_init(DeviceStat
     n->vdev.bad_features = virtio_net_bad_features;
     n->vdev.reset = virtio_net_reset;
     n->vdev.set_status = virtio_net_set_status;
-    n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
 
+    n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
     if (net->tx && strcmp(net->tx, "timer") && strcmp(net->tx, "bh")) {
         fprintf(stderr, "virtio-net: "
                 "Unknown option tx=%s, valid options: \"timer\" \"bh\"\n",
@@ -987,12 +999,21 @@ VirtIODevice *virtio_net_init(DeviceStat
         fprintf(stderr, "Defaulting to \"bh\"\n");
     }
 
+    /* Allocate per tx vq's */
+    n->tx_vq = qemu_mallocz(n->numtxqs * sizeof(*n->tx_vq));
+    for (i = 0; i < n->numtxqs; i++) {
+        if (net->tx && !strcmp(net->tx, "timer")) {
+            n->tx_vq[i] = virtio_add_queue(&n->vdev, 256,
+                                           virtio_net_handle_tx_timer);
+        } else {
+            n->tx_vq[i] = virtio_add_queue(&n->vdev, 256,
+                                           virtio_net_handle_tx_bh);
+        }
+    }
     if (net->tx && !strcmp(net->tx, "timer")) {
-        n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_timer);
         n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
         n->tx_timeout = net->txtimer;
     } else {
-        n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh);
         n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n);
     }
     n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
diff -ruNp org3/hw/virtio-net.h new3/hw/virtio-net.h
--- org3/hw/virtio-net.h	2010-09-28 10:07:31.000000000 +0530
+++ new3/hw/virtio-net.h	2010-10-19 19:46:52.000000000 +0530
@@ -44,6 +44,7 @@
 #define VIRTIO_NET_F_CTRL_RX    18      /* Control channel RX mode support */
 #define VIRTIO_NET_F_CTRL_VLAN  19      /* Control channel VLAN filtering */
 #define VIRTIO_NET_F_CTRL_RX_EXTRA 20   /* Extra RX mode control support */
+#define VIRTIO_NET_F_NUMTXQS    21      /* Supports multiple TX queues */
 
 #define VIRTIO_NET_S_LINK_UP    1       /* Link is up */
 
@@ -72,6 +73,7 @@ struct virtio_net_config
     uint8_t mac[ETH_ALEN];
     /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
     uint16_t status;
+    uint16_t numtxqs;	/* number of transmit queues */
 } __attribute__((packed));
 
 /* This is the first element of the scatter-gather list.  If you don't
diff -ruNp org3/hw/virtio-pci.c new3/hw/virtio-pci.c
--- org3/hw/virtio-pci.c	2010-10-19 19:38:11.000000000 +0530
+++ new3/hw/virtio-pci.c	2010-10-19 19:46:52.000000000 +0530
@@ -99,6 +99,7 @@ typedef struct {
     uint32_t addr;
     uint32_t class_code;
     uint32_t nvectors;
+    uint32_t mq;
     BlockConf block;
     NICConf nic;
     uint32_t host_features;
@@ -788,6 +789,7 @@ static PCIDeviceInfo virtio_info[] = {
         .romfile    = "pxe-virtio.bin",
         .qdev.props = (Property[]) {
             DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
+	    DEFINE_PROP_UINT32("mq", VirtIOPCIProxy, mq, 1),
             DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features),
             DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic),
             DEFINE_PROP_UINT32("x-txtimer", VirtIOPCIProxy,
diff -ruNp org3/net/tap.c new3/net/tap.c
--- org3/net/tap.c	2010-09-28 10:07:31.000000000 +0530
+++ new3/net/tap.c	2010-10-20 12:39:56.000000000 +0530
@@ -320,13 +320,14 @@ static NetClientInfo net_tap_info = {
 static TAPState *net_tap_fd_init(VLANState *vlan,
                                  const char *model,
                                  const char *name,
-                                 int fd,
+                                 int fd, int numtxqs,
                                  int vnet_hdr)
 {
     VLANClientState *nc;
     TAPState *s;
 
     nc = qemu_new_net_client(&net_tap_info, vlan, NULL, model, name);
+    nc->numtxqs = numtxqs;
 
     s = DO_UPCAST(TAPState, nc, nc);
 
@@ -424,6 +425,27 @@ int net_init_tap(QemuOpts *opts, Monitor
 {
     TAPState *s;
     int fd, vnet_hdr = 0;
+    int vhost;
+    int numtxqs = 1;
+
+    vhost = qemu_opt_get_bool(opts, "vhost", 0);
+
+    /*
+     * We support multiple tx queues if:
+     *      1. smp > 1
+     *      2. vhost=on
+     *      3. mq=on
+     * In this case, #txqueues = #cpus. This value can be changed by
+     * using the "numtxqs" option.
+     */
+    if (vhost && smp_cpus > 1) {
+        if (qemu_opt_get_bool(opts, "mq", 0)) {
+#define VIRTIO_MAX_TXQS         32
+            int dflt = MIN(smp_cpus, VIRTIO_MAX_TXQS);
+
+            numtxqs = qemu_opt_get_number(opts, "numtxqs", dflt);
+        }
+    }
 
     if (qemu_opt_get(opts, "fd")) {
         if (qemu_opt_get(opts, "ifname") ||
@@ -457,7 +479,7 @@ int net_init_tap(QemuOpts *opts, Monitor
         }
     }
 
-    s = net_tap_fd_init(vlan, "tap", name, fd, vnet_hdr);
+    s = net_tap_fd_init(vlan, "tap", name, fd, numtxqs, vnet_hdr);
     if (!s) {
         close(fd);
         return -1;
@@ -486,7 +508,7 @@ int net_init_tap(QemuOpts *opts, Monitor
         }
     }
 
-    if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) {
+    if (vhost) {
         int vhostfd, r;
         if (qemu_opt_get(opts, "vhostfd")) {
             r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
@@ -497,9 +519,13 @@ int net_init_tap(QemuOpts *opts, Monitor
         } else {
             vhostfd = -1;
         }
-        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
+        s->vhost_net = vhost_net_init(&s->nc, vhostfd, numtxqs);
         if (!s->vhost_net) {
             error_report("vhost-net requested but could not be initialized");
+            if (numtxqs > 1) {
+                error_report("Need vhost support for numtxqs > 1, exiting...");
+                exit(1);
+            }
             return -1;
         }
     } else if (qemu_opt_get(opts, "vhostfd")) {
diff -ruNp org3/net.c new3/net.c
--- org3/net.c	2010-10-19 19:38:11.000000000 +0530
+++ new3/net.c	2010-10-19 19:46:52.000000000 +0530
@@ -849,6 +849,15 @@ static int net_init_nic(QemuOpts *opts,
         return -1;
     }
 
+    if (nd->netdev->numtxqs > 1 && nd->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+        /*
+         * User specified mq for guest, but no "vectors=", tune
+         * it automatically to 'numtxqs' TX + 1 RX + 1 controlq.
+         */
+        nd->nvectors = nd->netdev->numtxqs + 1 + 1;
+        monitor_printf(mon, "nvectors tuned to %d\n", nd->nvectors);
+    }
+
     nd->used = 1;
     nb_nics++;
 
@@ -992,6 +1001,14 @@ static const struct {
             },
 #ifndef _WIN32
             {
+                .name = "mq",
+                .type = QEMU_OPT_BOOL,
+                .help = "enable multiqueue on network i/f",
+            }, {
+                .name = "numtxqs",
+                .type = QEMU_OPT_NUMBER,
+                .help = "optional number of TX queues, if mq is enabled",
+            }, {
                 .name = "fd",
                 .type = QEMU_OPT_STRING,
                 .help = "file descriptor of an already opened tap",
diff -ruNp org3/net.h new3/net.h
--- org3/net.h	2010-10-19 19:38:11.000000000 +0530
+++ new3/net.h	2010-10-19 19:46:52.000000000 +0530
@@ -62,6 +62,7 @@ struct VLANClientState {
     struct VLANState *vlan;
     VLANClientState *peer;
     NetQueue *send_queue;
+    int numtxqs;
     char *model;
     char *name;
     char info_str[256];

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox