Netdev List
 help / color / mirror / Atom feed
* [PATCH 47/84] netfilter: xtables: shorten up return clause
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

The return value of nf_ct_l3proto_get can directly be returned even in
the case of success.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c |    7 ++-----
 net/netfilter/xt_CONNSECMARK.c     |    6 ++----
 net/netfilter/xt_connbytes.c       |    7 ++-----
 net/netfilter/xt_connmark.c        |   12 ++++--------
 net/netfilter/xt_conntrack.c       |    6 ++----
 net/netfilter/xt_state.c           |    6 ++----
 6 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1faf5fa..5d70c43 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -403,13 +403,10 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	cipinfo->config = config;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-
-	return 0;
+	return ret;
 }
 
 /* drop reference count of cluster config when rule is deleted */
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c
index 105a62e..e953e30 100644
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -107,12 +107,10 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
 	}
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-	return 0;
+	return ret;
 }
 
 static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par)
diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c
index 2ff332e..ff738a5 100644
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -109,13 +109,10 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
 		return -EINVAL;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-
-	return 0;
+	return ret;
 }
 
 static void connbytes_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 71e38a1..ae10154 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -79,12 +79,10 @@ static int connmark_tg_check(const struct xt_tgchk_param *par)
 	int ret;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-	return 0;
+	return ret;
 }
 
 static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
@@ -111,12 +109,10 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
 	int ret;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-	return 0;
+	return ret;
 }
 
 static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index e0bcf8d..3348706 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -211,12 +211,10 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
 	int ret;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-	return 0;
+	return ret;
 }
 
 static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c
index 2b75230..be00d7b 100644
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -42,12 +42,10 @@ static int state_mt_check(const struct xt_mtchk_param *par)
 	int ret;
 
 	ret = nf_ct_l3proto_try_module_get(par->family);
-	if (ret < 0) {
+	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
-		return ret;
-	}
-	return 0;
+	return ret;
 }
 
 static void state_mt_destroy(const struct xt_mtdtor_param *par)
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 49/84] netfilter: xtables: remove xt_multiport revision 0
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

Superseded by xt_multiport revision 1 (introduction already predates
linux.git).

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 net/netfilter/xt_multiport.c |   77 ------------------------------------------
 1 files changed, 0 insertions(+), 77 deletions(-)

diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index b446738..83b77ce 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -28,23 +28,6 @@ MODULE_ALIAS("ip6t_multiport");
 
 /* Returns 1 if the port is matched by the test, 0 otherwise. */
 static inline bool
-ports_match_v0(const u_int16_t *portlist, enum xt_multiport_flags flags,
-	       u_int8_t count, u_int16_t src, u_int16_t dst)
-{
-	unsigned int i;
-	for (i = 0; i < count; i++) {
-		if (flags != XT_MULTIPORT_DESTINATION && portlist[i] == src)
-			return true;
-
-		if (flags != XT_MULTIPORT_SOURCE && portlist[i] == dst)
-			return true;
-	}
-
-	return false;
-}
-
-/* Returns 1 if the port is matched by the test, 0 otherwise. */
-static inline bool
 ports_match_v1(const struct xt_multiport_v1 *minfo,
 	       u_int16_t src, u_int16_t dst)
 {
@@ -89,30 +72,6 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
 }
 
 static bool
-multiport_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
-{
-	const __be16 *pptr;
-	__be16 _ports[2];
-	const struct xt_multiport *multiinfo = par->matchinfo;
-
-	if (par->fragoff != 0)
-		return false;
-
-	pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports);
-	if (pptr == NULL) {
-		/* We've been asked to examine this packet, and we
-		 * can't.  Hence, no choice but to drop.
-		 */
-		pr_debug("Dropping evil offset=0 tinygram.\n");
-		*par->hotdrop = true;
-		return false;
-	}
-
-	return ports_match_v0(multiinfo->ports, multiinfo->flags,
-	       multiinfo->count, ntohs(pptr[0]), ntohs(pptr[1]));
-}
-
-static bool
 multiport_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
 	const __be16 *pptr;
@@ -152,15 +111,6 @@ check(u_int16_t proto,
 		&& count <= XT_MULTI_PORTS;
 }
 
-static int multiport_mt_check_v0(const struct xt_mtchk_param *par)
-{
-	const struct ipt_ip *ip = par->entryinfo;
-	const struct xt_multiport *multiinfo = par->matchinfo;
-
-	return check(ip->proto, ip->invflags, multiinfo->flags,
-		     multiinfo->count);
-}
-
 static int multiport_mt_check(const struct xt_mtchk_param *par)
 {
 	const struct ipt_ip *ip = par->entryinfo;
@@ -170,15 +120,6 @@ static int multiport_mt_check(const struct xt_mtchk_param *par)
 		     multiinfo->count);
 }
 
-static int multiport_mt6_check_v0(const struct xt_mtchk_param *par)
-{
-	const struct ip6t_ip6 *ip = par->entryinfo;
-	const struct xt_multiport *multiinfo = par->matchinfo;
-
-	return check(ip->proto, ip->invflags, multiinfo->flags,
-		     multiinfo->count);
-}
-
 static int multiport_mt6_check(const struct xt_mtchk_param *par)
 {
 	const struct ip6t_ip6 *ip = par->entryinfo;
@@ -192,15 +133,6 @@ static struct xt_match multiport_mt_reg[] __read_mostly = {
 	{
 		.name		= "multiport",
 		.family		= NFPROTO_IPV4,
-		.revision	= 0,
-		.checkentry	= multiport_mt_check_v0,
-		.match		= multiport_mt_v0,
-		.matchsize	= sizeof(struct xt_multiport),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "multiport",
-		.family		= NFPROTO_IPV4,
 		.revision	= 1,
 		.checkentry	= multiport_mt_check,
 		.match		= multiport_mt,
@@ -210,15 +142,6 @@ static struct xt_match multiport_mt_reg[] __read_mostly = {
 	{
 		.name		= "multiport",
 		.family		= NFPROTO_IPV6,
-		.revision	= 0,
-		.checkentry	= multiport_mt6_check_v0,
-		.match		= multiport_mt_v0,
-		.matchsize	= sizeof(struct xt_multiport),
-		.me		= THIS_MODULE,
-	},
-	{
-		.name		= "multiport",
-		.family		= NFPROTO_IPV6,
 		.revision	= 1,
 		.checkentry	= multiport_mt6_check,
 		.match		= multiport_mt,
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 54/84] netfilter: xt_hashlimit: RCU conversion
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)

After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.

For 'readers', updating an existing entry, they use an individual lock
per entry.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_hashlimit.c |   70 ++++++++++++++++++++++++++++--------------
 1 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..453178d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
 	struct dsthash_dst dst;
 
 	/* modified structure members in the end */
+	spinlock_t lock;
 	unsigned long expires;		/* precalculated expiry time */
 	struct {
 		unsigned long prev;	/* last modification */
 		u_int32_t credit;
 		u_int32_t credit_cap, cost;
 	} rateinfo;
+	struct rcu_head rcu;
 };
 
 struct xt_hashlimit_htable {
@@ -143,9 +145,11 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
 	u_int32_t hash = hash_dst(ht, dst);
 
 	if (!hlist_empty(&ht->hash[hash])) {
-		hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
-			if (dst_cmp(ent, dst))
+		hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+			if (dst_cmp(ent, dst)) {
+				spin_lock(&ent->lock);
 				return ent;
+			}
 	}
 	return NULL;
 }
@@ -157,9 +161,10 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 {
 	struct dsthash_ent *ent;
 
+	spin_lock(&ht->lock);
 	/* initialize hash with random val at the time we allocate
 	 * the first hashtable entry */
-	if (!ht->rnd_initialized) {
+	if (unlikely(!ht->rnd_initialized)) {
 		get_random_bytes(&ht->rnd, sizeof(ht->rnd));
 		ht->rnd_initialized = true;
 	}
@@ -168,27 +173,36 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
 		/* FIXME: do something. question is what.. */
 		if (net_ratelimit())
 			pr_err("max count of %u reached\n", ht->cfg.max);
-		return NULL;
-	}
-
-	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+		ent = NULL;
+	} else
+		ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
 	if (!ent) {
 		if (net_ratelimit())
 			pr_err("cannot allocate dsthash_ent\n");
-		return NULL;
-	}
-	memcpy(&ent->dst, dst, sizeof(ent->dst));
+	} else {
+		memcpy(&ent->dst, dst, sizeof(ent->dst));
+		spin_lock_init(&ent->lock);
 
-	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
-	ht->count++;
+		spin_lock(&ent->lock);
+		hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+		ht->count++;
+	}
+	spin_unlock(&ht->lock);
 	return ent;
 }
 
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+	struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+	kmem_cache_free(hashlimit_cachep, ent);
+}
+
 static inline void
 dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
 {
-	hlist_del(&ent->node);
-	kmem_cache_free(hashlimit_cachep, ent);
+	hlist_del_rcu(&ent->node);
+	call_rcu_bh(&ent->rcu, dsthash_free_rcu);
 	ht->count--;
 }
 static void htable_gc(unsigned long htlong);
@@ -512,15 +526,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
 		goto hotdrop;
 
-	spin_lock_bh(&hinfo->lock);
+	rcu_read_lock_bh();
 	dh = dsthash_find(hinfo, &dst);
 	if (dh == NULL) {
 		dh = dsthash_alloc_init(hinfo, &dst);
 		if (dh == NULL) {
-			spin_unlock_bh(&hinfo->lock);
+			rcu_read_unlock_bh();
 			goto hotdrop;
 		}
-
 		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
 		dh->rateinfo.prev = jiffies;
 		dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
 		/* below the limit */
 		dh->rateinfo.credit -= dh->rateinfo.cost;
-		spin_unlock_bh(&hinfo->lock);
+		spin_unlock(&dh->lock);
+		rcu_read_unlock_bh();
 		return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
 	}
 
-	spin_unlock_bh(&hinfo->lock);
+	spin_unlock(&dh->lock);
+	rcu_read_unlock_bh();
 	/* default match is underlimit - so over the limit, we need to invert */
 	return info->cfg.mode & XT_HASHLIMIT_INVERT;
 
@@ -666,12 +681,15 @@ static void dl_seq_stop(struct seq_file *s, void *v)
 static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				   struct seq_file *s)
 {
+	int res;
+
+	spin_lock(&ent->lock);
 	/* recalculate to show accurate numbers */
 	rateinfo_recalc(ent, jiffies);
 
 	switch (family) {
 	case NFPROTO_IPV4:
-		return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip.src,
 				 ntohs(ent->dst.src_port),
@@ -679,9 +697,10 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
 	case NFPROTO_IPV6:
-		return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+		res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
 				 (long)(ent->expires - jiffies)/HZ,
 				 &ent->dst.ip6.src,
 				 ntohs(ent->dst.src_port),
@@ -689,11 +708,14 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
 				 ntohs(ent->dst.dst_port),
 				 ent->rateinfo.credit, ent->rateinfo.credit_cap,
 				 ent->rateinfo.cost);
+		break;
 #endif
 	default:
 		BUG();
-		return 0;
+		res = 0;
 	}
+	spin_unlock(&ent->lock);
+	return res;
 }
 
 static int dl_seq_show(struct seq_file *s, void *v)
@@ -817,9 +839,11 @@ err1:
 
 static void __exit hashlimit_mt_exit(void)
 {
-	kmem_cache_destroy(hashlimit_cachep);
 	xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
 	unregister_pernet_subsys(&hashlimit_net_ops);
+
+	rcu_barrier_bh();
+	kmem_cache_destroy(hashlimit_cachep);
 }
 
 module_init(hashlimit_mt_init);
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 56/84] netfilter: only do skb_checksum_help on CHECKSUM_PARTIAL in ip_queue
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Herbert Xu <herbert@gondor.apana.org.au>

While doing yet another audit on ip_summed I noticed ip_queue
calling skb_checksum_help unnecessarily.  As we will set ip_summed
to CHECKSUM_NONE when necessary in ipq_mangle_ipv4, there is no
need to zap CHECKSUM_COMPLETE in ipq_build_packet_message.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ip_queue.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index 2855f1f..d781513 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -160,8 +160,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 		break;
 
 	case IPQ_COPY_PACKET:
-		if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
-		     entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
+		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
 		    (*errp = skb_checksum_help(entry->skb))) {
 			read_unlock_bh(&queue_lock);
 			return NULL;
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 57/84] netfilter: only do skb_checksum_help on CHECKSUM_PARTIAL in ip6_queue
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Herbert Xu <herbert@gondor.apana.org.au>

As we will set ip_summed to CHECKSUM_NONE when necessary in
ipq_mangle_ipv6, there is no need to zap CHECKSUM_COMPLETE in
ipq_build_packet_message.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv6/netfilter/ip6_queue.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c
index 7854052..39856a2 100644
--- a/net/ipv6/netfilter/ip6_queue.c
+++ b/net/ipv6/netfilter/ip6_queue.c
@@ -161,8 +161,7 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp)
 		break;
 
 	case IPQ_COPY_PACKET:
-		if ((entry->skb->ip_summed == CHECKSUM_PARTIAL ||
-		     entry->skb->ip_summed == CHECKSUM_COMPLETE) &&
+		if (entry->skb->ip_summed == CHECKSUM_PARTIAL &&
 		    (*errp = skb_checksum_help(entry->skb))) {
 			read_unlock_bh(&queue_lock);
 			return NULL;
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 62/84] netfilter: fix some coding styles and remove moduleparam.h
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>

Fix some coding styles and remove moduleparam.h

Signed-off-by: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_standalone.c |    3 +--
 net/ipv4/netfilter/nf_nat_tftp.c       |    1 -
 net/netfilter/nf_conntrack_proto.c     |    2 --
 3 files changed, 1 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5678e95..0b49248 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -137,9 +137,8 @@ nf_nat_fn(unsigned int hooknum,
 				ret = nf_nat_rule_find(skb, hooknum, in, out,
 						       ct);
 
-			if (ret != NF_ACCEPT) {
+			if (ret != NF_ACCEPT)
 				return ret;
-			}
 		} else
 			pr_debug("Already setup manip %s for ct %p\n",
 				 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
diff --git a/net/ipv4/netfilter/nf_nat_tftp.c b/net/ipv4/netfilter/nf_nat_tftp.c
index b096e81..7274a43 100644
--- a/net/ipv4/netfilter/nf_nat_tftp.c
+++ b/net/ipv4/netfilter/nf_nat_tftp.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/moduleparam.h>
 #include <linux/udp.h>
 
 #include <net/netfilter/nf_nat_helper.h>
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 1a4568b..f71cd5d 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -13,12 +13,10 @@
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <linux/stddef.h>
 #include <linux/err.h>
 #include <linux/percpu.h>
-#include <linux/moduleparam.h>
 #include <linux/notifier.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 69/84] netfilter: bridge-netfilter: Fix MAC header handling with IP DNAT
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Bart De Schuymer <bdschuym@pandora.be>

- fix IP DNAT on vlan- or pppoe-encapsulated traffic: The functions
neigh_hh_output() or dst->neighbour->output() overwrite the complete
Ethernet header, although we only need the destination MAC address.
For encapsulated packets, they ended up overwriting the encapsulating
header. The new code copies the Ethernet source MAC address and
protocol number before calling dst->neighbour->output(). The Ethernet
source MAC and protocol number are copied back in place in
br_nf_pre_routing_finish_bridge_slow(). This also makes the IP DNAT
more transparent because in the old scheme the source MAC of the
bridge was copied into the source address in the Ethernet header. We
also let skb->protocol equal ETH_P_IP resp. ETH_P_IPV6 during the
execution of the PF_INET resp. PF_INET6 hooks.

- Speed up IP DNAT by calling neigh_hh_bridge() instead of
neigh_hh_output(): if dst->hh is available, we already know the MAC
address so we can just copy it.

Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_bridge.h |    5 ++-
 include/net/neighbour.h          |   14 ++++++
 net/bridge/br_netfilter.c        |   90 +++++++++++++++++++++++++++----------
 3 files changed, 83 insertions(+), 26 deletions(-)

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index ffab6c4..ea0e44b 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -43,7 +43,8 @@ enum nf_br_hook_priorities {
 #define BRNF_BRIDGED_DNAT		0x02
 #define BRNF_BRIDGED			0x04
 #define BRNF_NF_BRIDGE_PREROUTING	0x08
-
+#define BRNF_8021Q			0x10
+#define BRNF_PPPoE			0x20
 
 /* Only used in br_forward.c */
 extern int nf_bridge_copy_header(struct sk_buff *skb);
@@ -75,6 +76,8 @@ static inline int br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
 
 	skb_pull(skb, ETH_HLEN);
 	nf_bridge->mask ^= BRNF_BRIDGED_DNAT;
+	skb_copy_to_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN),
+				       skb->nf_bridge->data, ETH_HLEN-ETH_ALEN);
 	skb->dev = nf_bridge->physindev;
 	return br_handle_frame_finish(skb);
 }
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index da1d58b..eb21340 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -299,6 +299,20 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 	return 0;
 }
 
+#ifdef CONFIG_BRIDGE_NETFILTER
+static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
+{
+	unsigned seq, hh_alen;
+
+	do {
+		seq = read_seqbegin(&hh->hh_lock);
+		hh_alen = HH_DATA_ALIGN(ETH_HLEN);
+		memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
+	} while (read_seqretry(&hh->hh_lock, seq));
+	return 0;
+}
+#endif
+
 static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
 {
 	unsigned seq;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 05dc630..b7e405d 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -196,15 +196,24 @@ static inline void nf_bridge_save_header(struct sk_buff *skb)
 					 skb->nf_bridge->data, header_size);
 }
 
-/*
- * When forwarding bridge frames, we save a copy of the original
- * header before processing.
+static inline void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+	if (skb->nf_bridge->mask & BRNF_8021Q)
+		skb->protocol = htons(ETH_P_8021Q);
+	else if (skb->nf_bridge->mask & BRNF_PPPoE)
+		skb->protocol = htons(ETH_P_PPP_SES);
+}
+
+/* Fill in the header for fragmented IP packets handled by
+ * the IPv4 connection tracking code.
  */
 int nf_bridge_copy_header(struct sk_buff *skb)
 {
 	int err;
-	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+	unsigned int header_size;
 
+	nf_bridge_update_protocol(skb);
+	header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
 	err = skb_cow_head(skb, header_size);
 	if (err)
 		return err;
@@ -238,6 +247,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 	skb_dst_set(skb, &rt->u.dst);
 
 	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
 	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
 		       br_handle_frame_finish, 1);
@@ -245,6 +255,38 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 	return 0;
 }
 
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct dst_entry *dst;
+
+	skb->dev = bridge_parent(skb->dev);
+	if (!skb->dev)
+		goto free_skb;
+	dst = skb_dst(skb);
+	if (dst->hh) {
+		neigh_hh_bridge(dst->hh, skb);
+		skb->dev = nf_bridge->physindev;
+		return br_handle_frame_finish(skb);
+	} else if (dst->neighbour) {
+		/* the neighbour function below overwrites the complete
+		 * MAC header, so we save the Ethernet source address and
+		 * protocol number. */
+		skb_copy_from_linear_data_offset(skb, -(ETH_HLEN-ETH_ALEN), skb->nf_bridge->data, ETH_HLEN-ETH_ALEN);
+		/* tell br_dev_xmit to continue with forwarding */
+		nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+		return dst->neighbour->output(skb);
+	}
+free_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
 /* This requires some explaining. If DNAT has taken place,
  * we will need to fix up the destination Ethernet address.
  *
@@ -283,25 +325,6 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
  * device, we proceed as if ip_route_input() succeeded. If it differs from the
  * logical bridge port or if ip_route_output_key() fails we drop the packet.
  */
-
-static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
-{
-	skb->dev = bridge_parent(skb->dev);
-	if (skb->dev) {
-		struct dst_entry *dst = skb_dst(skb);
-
-		nf_bridge_pull_encap_header(skb);
-		skb->nf_bridge->mask |= BRNF_BRIDGED_DNAT;
-
-		if (dst->hh)
-			return neigh_hh_output(dst->hh, skb);
-		else if (dst->neighbour)
-			return dst->neighbour->output(skb);
-	}
-	kfree_skb(skb);
-	return 0;
-}
-
 static int br_nf_pre_routing_finish(struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
@@ -354,6 +377,7 @@ free_skb:
 			if (skb_dst(skb)->dev == dev) {
 bridged_dnat:
 				skb->dev = nf_bridge->physindev;
+				nf_bridge_update_protocol(skb);
 				nf_bridge_push_encap_header(skb);
 				NF_HOOK_THRESH(NFPROTO_BRIDGE,
 					       NF_BR_PRE_ROUTING,
@@ -376,6 +400,7 @@ bridged_dnat:
 	}
 
 	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
 	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
 		       br_handle_frame_finish, 1);
@@ -396,6 +421,10 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
 	nf_bridge->physindev = skb->dev;
 	skb->dev = bridge_parent(skb->dev);
+	if (skb->protocol == htons(ETH_P_8021Q))
+		nf_bridge->mask |= BRNF_8021Q;
+	else if (skb->protocol == htons(ETH_P_PPP_SES))
+		nf_bridge->mask |= BRNF_PPPoE;
 
 	return skb->dev;
 }
@@ -494,6 +523,7 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
 
+	skb->protocol = htons(ETH_P_IPV6);
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
 		br_nf_pre_routing_finish_ipv6);
 
@@ -566,6 +596,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
 	store_orig_dstaddr(skb);
+	skb->protocol = htons(ETH_P_IP);
 
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, skb->dev, NULL,
 		br_nf_pre_routing_finish);
@@ -614,7 +645,9 @@ static int br_nf_forward_finish(struct sk_buff *skb)
 	} else {
 		in = *((struct net_device **)(skb->cb));
 	}
+	nf_bridge_update_protocol(skb);
 	nf_bridge_push_encap_header(skb);
+
 	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, in,
 		       skb->dev, br_forward_finish, 1);
 	return 0;
@@ -666,6 +699,10 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
 	/* The physdev module checks on this */
 	nf_bridge->mask |= BRNF_BRIDGED;
 	nf_bridge->physoutdev = skb->dev;
+	if (pf == PF_INET)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
 
 	NF_HOOK(pf, NF_INET_FORWARD, skb, bridge_parent(in), parent,
 		br_nf_forward_finish);
@@ -706,8 +743,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	if (skb->nfct != NULL &&
-	    (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) &&
+	if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
 	    skb->len > skb->dev->mtu &&
 	    !skb_is_gso(skb))
 		return ip_fragment(skb, br_dev_queue_push_xmit);
@@ -755,6 +791,10 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 
 	nf_bridge_pull_encap_header(skb);
 	nf_bridge_save_header(skb);
+	if (pf == PF_INET)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
 
 	NF_HOOK(pf, NF_INET_POST_ROUTING, skb, NULL, realoutdev,
 		br_nf_dev_queue_xmit);
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 68/84] netfilter: bridge-netfilter: simplify IP DNAT
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Bart De Schuymer <bdschuym@pandora.be>

Remove br_netfilter.c::br_nf_local_out(). The function
br_nf_local_out() was needed because the PF_BRIDGE::LOCAL_OUT hook
could be called when IP DNAT happens on to-be-bridged traffic. The
new scheme eliminates this mess.

Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_bridge.h |   17 +++++-
 net/bridge/br_device.c           |    9 +++-
 net/bridge/br_netfilter.c        |  114 ++++++--------------------------------
 3 files changed, 40 insertions(+), 100 deletions(-)

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index f8105e5..ffab6c4 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -41,9 +41,8 @@ enum nf_br_hook_priorities {
 
 #define BRNF_PKT_TYPE			0x01
 #define BRNF_BRIDGED_DNAT		0x02
-#define BRNF_DONT_TAKE_PARENT		0x04
-#define BRNF_BRIDGED			0x08
-#define BRNF_NF_BRIDGE_PREROUTING	0x10
+#define BRNF_BRIDGED			0x04
+#define BRNF_NF_BRIDGE_PREROUTING	0x08
 
 
 /* Only used in br_forward.c */
@@ -68,6 +67,18 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
 	}
 }
 
+extern int br_handle_frame_finish(struct sk_buff *skb);
+/* Only used in br_device.c */
+static inline int br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	skb_pull(skb, ETH_HLEN);
+	nf_bridge->mask ^= BRNF_BRIDGED_DNAT;
+	skb->dev = nf_bridge->physindev;
+	return br_handle_frame_finish(skb);
+}
+
 /* This is called by the IP fragmenting code and it ensures there is
  * enough room for the encapsulating header (if there is one). */
 static inline unsigned int nf_bridge_pad(const struct sk_buff *skb)
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 5b8a6e7..007bde8 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -15,7 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
-
+#include <linux/netfilter_bridge.h>
 #include <asm/uaccess.h>
 #include "br_private.h"
 
@@ -28,6 +28,13 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct net_bridge_mdb_entry *mdst;
 	struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
 
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
+		br_nf_pre_routing_finish_bridge_slow(skb);
+		return NETDEV_TX_OK;
+	}
+#endif
+
 	brstats->tx_packets++;
 	brstats->tx_bytes += skb->len;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index dd6f538..05dc630 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -246,8 +246,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 }
 
 /* This requires some explaining. If DNAT has taken place,
- * we will need to fix up the destination Ethernet address,
- * and this is a tricky process.
+ * we will need to fix up the destination Ethernet address.
  *
  * There are two cases to consider:
  * 1. The packet was DNAT'ed to a device in the same bridge
@@ -261,52 +260,38 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
  * call ip_route_input() and to look at skb->dst->dev, which is
  * changed to the destination device if ip_route_input() succeeds.
  *
- * Let us first consider the case that ip_route_input() succeeds:
- *
- * If skb->dst->dev equals the logical bridge device the packet
- * came in on, we can consider this bridging. The packet is passed
- * through the neighbour output function to build a new destination
- * MAC address, which will make the packet enter br_nf_local_out()
- * not much later. In that function it is assured that the iptables
- * FORWARD chain is traversed for the packet.
+ * Let's first consider the case that ip_route_input() succeeds:
  *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
  * Otherwise, the packet is considered to be routed and we just
  * change the destination MAC address so that the packet will
  * later be passed up to the IP stack to be routed. For a redirected
  * packet, ip_route_input() will give back the localhost as output device,
  * which differs from the bridge device.
  *
- * Let us now consider the case that ip_route_input() fails:
+ * Let's now consider the case that ip_route_input() fails:
  *
  * This can be because the destination address is martian, in which case
  * the packet will be dropped.
- * After a "echo '0' > /proc/sys/net/ipv4/ip_forward" ip_route_input()
- * will fail, while __ip_route_output_key() will return success. The source
- * address for __ip_route_output_key() is set to zero, so __ip_route_output_key
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
  * thinks we're handling a locally generated packet and won't care
- * if IP forwarding is allowed. We send a warning message to the users's
- * log telling her to put IP forwarding on.
- *
- * ip_route_input() will also fail if there is no route available.
- * In that case we just drop the packet.
- *
- * --Lennert, 20020411
- * --Bart, 20020416 (updated)
- * --Bart, 20021007 (updated)
- * --Bart, 20062711 (updated) */
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+
 static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb)
 {
-	if (skb->pkt_type == PACKET_OTHERHOST) {
-		skb->pkt_type = PACKET_HOST;
-		skb->nf_bridge->mask |= BRNF_PKT_TYPE;
-	}
-	skb->nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-
 	skb->dev = bridge_parent(skb->dev);
 	if (skb->dev) {
 		struct dst_entry *dst = skb_dst(skb);
 
 		nf_bridge_pull_encap_header(skb);
+		skb->nf_bridge->mask |= BRNF_BRIDGED_DNAT;
 
 		if (dst->hh)
 			return neigh_hh_output(dst->hh, skb);
@@ -368,9 +353,6 @@ free_skb:
 		} else {
 			if (skb_dst(skb)->dev == dev) {
 bridged_dnat:
-				/* Tell br_nf_local_out this is a
-				 * bridged frame */
-				nf_bridge->mask |= BRNF_BRIDGED_DNAT;
 				skb->dev = nf_bridge->physindev;
 				nf_bridge_push_encap_header(skb);
 				NF_HOOK_THRESH(NFPROTO_BRIDGE,
@@ -721,54 +703,6 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
-/* PF_BRIDGE/LOCAL_OUT ***********************************************
- *
- * This function sees both locally originated IP packets and forwarded
- * IP packets (in both cases the destination device is a bridge
- * device). It also sees bridged-and-DNAT'ed packets.
- *
- * If (nf_bridge->mask & BRNF_BRIDGED_DNAT) then the packet is bridged
- * and we fake the PF_BRIDGE/FORWARD hook. The function br_nf_forward()
- * will then fake the PF_INET/FORWARD hook. br_nf_local_out() has priority
- * NF_BR_PRI_FIRST, so no relevant PF_BRIDGE/INPUT functions have been nor
- * will be executed.
- */
-static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff *skb,
-				    const struct net_device *in,
-				    const struct net_device *out,
-				    int (*okfn)(struct sk_buff *))
-{
-	struct net_device *realindev;
-	struct nf_bridge_info *nf_bridge;
-
-	if (!skb->nf_bridge)
-		return NF_ACCEPT;
-
-	/* Need exclusive nf_bridge_info since we might have multiple
-	 * different physoutdevs. */
-	if (!nf_bridge_unshare(skb))
-		return NF_DROP;
-
-	nf_bridge = skb->nf_bridge;
-	if (!(nf_bridge->mask & BRNF_BRIDGED_DNAT))
-		return NF_ACCEPT;
-
-	/* Bridged, take PF_BRIDGE/FORWARD.
-	 * (see big note in front of br_nf_pre_routing_finish) */
-	nf_bridge->physoutdev = skb->dev;
-	realindev = nf_bridge->physindev;
-
-	if (nf_bridge->mask & BRNF_PKT_TYPE) {
-		skb->pkt_type = PACKET_OTHERHOST;
-		nf_bridge->mask ^= BRNF_PKT_TYPE;
-	}
-	nf_bridge_push_encap_header(skb);
-
-	NF_HOOK(NFPROTO_BRIDGE, NF_BR_FORWARD, skb, realindev, skb->dev,
-		br_forward_finish);
-	return NF_STOLEN;
-}
-
 #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
@@ -797,10 +731,7 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 	struct net_device *realoutdev = bridge_parent(skb->dev);
 	u_int8_t pf;
 
-	if (!nf_bridge)
-		return NF_ACCEPT;
-
-	if (!(nf_bridge->mask & (BRNF_BRIDGED | BRNF_BRIDGED_DNAT)))
+	if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED))
 		return NF_ACCEPT;
 
 	if (!realoutdev)
@@ -847,10 +778,8 @@ static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-/* For br_nf_local_out we need (prio = NF_BR_PRI_FIRST), to insure that innocent
- * PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input.
- * For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
- * ip_refrag() can return NF_STOLEN. */
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
 static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 	{
 		.hook = br_nf_pre_routing,
@@ -881,13 +810,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 		.priority = NF_BR_PRI_BRNF,
 	},
 	{
-		.hook = br_nf_local_out,
-		.owner = THIS_MODULE,
-		.pf = PF_BRIDGE,
-		.hooknum = NF_BR_LOCAL_OUT,
-		.priority = NF_BR_PRI_FIRST,
-	},
-	{
 		.hook = br_nf_post_routing,
 		.owner = THIS_MODULE,
 		.pf = PF_BRIDGE,
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 65/84] Restore __ALIGN_MASK()
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Alexey Dobriyan <adobriyan@gmail.com>

Fix lib/bitmap.c compile failure due to __ALIGN_KERNEL changes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/kernel.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 284ea99..db6717d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -40,6 +40,7 @@ extern const char linux_proc_banner[];
 #define STACK_MAGIC	0xdeadbeef
 
 #define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
+#define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
 
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 75/84] netfilter: xtables: remove old comments about reentrancy
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ip_tables.c   |    2 --
 net/ipv4/netfilter/ipt_REJECT.c  |    3 ---
 net/ipv6/netfilter/ip6_tables.c  |    2 --
 net/ipv6/netfilter/ip6t_REJECT.c |    3 ---
 4 files changed, 0 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 70900ec..bb5e0d9 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -434,8 +434,6 @@ ipt_do_table(struct sk_buff *skb,
 			continue;
 		}
 
-		/* Targets which reenter must return
-		   abs. verdicts */
 		tgpar.target   = t->u.kernel.target;
 		tgpar.targinfo = t->data;
 
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index b026014..038fa0b 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -139,9 +139,6 @@ reject_tg(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct ipt_reject_info *reject = par->targinfo;
 
-	/* WARNING: This code causes reentry within iptables.
-	   This means that the iptables jump stack is now crap.  We
-	   must return an absolute verdict. --RR */
 	switch (reject->with) {
 	case IPT_ICMP_NET_UNREACHABLE:
 		send_unreach(skb, ICMP_NET_UNREACH);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 2a2770b..7afa117 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -451,8 +451,6 @@ ip6t_do_table(struct sk_buff *skb,
 			continue;
 		}
 
-		/* Targets which reenter must return
-		   abs. verdicts */
 		tgpar.target   = t->u.kernel.target;
 		tgpar.targinfo = t->data;
 
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 55b9b2d..dad9762 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -179,9 +179,6 @@ reject_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 	struct net *net = dev_net((par->in != NULL) ? par->in : par->out);
 
 	pr_debug("%s: medium point\n", __func__);
-	/* WARNING: This code causes reentry within ip6tables.
-	   This means that the ip6tables jump stack is now crap.  We
-	   must return an absolute verdict. --RR */
 	switch (reject->with) {
 	case IP6T_ICMP6_NO_ROUTE:
 		send_unreach(net, skb, ICMPV6_NOROUTE, par->hooknum);
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 72/84] netfilter: xtables: inclusion of xt_TEE
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

xt_TEE can be used to clone and reroute a packet. This can for
example be used to copy traffic at a router for logging purposes
to another dedicated machine.

References: http://www.gossamer-threads.com/lists/iptables/devel/68781
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/Kbuild   |    1 +
 include/linux/netfilter/xt_TEE.h |    9 ++
 net/ipv4/ip_output.c             |    1 +
 net/ipv6/ip6_output.c            |    1 +
 net/netfilter/Kconfig            |    7 +
 net/netfilter/Makefile           |    1 +
 net/netfilter/xt_TEE.c           |  256 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 276 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/netfilter/xt_TEE.h
 create mode 100644 net/netfilter/xt_TEE.c

diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild
index a5a63e4..48767cd 100644
--- a/include/linux/netfilter/Kbuild
+++ b/include/linux/netfilter/Kbuild
@@ -16,6 +16,7 @@ header-y += xt_RATEEST.h
 header-y += xt_SECMARK.h
 header-y += xt_TCPMSS.h
 header-y += xt_TCPOPTSTRIP.h
+header-y += xt_TEE.h
 header-y += xt_TPROXY.h
 header-y += xt_comment.h
 header-y += xt_connbytes.h
diff --git a/include/linux/netfilter/xt_TEE.h b/include/linux/netfilter/xt_TEE.h
new file mode 100644
index 0000000..55d4a50
--- /dev/null
+++ b/include/linux/netfilter/xt_TEE.h
@@ -0,0 +1,9 @@
+#ifndef _XT_TEE_TARGET_H
+#define _XT_TEE_TARGET_H
+
+struct xt_tee_tginfo {
+	union nf_inet_addr gw;
+	char oif[16];
+};
+
+#endif /* _XT_TEE_TARGET_H */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f09135e..0abfdde 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -309,6 +309,7 @@ int ip_output(struct sk_buff *skb)
 			    ip_finish_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
+EXPORT_SYMBOL_GPL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c10a38a..d09be7f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -176,6 +176,7 @@ int ip6_output(struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
+EXPORT_SYMBOL_GPL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 8055786..673a6c8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -502,6 +502,13 @@ config NETFILTER_XT_TARGET_RATEEST
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NETFILTER_XT_TARGET_TEE
+	tristate '"TEE" - packet cloning to alternate destiantion'
+	depends on NETFILTER_ADVANCED
+	---help---
+	This option adds a "TEE" target with which a packet can be cloned and
+	this clone be rerouted to another nexthop.
+
 config NETFILTER_XT_TARGET_TPROXY
 	tristate '"TPROXY" target support (EXPERIMENTAL)'
 	depends on EXPERIMENTAL
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index cd31afe..14e3a8f 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o
+obj-$(CONFIG_NETFILTER_XT_TARGET_TEE) += xt_TEE.o
 obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o
 
 # matches
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
new file mode 100644
index 0000000..b3d7301
--- /dev/null
+++ b/net/netfilter/xt_TEE.c
@@ -0,0 +1,256 @@
+/*
+ *	"TEE" target extension for Xtables
+ *	Copyright © Sebastian Claßen, 2007
+ *	Jan Engelhardt, 2007-2010
+ *
+ *	based on ipt_ROUTE.c from Cédric de Launois
+ *	<delaunois@info.ucl.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	version 2 or later, as published by the Free Software Foundation.
+ */
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/route.h>
+#include <linux/skbuff.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/route.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_TEE.h>
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#	define WITH_CONNTRACK 1
+#	include <net/netfilter/nf_conntrack.h>
+#endif
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#	define WITH_IPV6 1
+#endif
+
+static const union nf_inet_addr tee_zero_address;
+
+static struct net *pick_net(struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_NS
+	const struct dst_entry *dst;
+
+	if (skb->dev != NULL)
+		return dev_net(skb->dev);
+	dst = skb_dst(skb);
+	if (dst != NULL && dst->dev != NULL)
+		return dev_net(dst->dev);
+#endif
+	return &init_net;
+}
+
+static bool tee_tg_route_oif(struct flowi *f, struct net *net,
+			     const struct xt_tee_tginfo *info)
+{
+	const struct net_device *dev;
+
+	if (*info->oif != '\0')
+		return true;
+	dev = dev_get_by_name(net, info->oif);
+	if (dev == NULL)
+		return false;
+	f->oif = dev->ifindex;
+	return true;
+}
+
+static bool
+tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct rtable *rt;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	if (!tee_tg_route_oif(&fl, net, info))
+		return false;
+	fl.nl_u.ip4_u.daddr = info->gw.ip;
+	fl.nl_u.ip4_u.tos   = RT_TOS(iph->tos);
+	fl.nl_u.ip4_u.scope = RT_SCOPE_UNIVERSE;
+	if (ip_route_output_key(net, &rt, &fl) != 0)
+		return false;
+
+	dst_release(skb_dst(skb));
+	skb_dst_set(skb, &rt->u.dst);
+	skb->dev      = rt->u.dst.dev;
+	skb->protocol = htons(ETH_P_IP);
+	return true;
+}
+
+static unsigned int
+tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+	struct iphdr *iph;
+
+	/*
+	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
+	 * the original skb, which should continue on its way as if nothing has
+	 * happened. The copy should be independently delivered to the TEE
+	 * --gateway.
+	 */
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+	/* Avoid counting cloned packets towards the original connection. */
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	/*
+	 * If we are in PREROUTING/INPUT, the checksum must be recalculated
+	 * since the length could have changed as a result of defragmentation.
+	 *
+	 * We also decrease the TTL to mitigate potential TEE loops
+	 * between two hosts.
+	 *
+	 * Set %IP_DF so that the original source is notified of a potentially
+	 * decreased MTU on the clone route. IPv6 does this too.
+	 */
+	iph = ip_hdr(skb);
+	iph->frag_off |= htons(IP_DF);
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN)
+		--iph->ttl;
+	ip_send_check(iph);
+
+	/*
+	 * Xtables is not reentrant currently, so a choice has to be made:
+	 * 1. return absolute verdict for the original and let the cloned
+	 *    packet travel through the chains
+	 * 2. let the original continue travelling and not pass the clone
+	 *    to Xtables.
+	 * #2 is chosen. Normally, we would use ip_local_out for the clone.
+	 * Because iph->check is already correct and we don't pass it to
+	 * Xtables anyway, a shortcut to dst_output [forwards to ip_output] can
+	 * be taken. %IPSKB_REROUTED needs to be set so that ip_output does not
+	 * invoke POSTROUTING on the cloned packet.
+	 */
+	IPCB(skb)->flags |= IPSKB_REROUTED;
+	if (tee_tg_route4(skb, info))
+		ip_output(skb);
+	else
+		kfree_skb(skb);
+
+	return XT_CONTINUE;
+}
+
+#ifdef WITH_IPV6
+static bool
+tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
+{
+	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	struct net *net = pick_net(skb);
+	struct dst_entry *dst;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(fl));
+	if (!tee_tg_route_oif(&fl, net, info))
+		return false;
+	fl.nl_u.ip6_u.daddr = info->gw.in6;
+	fl.nl_u.ip6_u.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
+				  (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
+	dst = ip6_route_output(net, NULL, &fl);
+	if (dst == NULL)
+		return false;
+
+	dst_release(skb_dst(skb));
+	skb_dst_set(skb, dst);
+	skb->dev      = dst->dev;
+	skb->protocol = htons(ETH_P_IPV6);
+	return true;
+}
+
+static unsigned int
+tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+
+	skb = pskb_copy(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return XT_CONTINUE;
+
+#ifdef WITH_CONNTRACK
+	nf_conntrack_put(skb->nfct);
+	skb->nfct     = &nf_conntrack_untracked.ct_general;
+	skb->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get(skb->nfct);
+#endif
+	if (par->hooknum == NF_INET_PRE_ROUTING ||
+	    par->hooknum == NF_INET_LOCAL_IN) {
+		struct ipv6hdr *iph = ipv6_hdr(skb);
+		--iph->hop_limit;
+	}
+	IP6CB(skb)->flags |= IP6SKB_REROUTED;
+	if (tee_tg_route6(skb, info))
+		ip6_output(skb);
+	else
+		kfree_skb(skb);
+
+	return XT_CONTINUE;
+}
+#endif /* WITH_IPV6 */
+
+static int tee_tg_check(const struct xt_tgchk_param *par)
+{
+	const struct xt_tee_tginfo *info = par->targinfo;
+
+	if (info->oif[sizeof(info->oif)-1] != '\0')
+		return -EINVAL;
+	/* 0.0.0.0 and :: not allowed */
+	return (memcmp(&info->gw, &tee_zero_address,
+	       sizeof(tee_zero_address)) == 0) ? -EINVAL : 0;
+}
+
+static struct xt_target tee_tg_reg[] __read_mostly = {
+	{
+		.name       = "TEE",
+		.revision   = 1,
+		.family     = NFPROTO_IPV4,
+		.target     = tee_tg4,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.me         = THIS_MODULE,
+	},
+#ifdef WITH_IPV6
+	{
+		.name       = "TEE",
+		.revision   = 1,
+		.family     = NFPROTO_IPV6,
+		.target     = tee_tg6,
+		.targetsize = sizeof(struct xt_tee_tginfo),
+		.checkentry = tee_tg_check,
+		.me         = THIS_MODULE,
+	},
+#endif
+};
+
+static int __init tee_tg_init(void)
+{
+	return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+static void __exit tee_tg_exit(void)
+{
+	xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+}
+
+module_init(tee_tg_init);
+module_exit(tee_tg_exit);
+MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_DESCRIPTION("Xtables: Reroute packet copy");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_TEE");
+MODULE_ALIAS("ip6t_TEE");
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 74/84] netfilter: xt_TEE: have cloned packet travel through Xtables too
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

Since Xtables is now reentrant/nestable, the cloned packet can also go
through Xtables and be subject to rules itself.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/ip_output.c   |    1 -
 net/ipv6/ip6_output.c  |    1 -
 net/netfilter/xt_TEE.c |   40 ++++++++++++++++++----------------------
 3 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0abfdde..f09135e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -309,7 +309,6 @@ int ip_output(struct sk_buff *skb)
 			    ip_finish_output,
 			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 }
-EXPORT_SYMBOL_GPL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d09be7f..c10a38a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -176,7 +176,6 @@ int ip6_output(struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
-EXPORT_SYMBOL_GPL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP)
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index b3d7301..842e701 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -12,6 +12,7 @@
  */
 #include <linux/ip.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/route.h>
 #include <linux/skbuff.h>
 #include <net/checksum.h>
@@ -32,6 +33,7 @@
 #endif
 
 static const union nf_inet_addr tee_zero_address;
+static DEFINE_PER_CPU(bool, tee_active);
 
 static struct net *pick_net(struct sk_buff *skb)
 {
@@ -91,6 +93,8 @@ tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 	const struct xt_tee_tginfo *info = par->targinfo;
 	struct iphdr *iph;
 
+	if (percpu_read(tee_active))
+		return XT_CONTINUE;
 	/*
 	 * Copy the skb, and route the copy. Will later return %XT_CONTINUE for
 	 * the original skb, which should continue on its way as if nothing has
@@ -125,24 +129,13 @@ tee_tg4(struct sk_buff *skb, const struct xt_target_param *par)
 		--iph->ttl;
 	ip_send_check(iph);
 
-	/*
-	 * Xtables is not reentrant currently, so a choice has to be made:
-	 * 1. return absolute verdict for the original and let the cloned
-	 *    packet travel through the chains
-	 * 2. let the original continue travelling and not pass the clone
-	 *    to Xtables.
-	 * #2 is chosen. Normally, we would use ip_local_out for the clone.
-	 * Because iph->check is already correct and we don't pass it to
-	 * Xtables anyway, a shortcut to dst_output [forwards to ip_output] can
-	 * be taken. %IPSKB_REROUTED needs to be set so that ip_output does not
-	 * invoke POSTROUTING on the cloned packet.
-	 */
-	IPCB(skb)->flags |= IPSKB_REROUTED;
-	if (tee_tg_route4(skb, info))
-		ip_output(skb);
-	else
+	if (tee_tg_route4(skb, info)) {
+		percpu_write(tee_active, true);
+		ip_local_out(skb);
+		percpu_write(tee_active, false);
+	} else {
 		kfree_skb(skb);
-
+	}
 	return XT_CONTINUE;
 }
 
@@ -177,6 +170,8 @@ tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 {
 	const struct xt_tee_tginfo *info = par->targinfo;
 
+	if (percpu_read(tee_active))
+		return XT_CONTINUE;
 	skb = pskb_copy(skb, GFP_ATOMIC);
 	if (skb == NULL)
 		return XT_CONTINUE;
@@ -192,12 +187,13 @@ tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 		struct ipv6hdr *iph = ipv6_hdr(skb);
 		--iph->hop_limit;
 	}
-	IP6CB(skb)->flags |= IP6SKB_REROUTED;
-	if (tee_tg_route6(skb, info))
-		ip6_output(skb);
-	else
+	if (tee_tg_route6(skb, info)) {
+		percpu_write(tee_active, true);
+		ip6_local_out(skb);
+		percpu_write(tee_active, false);
+	} else {
 		kfree_skb(skb);
-
+	}
 	return XT_CONTINUE;
 }
 #endif /* WITH_IPV6 */
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 76/84] netfilter: xt_TEE: resolve oif using netdevice notifiers
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

Replace the runtime oif name resolving by netdevice notifier based
resolving. When an oif is given, a netdevice notifier is registered
to resolve the name on NETDEV_REGISTER or NETDEV_CHANGE and unresolve
it again on NETDEV_UNREGISTER or NETDEV_CHANGE to a different name.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/xt_TEE.h |    3 +
 net/netfilter/xt_TEE.c           |  103 +++++++++++++++++++++++++++++--------
 2 files changed, 83 insertions(+), 23 deletions(-)

diff --git a/include/linux/netfilter/xt_TEE.h b/include/linux/netfilter/xt_TEE.h
index 55d4a50..5c21d5c 100644
--- a/include/linux/netfilter/xt_TEE.h
+++ b/include/linux/netfilter/xt_TEE.h
@@ -4,6 +4,9 @@
 struct xt_tee_tginfo {
 	union nf_inet_addr gw;
 	char oif[16];
+
+	/* used internally by the kernel */
+	struct xt_tee_priv *priv __attribute__((aligned(8)));
 };
 
 #endif /* _XT_TEE_TARGET_H */
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 842e701..49da6c0 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -15,6 +15,7 @@
 #include <linux/percpu.h>
 #include <linux/route.h>
 #include <linux/skbuff.h>
+#include <linux/notifier.h>
 #include <net/checksum.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -32,6 +33,12 @@
 #	define WITH_IPV6 1
 #endif
 
+struct xt_tee_priv {
+	struct notifier_block	notifier;
+	struct xt_tee_tginfo	*tginfo;
+	int			oif;
+};
+
 static const union nf_inet_addr tee_zero_address;
 static DEFINE_PER_CPU(bool, tee_active);
 
@@ -49,20 +56,6 @@ static struct net *pick_net(struct sk_buff *skb)
 	return &init_net;
 }
 
-static bool tee_tg_route_oif(struct flowi *f, struct net *net,
-			     const struct xt_tee_tginfo *info)
-{
-	const struct net_device *dev;
-
-	if (*info->oif != '\0')
-		return true;
-	dev = dev_get_by_name(net, info->oif);
-	if (dev == NULL)
-		return false;
-	f->oif = dev->ifindex;
-	return true;
-}
-
 static bool
 tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
 {
@@ -72,8 +65,11 @@ tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info)
 	struct flowi fl;
 
 	memset(&fl, 0, sizeof(fl));
-	if (!tee_tg_route_oif(&fl, net, info))
-		return false;
+	if (info->priv) {
+		if (info->priv->oif == -1)
+			return false;
+		fl.oif = info->priv->oif;
+	}
 	fl.nl_u.ip4_u.daddr = info->gw.ip;
 	fl.nl_u.ip4_u.tos   = RT_TOS(iph->tos);
 	fl.nl_u.ip4_u.scope = RT_SCOPE_UNIVERSE;
@@ -149,8 +145,11 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info)
 	struct flowi fl;
 
 	memset(&fl, 0, sizeof(fl));
-	if (!tee_tg_route_oif(&fl, net, info))
-		return false;
+	if (info->priv) {
+		if (info->priv->oif == -1)
+			return false;
+		fl.oif = info->priv->oif;
+	}
 	fl.nl_u.ip6_u.daddr = info->gw.in6;
 	fl.nl_u.ip6_u.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) |
 				  (iph->flow_lbl[1] << 8) | iph->flow_lbl[2];
@@ -198,15 +197,71 @@ tee_tg6(struct sk_buff *skb, const struct xt_target_param *par)
 }
 #endif /* WITH_IPV6 */
 
+static int tee_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct xt_tee_priv *priv;
+
+	priv = container_of(this, struct xt_tee_priv, notifier);
+	switch (event) {
+	case NETDEV_REGISTER:
+		if (!strcmp(dev->name, priv->tginfo->oif))
+			priv->oif = dev->ifindex;
+		break;
+	case NETDEV_UNREGISTER:
+		if (dev->ifindex == priv->oif)
+			priv->oif = -1;
+		break;
+	case NETDEV_CHANGENAME:
+		if (!strcmp(dev->name, priv->tginfo->oif))
+			priv->oif = dev->ifindex;
+		else if (dev->ifindex == priv->oif)
+			priv->oif = -1;
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
 static int tee_tg_check(const struct xt_tgchk_param *par)
 {
-	const struct xt_tee_tginfo *info = par->targinfo;
+	struct xt_tee_tginfo *info = par->targinfo;
+	struct xt_tee_priv *priv;
 
-	if (info->oif[sizeof(info->oif)-1] != '\0')
-		return -EINVAL;
 	/* 0.0.0.0 and :: not allowed */
-	return (memcmp(&info->gw, &tee_zero_address,
-	       sizeof(tee_zero_address)) == 0) ? -EINVAL : 0;
+	if (memcmp(&info->gw, &tee_zero_address,
+		   sizeof(tee_zero_address)) == 0)
+		return -EINVAL;
+
+	if (info->oif[0]) {
+		if (info->oif[sizeof(info->oif)-1] != '\0')
+			return -EINVAL;
+
+		priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+		if (priv == NULL)
+			return -ENOMEM;
+
+		priv->tginfo  = info;
+		priv->oif     = -1;
+		priv->notifier.notifier_call = tee_netdev_event;
+		info->priv    = priv;
+
+		register_netdevice_notifier(&priv->notifier);
+	} else
+		info->priv = NULL;
+
+	return 0;
+}
+
+static void tee_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	struct xt_tee_tginfo *info = par->targinfo;
+
+	if (info->priv) {
+		unregister_netdevice_notifier(&info->priv->notifier);
+		kfree(info->priv);
+	}
 }
 
 static struct xt_target tee_tg_reg[] __read_mostly = {
@@ -217,6 +272,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.target     = tee_tg4,
 		.targetsize = sizeof(struct xt_tee_tginfo),
 		.checkentry = tee_tg_check,
+		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
 	},
 #ifdef WITH_IPV6
@@ -227,6 +283,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.target     = tee_tg6,
 		.targetsize = sizeof(struct xt_tee_tginfo),
 		.checkentry = tee_tg_check,
+		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
 	},
 #endif
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 77/84] netfilter: bridge-netfilter: fix refragmenting IP traffic encapsulated in PPPoE traffic
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Bart De Schuymer <bdschuym@pandora.be>

The MTU for IP traffic encapsulated inside PPPoE traffic is smaller
than the MTU of the Ethernet device (1500). Connection tracking
gathers all IP packets and sometimes will refragment them in
ip_fragment(). We then need to subtract the length of the
encapsulating header from the mtu used in ip_fragment(). The check in
br_nf_dev_queue_xmit() which determines if ip_fragment() has to be
called is also updated for the PPPoE-encapsulated packets.
nf_bridge_copy_header() is also updated to make sure the PPPoE data
length field has the correct value.

Signed-off-by: Bart De Schuymer <bdschuym@pandora.be>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter_bridge.h |    7 +++++++
 net/bridge/br_netfilter.c        |    2 +-
 net/ipv4/ip_output.c             |    4 ++++
 3 files changed, 12 insertions(+), 1 deletions(-)

diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index ea0e44b..0ddd161 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -68,6 +68,13 @@ static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
 	}
 }
 
+static inline unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
+{
+	if (unlikely(skb->nf_bridge->mask & BRNF_PPPoE))
+		return PPPOE_SES_HLEN;
+	return 0;
+}
+
 extern int br_handle_frame_finish(struct sk_buff *skb);
 /* Only used in br_device.c */
 static inline int br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 6b80ebc..93f80fe 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -745,7 +745,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
-	    skb->len > skb->dev->mtu &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb))
 		return ip_fragment(skb, br_dev_queue_push_xmit);
 	else
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b0b2e30..d979710 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -469,6 +469,10 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 	hlen = iph->ihl * 4;
 	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge)
+		mtu -= nf_bridge_mtu_reduction(skb);
+#endif
 	IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 
 	/* When frag_list is given, use it. First, check its validity:
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 81/84] netfilter: x_tables: rectify XT_FUNCTION_MAXNAMELEN usage
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jan Engelhardt <jengelh@medozas.de>

There has been quite a confusion in userspace about
XT_FUNCTION_MAXNAMELEN; because struct xt_entry_match used MAX-1,
userspace would have to do an awkward MAX-2 for maximum length
checking (due to '\0'). This patch adds a new define that matches the
definition of XT_TABLE_MAXNAMELEN - being the size of the actual
struct member, not one off.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h |   14 ++++++--------
 1 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 50c8672..eeb4884 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 #define XT_FUNCTION_MAXNAMELEN 30
+#define XT_EXTENSION_MAXNAMELEN 29
 #define XT_TABLE_MAXNAMELEN 32
 
 struct xt_entry_match {
@@ -12,8 +13,7 @@ struct xt_entry_match {
 			__u16 match_size;
 
 			/* Used by userspace */
-			char name[XT_FUNCTION_MAXNAMELEN-1];
-
+			char name[XT_EXTENSION_MAXNAMELEN];
 			__u8 revision;
 		} user;
 		struct {
@@ -36,8 +36,7 @@ struct xt_entry_target {
 			__u16 target_size;
 
 			/* Used by userspace */
-			char name[XT_FUNCTION_MAXNAMELEN-1];
-
+			char name[XT_EXTENSION_MAXNAMELEN];
 			__u8 revision;
 		} user;
 		struct {
@@ -70,8 +69,7 @@ struct xt_standard_target {
 /* The argument to IPT_SO_GET_REVISION_*.  Returns highest revision
  * kernel supports, if >= revision. */
 struct xt_get_revision {
-	char name[XT_FUNCTION_MAXNAMELEN-1];
-
+	char name[XT_EXTENSION_MAXNAMELEN];
 	__u8 revision;
 };
 
@@ -291,7 +289,7 @@ struct xt_tgdtor_param {
 struct xt_match {
 	struct list_head list;
 
-	const char name[XT_FUNCTION_MAXNAMELEN-1];
+	const char name[XT_EXTENSION_MAXNAMELEN];
 	u_int8_t revision;
 
 	/* Return true or false: return FALSE and set *hotdrop = 1 to
@@ -330,7 +328,7 @@ struct xt_match {
 struct xt_target {
 	struct list_head list;
 
-	const char name[XT_FUNCTION_MAXNAMELEN-1];
+	const char name[XT_EXTENSION_MAXNAMELEN];
 	u_int8_t revision;
 
 	/* Returns verdict. Argument order changed since 2.6.9, as this
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 80/84] netfilter: nf_conntrack: extend with extra stat counter
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Jesper Dangaard Brouer <hawk@comx.dk>

I suspect an unfortunatly series of events occuring under a DDoS
attack, in function __nf_conntrack_find() nf_contrack_core.c.

Adding a stats counter to see if the search is restarted too often.

Signed-off-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_common.h      |    1 +
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c   |    7 ++++---
 net/netfilter/nf_conntrack_core.c                  |    4 +++-
 net/netfilter/nf_conntrack_standalone.c            |    7 ++++---
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
index c608677..14e6d32 100644
--- a/include/linux/netfilter/nf_conntrack_common.h
+++ b/include/linux/netfilter/nf_conntrack_common.h
@@ -113,6 +113,7 @@ struct ip_conntrack_stat {
 	unsigned int expect_new;
 	unsigned int expect_create;
 	unsigned int expect_delete;
+	unsigned int search_restart;
 };
 
 /* call to create an explicit dependency on nf_conntrack. */
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 2fb7b76..244f7cb 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -336,12 +336,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete\n");
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
 		return 0;
 	}
 
 	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
-			"%08x %08x %08x %08x %08x  %08x %08x %08x \n",
+			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
 		   nr_conntracks,
 		   st->searched,
 		   st->found,
@@ -358,7 +358,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 
 		   st->expect_new,
 		   st->expect_create,
-		   st->expect_delete
+		   st->expect_delete,
+		   st->search_restart
 		);
 	return 0;
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0c9bbe9..3907efb 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -319,8 +319,10 @@ begin:
 	 * not the expected one, we must restart lookup.
 	 * We probably met an item that was moved to another chain.
 	 */
-	if (get_nulls_value(n) != hash)
+	if (get_nulls_value(n) != hash) {
+		NF_CT_STAT_INC(net, search_restart);
 		goto begin;
+	}
 	local_bh_enable();
 
 	return NULL;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index faa8eb3..ea4a8d3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -252,12 +252,12 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 	const struct ip_conntrack_stat *st = v;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete\n");
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete search_restart\n");
 		return 0;
 	}
 
 	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
-			"%08x %08x %08x %08x %08x  %08x %08x %08x \n",
+			"%08x %08x %08x %08x %08x  %08x %08x %08x %08x\n",
 		   nr_conntracks,
 		   st->searched,
 		   st->found,
@@ -274,7 +274,8 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
 
 		   st->expect_new,
 		   st->expect_create,
-		   st->expect_delete
+		   st->expect_delete,
+		   st->search_restart
 		);
 	return 0;
 }
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 83/84] netfilter: nf_conntrack_proto: fix warning with CONFIG_PROVE_RCU
From: kaber @ 2010-05-10 20:18 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1273522735-24672-1-git-send-email-kaber@trash.net>

From: Patrick McHardy <kaber@trash.net>

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
include/net/netfilter/nf_conntrack_l3proto.h:92 invoked rcu_dereference_check()
without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
2 locks held by iptables/3197:
 #0:  (sk_lock-AF_INET){+.+.+.}, at: [<ffffffff8149bd8c>]
ip_setsockopt+0x7c/0xa0
 #1:  (&xt[i].mutex){+.+.+.}, at: [<ffffffff8148a5fe>]
xt_find_table_lock+0x3e/0x110

stack backtrace:
Pid: 3197, comm: iptables Not tainted 2.6.34-rc4 #2
Call Trace:
 [<ffffffff8105e2e8>] lockdep_rcu_dereference+0xb8/0xc0
 [<ffffffff8147fb3b>] nf_ct_l3proto_module_put+0x6b/0x70
 [<ffffffff8148d891>] state_mt_destroy+0x11/0x20
 [<ffffffff814d3baf>] cleanup_match+0x2f/0x50
 [<ffffffff814d3c63>] cleanup_entry+0x33/0x90
 [<ffffffff814d5653>] ? __do_replace+0x1a3/0x210
 [<ffffffff814d564c>] __do_replace+0x19c/0x210
 [<ffffffff814d651a>] do_ipt_set_ctl+0x16a/0x1b0
 [<ffffffff8147a610>] nf_sockopt+0x60/0xa0
...

The __nf_ct_l3proto_find() call doesn't actually need rcu read side
protection since the caller holds a reference to the protocol. Use
rcu_read_lock() anyways to avoid the warning.

Kernel bugzilla #15781: https://bugzilla.kernel.org/show_bug.cgi?id=15781

Reported-by: Christian Casteyde <casteyde.christian@free.fr>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index a6defc7..5886ba1 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -117,9 +117,13 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
 {
 	struct nf_conntrack_l3proto *p;
 
-	/* rcu_read_lock not necessary since the caller holds a reference */
+	/* rcu_read_lock not necessary since the caller holds a reference, but
+	 * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
+	 */
+	rcu_read_lock();
 	p = __nf_ct_l3proto_find(l3proto);
 	module_put(p->me);
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
 
-- 
1.7.0.4


^ permalink raw reply related

* Re: RFC: Network Plugin Architecture (NPA) for vmxnet3
From: Pankaj Thakkar @ 2010-05-10 20:46 UTC (permalink / raw)
  To: Avi Kivity
  Cc: linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	virtualization@lists.linux-foundation.org, pv-drivers@vmware.com,
	Shreyas Bhatewara
In-Reply-To: <4BE284CE.7010808@redhat.com>

On Thu, May 06, 2010 at 01:58:54AM -0700, Avi Kivity wrote:
> > We don't pass the whole VF to the guest. Only the BAR which is responsible for
> > TX/RX/intr is mapped into guest space.
> 
> Does the SR/IOV spec guarantee that you will have such a separation?

No. This is a guideline which we provided to IHVs and would have to be enforced
through testing/certification.

> How can you unmap the VF without guest cooperation?  If you're executing 
> Plugin code, you can't yank anything out.

In our Kawela plugin we don't have any reads from the memory space at all.
Hence you can yank the VF anytime (the code loaded in the guest address space
will keep on executing). Even if there were reads we can map the memory
pages to a NULL page and return 0xffffffff so that the plugin can detect this
and return an error to the shell. Remember there are no control operations in
the plugin and the code is really small (about 1k lines compared to 5k lines in
the full VF driver).

> 
> Are plugins executed with preemption/interrupts disabled?

Depends on the model. Today the plugin code for checking the TX/RX rings runs
in the deferred napi context.

> What ISAs do those plugins support?

x86 and x64.

Thanks,

-pankaj


^ permalink raw reply

* Re: [PATCH 72/84] netfilter: xtables: inclusion of xt_TEE
From: Eric Dumazet @ 2010-05-10 20:52 UTC (permalink / raw)
  To: kaber; +Cc: davem, netfilter-devel, netdev
In-Reply-To: <1273522735-24672-73-git-send-email-kaber@trash.net>

Le lundi 10 mai 2010 à 22:18 +0200, kaber@trash.net a écrit :
> From: Jan Engelhardt <jengelh@medozas.de>
> 
> xt_TEE can be used to clone and reroute a packet. This can for
> example be used to copy traffic at a router for logging purposes
> to another dedicated machine.
> 
> References: http://www.gossamer-threads.com/lists/iptables/devel/68781
> Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
> Signed-off-by: Patrick McHardy <kaber@trash.net>
> ---

> +static bool tee_tg_route_oif(struct flowi *f, struct net *net,
> +			     const struct xt_tee_tginfo *info)
> +{
> +	const struct net_device *dev;
> +
> +	if (*info->oif != '\0')
> +		return true;
> +	dev = dev_get_by_name(net, info->oif);
> +	if (dev == NULL)
> +		return false;
> +	f->oif = dev->ifindex;
> +	return true;
> +}
> +

This leaks a refcount on device.

But I see patch 76/84 replaces the whole thing, so this is probably
harmless.





^ permalink raw reply

* [PATCH V4  0/4] net: relax dst refcnt for net-next-2.6
From: Eric Dumazet @ 2010-05-10 21:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Here is V4 of a patch previously sent last year

One serious point of contention in network stack is the IP route cache
refcounts in input path, on SMP setups.

On stress situation, one cpu (say A) handles network softirq RX processing.
When a packet is received, we need to find a dst_entry, take
a reference on this dst_entry and associate skb to this dst_entry.
skb is queued on a socket receive queue.

When application (running from another CPU B) dequeues this packet,
it has to release the dst_entry, which refcount is hot and dirty on
another CPU A cache, involving an expensive cache line ping-pong.

Back in November 2008, we tried to keep this cache line only
in CPU A (commit 703556028792)
(net: release skb->dst in sock_queue_rcv_skb()), but we had
to revert this commit because it broke IP_PKTINFO handling,
as noticed by Mark McLoughlin

Then David suggested not taking the reference at the first place,
which this patch does when possible.

We prepared this work with commit adf30907 (net: skb->dst accessors),
introducing accessors to work on skb->dst

We now can use the low order bit of skb->_skb_dst to tell
if a reference was _not_ taken on dst for this skb

We make sure a dst leaving rcu protected region has a refcount.
This is done on enqueueing on any kind of queue (backlog, qdisc,
nf_queue, ...)

Net effect of this patch is avoiding two atomic ops per
incoming packet, and two atomic ops per outgoing TCP packet.

Same for outgoing path, if device has IFF_XMIT_DST_RELEASE,
or qdisc is work-conserving (or no queue)

V2: Forwarding is taken into account by changes in dev_queue_xmit(),
forcing a dst refcount on !IFF_XMIT_DST_RELEASE devices.

V3: As pointed by Patrick, we must force a dst refcount in
__nf_queue(), before queueing a packet.

V4: 
- output path (ip_queue_xmit()) handled as well.

- commit f84af32cbca70 (net: ip_queue_rcv_skb() helper) already in tree.

- Some interim checks make sure a dst does not escape unrefcounted
from a RCU section (thanks to lockdep)

- Better handling of queueing (backlog, qdisc)

Patch split into 4 parts :

1/4 : add a noref bit on skb dst (dstref infrastructure)

2/4 : ip_route_input_noref() introduction

3/4 : Use ip_route_input_noref() in three input paths

4/4 : norefcounting in ip_queue_xmit()


 include/linux/skbuff.h   |   58 ++++++++++++++++++++++++++++++++++---
 include/net/dst.h        |   48 ++++++++++++++++++++++++++++--
 include/net/route.h      |   17 ++++++++++
 include/net/sock.h       |   13 +++++---
 net/core/dev.c           |    3 +
 net/core/skbuff.c        |    2 -
 net/core/sock.c          |    6 +++
 net/ipv4/arp.c           |    2 -
 net/ipv4/icmp.c          |    6 +--
 net/ipv4/ip_input.c      |    4 +-
 net/ipv4/ip_options.c    |    9 +++--
 net/ipv4/ip_output.c     |    9 ++++-
 net/ipv4/netfilter.c     |    6 +--
 net/ipv4/route.c         |   17 +++++++---
 net/ipv4/xfrm4_input.c   |    4 +-
 net/netfilter/nf_queue.c |    2 +
 net/sched/sch_generic.c  |    2 -
 17 files changed, 170 insertions(+), 38 deletions(-)



^ permalink raw reply

* [PATCH V4 4/4] net: No dst refcounting in ip_queue_xmit()
From: Eric Dumazet @ 2010-05-10 21:31 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

TCP outgoing packets can avoid two atomic ops, and dirtying
of previously higly contended cache line using new refdst
infrastructure.

Note 1: loopback device excluded because of !IFF_XMIT_DST_RELEASE
Note 2: UDP packets dsts are built before ip_queue_xmit().

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/ipv4/ip_output.c |    9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f039219..012f1c9 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -318,10 +318,12 @@ int ip_queue_xmit(struct sk_buff *skb)
 	struct ip_options *opt = inet->opt;
 	struct rtable *rt;
 	struct iphdr *iph;
+	int res;
 
 	/* Skip all of this if the packet is already routed,
 	 * f.e. by something like SCTP.
 	 */
+	rcu_read_lock();
 	rt = skb_rtable(skb);
 	if (rt != NULL)
 		goto packet_routed;
@@ -359,7 +361,7 @@ int ip_queue_xmit(struct sk_buff *skb)
 		}
 		sk_setup_caps(sk, &rt->u.dst);
 	}
-	skb_dst_set(skb, dst_clone(&rt->u.dst));
+	skb_dst_set_noref(skb, &rt->u.dst);
 
 packet_routed:
 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
@@ -391,9 +393,12 @@ packet_routed:
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
-	return ip_local_out(skb);
+	res = ip_local_out(skb);
+	rcu_read_unlock();
+	return res;
 
 no_route:
+	rcu_read_unlock();
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 	kfree_skb(skb);
 	return -EHOSTUNREACH;




^ permalink raw reply related

* [PATCH V4 1/4] net: add a noref bit on skb dst
From: Eric Dumazet @ 2010-05-10 21:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Use low order bit of skb->_skb_dst to tell dst is not refcounted.

Change _skb_dst to _skb_refdst to make sure all uses are catched.

skb_dst() returns the dst, regardless of noref bit set or not, but
with a lockdep check to make sure a noref dst is not given if current
user is not rcu protected.

New skb_dst_set_noref() helper to set an notrefcounted dst on a skb.
(with lockdep check)

skb_dst_drop() drops a reference only if skb dst was refcounted.

skb_dst_force() helper is used to force a refcount on dst, when skb
is queued and not anymore RCU protected.

Use skb_dst_force() in __sk_add_backlog(), __dev_xmit_skb() if 
!IFF_XMIT_DST_RELEASE or skb enqueued on qdisc queue, in
sock_queue_rcv_skb(), in __nf_queue().

Note: dst_use_noref() still dirties dst, we might transform it
later to do one dirtying per jiffies.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/linux/skbuff.h   |   58 ++++++++++++++++++++++++++++++++++---
include/net/dst.h        |   48 ++++++++++++++++++++++++++++--
include/net/sock.h       |   13 +++++---
net/core/dev.c           |    3 +
net/core/skbuff.c        |    2 -
net/core/sock.c          |    6 +++
net/ipv4/icmp.c          |    6 +--
net/ipv4/ip_options.c    |    9 +++--
net/ipv4/netfilter.c     |    6 +--
net/ipv4/route.c         |    2 -
net/netfilter/nf_queue.c |    2 +
net/sched/sch_generic.c  |    2 -
12 files changed, 132 insertions(+), 25 deletions(-) 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9525bc..7cdfb4d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -264,7 +264,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
- *	@_skb_dst: destination entry
+ *	@_skb_refdst: destination entry (with norefcount bit)
  *	@sp: the security path, used for xfrm
  *	@cb: Control buffer. Free for use by every layer. Put private vars here
  *	@len: Length of actual data
@@ -328,7 +328,7 @@ struct sk_buff {
 	 */
 	char			cb[48] __aligned(8);
 
-	unsigned long		_skb_dst;
+	unsigned long		_skb_refdst;
 #ifdef CONFIG_XFRM
 	struct	sec_path	*sp;
 #endif
@@ -419,14 +419,64 @@ struct sk_buff {
 
 #include <asm/system.h>
 
+/*
+ * skb might have a dst pointer attached, refcounted or not.
+ * _skb_refdst low order bit is set if refcount was _not_ taken
+ */
+#define SKB_DST_NOREF	1UL
+#define SKB_DST_PTRMASK	~(SKB_DST_NOREF)
+
+/**
+ * skb_dst - returns skb dst_entry
+ * @skb: buffer
+ *
+ * Returns skb dst_entry, regardless of reference taken or not.
+ */
 static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
 {
-	return (struct dst_entry *)skb->_skb_dst;
+	/* If refdst was not refcounted, check we still are in a 
+	 * rcu_read_lock section
+	 */
+	WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
+		!rcu_read_lock_held() &&
+		!rcu_read_lock_bh_held());
+	return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
 }
 
+/**
+ * skb_dst_set - sets skb dst
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was taken on dst and should
+ * be released by skb_dst_drop()
+ */
 static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 {
-	skb->_skb_dst = (unsigned long)dst;
+	skb->_skb_refdst = (unsigned long)dst;
+}
+
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+}
+
+/**
+ * skb_dst_is_noref - Test if skb dst isnt refcounted
+ * @skb: buffer
+ */
+static inline bool skb_dst_is_noref(const struct sk_buff *skb)
+{
+	return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
 }
 
 static inline struct rtable *skb_rtable(const struct sk_buff *skb)
diff --git a/include/net/dst.h b/include/net/dst.h
index aac5a5f..27207a1 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -168,6 +168,12 @@ static inline void dst_use(struct dst_entry *dst, unsigned long time)
 	dst->lastuse = time;
 }
 
+static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+{
+	dst->__use++;
+	dst->lastuse = time;
+}
+
 static inline
 struct dst_entry * dst_clone(struct dst_entry * dst)
 {
@@ -177,11 +183,47 @@ struct dst_entry * dst_clone(struct dst_entry * dst)
 }
 
 extern void dst_release(struct dst_entry *dst);
+
+static inline void refdst_drop(unsigned long refdst)
+{
+	if (!(refdst & SKB_DST_NOREF))
+		dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
+}
+
+/**
+ * skb_dst_drop - drops skb dst
+ * @skb: buffer
+ *
+ * Drops dst reference count if a reference was taken.
+ */
 static inline void skb_dst_drop(struct sk_buff *skb)
 {
-	if (skb->_skb_dst)
-		dst_release(skb_dst(skb));
-	skb->_skb_dst = 0UL;
+	if (skb->_skb_refdst) {
+		refdst_drop(skb->_skb_refdst);
+		skb->_skb_refdst = 0UL;
+	}
+}
+
+static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
+{
+	nskb->_skb_refdst = oskb->_skb_refdst;
+	if (!(nskb->_skb_refdst & SKB_DST_NOREF))
+		dst_clone(skb_dst(nskb));
+}
+
+/**
+ * skb_dst_force - makes sure skb dst is refcounted
+ * @skb: buffer
+ *
+ * If dst is not yet refcounted, let's do it
+ */
+static inline void skb_dst_force(struct sk_buff *skb)
+{
+	if (skb_dst_is_noref(skb)) {
+		WARN_ON(!rcu_read_lock_held());
+		skb->_skb_refdst &= ~SKB_DST_NOREF;
+		dst_clone(skb_dst(skb));
+	}
 }
 
 /* Children define the path of the packet through the
diff --git a/include/net/sock.h b/include/net/sock.h
index 328e03f..307affa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -598,12 +598,15 @@ static inline int sk_stream_memory_free(struct sock *sk)
 /* OOB backlog add */
 static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
 {
-	if (!sk->sk_backlog.tail) {
-		sk->sk_backlog.head = sk->sk_backlog.tail = skb;
-	} else {
+	/* dont let skb dst not refcounted, we are going to leave rcu lock */
+	skb_dst_force(skb);
+
+	if (!sk->sk_backlog.tail)
+		sk->sk_backlog.head = skb;
+	else
 		sk->sk_backlog.tail->next = skb;
-		sk->sk_backlog.tail = skb;
-	}
+
+	sk->sk_backlog.tail = skb;
 	skb->next = NULL;
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 32611c8..dfe6ba6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2047,6 +2047,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
+		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
+			skb_dst_force(skb);
 		__qdisc_update_bstats(q, skb->len);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
 			__qdisc_run(q);
@@ -2055,6 +2057,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 		rc = NET_XMIT_SUCCESS;
 	} else {
+		skb_dst_force(skb);
 		rc = qdisc_enqueue_root(skb, q);
 		qdisc_run(q);
 	}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a9b0e1f..c543dd2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -520,7 +520,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->transport_header	= old->transport_header;
 	new->network_header	= old->network_header;
 	new->mac_header		= old->mac_header;
-	skb_dst_set(new, dst_clone(skb_dst(old)));
+	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
diff --git a/net/core/sock.c b/net/core/sock.c
index 94c4aff..d24b5c1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -307,6 +307,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	 */
 	skb_len = skb->len;
 
+	/* we escape from rcu protected region, make sure we dont leak
+	 * a norefcounted dst
+	 */
+	skb_dst_force(skb);
+
 	spin_lock_irqsave(&list->lock, flags);
 	skb->dropcount = atomic_read(&sk->sk_drops);
 	__skb_queue_tail(list, skb);
@@ -1535,6 +1540,7 @@ static void __release_sock(struct sock *sk)
 		do {
 			struct sk_buff *next = skb->next;
 
+			WARN_ON_ONCE(skb_dst_is_noref(skb));
 			skb->next = NULL;
 			sk_backlog_rcv(sk, skb);
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f3d339f..d65e921 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -587,20 +587,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 			err = __ip_route_output_key(net, &rt2, &fl);
 		else {
 			struct flowi fl2 = {};
-			struct dst_entry *odst;
+			unsigned long orefdst;
 
 			fl2.fl4_dst = fl.fl4_src;
 			if (ip_route_output_key(net, &rt2, &fl2))
 				goto relookup_failed;
 
 			/* Ugh! */
-			odst = skb_dst(skb_in);
+			orefdst = skb_in->_skb_refdst; /* save old refdst */
 			err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
 					     RT_TOS(tos), rt2->u.dst.dev);
 
 			dst_release(&rt2->u.dst);
 			rt2 = skb_rtable(skb_in);
-			skb_dst_set(skb_in, odst);
+			skb_in->_skb_refdst = orefdst; /* restore old refdst */
 		}
 
 		if (err)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 4c09a31..3244133 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -601,6 +601,7 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	unsigned char *optptr = skb_network_header(skb) + opt->srr;
 	struct rtable *rt = skb_rtable(skb);
 	struct rtable *rt2;
+	unsigned long orefdst;
 	int err;
 
 	if (!opt->srr)
@@ -624,16 +625,16 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 		}
 		memcpy(&nexthop, &optptr[srrptr-1], 4);
 
-		rt = skb_rtable(skb);
+		orefdst = skb->_skb_refdst;
 		skb_dst_set(skb, NULL);
 		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
 		rt2 = skb_rtable(skb);
 		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
-			ip_rt_put(rt2);
-			skb_dst_set(skb, &rt->u.dst);
+			skb_dst_drop(skb);
+			skb->_skb_refdst = orefdst;
 			return -EINVAL;
 		}
-		ip_rt_put(rt);
+		refdst_drop(orefdst);
 		if (rt2->rt_type != RTN_LOCAL)
 			break;
 		/* Superfast 8) loopback forward */
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 82fb43c..07de855 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -17,7 +17,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 	const struct iphdr *iph = ip_hdr(skb);
 	struct rtable *rt;
 	struct flowi fl = {};
-	struct dst_entry *odst;
+	unsigned long orefdst;
 	unsigned int hh_len;
 	unsigned int type;
 
@@ -51,14 +51,14 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type)
 		if (ip_route_output_key(net, &rt, &fl) != 0)
 			return -1;
 
-		odst = skb_dst(skb);
+		orefdst = skb->_skb_refdst;
 		if (ip_route_input(skb, iph->daddr, iph->saddr,
 				   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
 			dst_release(&rt->u.dst);
 			return -1;
 		}
 		dst_release(&rt->u.dst);
-		dst_release(odst);
+		refdst_drop(orefdst);
 	}
 
 	if (skb_dst(skb)->error)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dea3f92..705eccf 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3033,7 +3033,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 				continue;
 			if (rt_is_expired(rt))
 				continue;
-			skb_dst_set(skb, dst_clone(&rt->u.dst));
+			skb_dst_set_noref(skb, &rt->u.dst);
 			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
 					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
 					 1, NLM_F_MULTI) <= 0) {
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index c49ef21..cb3cde4 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <net/protocol.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/dst.h>
 
 #include "nf_internals.h"
 
@@ -170,6 +171,7 @@ static int __nf_queue(struct sk_buff *skb,
 			dev_hold(physoutdev);
 	}
 #endif
+	skb_dst_force(skb);
 	afinfo->saveroute(skb, entry);
 	status = qh->outfn(entry, queuenum);
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index a969b11..418bc2e 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -179,7 +179,7 @@ static inline int qdisc_restart(struct Qdisc *q)
 	skb = dequeue_skb(q);
 	if (unlikely(!skb))
 		return 0;
-
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));





^ permalink raw reply related

* [PATCH V4 2/4] net: implements ip_route_input_noref()
From: Eric Dumazet @ 2010-05-10 21:32 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

ip_route_input() is the version returning a refcounted dst, while
ip_route_input_noref() returns a non refcounted one.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
include/net/route.h |   17 ++++++++++++++++-
net/ipv4/route.c    |   15 ++++++++++-----
2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 2c9fba7..af6cf4b 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -112,7 +112,22 @@ extern void		rt_cache_flush_batch(void);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
-extern int		ip_route_input(struct sk_buff*, __be32 dst, __be32 src, u8 tos, struct net_device *devin);
+
+extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
+				 u8 tos, struct net_device *devin, bool noref);
+
+static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+				 u8 tos, struct net_device *devin)
+{
+	return ip_route_input_common(skb, dst, src, tos, devin, false);
+}
+
+static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
+				       u8 tos, struct net_device *devin)
+{
+	return ip_route_input_common(skb, dst, src, tos, devin, true);
+}
+
 extern unsigned short	ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 705eccf..560acc6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2277,8 +2277,8 @@ martian_source:
 	goto e_inval;
 }
 
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-		   u8 tos, struct net_device *dev)
+int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+			   u8 tos, struct net_device *dev, bool noref)
 {
 	struct rtable * rth;
 	unsigned	hash;
@@ -2304,10 +2304,15 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		    rth->fl.mark == skb->mark &&
 		    net_eq(dev_net(rth->u.dst.dev), net) &&
 		    !rt_is_expired(rth)) {
-			dst_use(&rth->u.dst, jiffies);
+			if (noref) {
+				dst_use_noref(&rth->u.dst, jiffies);
+				skb_dst_set_noref(skb, &rth->u.dst);
+			} else {
+				dst_use(&rth->u.dst, jiffies);
+				skb_dst_set(skb, &rth->u.dst);
+			}
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();
-			skb_dst_set(skb, &rth->u.dst);
 			return 0;
 		}
 		RT_CACHE_STAT_INC(in_hlist_search);
@@ -2350,6 +2355,7 @@ skip_cache:
 	}
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
 }
+EXPORT_SYMBOL(ip_route_input_common);
 
 static int __mkroute_output(struct rtable **result,
 			    struct fib_result *res,
@@ -3361,5 +3367,4 @@ void __init ip_static_sysctl_init(void)
 #endif
 
 EXPORT_SYMBOL(__ip_select_ident);
-EXPORT_SYMBOL(ip_route_input);
 EXPORT_SYMBOL(ip_route_output_key);





^ permalink raw reply related

* [PATCH V4 3/4] net: Use ip_route_input_noref() in input path
From: Eric Dumazet @ 2010-05-10 21:33 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Use ip_route_input_noref() in ip fast path, to avoid two atomic ops per
incoming packet.

Note: loopback is excluded from this optimization in ip_rcv_finish()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
net/ipv4/arp.c         |    2 +-
net/ipv4/ip_input.c    |    4 ++--
net/ipv4/xfrm4_input.c |    4 ++--
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 6e74706..f4ae3d4 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -854,7 +854,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f8ab7a3..6d5b65a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -331,8 +331,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
-					 skb->dev);
+		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					       iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EHOSTUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index c791bb6..e7972d8 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
-				   skb->dev))
+		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);





^ permalink raw reply related

* Re: [PATCH] virtif: initial interface extensions
From: Arnd Bergmann @ 2010-05-10 21:46 UTC (permalink / raw)
  To: Scott Feldman; +Cc: Stefan Berger, netdev, Arnd Bergmann
In-Reply-To: <C80DA4F7.2FBCF%scofeldm@cisco.com>

On Monday 10 May 2010 20:56:39 Scott Feldman wrote:
> On 5/10/10 8:37 AM, "Stefan Berger" <stefanb@us.ibm.com> wrote:
> > In what case would the IFLA_VIRTIF_PORT_PROFILE be provided? Would libvirt for
> > example need to be aware of whether the Ethernet device can handle the setup
> > protocol via its firmware and in this case provide the port profile parameter
> > and in other cases provide other parameters? Certainly the user or upper layer
> > management software would have to know it when creating the domain XML and in
> > fact different types of parameters were needed.
> 
> > Obviously we should have one
> > common set of (XML) parameters that go into the netlink message and that can
> > be handled by the kernel driver if the firmware knows how to handle it or by
> > LLDPAD. 
> 
> With Arnd's latest additions, we have a single netlink msg, but the
> parameter sets are disjoint between VDP/CDCP and what we need for the kernel
> driver.  So that means the sender (libvirt in this case) needs to know about
> both setups to send a single netlink msg.  An alternative is a have two
> netlink msgs, one for each setup.  That still requires the sender to know
> about two setups.

There are two separate issues here. The first one is whether we're doing
the association in the device driver or in user space. The assumption here
is that if it's in the device driver, there will be a VF number to identify
the channel, while in user space that is not needed.

The other question is which protocol we're using. There are as far as I
can tell five options:

1. enic device driver
2. VDP
3. CDCP
4. CDCP + VDP
5. enic + VDP

The first two ones are the most interesting for now, since Linux cannot do
S-VLANs yet, and they are required for CDCP. However, each of these options
could theoreticall be done in the kernel (plus firmware) or in user space.

If it's done in user space, the VF number is meaningless, because the setup
of the software device is also done from software, but instead you need to
take care of creating the software device with the correct parameters, e.g.
a macvtap device connected to a VLAN interface using the numbers you pass
in the VDP protocol.

Right now, we're not planning to do the protocol that enic uses in LLDPAD,
because it's not publically released. Similarly, there are no adapters that
do VDP in firmware, but both these cases should be covered by the protocol
and it would be good if libvirt could handle them.

Stefan, can you just define the XML in a way that matches the netlink
definition? What you need is something like

1. VF number (optional, signifies that 2/3 are done in firmware)
2. Lower-level protocol
  2.1. CDCP
     2.1.1 SVID
     2.1.2 SCID
  2.2. enic
     2.2.1 port profile name
     2.2.2 ...
3. VDP
  3.1 VSI type/version/provider
  3.2 UUID
  3.3 MAC/VLAN

You need to have 2. or 3. or both, and 2.1/2.2 are mutually exclusive.

	Arnd

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox