Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 13/17] netfilter: nf_tables: complete net namespace support
From: Pablo Neira Ayuso @ 2013-10-14 16:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, kaber, netdev
In-Reply-To: <1381768738-17739-1-git-send-email-pablo@netfilter.org>

Register family per netnamespace to ensure that sets are
only visible in its approapriate namespace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/net_namespace.h             |    4 ++
 include/net/netfilter/nf_tables.h       |    4 +-
 include/net/netns/nftables.h            |   15 ++++++
 net/bridge/netfilter/nf_tables_bridge.c |   32 +++++++++++-
 net/ipv4/netfilter/nf_tables_ipv4.c     |   32 +++++++++++-
 net/ipv6/netfilter/nf_tables_ipv6.c     |   33 +++++++++++-
 net/netfilter/nf_tables_api.c           |   83 ++++++++++++++++++++-----------
 7 files changed, 168 insertions(+), 35 deletions(-)
 create mode 100644 include/net/netns/nftables.h

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index bcc4a8e..da68c9a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -22,6 +22,7 @@
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netns/conntrack.h>
 #endif
+#include <net/netns/nftables.h>
 #include <net/netns/xfrm.h>
 
 struct user_namespace;
@@ -101,6 +102,9 @@ struct net {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 	struct netns_ct		ct;
 #endif
+#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
+	struct netns_nftables	nft;
+#endif
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	struct netns_nf_frag	nf_frag;
 #endif
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index a68f45f..d3272e9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -68,6 +68,7 @@ static inline void nft_data_debug(const struct nft_data *data)
 /**
  *	struct nft_ctx - nf_tables rule/set context
  *
+ *	@net: net namespace
  * 	@skb: netlink skb
  * 	@nlh: netlink message header
  * 	@afi: address family info
@@ -76,6 +77,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  *	@nla: netlink attributes
  */
 struct nft_ctx {
+	struct net			*net;
 	const struct sk_buff		*skb;
 	const struct nlmsghdr		*nlh;
 	const struct nft_af_info	*afi;
@@ -462,7 +464,7 @@ struct nft_af_info {
 	nf_hookfn			*hooks[NF_MAX_HOOKS];
 };
 
-extern int nft_register_afinfo(struct nft_af_info *);
+extern int nft_register_afinfo(struct net *, struct nft_af_info *);
 extern void nft_unregister_afinfo(struct nft_af_info *);
 
 struct nf_chain_type {
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
new file mode 100644
index 0000000..a98b1c5
--- /dev/null
+++ b/include/net/netns/nftables.h
@@ -0,0 +1,15 @@
+#ifndef _NETNS_NFTABLES_H_
+#define _NETNS_NFTABLES_H_
+
+#include <linux/list.h>
+
+struct nft_af_info;
+
+struct netns_nftables {
+	struct list_head	af_info;
+	struct nft_af_info	*ipv4;
+	struct nft_af_info	*ipv6;
+	struct nft_af_info	*bridge;
+};
+
+#endif
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index bc5c21c..e8cb016 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -19,14 +19,42 @@ static struct nft_af_info nft_af_bridge __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static int nf_tables_bridge_init_net(struct net *net)
+{
+	net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.bridge == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge));
+
+	if (nft_register_afinfo(net, net->nft.bridge) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.bridge);
+	return -ENOMEM;
+}
+
+static void nf_tables_bridge_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.bridge);
+	kfree(net->nft.bridge);
+}
+
+static struct pernet_operations nf_tables_bridge_net_ops = {
+	.init	= nf_tables_bridge_init_net,
+	.exit	= nf_tables_bridge_exit_net,
+};
+
 static int __init nf_tables_bridge_init(void)
 {
-	return nft_register_afinfo(&nft_af_bridge);
+	return register_pernet_subsys(&nf_tables_bridge_net_ops);
 }
 
 static void __exit nf_tables_bridge_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_bridge);
+	return unregister_pernet_subsys(&nf_tables_bridge_net_ops);
 }
 
 module_init(nf_tables_bridge_init);
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index c61cffb..8f7536b 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -14,6 +14,7 @@
 #include <linux/ip.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_tables_ipv4.h>
@@ -47,6 +48,33 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+static int nf_tables_ipv4_init_net(struct net *net)
+{
+	net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.ipv4 == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
+
+	if (nft_register_afinfo(net, net->nft.ipv4) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.ipv4);
+	return -ENOMEM;
+}
+
+static void nf_tables_ipv4_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.ipv4);
+	kfree(net->nft.ipv4);
+}
+
+static struct pernet_operations nf_tables_ipv4_net_ops = {
+	.init	= nf_tables_ipv4_init_net,
+	.exit	= nf_tables_ipv4_exit_net,
+};
 
 static unsigned int
 nft_do_chain_ipv4(const struct nf_hook_ops *ops,
@@ -83,12 +111,12 @@ static struct nf_chain_type filter_ipv4 = {
 static int __init nf_tables_ipv4_init(void)
 {
 	nft_register_chain_type(&filter_ipv4);
-	return nft_register_afinfo(&nft_af_ipv4);
+	return register_pernet_subsys(&nf_tables_ipv4_net_ops);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_ipv4);
+	unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
 	nft_unregister_chain_type(&filter_ipv4);
 }
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 42f905a..d77db8a 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -45,6 +45,34 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static int nf_tables_ipv6_init_net(struct net *net)
+{
+	net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
+	if (net->nft.ipv6 == NULL)
+		return -ENOMEM;
+
+	memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6));
+
+	if (nft_register_afinfo(net, net->nft.ipv6) < 0)
+		goto err;
+
+	return 0;
+err:
+	kfree(net->nft.ipv6);
+	return -ENOMEM;
+}
+
+static void nf_tables_ipv6_exit_net(struct net *net)
+{
+	nft_unregister_afinfo(net->nft.ipv6);
+	kfree(net->nft.ipv6);
+}
+
+static struct pernet_operations nf_tables_ipv6_net_ops = {
+	.init	= nf_tables_ipv6_init_net,
+	.exit	= nf_tables_ipv6_exit_net,
+};
+
 static unsigned int
 nft_do_chain_ipv6(const struct nf_hook_ops *ops,
 		  struct sk_buff *skb,
@@ -82,11 +110,12 @@ static struct nf_chain_type filter_ipv6 = {
 static int __init nf_tables_ipv6_init(void)
 {
 	nft_register_chain_type(&filter_ipv6);
-	return nft_register_afinfo(&nft_af_ipv6);
+	return register_pernet_subsys(&nf_tables_ipv6_net_ops);
 }
+
 static void __exit nf_tables_ipv6_exit(void)
 {
-	nft_unregister_afinfo(&nft_af_ipv6);
+	unregister_pernet_subsys(&nf_tables_ipv6_net_ops);
 	nft_unregister_chain_type(&filter_ipv6);
 }
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a4dd7ce..e1ee850 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -18,9 +18,9 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/net_namespace.h>
 #include <net/sock.h>
 
-static LIST_HEAD(nf_tables_afinfo);
 static LIST_HEAD(nf_tables_expressions);
 
 /**
@@ -31,11 +31,11 @@ static LIST_HEAD(nf_tables_expressions);
  *	Register the address family for use with nf_tables. Returns zero on
  *	success or a negative errno code otherwise.
  */
-int nft_register_afinfo(struct nft_af_info *afi)
+int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
 {
 	INIT_LIST_HEAD(&afi->tables);
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_add_tail(&afi->list, &nf_tables_afinfo);
+	list_add_tail(&afi->list, &net->nft.af_info);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return 0;
 }
@@ -56,22 +56,23 @@ void nft_unregister_afinfo(struct nft_af_info *afi)
 }
 EXPORT_SYMBOL_GPL(nft_unregister_afinfo);
 
-static struct nft_af_info *nft_afinfo_lookup(int family)
+static struct nft_af_info *nft_afinfo_lookup(struct net *net, int family)
 {
 	struct nft_af_info *afi;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (afi->family == family)
 			return afi;
 	}
 	return NULL;
 }
 
-static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
+static struct nft_af_info *
+nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
 {
 	struct nft_af_info *afi;
 
-	afi = nft_afinfo_lookup(family);
+	afi = nft_afinfo_lookup(net, family);
 	if (afi != NULL)
 		return afi;
 #ifdef CONFIG_MODULES
@@ -79,7 +80,7 @@ static struct nft_af_info *nf_tables_afinfo_lookup(int family, bool autoload)
 		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 		request_module("nft-afinfo-%u", family);
 		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		afi = nft_afinfo_lookup(family);
+		afi = nft_afinfo_lookup(net, family);
 		if (afi != NULL)
 			return ERR_PTR(-EAGAIN);
 	}
@@ -232,9 +233,10 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -268,6 +270,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -278,7 +281,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -379,9 +382,10 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nlattr *name;
 	struct nft_af_info *afi;
 	struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, true);
+	afi = nf_tables_afinfo_lookup(net, family, true);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -433,9 +437,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	struct nft_af_info *afi;
 	struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -663,9 +668,10 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -702,6 +708,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_table *table;
 	const struct nft_chain *chain;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -712,7 +719,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -813,6 +820,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	struct nft_chain *chain;
 	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	u64 handle = 0;
 	int err;
@@ -820,7 +828,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(family, true);
+	afi = nf_tables_afinfo_lookup(net, family, true);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1010,9 +1018,10 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1050,6 +1059,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 const struct nft_chain *chain,
 			 const struct nlattr * const *nla)
 {
+	ctx->net   = sock_net(skb->sk);
 	ctx->skb   = skb;
 	ctx->nlh   = nlh;
 	ctx->afi   = afi;
@@ -1361,9 +1371,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
 	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 
-	list_for_each_entry(afi, &nf_tables_afinfo, list) {
+	list_for_each_entry(afi, &net->nft.af_info, list) {
 		if (family != NFPROTO_UNSPEC && family != afi->family)
 			continue;
 
@@ -1402,6 +1413,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_chain *chain;
 	const struct nft_rule *rule;
 	struct sk_buff *skb2;
+	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	int err;
 
@@ -1412,7 +1424,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 		return netlink_dump_start(nlsk, skb, nlh, &c);
 	}
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1477,6 +1489,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *old_rule = NULL;
@@ -1490,7 +1503,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1585,12 +1598,13 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	const struct nft_table *table;
 	struct nft_chain *chain;
 	struct nft_rule *rule, *tmp;
 	int family = nfmsg->nfgen_family;
 
-	afi = nf_tables_afinfo_lookup(family, false);
+	afi = nf_tables_afinfo_lookup(net, family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1697,11 +1711,12 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 				     const struct nlmsghdr *nlh,
 				     const struct nlattr * const nla[])
 {
+	struct net *net = sock_net(skb->sk);
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
 	const struct nft_table *table = NULL;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -1818,12 +1833,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
 {
 	struct sk_buff *skb;
 	u32 portid = NETLINK_CB(ctx->skb).portid;
-	struct net *net = sock_net(ctx->skb->sk);
 	bool report;
 	int err;
 
 	report = nlmsg_report(ctx->nlh);
-	if (!report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
+	if (!report && !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
 		return 0;
 
 	err = -ENOBUFS;
@@ -1837,11 +1851,11 @@ static int nf_tables_set_notify(const struct nft_ctx *ctx,
 		goto err;
 	}
 
-	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report,
+	err = nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, report,
 			     GFP_KERNEL);
 err:
 	if (err < 0)
-		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
+		nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, err);
 	return err;
 }
 
@@ -1974,6 +1988,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_set_ops *ops;
 	const struct nft_af_info *afi;
+	struct net *net = sock_net(skb->sk);
 	struct nft_table *table;
 	struct nft_set *set;
 	struct nft_ctx ctx;
@@ -2032,7 +2047,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, create);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, create);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -2219,8 +2234,9 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	const struct nft_af_info *afi;
 	const struct nft_table *table;
+	struct net *net = sock_net(skb->sk);
 
-	afi = nf_tables_afinfo_lookup(nfmsg->nfgen_family, false);
+	afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
@@ -3011,6 +3027,16 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
 }
 EXPORT_SYMBOL_GPL(nft_data_dump);
 
+static int nf_tables_init_net(struct net *net)
+{
+	INIT_LIST_HEAD(&net->nft.af_info);
+	return 0;
+}
+
+static struct pernet_operations nf_tables_net_ops = {
+	.init	= nf_tables_init_net,
+};
+
 static int __init nf_tables_module_init(void)
 {
 	int err;
@@ -3031,7 +3057,7 @@ static int __init nf_tables_module_init(void)
 		goto err3;
 
 	pr_info("nf_tables: (c) 2007-2009 Patrick McHardy <kaber@trash.net>\n");
-	return 0;
+	return register_pernet_subsys(&nf_tables_net_ops);
 err3:
 	nf_tables_core_module_exit();
 err2:
@@ -3042,6 +3068,7 @@ err1:
 
 static void __exit nf_tables_module_exit(void)
 {
+	unregister_pernet_subsys(&nf_tables_net_ops);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
 	nf_tables_core_module_exit();
 	kfree(info);
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 12/17] netfilter: nf_tables: Add support for IPv6 NAT
From: Pablo Neira Ayuso @ 2013-10-14 16:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, kaber, netdev
In-Reply-To: <1381768738-17739-1-git-send-email-pablo@netfilter.org>

From: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>

This patch generalizes the NAT expression to support both IPv4 and IPv6
using the existing IPv4/IPv6 NAT infrastructure. This also adds the
NAT chain type for IPv6.

This patch collapses the following patches that were posted to the
netfilter-devel mailing list, from Tomasz:

* nf_tables: Change NFTA_NAT_ attributes to better semantic significance
* nf_tables: Split IPv4 NAT into NAT expression and IPv4 NAT chain
* nf_tables: Add support for IPv6 NAT expression
* nf_tables: Add support for IPv6 NAT chain
* nf_tables: Fix up build issue on IPv6 NAT support

And, from Pablo Neira Ayuso:

* fix missing dependencies in nft_chain_nat

Signed-off-by: Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   18 +--
 net/ipv4/netfilter/Kconfig               |    1 +
 net/ipv4/netfilter/nft_chain_nat_ipv4.c  |  156 +--------------------
 net/ipv6/netfilter/Kconfig               |    5 +
 net/ipv6/netfilter/Makefile              |    1 +
 net/ipv6/netfilter/nft_chain_nat_ipv6.c  |  211 ++++++++++++++++++++++++++++
 net/netfilter/Kconfig                    |    6 +
 net/netfilter/Makefile                   |    1 +
 net/netfilter/nft_nat.c                  |  220 ++++++++++++++++++++++++++++++
 9 files changed, 457 insertions(+), 162 deletions(-)
 create mode 100644 net/ipv6/netfilter/nft_chain_nat_ipv6.c
 create mode 100644 net/netfilter/nft_nat.c

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a9c4bce..7d4a199 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -695,18 +695,20 @@ enum nft_nat_types {
  * enum nft_nat_attributes - nf_tables nat expression netlink attributes
  *
  * @NFTA_NAT_TYPE: NAT type (NLA_U32: nft_nat_types)
- * @NFTA_NAT_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
- * @NFTA_NAT_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
- * @NFTA_NAT_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_FAMILY: NAT family (NLA_U32)
+ * @NFTA_NAT_REG_ADDR_MIN: source register of address range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_nat_attributes {
 	NFTA_NAT_UNSPEC,
 	NFTA_NAT_TYPE,
-	NFTA_NAT_ADDR_MIN,
-	NFTA_NAT_ADDR_MAX,
-	NFTA_NAT_PROTO_MIN,
-	NFTA_NAT_PROTO_MAX,
+	NFTA_NAT_FAMILY,
+	NFTA_NAT_REG_ADDR_MIN,
+	NFTA_NAT_REG_ADDR_MAX,
+	NFTA_NAT_REG_PROTO_MIN,
+	NFTA_NAT_REG_PROTO_MAX,
 	__NFTA_NAT_MAX
 };
 #define NFTA_NAT_MAX		(__NFTA_NAT_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index ae65fe9..1f37ef6 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -50,6 +50,7 @@ config NFT_CHAIN_ROUTE_IPV4
 
 config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
+	depends on NF_NAT_IPV4 && NFT_NAT
 	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index e09c201..cf2c792 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
  * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,10 +15,8 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/ip.h>
-#include <linux/netlink.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_nat.h>
@@ -27,147 +26,6 @@
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
-struct nft_nat {
-	enum nft_registers	sreg_addr_min:8;
-	enum nft_registers	sreg_addr_max:8;
-	enum nft_registers	sreg_proto_min:8;
-	enum nft_registers	sreg_proto_max:8;
-	enum nf_nat_manip_type	type;
-};
-
-static void nft_nat_eval(const struct nft_expr *expr,
-			 struct nft_data data[NFT_REG_MAX + 1],
-			 const struct nft_pktinfo *pkt)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
-	struct nf_nat_range range;
-
-	memset(&range, 0, sizeof(range));
-	if (priv->sreg_addr_min) {
-		range.min_addr.ip = data[priv->sreg_addr_min].data[0];
-		range.max_addr.ip = data[priv->sreg_addr_max].data[0];
-		range.flags |= NF_NAT_RANGE_MAP_IPS;
-	}
-
-	if (priv->sreg_proto_min) {
-		range.min_proto.all = data[priv->sreg_proto_min].data[0];
-		range.max_proto.all = data[priv->sreg_proto_max].data[0];
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-	}
-
-	data[NFT_REG_VERDICT].verdict =
-		nf_nat_setup_info(ct, &range, priv->type);
-}
-
-static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
-	[NFTA_NAT_ADDR_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_ADDR_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MIN]	= { .type = NLA_U32 },
-	[NFTA_NAT_PROTO_MAX]	= { .type = NLA_U32 },
-	[NFTA_NAT_TYPE]		= { .type = NLA_U32 },
-};
-
-static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
-			const struct nlattr * const tb[])
-{
-	struct nft_nat *priv = nft_expr_priv(expr);
-	int err;
-
-	if (tb[NFTA_NAT_TYPE] == NULL)
-		return -EINVAL;
-
-	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
-	case NFT_NAT_SNAT:
-		priv->type = NF_NAT_MANIP_SRC;
-		break;
-	case NFT_NAT_DNAT:
-		priv->type = NF_NAT_MANIP_DST;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MIN]) {
-		priv->sreg_addr_min = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MIN]));
-		err = nft_validate_input_register(priv->sreg_addr_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_ADDR_MAX]) {
-		priv->sreg_addr_max = ntohl(nla_get_be32(tb[NFTA_NAT_ADDR_MAX]));
-		err = nft_validate_input_register(priv->sreg_addr_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_addr_max = priv->sreg_addr_min;
-
-	if (tb[NFTA_NAT_PROTO_MIN]) {
-		priv->sreg_proto_min = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MIN]));
-		err = nft_validate_input_register(priv->sreg_proto_min);
-		if (err < 0)
-			return err;
-	}
-
-	if (tb[NFTA_NAT_PROTO_MAX]) {
-		priv->sreg_proto_max = ntohl(nla_get_be32(tb[NFTA_NAT_PROTO_MAX]));
-		err = nft_validate_input_register(priv->sreg_proto_max);
-		if (err < 0)
-			return err;
-	} else
-		priv->sreg_proto_max = priv->sreg_proto_min;
-
-	return 0;
-}
-
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
-{
-	const struct nft_nat *priv = nft_expr_priv(expr);
-
-	switch (priv->type) {
-	case NF_NAT_MANIP_SRC:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
-			goto nla_put_failure;
-		break;
-	case NF_NAT_MANIP_DST:
-		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
-			goto nla_put_failure;
-		break;
-	}
-
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MIN, htonl(priv->sreg_addr_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_ADDR_MAX, htonl(priv->sreg_addr_max)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MIN, htonl(priv->sreg_proto_min)))
-		goto nla_put_failure;
-	if (nla_put_be32(skb, NFTA_NAT_PROTO_MAX, htonl(priv->sreg_proto_max)))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static struct nft_expr_type nft_nat_type;
-static const struct nft_expr_ops nft_nat_ops = {
-	.type		= &nft_nat_type,
-	.size		= NFT_EXPR_SIZE(sizeof(struct nft_nat)),
-	.eval		= nft_nat_eval,
-	.init		= nft_nat_init,
-	.dump		= nft_nat_dump,
-};
-
-static struct nft_expr_type nft_nat_type __read_mostly = {
-	.name		= "nat",
-	.ops		= &nft_nat_ops,
-	.policy		= nft_nat_policy,
-	.maxattr	= NFTA_NAT_MAX,
-	.owner		= THIS_MODULE,
-};
-
 /*
  * NAT chains
  */
@@ -306,7 +164,7 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-struct nf_chain_type nft_chain_nat_ipv4 = {
+static struct nf_chain_type nft_chain_nat_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
@@ -331,20 +189,11 @@ static int __init nft_chain_nat_init(void)
 	if (err < 0)
 		return err;
 
-	err = nft_register_expr(&nft_nat_type);
-	if (err < 0)
-		goto err;
-
 	return 0;
-
-err:
-	nft_unregister_chain_type(&nft_chain_nat_ipv4);
-	return err;
 }
 
 static void __exit nft_chain_nat_exit(void)
 {
-	nft_unregister_expr(&nft_nat_type);
 	nft_unregister_chain_type(&nft_chain_nat_ipv4);
 }
 
@@ -354,4 +203,3 @@ module_exit(nft_chain_nat_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
-MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 2383306..7702f9e 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -33,6 +33,11 @@ config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
 	tristate "IPv6 nf_tables route chain support"
 
+config NFT_CHAIN_NAT_IPV6
+	depends on NF_TABLES_IPV6
+	depends on NF_NAT_IPV6 && NFT_NAT
+	tristate "IPv6 nf_tables nat chain support"
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index be4913a..d1b4928 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
new file mode 100644
index 0000000..e86dcd7
--- /dev/null
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ipv6.h>
+
+/*
+ * IPv6 NAT chains
+ */
+
+static unsigned int nf_nat_ipv6_fn(const struct nf_hook_ops *ops,
+			      struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+	struct nf_conn_nat *nat;
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	__be16 frag_off;
+	int hdrlen;
+	u8 nexthdr;
+	struct nft_pktinfo pkt;
+	unsigned int ret;
+
+	if (ct == NULL || nf_ct_is_untracked(ct))
+		return NF_ACCEPT;
+
+	nat = nfct_nat(ct);
+	if (nat == NULL) {
+		/* Conntrack module was loaded late, can't add extension. */
+		if (nf_ct_is_confirmed(ct))
+			return NF_ACCEPT;
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+		if (nat == NULL)
+			return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED + IP_CT_IS_REPLY:
+		nexthdr = ipv6_hdr(skb)->nexthdr;
+		hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+					  &nexthdr, &frag_off);
+
+		if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+			if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
+							   ops->hooknum,
+							   hdrlen))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall through */
+	case IP_CT_NEW:
+		if (nf_nat_initialized(ct, maniptype))
+			break;
+
+		nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
+		if (ret != NF_ACCEPT)
+			return ret;
+		if (!nf_nat_initialized(ct, maniptype)) {
+			ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
+			if (ret != NF_ACCEPT)
+				return ret;
+		}
+	default:
+		break;
+	}
+
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
+}
+
+static unsigned int nf_nat_ipv6_prerouting(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      int (*okfn)(struct sk_buff *))
+{
+	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
+		skb_dst_drop(skb);
+
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_postrouting(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo __maybe_unused;
+	const struct nf_conn *ct __maybe_unused;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+#ifdef CONFIG_XFRM
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
+				      &ct->tuplehash[!dir].tuple.dst.u3) ||
+		    (ct->tuplehash[dir].tuple.src.u.all !=
+		     ct->tuplehash[!dir].tuple.dst.u.all))
+			if (nf_xfrm_me_harder(skb, AF_INET6) < 0)
+				ret = NF_DROP;
+	}
+#endif
+	return ret;
+}
+
+static unsigned int nf_nat_ipv6_output(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  int (*okfn)(struct sk_buff *))
+{
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn *ct;
+	unsigned int ret;
+
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN &&
+	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
+				      &ct->tuplehash[!dir].tuple.src.u3)) {
+			if (ip6_route_me_harder(skb))
+				ret = NF_DROP;
+		}
+#ifdef CONFIG_XFRM
+		else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+			 ct->tuplehash[dir].tuple.dst.u.all !=
+			 ct->tuplehash[!dir].tuple.src.u.all)
+			if (nf_xfrm_me_harder(skb, AF_INET6))
+				ret = NF_DROP;
+#endif
+	}
+	return ret;
+}
+
+static struct nf_chain_type nft_chain_nat_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_ipv6_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_ipv6_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_ipv6_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_ipv6_fn,
+	},
+	.me		= THIS_MODULE,
+};
+
+static int __init nft_chain_nat_ipv6_init(void)
+{
+	int err;
+
+	err = nft_register_chain_type(&nft_chain_nat_ipv6);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_chain_nat_ipv6_exit(void)
+{
+	nft_unregister_chain_type(&nft_chain_nat_ipv6);
+}
+
+module_init(nft_chain_nat_ipv6_init);
+module_exit(nft_chain_nat_ipv6_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 49e3627..48acec1 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,12 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_NAT
+	depends on NF_TABLES
+	depends on NF_CONNTRACK
+	depends on NF_NAT
+	tristate "Netfilter nf_tables nat module"
+
 config NFT_COMPAT
 	depends on NF_TABLES
 	depends on NETFILTER_XTABLES
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index a678145..394483b 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
+obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
 #nf_tables-objs			+= nft_meta_target.o
 obj-$(CONFIG_NFT_RBTREE)	+= nft_rbtree.o
 obj-$(CONFIG_NFT_HASH)		+= nft_hash.o
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
new file mode 100644
index 0000000..b0b87b2
--- /dev/null
+++ b/net/netfilter/nft_nat.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
+ * Copyright (c) 2012 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/string.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#include <net/ip.h>
+
+struct nft_nat {
+	enum nft_registers      sreg_addr_min:8;
+	enum nft_registers      sreg_addr_max:8;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
+	int                     family;
+	enum nf_nat_manip_type  type;
+};
+
+static void nft_nat_eval(const struct nft_expr *expr,
+			 struct nft_data data[NFT_REG_MAX + 1],
+			 const struct nft_pktinfo *pkt)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct = nf_ct_get(pkt->skb, &ctinfo);
+	struct nf_nat_range range;
+
+	memset(&range, 0, sizeof(range));
+	if (priv->sreg_addr_min) {
+		if (priv->family == AF_INET) {
+			range.min_addr.ip = data[priv->sreg_addr_min].data[0];
+			range.max_addr.ip = data[priv->sreg_addr_max].data[0];
+
+		} else {
+			memcpy(range.min_addr.ip6,
+			       data[priv->sreg_addr_min].data,
+			       sizeof(struct nft_data));
+			memcpy(range.max_addr.ip6,
+			       data[priv->sreg_addr_max].data,
+			       sizeof(struct nft_data));
+		}
+		range.flags |= NF_NAT_RANGE_MAP_IPS;
+	}
+
+	if (priv->sreg_proto_min) {
+		range.min_proto.all = data[priv->sreg_proto_min].data[0];
+		range.max_proto.all = data[priv->sreg_proto_max].data[0];
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+	}
+
+	data[NFT_REG_VERDICT].verdict =
+		nf_nat_setup_info(ct, &range, priv->type);
+}
+
+static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = {
+	[NFTA_NAT_TYPE]		 = { .type = NLA_U32 },
+	[NFTA_NAT_FAMILY]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MIN]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_ADDR_MAX]	 = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 },
+	[NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 },
+};
+
+static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+			const struct nlattr * const tb[])
+{
+	struct nft_nat *priv = nft_expr_priv(expr);
+	int err;
+
+	if (tb[NFTA_NAT_TYPE] == NULL)
+		return -EINVAL;
+
+	switch (ntohl(nla_get_be32(tb[NFTA_NAT_TYPE]))) {
+	case NFT_NAT_SNAT:
+		priv->type = NF_NAT_MANIP_SRC;
+		break;
+	case NFT_NAT_DNAT:
+		priv->type = NF_NAT_MANIP_DST;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (tb[NFTA_NAT_FAMILY] == NULL)
+		return -EINVAL;
+
+	priv->family = ntohl(nla_get_be32(tb[NFTA_NAT_FAMILY]));
+	if (priv->family != AF_INET && priv->family != AF_INET6)
+		return -EINVAL;
+
+	if (tb[NFTA_NAT_REG_ADDR_MIN]) {
+		priv->sreg_addr_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MIN]));
+		err = nft_validate_input_register(priv->sreg_addr_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_ADDR_MAX]) {
+		priv->sreg_addr_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_ADDR_MAX]));
+		err = nft_validate_input_register(priv->sreg_addr_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_addr_max = priv->sreg_addr_min;
+
+	if (tb[NFTA_NAT_REG_PROTO_MIN]) {
+		priv->sreg_proto_min = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MIN]));
+		err = nft_validate_input_register(priv->sreg_proto_min);
+		if (err < 0)
+			return err;
+	}
+
+	if (tb[NFTA_NAT_REG_PROTO_MAX]) {
+		priv->sreg_proto_max = ntohl(nla_get_be32(
+						tb[NFTA_NAT_REG_PROTO_MAX]));
+		err = nft_validate_input_register(priv->sreg_proto_max);
+		if (err < 0)
+			return err;
+	} else
+		priv->sreg_proto_max = priv->sreg_proto_min;
+
+	return 0;
+}
+
+static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_nat *priv = nft_expr_priv(expr);
+
+	switch (priv->type) {
+	case NF_NAT_MANIP_SRC:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_SNAT)))
+			goto nla_put_failure;
+		break;
+	case NF_NAT_MANIP_DST:
+		if (nla_put_be32(skb, NFTA_NAT_TYPE, htonl(NFT_NAT_DNAT)))
+			goto nla_put_failure;
+		break;
+	}
+
+	if (nla_put_be32(skb, NFTA_NAT_FAMILY, htonl(priv->family)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MIN, htonl(priv->sreg_addr_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_ADDR_MAX, htonl(priv->sreg_addr_max)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MIN, htonl(priv->sreg_proto_min)))
+		goto nla_put_failure;
+	if (nla_put_be32(skb,
+			 NFTA_NAT_REG_PROTO_MAX, htonl(priv->sreg_proto_max)))
+		goto nla_put_failure;
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static struct nft_expr_type nft_nat_type;
+static const struct nft_expr_ops nft_nat_ops = {
+	.type           = &nft_nat_type,
+	.size           = NFT_EXPR_SIZE(sizeof(struct nft_nat)),
+	.eval           = nft_nat_eval,
+	.init           = nft_nat_init,
+	.dump           = nft_nat_dump,
+};
+
+static struct nft_expr_type nft_nat_type __read_mostly = {
+	.name           = "nat",
+	.ops            = &nft_nat_ops,
+	.policy         = nft_nat_policy,
+	.maxattr        = NFTA_NAT_MAX,
+	.owner          = THIS_MODULE,
+};
+
+static int __init nft_nat_module_init(void)
+{
+	int err;
+
+	err = nft_register_expr(&nft_nat_type);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static void __exit nft_nat_module_exit(void)
+{
+	nft_unregister_expr(&nft_nat_type);
+}
+
+module_init(nft_nat_module_init);
+module_exit(nft_nat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
+MODULE_ALIAS_NFT_EXPR("nat");
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 09/17] netfilter: nf_tables: add compatibility layer for x_tables
From: Pablo Neira Ayuso @ 2013-10-14 16:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, kaber, netdev
In-Reply-To: <1381768738-17739-1-git-send-email-pablo@netfilter.org>

This patch adds the x_tables compatibility layer. This allows you
to use existing x_tables matches and targets from nf_tables.

This compatibility later allows us to use existing matches/targets
for features that are still missing in nf_tables. We can progressively
replace them with native nf_tables extensions. It also provides the
userspace compatibility software that allows you to express the
rule-set using the iptables syntax but using the nf_tables kernel
components.

In order to get this compatibility layer working, I've done the
following things:

* add NFNL_SUBSYS_NFT_COMPAT: this new nfnetlink subsystem is used
to query the x_tables match/target revision, so we don't need to
use the native x_table getsockopt interface.

* emulate xt structures: this required extending the struct nft_pktinfo
to include the fragment offset, which is already obtained from
ip[6]_tables and that is used by some matches/targets.

* add support for default policy to base chains, required to emulate
  x_tables.

* add NFTA_CHAIN_USE attribute to obtain the number of references to
  chains, required by x_tables emulation.

* add chain packet/byte counters using per-cpu.

* support 32-64 bits compat.

For historical reasons, this patch includes the following patches
that were posted in the netfilter-devel mailing list.

>From Pablo Neira Ayuso:
* nf_tables: add default policy to base chains
* netfilter: nf_tables: add NFTA_CHAIN_USE attribute
* nf_tables: nft_compat: private data of target and matches in contiguous area
* nf_tables: validate hooks for compat match/target
* nf_tables: nft_compat: release cached matches/targets
* nf_tables: x_tables support as a compile time option
* nf_tables: fix alias for xtables over nftables module
* nf_tables: add packet and byte counters per chain
* nf_tables: fix per-chain counter stats if no counters are passed
* nf_tables: don't bump chain stats
* nf_tables: add protocol and flags for xtables over nf_tables
* nf_tables: add ip[6]t_entry emulation
* nf_tables: move specific layer 3 compat code to nf_tables_ipv[4|6]
* nf_tables: support 32bits-64bits x_tables compat
* nf_tables: fix compilation if CONFIG_COMPAT is disabled

>From Patrick McHardy:
* nf_tables: move policy to struct nft_base_chain
* nf_tables: send notifications for base chain policy changes

>From Alexander Primak:
* nf_tables: remove the duplicate NF_INET_LOCAL_OUT

>From Nicolas Dichtel:
* nf_tables: fix compilation when nf-netlink is a module

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h               |   44 +-
 include/net/netfilter/nf_tables_ipv4.h          |   23 +
 include/net/netfilter/nf_tables_ipv6.h          |   30 +
 include/uapi/linux/netfilter/Kbuild             |    1 +
 include/uapi/linux/netfilter/nf_tables.h        |   32 +
 include/uapi/linux/netfilter/nf_tables_compat.h |   38 ++
 include/uapi/linux/netfilter/nfnetlink.h        |    3 +-
 net/ipv4/netfilter/nf_tables_ipv4.c             |   32 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c         |    6 +-
 net/ipv4/netfilter/nft_chain_route_ipv4.c       |    6 +-
 net/ipv6/netfilter/nf_tables_ipv6.c             |   33 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c       |    8 +-
 net/netfilter/Kconfig                           |    9 +
 net/netfilter/Makefile                          |    1 +
 net/netfilter/nf_tables_api.c                   |  220 ++++++-
 net/netfilter/nf_tables_core.c                  |   46 +-
 net/netfilter/nft_cmp.c                         |    3 +-
 net/netfilter/nft_compat.c                      |  768 +++++++++++++++++++++++
 net/netfilter/nft_immediate.c                   |   12 +-
 net/netfilter/nft_payload.c                     |    4 +-
 20 files changed, 1241 insertions(+), 78 deletions(-)
 create mode 100644 include/net/netfilter/nf_tables_ipv4.h
 create mode 100644 include/net/netfilter/nf_tables_ipv6.h
 create mode 100644 include/uapi/linux/netfilter/nf_tables_compat.h
 create mode 100644 net/netfilter/nft_compat.c

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 8403f7f5..a68f45f 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netlink.h>
 
@@ -15,8 +16,23 @@ struct nft_pktinfo {
 	u8				hooknum;
 	u8				nhoff;
 	u8				thoff;
+	/* for x_tables compatibility */
+	struct xt_action_param		xt;
 };
 
+static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
+				   const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out)
+{
+	pkt->skb = skb;
+	pkt->in = pkt->xt.in = in;
+	pkt->out = pkt->xt.out = out;
+	pkt->hooknum = pkt->xt.hooknum = ops->hooknum;
+	pkt->xt.family = ops->pf;
+}
+
 struct nft_data {
 	union {
 		u32				data[4];
@@ -57,6 +73,7 @@ static inline void nft_data_debug(const struct nft_data *data)
  * 	@afi: address family info
  * 	@table: the table the chain is contained in
  * 	@chain: the chain the rule is contained in
+ *	@nla: netlink attributes
  */
 struct nft_ctx {
 	const struct sk_buff		*skb;
@@ -64,6 +81,7 @@ struct nft_ctx {
 	const struct nft_af_info	*afi;
 	const struct nft_table		*table;
 	const struct nft_chain		*chain;
+	const struct nlattr * const 	*nla;
 };
 
 struct nft_data_desc {
@@ -235,7 +253,8 @@ extern void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@maxattr: highest netlink attribute number
  */
 struct nft_expr_type {
-	const struct nft_expr_ops	*(*select_ops)(const struct nlattr * const tb[]);
+	const struct nft_expr_ops	*(*select_ops)(const struct nft_ctx *,
+						       const struct nlattr * const tb[]);
 	const struct nft_expr_ops	*ops;
 	struct list_head		list;
 	const char			*name;
@@ -253,6 +272,8 @@ struct nft_expr_type {
  *	@destroy: destruction function
  *	@dump: function to dump parameters
  *	@type: expression type
+ *	@validate: validate expression, called during loop detection
+ *	@data: extra data to attach to this expression operation
  */
 struct nft_expr;
 struct nft_expr_ops {
@@ -267,8 +288,11 @@ struct nft_expr_ops {
 	void				(*destroy)(const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
 						const struct nft_expr *expr);
-	const struct nft_data *		(*get_verdict)(const struct nft_expr *expr);
+	int				(*validate)(const struct nft_ctx *ctx,
+						    const struct nft_expr *expr,
+						    const struct nft_data **data);
 	const struct nft_expr_type	*type;
+	void				*data;
 };
 
 #define NFT_EXPR_MAXATTR		16
@@ -368,16 +392,25 @@ enum nft_chain_type {
 	NFT_CHAIN_T_MAX
 };
 
+struct nft_stats {
+	u64 bytes;
+	u64 pkts;
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
  *	@type: chain type
+ *	@policy: default policy
+ *	@stats: per-cpu chain stats
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
 	enum nft_chain_type		type;
+	u8				policy;
+	struct nft_stats __percpu	*stats;
 	struct nft_chain		chain;
 };
 
@@ -386,11 +419,8 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai
 	return container_of(chain, struct nft_base_chain, chain);
 }
 
-extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-				 struct sk_buff *skb,
-				 const struct net_device *in,
-				 const struct net_device *out,
-				 int (*okfn)(struct sk_buff *));
+extern unsigned int nft_do_chain_pktinfo(struct nft_pktinfo *pkt,
+					 const struct nf_hook_ops *ops);
 
 /**
  *	struct nft_table - nf_tables table
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
new file mode 100644
index 0000000..1be1c2c
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -0,0 +1,23 @@
+#ifndef _NF_TABLES_IPV4_H_
+#define _NF_TABLES_IPV4_H_
+
+#include <net/netfilter/nf_tables.h>
+#include <net/ip.h>
+
+static inline void
+nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	struct iphdr *ip;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	pkt->xt.thoff = ip_hdrlen(pkt->skb);
+	ip = ip_hdr(pkt->skb);
+	pkt->xt.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
+}
+
+#endif
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
new file mode 100644
index 0000000..4a9b88a
--- /dev/null
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -0,0 +1,30 @@
+#ifndef _NF_TABLES_IPV6_H_
+#define _NF_TABLES_IPV6_H_
+
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/ipv6.h>
+
+static inline int
+nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt,
+		     const struct nf_hook_ops *ops,
+		     struct sk_buff *skb,
+		     const struct net_device *in,
+		     const struct net_device *out)
+{
+	int protohdr, thoff = 0;
+	unsigned short frag_off;
+
+	nft_set_pktinfo(pkt, ops, skb, in, out);
+
+	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, NULL);
+	/* If malformed, drop it */
+	if (protohdr < 0)
+		return -1;
+
+	pkt->xt.thoff = thoff;
+	pkt->xt.fragoff = frag_off;
+
+	return 0;
+}
+
+#endif
diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild
index 6ce0b7f..17c3af2 100644
--- a/include/uapi/linux/netfilter/Kbuild
+++ b/include/uapi/linux/netfilter/Kbuild
@@ -6,6 +6,7 @@ header-y += nf_conntrack_sctp.h
 header-y += nf_conntrack_tcp.h
 header-y += nf_conntrack_tuple_common.h
 header-y += nf_tables.h
+header-y += nf_tables_compat.h
 header-y += nf_nat.h
 header-y += nfnetlink.h
 header-y += nfnetlink_acct.h
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 779cf95..1563875 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,7 +115,10 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_POLICY: numeric policy of the chain (NLA_U32)
+ * @NFTA_CHAIN_USE: number of references to this chain (NLA_U32)
  * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
+ * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -123,7 +126,10 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_POLICY,
+	NFTA_CHAIN_USE,
 	NFTA_CHAIN_TYPE,
+	NFTA_CHAIN_COUNTERS,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
@@ -135,6 +141,7 @@ enum nft_chain_attributes {
  * @NFTA_RULE_CHAIN: name of the chain containing the rule (NLA_STRING)
  * @NFTA_RULE_HANDLE: numeric handle of the rule (NLA_U64)
  * @NFTA_RULE_EXPRESSIONS: list of expressions (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_RULE_COMPAT: compatibility specifications of the rule (NLA_NESTED: nft_rule_compat_attributes)
  */
 enum nft_rule_attributes {
 	NFTA_RULE_UNSPEC,
@@ -142,11 +149,36 @@ enum nft_rule_attributes {
 	NFTA_RULE_CHAIN,
 	NFTA_RULE_HANDLE,
 	NFTA_RULE_EXPRESSIONS,
+	NFTA_RULE_COMPAT,
 	__NFTA_RULE_MAX
 };
 #define NFTA_RULE_MAX		(__NFTA_RULE_MAX - 1)
 
 /**
+ * enum nft_rule_compat_flags - nf_tables rule compat flags
+ *
+ * @NFT_RULE_COMPAT_F_INV: invert the check result
+ */
+enum nft_rule_compat_flags {
+	NFT_RULE_COMPAT_F_INV	= (1 << 1),
+	NFT_RULE_COMPAT_F_MASK	= NFT_RULE_COMPAT_F_INV,
+};
+
+/**
+ * enum nft_rule_compat_attributes - nf_tables rule compat attributes
+ *
+ * @NFTA_RULE_COMPAT_PROTO: numerice value of handled protocol (NLA_U32)
+ * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32)
+ */
+enum nft_rule_compat_attributes {
+	NFTA_RULE_COMPAT_UNSPEC,
+	NFTA_RULE_COMPAT_PROTO,
+	NFTA_RULE_COMPAT_FLAGS,
+	__NFTA_RULE_COMPAT_MAX
+};
+#define NFTA_RULE_COMPAT_MAX	(__NFTA_RULE_COMPAT_MAX - 1)
+
+/**
  * enum nft_set_flags - nf_tables set flags
  *
  * @NFT_SET_ANONYMOUS: name allocation, automatic cleanup on unlink
diff --git a/include/uapi/linux/netfilter/nf_tables_compat.h b/include/uapi/linux/netfilter/nf_tables_compat.h
new file mode 100644
index 0000000..8310f5f
--- /dev/null
+++ b/include/uapi/linux/netfilter/nf_tables_compat.h
@@ -0,0 +1,38 @@
+#ifndef _NFT_COMPAT_NFNETLINK_H_
+#define _NFT_COMPAT_NFNETLINK_H_
+
+enum nft_target_attributes {
+	NFTA_TARGET_UNSPEC,
+	NFTA_TARGET_NAME,
+	NFTA_TARGET_REV,
+	NFTA_TARGET_INFO,
+	__NFTA_TARGET_MAX
+};
+#define NFTA_TARGET_MAX		(__NFTA_TARGET_MAX - 1)
+
+enum nft_match_attributes {
+	NFTA_MATCH_UNSPEC,
+	NFTA_MATCH_NAME,
+	NFTA_MATCH_REV,
+	NFTA_MATCH_INFO,
+	__NFTA_MATCH_MAX
+};
+#define NFTA_MATCH_MAX		(__NFTA_MATCH_MAX - 1)
+
+#define NFT_COMPAT_NAME_MAX	32
+
+enum {
+	NFNL_MSG_COMPAT_GET,
+	NFNL_MSG_COMPAT_MAX
+};
+
+enum {
+	NFTA_COMPAT_UNSPEC = 0,
+	NFTA_COMPAT_NAME,
+	NFTA_COMPAT_REV,
+	NFTA_COMPAT_TYPE,
+	__NFTA_COMPAT_MAX,
+};
+#define NFTA_COMPAT_MAX (__NFTA_COMPAT_MAX - 1)
+
+#endif
diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h
index d276c3b..2889594 100644
--- a/include/uapi/linux/netfilter/nfnetlink.h
+++ b/include/uapi/linux/netfilter/nfnetlink.h
@@ -54,6 +54,7 @@ struct nfgenmsg {
 #define NFNL_SUBSYS_CTNETLINK_TIMEOUT	8
 #define NFNL_SUBSYS_CTHELPER		9
 #define NFNL_SUBSYS_NFTABLES		10
-#define NFNL_SUBSYS_COUNT		11
+#define NFNL_SUBSYS_NFT_COMPAT		11
+#define NFNL_SUBSYS_COUNT		12
 
 #endif /* _UAPI_NFNETLINK_H */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 23525c4..c61cffb 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -15,6 +15,8 @@
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/ip.h>
+#include <net/net_namespace.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 
 static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -22,6 +24,8 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct iphdr) ||
 		     ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
 		if (net_ratelimit())
@@ -29,8 +33,9 @@ static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv4 __read_mostly = {
@@ -42,6 +47,21 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+
+static unsigned int
+nft_do_chain_ipv4(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv4 = {
 	.family		= NFPROTO_IPV4,
 	.name		= "filter",
@@ -52,11 +72,11 @@ static struct nf_chain_type filter_ipv4 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
+		[NF_INET_LOCAL_OUT]	= nft_ipv4_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
 	},
 };
 
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index cd28630..e09c201 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -23,6 +23,7 @@
 #include <net/netfilter/nf_nat.h>
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/netfilter/nf_nat_l3proto.h>
 #include <net/ip.h>
 
@@ -181,6 +182,7 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	struct nf_conn_nat *nat;
 	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
+	struct nft_pktinfo pkt;
 	unsigned int ret;
 
 	if (ct == NULL || nf_ct_is_untracked(ct))
@@ -213,7 +215,9 @@ static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
 		if (nf_nat_initialized(ct, maniptype))
 			break;
 
-		ret = nft_do_chain(ops, skb, in, out, okfn);
+		nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
+		ret = nft_do_chain_pktinfo(&pkt, ops);
 		if (ret != NF_ACCEPT)
 			return ret;
 		if (!nf_nat_initialized(ct, maniptype)) {
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 6b84e09..4e6bf9a 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -17,6 +17,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv4.h>
 #include <net/route.h>
 #include <net/ip.h>
 
@@ -27,6 +28,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	u32 mark;
 	__be32 saddr, daddr;
 	u_int8_t tos;
@@ -37,13 +39,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
+	nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
+
 	mark = skb->mark;
 	iph = ip_hdr(skb);
 	saddr = iph->saddr;
 	daddr = iph->daddr;
 	tos = iph->tos;
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE) {
 		iph = ip_hdr(skb);
 
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 3631d62..42f905a 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -14,6 +14,7 @@
 #include <linux/ipv6.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 
 static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    struct sk_buff *skb,
@@ -21,14 +22,18 @@ static unsigned int nft_ipv6_output(const struct nf_hook_ops *ops,
 				    const struct net_device *out,
 				    int (*okfn)(struct sk_buff *))
 {
+	struct nft_pktinfo pkt;
+
 	if (unlikely(skb->len < sizeof(struct ipv6hdr))) {
 		if (net_ratelimit())
 			pr_info("nf_tables_ipv6: ignoring short SOCK_RAW "
 				"packet\n");
 		return NF_ACCEPT;
 	}
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
 
-	return nft_do_chain(ops, skb, in, out, okfn);
+	return nft_do_chain_pktinfo(&pkt, ops);
 }
 
 static struct nft_af_info nft_af_ipv6 __read_mostly = {
@@ -40,6 +45,22 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static unsigned int
+nft_do_chain_ipv6(const struct nf_hook_ops *ops,
+		  struct sk_buff *skb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  int (*okfn)(struct sk_buff *))
+{
+	struct nft_pktinfo pkt;
+
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
+	return nft_do_chain_pktinfo(&pkt, ops);
+}
+
 static struct nf_chain_type filter_ipv6 = {
 	.family		= NFPROTO_IPV6,
 	.name		= "filter",
@@ -50,11 +71,11 @@ static struct nf_chain_type filter_ipv6 = {
 			  (1 << NF_INET_PRE_ROUTING) |
 			  (1 << NF_INET_POST_ROUTING),
 	.fn		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain,
-		[NF_INET_FORWARD]	= nft_do_chain,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain,
-		[NF_INET_POST_ROUTING]	= nft_do_chain,
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
+		[NF_INET_LOCAL_OUT]	= nft_ipv6_output,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
 	},
 };
 
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 4cdc992..3fe40f0 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -19,6 +19,7 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_ipv6.h>
 #include <net/route.h>
 
 static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
@@ -28,10 +29,15 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 					int (*okfn)(struct sk_buff *))
 {
 	unsigned int ret;
+	struct nft_pktinfo pkt;
 	struct in6_addr saddr, daddr;
 	u_int8_t hop_limit;
 	u32 mark, flowlabel;
 
+	/* malformed packet, drop it */
+	if (nft_set_pktinfo_ipv6(&pkt, ops, skb, in, out) < 0)
+		return NF_DROP;
+
 	/* save source/dest address, mark, hoplimit, flowlabel, priority */
 	memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
 	memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
@@ -41,7 +47,7 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	/* flowlabel and prio (includes version, which shouldn't change either */
 	flowlabel = *((u32 *)ipv6_hdr(skb));
 
-	ret = nft_do_chain(ops, skb, in, out, okfn);
+	ret = nft_do_chain_pktinfo(&pkt, ops);
 	if (ret != NF_DROP && ret != NF_QUEUE &&
 	    (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
 	     memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index aa184a46..49e3627 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -450,6 +450,15 @@ config NFT_LIMIT
 	depends on NF_TABLES
 	tristate "Netfilter nf_tables limit module"
 
+config NFT_COMPAT
+	depends on NF_TABLES
+	depends on NETFILTER_XTABLES
+	tristate "Netfilter x_tables over nf_tables module"
+	help
+	  This is required if you intend to use any of existing
+	  x_tables match/target extensions over the nf_tables
+	  framework.
+
 config NETFILTER_XTABLES
 	tristate "Netfilter Xtables support (required for ip_tables)"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index b6b7875..a678145 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -70,6 +70,7 @@ nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o
 nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
+obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
 obj-$(CONFIG_NFT_CT)		+= nft_ct.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9c2d8d5..61e017b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -438,7 +438,9 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_POLICY]	= { .type = NLA_U32 },
 	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
+	[NFTA_CHAIN_COUNTERS]	= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -446,6 +448,33 @@ static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
 	[NFTA_HOOK_PRIORITY]	= { .type = NLA_U32 },
 };
 
+static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
+{
+	struct nft_stats *cpu_stats, total;
+	struct nlattr *nest;
+	int cpu;
+
+	memset(&total, 0, sizeof(total));
+	for_each_possible_cpu(cpu) {
+		cpu_stats = per_cpu_ptr(stats, cpu);
+		total.pkts += cpu_stats->pkts;
+		total.bytes += cpu_stats->bytes;
+	}
+	nest = nla_nest_start(skb, NFTA_CHAIN_COUNTERS);
+	if (nest == NULL)
+		goto nla_put_failure;
+
+	if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) ||
+	    nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -ENOSPC;
+}
+
 static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 				     int event, u32 flags, int family,
 				     const struct nft_table *table,
@@ -472,8 +501,11 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		goto nla_put_failure;
 
 	if (chain->flags & NFT_BASE_CHAIN) {
-		const struct nf_hook_ops *ops = &nft_base_chain(chain)->ops;
-		struct nlattr *nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
+		const struct nft_base_chain *basechain = nft_base_chain(chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+		struct nlattr *nest;
+
+		nest = nla_nest_start(skb, NFTA_CHAIN_HOOK);
 		if (nest == NULL)
 			goto nla_put_failure;
 		if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
@@ -482,11 +514,21 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
 
+		if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
+				 htonl(basechain->policy)))
+			goto nla_put_failure;
+
 		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
 			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
 				goto nla_put_failure;
+
+		if (nft_dump_stats(skb, nft_base_chain(chain)->stats))
+			goto nla_put_failure;
 	}
 
+	if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
+		goto nla_put_failure;
+
 	return nlmsg_end(skb, nlh);
 
 nla_put_failure:
@@ -617,6 +659,67 @@ err:
 	return err;
 }
 
+static int
+nf_tables_chain_policy(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	switch (ntohl(nla_get_be32(attr))) {
+	case NF_DROP:
+		chain->policy = NF_DROP;
+		break;
+	case NF_ACCEPT:
+		chain->policy = NF_ACCEPT;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
+	[NFTA_COUNTER_PACKETS]	= { .type = NLA_U64 },
+	[NFTA_COUNTER_BYTES]	= { .type = NLA_U64 },
+};
+
+static int
+nf_tables_counters(struct nft_base_chain *chain, const struct nlattr *attr)
+{
+	struct nlattr *tb[NFTA_COUNTER_MAX+1];
+	struct nft_stats __percpu *newstats;
+	struct nft_stats *stats;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
+		return -EINVAL;
+
+	newstats = alloc_percpu(struct nft_stats);
+	if (newstats == NULL)
+		return -ENOMEM;
+
+	/* Restore old counters on this cpu, no problem. Per-cpu statistics
+	 * are not exposed to userspace.
+	 */
+	stats = this_cpu_ptr(newstats);
+	stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+	stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+
+	if (chain->stats) {
+		/* nfnl_lock is held, add some nfnl function for this, later */
+		struct nft_stats __percpu *oldstats =
+			rcu_dereference_protected(chain->stats, 1);
+
+		rcu_assign_pointer(chain->stats, newstats);
+		synchronize_rcu();
+		free_percpu(oldstats);
+	} else
+		rcu_assign_pointer(chain->stats, newstats);
+
+	return 0;
+}
+
 static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			      const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[])
@@ -626,7 +729,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	const struct nft_af_info *afi;
 	struct nft_table *table;
 	struct nft_chain *chain;
-	struct nft_base_chain *basechain;
+	struct nft_base_chain *basechain = NULL;
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
 	int family = nfmsg->nfgen_family;
 	u64 handle = 0;
@@ -673,6 +776,26 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		    !IS_ERR(nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME])))
 			return -EEXIST;
 
+		if (nla[NFTA_CHAIN_POLICY]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_chain_policy(nft_base_chain(chain),
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0)
+				return err;
+		}
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			if (!(chain->flags & NFT_BASE_CHAIN))
+				return -EOPNOTSUPP;
+
+			err = nf_tables_counters(nft_base_chain(chain),
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0)
+				return err;
+		}
+
 		if (nla[NFTA_CHAIN_HANDLE] && name)
 			nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN);
 
@@ -727,6 +850,36 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 			ops->hook = afi->hooks[ops->hooknum];
 
 		chain->flags |= NFT_BASE_CHAIN;
+
+		if (nla[NFTA_CHAIN_POLICY]) {
+			err = nf_tables_chain_policy(basechain,
+						     nla[NFTA_CHAIN_POLICY]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else
+			basechain->policy = NF_ACCEPT;
+
+		if (nla[NFTA_CHAIN_COUNTERS]) {
+			err = nf_tables_counters(basechain,
+						 nla[NFTA_CHAIN_COUNTERS]);
+			if (err < 0) {
+				free_percpu(basechain->stats);
+				kfree(basechain);
+				return err;
+			}
+		} else {
+			struct nft_stats __percpu *newstats;
+
+			newstats = alloc_percpu(struct nft_stats);
+			if (newstats == NULL)
+				return -ENOMEM;
+
+			rcu_assign_pointer(nft_base_chain(chain)->stats,
+					   newstats);
+		}
 	} else {
 		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 		if (chain == NULL)
@@ -739,6 +892,15 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	list_add_tail(&chain->list, &table->chains);
 	table->use++;
+
+	if (chain->flags & NFT_BASE_CHAIN) {
+		err = nf_register_hook(&nft_base_chain(chain)->ops);
+		if (err < 0) {
+			free_percpu(basechain->stats);
+			kfree(basechain);
+			return err;
+		}
+	}
 notify:
 	nf_tables_chain_notify(skb, nlh, table, chain, NFT_MSG_NEWCHAIN,
 			       family);
@@ -751,9 +913,10 @@ static void nf_tables_rcu_chain_destroy(struct rcu_head *head)
 
 	BUG_ON(chain->use > 0);
 
-	if (chain->flags & NFT_BASE_CHAIN)
+	if (chain->flags & NFT_BASE_CHAIN) {
+		free_percpu(nft_base_chain(chain)->stats);
 		kfree(nft_base_chain(chain));
-	else
+	} else
 		kfree(chain);
 }
 
@@ -801,13 +964,15 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 			 const struct nlmsghdr *nlh,
 			 const struct nft_af_info *afi,
 			 const struct nft_table *table,
-			 const struct nft_chain *chain)
+			 const struct nft_chain *chain,
+			 const struct nlattr * const *nla)
 {
 	ctx->skb   = skb;
 	ctx->nlh   = nlh;
 	ctx->afi   = afi;
 	ctx->table = table;
 	ctx->chain = chain;
+	ctx->nla   = nla;
 }
 
 /*
@@ -910,7 +1075,8 @@ struct nft_expr_info {
 	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1];
 };
 
-static int nf_tables_expr_parse(const struct nlattr *nla,
+static int nf_tables_expr_parse(const struct nft_ctx *ctx,
+				const struct nlattr *nla,
 				struct nft_expr_info *info)
 {
 	const struct nft_expr_type *type;
@@ -935,7 +1101,8 @@ static int nf_tables_expr_parse(const struct nlattr *nla,
 		memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));
 
 	if (type->select_ops != NULL) {
-		ops = type->select_ops((const struct nlattr * const *)info->tb);
+		ops = type->select_ops(ctx,
+				       (const struct nlattr * const *)info->tb);
 		if (IS_ERR(ops)) {
 			err = PTR_ERR(ops);
 			goto err1;
@@ -1012,6 +1179,7 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_RULE_HANDLE]	= { .type = NLA_U64 },
 	[NFTA_RULE_EXPRESSIONS]	= { .type = NLA_NESTED },
+	[NFTA_RULE_COMPAT]	= { .type = NLA_NESTED },
 };
 
 static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq,
@@ -1269,6 +1437,8 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		handle = nf_tables_alloc_handle(table);
 	}
 
+	nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+
 	n = 0;
 	size = 0;
 	if (nla[NFTA_RULE_EXPRESSIONS]) {
@@ -1278,7 +1448,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 				goto err1;
 			if (n == NFT_RULE_MAXEXPRS)
 				goto err1;
-			err = nf_tables_expr_parse(tmp, &info[n]);
+			err = nf_tables_expr_parse(&ctx, tmp, &info[n]);
 			if (err < 0)
 				goto err1;
 			size += info[n].ops->size;
@@ -1294,7 +1464,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	rule->handle = handle;
 	rule->dlen   = size;
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, chain);
 	expr = nft_expr_first(rule);
 	for (i = 0; i < n; i++) {
 		err = nf_tables_newexpr(&ctx, &info[i], expr);
@@ -1304,13 +1473,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 		expr = nft_expr_next(expr);
 	}
 
-	/* Register hook when first rule is inserted into a base chain */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN) {
-		err = nf_register_hook(&nft_base_chain(chain)->ops);
-		if (err < 0)
-			goto err2;
-	}
-
 	if (nlh->nlmsg_flags & NLM_F_REPLACE) {
 		list_replace_rcu(&old_rule->list, &rule->list);
 		nf_tables_rule_destroy(old_rule);
@@ -1379,10 +1541,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 		}
 	}
 
-	/* Unregister hook when last rule from base chain is deleted */
-	if (list_empty(&chain->rules) && chain->flags & NFT_BASE_CHAIN)
-		nf_unregister_hook(&nft_base_chain(chain)->ops);
-
 	return 0;
 }
 
@@ -1470,7 +1628,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 			return PTR_ERR(table);
 	}
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -1799,7 +1957,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
 
 	set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
 	if (IS_ERR(set)) {
@@ -1987,7 +2145,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	nft_ctx_init(ctx, skb, nlh, afi, table, NULL);
+	nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
 	return 0;
 }
 
@@ -2435,23 +2593,27 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 {
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
-	const struct nft_data *data;
 	const struct nft_set *set;
 	struct nft_set_binding *binding;
 	struct nft_set_iter iter;
-	int err;
 
 	if (ctx->chain == chain)
 		return -ELOOP;
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		nft_rule_for_each_expr(expr, last, rule) {
-			if (!expr->ops->get_verdict)
+			const struct nft_data *data = NULL;
+			int err;
+
+			if (!expr->ops->validate)
 				continue;
 
-			data = expr->ops->get_verdict(expr);
+			err = expr->ops->validate(ctx, expr, &data);
+			if (err < 0)
+				return err;
+
 			if (data == NULL)
-				break;
+				continue;
 
 			switch (data->verdict) {
 			case NFT_JUMP:
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 9aede59..e51a45c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -60,27 +60,34 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
 	return true;
 }
 
-unsigned int nft_do_chain(const struct nf_hook_ops *ops,
-			  struct sk_buff *skb,
-			  const struct net_device *in,
-			  const struct net_device *out,
-			  int (*okfn)(struct sk_buff *))
+struct nft_jumpstack {
+	const struct nft_chain	*chain;
+	const struct nft_rule	*rule;
+};
+
+static inline void
+nft_chain_stats(const struct nft_chain *this, const struct nft_pktinfo *pkt,
+		struct nft_jumpstack *jumpstack, unsigned int stackptr)
+{
+	struct nft_stats __percpu *stats;
+	const struct nft_chain *chain = stackptr ? jumpstack[0].chain : this;
+
+	rcu_read_lock_bh();
+	stats = rcu_dereference(nft_base_chain(chain)->stats);
+	__this_cpu_inc(stats->pkts);
+	__this_cpu_add(stats->bytes, pkt->skb->len);
+	rcu_read_unlock_bh();
+}
+
+unsigned int
+nft_do_chain_pktinfo(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops)
 {
 	const struct nft_chain *chain = ops->priv;
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
 	struct nft_data data[NFT_REG_MAX + 1];
-	const struct nft_pktinfo pkt = {
-		.skb		= skb,
-		.in		= in,
-		.out		= out,
-		.hooknum	= ops->hooknum,
-	};
 	unsigned int stackptr = 0;
-	struct {
-		const struct nft_chain	*chain;
-		const struct nft_rule	*rule;
-	} jumpstack[NFT_JUMP_STACK_SIZE];
+	struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE];
 
 do_chain:
 	rule = list_entry(&chain->rules, struct nft_rule, list);
@@ -91,8 +98,8 @@ next_rule:
 			if (expr->ops == &nft_cmp_fast_ops)
 				nft_cmp_fast_eval(expr, data);
 			else if (expr->ops != &nft_payload_fast_ops ||
-				 !nft_payload_fast_eval(expr, data, &pkt))
-				expr->ops->eval(expr, data, &pkt);
+				 !nft_payload_fast_eval(expr, data, pkt))
+				expr->ops->eval(expr, data, pkt);
 
 			if (data[NFT_REG_VERDICT].verdict != NFT_CONTINUE)
 				break;
@@ -135,10 +142,11 @@ next_rule:
 		rule  = jumpstack[stackptr].rule;
 		goto next_rule;
 	}
+	nft_chain_stats(chain, pkt, jumpstack, stackptr);
 
-	return NF_ACCEPT;
+	return nft_base_chain(chain)->policy;
 }
-EXPORT_SYMBOL_GPL(nft_do_chain);
+EXPORT_SYMBOL_GPL(nft_do_chain_pktinfo);
 
 int __init nf_tables_core_module_init(void)
 {
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 37134f3..954925d 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -162,7 +162,8 @@ const struct nft_expr_ops nft_cmp_fast_ops = {
 	.dump		= nft_cmp_fast_dump,
 };
 
-static const struct nft_expr_ops *nft_cmp_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[])
 {
 	struct nft_data_desc desc;
 	struct nft_data data;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
new file mode 100644
index 0000000..4811f76
--- /dev/null
+++ b/net/netfilter/nft_compat.c
@@ -0,0 +1,768 @@
+/*
+ * (C) 2012-2013 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This software has been sponsored by Sophos Astaro <http://www.sophos.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_tables_compat.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <asm/uaccess.h> /* for set_fs */
+#include <net/netfilter/nf_tables.h>
+
+union nft_entry {
+	struct ipt_entry e4;
+	struct ip6t_entry e6;
+};
+
+static inline void
+nft_compat_set_par(struct xt_action_param *par, void *xt, const void *xt_info)
+{
+	par->target	= xt;
+	par->targinfo	= xt_info;
+	par->hotdrop	= false;
+}
+
+static void nft_target_eval(const struct nft_expr *expr,
+			    struct nft_data data[NFT_REG_MAX + 1],
+			    const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	int ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, target, info);
+
+	ret = target->target(skb, &pkt->xt);
+
+	if (pkt->xt.hotdrop)
+		ret = NF_DROP;
+
+	switch(ret) {
+	case XT_CONTINUE:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	default:
+		data[NFT_REG_VERDICT].verdict = ret;
+		break;
+	}
+	return;
+}
+
+static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
+	[NFTA_TARGET_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_TARGET_REV]	= { .type = NLA_U32 },
+	[NFTA_TARGET_INFO]	= { .type = NLA_BINARY },
+};
+
+static void
+nft_target_set_tgchk_param(struct xt_tgchk_param *par,
+			   const struct nft_ctx *ctx,
+			   struct xt_target *target, void *info,
+			   union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->target	= target;
+	par->targinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void target_compat_from_user(struct xt_target *t, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (t->compat_from_user) {
+		int pad;
+
+		t->compat_from_user(out, in);
+		pad = XT_ALIGN(t->targetsize) - t->targetsize;
+		if (pad > 0)
+			memset(out + t->targetsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(t->targetsize));
+}
+
+static inline int nft_compat_target_offset(struct xt_target *target)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_target_offset(target);
+#else
+	return 0;
+#endif
+}
+
+static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] = {
+	[NFTA_RULE_COMPAT_PROTO]	= { .type = NLA_U32 },
+	[NFTA_RULE_COMPAT_FLAGS]	= { .type = NLA_U32 },
+};
+
+static u8 nft_parse_compat(const struct nlattr *attr, bool *inv)
+{
+	struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1];
+	u32 flags;
+	int err;
+
+	err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr,
+			       nft_rule_compat_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_RULE_COMPAT_PROTO] || !tb[NFTA_RULE_COMPAT_FLAGS])
+		return -EINVAL;
+
+	flags = ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_FLAGS]));
+	if (flags & ~NFT_RULE_COMPAT_F_MASK)
+		return -EINVAL;
+	if (flags & NFT_RULE_COMPAT_F_INV)
+		*inv = true;
+
+	return ntohl(nla_get_be32(tb[NFTA_RULE_COMPAT_PROTO]));
+}
+
+static int
+nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_target *target = expr->ops->data;
+	struct xt_tgchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
+
+	ret = xt_check_target(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	/* The standard target cannot be used */
+	if (target->target == NULL) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+err:
+	module_put(target->me);
+	return ret;
+}
+
+static void
+nft_target_destroy(const struct nft_expr *expr)
+{
+	struct xt_target *target = expr->ops->data;
+
+	module_put(target->me);
+}
+
+static int
+target_dump_info(struct sk_buff *skb, const struct xt_target *t, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (t->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(t->targetsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		t->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_TARGET_INFO, XT_ALIGN(t->targetsize), in);
+
+	return ret;
+}
+
+static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct xt_target *target = expr->ops->data;
+	void *info = nft_expr_priv(expr);
+
+	if (nla_put_string(skb, NFTA_TARGET_NAME, target->name) ||
+	    nla_put_be32(skb, NFTA_TARGET_REV, htonl(target->revision)) ||
+	    target_dump_info(skb, target, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_target_validate(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nft_data **data)
+{
+	struct xt_target *target = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & target->hooks)
+			return 0;
+
+		/* This target is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void nft_match_eval(const struct nft_expr *expr,
+			   struct nft_data data[NFT_REG_MAX + 1],
+			   const struct nft_pktinfo *pkt)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct sk_buff *skb = pkt->skb;
+	bool ret;
+
+	nft_compat_set_par((struct xt_action_param *)&pkt->xt, match, info);
+
+	ret = match->match(skb, (struct xt_action_param *)&pkt->xt);
+
+	if (pkt->xt.hotdrop) {
+		data[NFT_REG_VERDICT].verdict = NF_DROP;
+		return;
+	}
+
+	switch(ret) {
+	case true:
+		data[NFT_REG_VERDICT].verdict = NFT_CONTINUE;
+		break;
+	case false:
+		data[NFT_REG_VERDICT].verdict = NFT_BREAK;
+		break;
+	}
+}
+
+static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
+	[NFTA_MATCH_NAME]	= { .type = NLA_NUL_STRING },
+	[NFTA_MATCH_REV]	= { .type = NLA_U32 },
+	[NFTA_MATCH_INFO]	= { .type = NLA_BINARY },
+};
+
+/* struct xt_mtchk_param and xt_tgchk_param look very similar */
+static void
+nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx,
+			  struct xt_match *match, void *info,
+			  union nft_entry *entry, u8 proto, bool inv)
+{
+	par->net	= &init_net;
+	par->table	= ctx->table->name;
+	switch (ctx->afi->family) {
+	case AF_INET:
+		entry->e4.ip.proto = proto;
+		entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0;
+		break;
+	case AF_INET6:
+		entry->e6.ipv6.proto = proto;
+		entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0;
+		break;
+	}
+	par->entryinfo	= entry;
+	par->match	= match;
+	par->matchinfo	= info;
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		par->hook_mask = 1 << ops->hooknum;
+	}
+	par->family	= ctx->afi->family;
+}
+
+static void match_compat_from_user(struct xt_match *m, void *in, void *out)
+{
+#ifdef CONFIG_COMPAT
+	if (m->compat_from_user) {
+		int pad;
+
+		m->compat_from_user(out, in);
+		pad = XT_ALIGN(m->matchsize) - m->matchsize;
+		if (pad > 0)
+			memset(out + m->matchsize, 0, pad);
+	} else
+#endif
+		memcpy(out, in, XT_ALIGN(m->matchsize));
+}
+
+static int
+nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
+		const struct nlattr * const tb[])
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+	struct xt_mtchk_param par;
+	size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO]));
+	u8 proto = 0;
+	bool inv = false;
+	union nft_entry e = {};
+	int ret;
+
+	match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info);
+
+	if (ctx->nla[NFTA_RULE_COMPAT])
+		proto = nft_parse_compat(ctx->nla[NFTA_RULE_COMPAT], &inv);
+
+	nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
+
+	ret = xt_check_match(&par, size, proto, inv);
+	if (ret < 0)
+		goto err;
+
+	return 0;
+err:
+	module_put(match->me);
+	return ret;
+}
+
+static void
+nft_match_destroy(const struct nft_expr *expr)
+{
+	struct xt_match *match = expr->ops->data;
+
+	module_put(match->me);
+}
+
+static int
+match_dump_info(struct sk_buff *skb, const struct xt_match *m, const void *in)
+{
+	int ret;
+
+#ifdef CONFIG_COMPAT
+	if (m->compat_to_user) {
+		mm_segment_t old_fs;
+		void *out;
+
+		out = kmalloc(XT_ALIGN(m->matchsize), GFP_ATOMIC);
+		if (out == NULL)
+			return -ENOMEM;
+
+		/* We want to reuse existing compat_to_user */
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		m->compat_to_user(out, in);
+		set_fs(old_fs);
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), out);
+		kfree(out);
+	} else
+#endif
+		ret = nla_put(skb, NFTA_MATCH_INFO, XT_ALIGN(m->matchsize), in);
+
+	return ret;
+}
+
+static inline int nft_compat_match_offset(struct xt_match *match)
+{
+#ifdef CONFIG_COMPAT
+	return xt_compat_match_offset(match);
+#else
+	return 0;
+#endif
+}
+
+static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	void *info = nft_expr_priv(expr);
+	struct xt_match *match = expr->ops->data;
+
+	if (nla_put_string(skb, NFTA_MATCH_NAME, match->name) ||
+	    nla_put_be32(skb, NFTA_MATCH_REV, htonl(match->revision)) ||
+	    match_dump_info(skb, match, info))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static int nft_match_validate(const struct nft_ctx *ctx,
+			      const struct nft_expr *expr,
+			      const struct nft_data **data)
+{
+	struct xt_match *match = expr->ops->data;
+	unsigned int hook_mask = 0;
+
+	if (ctx->chain->flags & NFT_BASE_CHAIN) {
+		const struct nft_base_chain *basechain =
+						nft_base_chain(ctx->chain);
+		const struct nf_hook_ops *ops = &basechain->ops;
+
+		hook_mask = 1 << ops->hooknum;
+		if (hook_mask & match->hooks)
+			return 0;
+
+		/* This match is being called from an invalid chain */
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
+		      int event, u16 family, const char *name,
+		      int rev, int target)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = portid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = family;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	if (nla_put_string(skb, NFTA_COMPAT_NAME, name) ||
+	    nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) ||
+	    nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target)))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int
+nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
+		const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret = 0, target;
+	struct nfgenmsg *nfmsg;
+	const char *fmt;
+	const char *name;
+	u32 rev;
+	struct sk_buff *skb2;
+
+	if (tb[NFTA_COMPAT_NAME] == NULL ||
+	    tb[NFTA_COMPAT_REV] == NULL ||
+	    tb[NFTA_COMPAT_TYPE] == NULL)
+		return -EINVAL;
+
+	name = nla_data(tb[NFTA_COMPAT_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
+	target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+
+	nfmsg = nlmsg_data(nlh);
+
+	switch(nfmsg->nfgen_family) {
+	case AF_INET:
+		fmt = "ipt_%s";
+		break;
+	case AF_INET6:
+		fmt = "ip6t_%s";
+		break;
+	default:
+		pr_err("nft_compat: unsupported protocol %d\n",
+			nfmsg->nfgen_family);
+		return -EINVAL;
+	}
+
+	try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name,
+						 rev, target, &ret),
+						 fmt, name);
+
+	if (ret < 0)
+		return ret;
+
+	skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (skb2 == NULL)
+		return -ENOMEM;
+
+	/* include the best revision for this extension in the message */
+	if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid,
+				  nlh->nlmsg_seq,
+				  NFNL_MSG_TYPE(nlh->nlmsg_type),
+				  NFNL_MSG_COMPAT_GET,
+				  nfmsg->nfgen_family,
+				  name, ret, target) <= 0) {
+		kfree_skb(skb2);
+		return -ENOSPC;
+	}
+
+	ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid,
+				MSG_DONTWAIT);
+	if (ret > 0)
+		ret = 0;
+
+	return ret == -EAGAIN ? -ENOBUFS : ret;
+}
+
+static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
+	[NFTA_COMPAT_NAME]	= { .type = NLA_NUL_STRING,
+				    .len = NFT_COMPAT_NAME_MAX-1 },
+	[NFTA_COMPAT_REV]	= { .type = NLA_U32 },
+	[NFTA_COMPAT_TYPE]	= { .type = NLA_U32 },
+};
+
+static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = {
+	[NFNL_MSG_COMPAT_GET]		= { .call = nfnl_compat_get,
+					    .attr_count = NFTA_COMPAT_MAX,
+					    .policy = nfnl_compat_policy_get },
+};
+
+static const struct nfnetlink_subsystem nfnl_compat_subsys = {
+	.name		= "nft-compat",
+	.subsys_id	= NFNL_SUBSYS_NFT_COMPAT,
+	.cb_count	= NFNL_MSG_COMPAT_MAX,
+	.cb		= nfnl_nft_compat_cb,
+};
+
+static LIST_HEAD(nft_match_list);
+
+struct nft_xt {
+	struct list_head	head;
+	struct nft_expr_ops	ops;
+};
+
+static struct nft_expr_type nft_match_type;
+
+static const struct nft_expr_ops *
+nft_match_select_ops(const struct nft_ctx *ctx,
+		     const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_match;
+	struct xt_match *match;
+	char *mt_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_MATCH_NAME] == NULL ||
+	    tb[NFTA_MATCH_REV] == NULL ||
+	    tb[NFTA_MATCH_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	mt_name = nla_data(tb[NFTA_MATCH_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_MATCH_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing match if it's already loaded. */
+	list_for_each_entry(nft_match, &nft_match_list, head) {
+		struct xt_match *match = nft_match->ops.data;
+
+		if (strcmp(match->name, mt_name) == 0 &&
+		    match->revision == rev && match->family == family)
+			return &nft_match->ops;
+	}
+
+	match = xt_request_find_match(family, mt_name, rev);
+	if (IS_ERR(match))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this match, allocate operations */
+	nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_match == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_match->ops.type = &nft_match_type;
+	nft_match->ops.size = NFT_EXPR_SIZE(XT_ALIGN(match->matchsize) +
+					    nft_compat_match_offset(match));
+	nft_match->ops.eval = nft_match_eval;
+	nft_match->ops.init = nft_match_init;
+	nft_match->ops.destroy = nft_match_destroy;
+	nft_match->ops.dump = nft_match_dump;
+	nft_match->ops.validate = nft_match_validate;
+	nft_match->ops.data = match;
+
+	list_add(&nft_match->head, &nft_match_list);
+
+	return &nft_match->ops;
+}
+
+static void nft_match_release(void)
+{
+	struct nft_xt *nft_match;
+
+	list_for_each_entry(nft_match, &nft_match_list, head)
+		kfree(nft_match);
+}
+
+static struct nft_expr_type nft_match_type __read_mostly = {
+	.name		= "match",
+	.select_ops	= nft_match_select_ops,
+	.policy		= nft_match_policy,
+	.maxattr	= NFTA_MATCH_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static LIST_HEAD(nft_target_list);
+
+static struct nft_expr_type nft_target_type;
+
+static const struct nft_expr_ops *
+nft_target_select_ops(const struct nft_ctx *ctx,
+		      const struct nlattr * const tb[])
+{
+	struct nft_xt *nft_target;
+	struct xt_target *target;
+	char *tg_name;
+	__u32 rev, family;
+
+	if (tb[NFTA_TARGET_NAME] == NULL ||
+	    tb[NFTA_TARGET_REV] == NULL ||
+	    tb[NFTA_TARGET_INFO] == NULL)
+		return ERR_PTR(-EINVAL);
+
+	tg_name = nla_data(tb[NFTA_TARGET_NAME]);
+	rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
+	family = ctx->afi->family;
+
+	/* Re-use the existing target if it's already loaded. */
+	list_for_each_entry(nft_target, &nft_match_list, head) {
+		struct xt_target *target = nft_target->ops.data;
+
+		if (strcmp(target->name, tg_name) == 0 &&
+		    target->revision == rev && target->family == family)
+			return &nft_target->ops;
+	}
+
+	target = xt_request_find_target(family, tg_name, rev);
+	if (IS_ERR(target))
+		return ERR_PTR(-ENOENT);
+
+	/* This is the first time we use this target, allocate operations */
+	nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
+	if (nft_target == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	nft_target->ops.type = &nft_target_type;
+	nft_target->ops.size = NFT_EXPR_SIZE(XT_ALIGN(target->targetsize) +
+					     nft_compat_target_offset(target));
+	nft_target->ops.eval = nft_target_eval;
+	nft_target->ops.init = nft_target_init;
+	nft_target->ops.destroy = nft_target_destroy;
+	nft_target->ops.dump = nft_target_dump;
+	nft_target->ops.validate = nft_target_validate;
+	nft_target->ops.data = target;
+
+	list_add(&nft_target->head, &nft_target_list);
+
+	return &nft_target->ops;
+}
+
+static void nft_target_release(void)
+{
+	struct nft_xt *nft_target;
+
+	list_for_each_entry(nft_target, &nft_target_list, head)
+		kfree(nft_target);
+}
+
+static struct nft_expr_type nft_target_type __read_mostly = {
+	.name		= "target",
+	.select_ops	= nft_target_select_ops,
+	.policy		= nft_target_policy,
+	.maxattr	= NFTA_TARGET_MAX,
+	.owner		= THIS_MODULE,
+};
+
+static int __init nft_compat_module_init(void)
+{
+	int ret;
+
+	ret = nft_register_expr(&nft_match_type);
+	if (ret < 0)
+		return ret;
+
+	ret = nft_register_expr(&nft_target_type);
+	if (ret < 0)
+		goto err_match;
+
+	ret = nfnetlink_subsys_register(&nfnl_compat_subsys);
+	if (ret < 0) {
+		pr_err("nft_compat: cannot register with nfnetlink.\n");
+		goto err_target;
+	}
+
+	pr_info("nf_tables_compat: (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>\n");
+
+	return ret;
+
+err_target:
+	nft_unregister_expr(&nft_target_type);
+err_match:
+	nft_unregister_expr(&nft_match_type);
+	return ret;
+}
+
+static void __exit nft_compat_module_exit(void)
+{
+	nfnetlink_subsys_unregister(&nfnl_compat_subsys);
+	nft_unregister_expr(&nft_target_type);
+	nft_unregister_expr(&nft_match_type);
+	nft_match_release();
+	nft_target_release();
+}
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFT_COMPAT);
+
+module_init(nft_compat_module_init);
+module_exit(nft_compat_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("match");
+MODULE_ALIAS_NFT_EXPR("target");
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 1bfeeaf..f169501 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -90,14 +90,16 @@ nla_put_failure:
 	return -1;
 }
 
-static const struct nft_data *nft_immediate_get_verdict(const struct nft_expr *expr)
+static int nft_immediate_validate(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr,
+				  const struct nft_data **data)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
 	if (priv->dreg == NFT_REG_VERDICT)
-		return &priv->data;
-	else
-		return NULL;
+		*data = &priv->data;
+
+	return 0;
 }
 
 static struct nft_expr_type nft_imm_type;
@@ -108,7 +110,7 @@ static const struct nft_expr_ops nft_imm_ops = {
 	.init		= nft_immediate_init,
 	.destroy	= nft_immediate_destroy,
 	.dump		= nft_immediate_dump,
-	.get_verdict	= nft_immediate_get_verdict,
+	.validate	= nft_immediate_validate,
 };
 
 static struct nft_expr_type nft_imm_type __read_mostly = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7cf13f7..bc8bdb2 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -107,7 +107,9 @@ const struct nft_expr_ops nft_payload_fast_ops = {
 	.dump		= nft_payload_dump,
 };
 
-static const struct nft_expr_ops *nft_payload_select_ops(const struct nlattr * const tb[])
+static const struct nft_expr_ops *
+nft_payload_select_ops(const struct nft_ctx *ctx,
+		       const struct nlattr * const tb[])
 {
 	enum nft_payload_bases base;
 	unsigned int offset, len;
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 08/17] netfilter: nf_tables: convert built-in tables/chains to chain types
From: Pablo Neira Ayuso @ 2013-10-14 16:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, kaber, netdev
In-Reply-To: <1381768738-17739-1-git-send-email-pablo@netfilter.org>

This patch converts built-in tables/chains to chain types that
allows you to deploy customized table and chain configurations from
userspace.

After this patch, you have to specify the chain type when
creating a new chain:

 add chain ip filter output { type filter hook input priority 0; }
                              ^^^^ ------

The existing chain types after this patch are: filter, route and
nat. Note that tables are just containers of chains with no specific
semantics, which is a significant change with regards to iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h                  |   31 ++-
 include/uapi/linux/netfilter/nf_tables.h           |    2 +
 net/ipv4/netfilter/Kconfig                         |    8 +-
 net/ipv4/netfilter/Makefile                        |    4 +-
 net/ipv4/netfilter/nf_tables_ipv4.c                |   21 +++
 .../{nf_table_nat_ipv4.c => nft_chain_nat_ipv4.c}  |  116 +++---------
 ...f_table_route_ipv4.c => nft_chain_route_ipv4.c} |   43 ++---
 net/ipv6/netfilter/Kconfig                         |    4 +-
 net/ipv6/netfilter/Makefile                        |    2 +-
 net/ipv6/netfilter/nf_tables_ipv6.c                |   22 ++-
 ...f_table_route_ipv6.c => nft_chain_route_ipv6.c} |   45 ++---
 net/netfilter/nf_tables_api.c                      |  197 +++++++++-----------
 12 files changed, 221 insertions(+), 274 deletions(-)
 rename net/ipv4/netfilter/{nf_table_nat_ipv4.c => nft_chain_nat_ipv4.c} (76%)
 rename net/ipv4/netfilter/{nf_table_route_ipv4.c => nft_chain_route_ipv4.c} (61%)
 rename net/ipv6/netfilter/{nf_table_route_ipv6.c => nft_chain_route_ipv6.c} (65%)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 66d0359..8403f7f5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -336,7 +336,6 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
 
 enum nft_chain_flags {
 	NFT_BASE_CHAIN			= 0x1,
-	NFT_CHAIN_BUILTIN		= 0x2,
 };
 
 /**
@@ -362,14 +361,23 @@ struct nft_chain {
 	char				name[NFT_CHAIN_MAXNAMELEN];
 };
 
+enum nft_chain_type {
+	NFT_CHAIN_T_DEFAULT = 0,
+	NFT_CHAIN_T_ROUTE,
+	NFT_CHAIN_T_NAT,
+	NFT_CHAIN_T_MAX
+};
+
 /**
  *	struct nft_base_chain - nf_tables base chain
  *
  *	@ops: netfilter hook ops
+ *	@type: chain type
  *	@chain: the chain
  */
 struct nft_base_chain {
 	struct nf_hook_ops		ops;
+	enum nft_chain_type		type;
 	struct nft_chain		chain;
 };
 
@@ -384,10 +392,6 @@ extern unsigned int nft_do_chain(const struct nf_hook_ops *ops,
 				 const struct net_device *out,
 				 int (*okfn)(struct sk_buff *));
 
-enum nft_table_flags {
-	NFT_TABLE_BUILTIN		= 0x1,
-};
-
 /**
  *	struct nft_table - nf_tables table
  *
@@ -431,8 +435,17 @@ struct nft_af_info {
 extern int nft_register_afinfo(struct nft_af_info *);
 extern void nft_unregister_afinfo(struct nft_af_info *);
 
-extern int nft_register_table(struct nft_table *, int family);
-extern void nft_unregister_table(struct nft_table *, int family);
+struct nf_chain_type {
+	unsigned int		hook_mask;
+	const char		*name;
+	enum nft_chain_type	type;
+	nf_hookfn		*fn[NF_MAX_HOOKS];
+	struct module		*me;
+	int			family;
+};
+
+extern int nft_register_chain_type(struct nf_chain_type *);
+extern void nft_unregister_chain_type(struct nf_chain_type *);
 
 extern int nft_register_expr(struct nft_expr_type *);
 extern void nft_unregister_expr(struct nft_expr_type *);
@@ -440,8 +453,8 @@ extern void nft_unregister_expr(struct nft_expr_type *);
 #define MODULE_ALIAS_NFT_FAMILY(family)	\
 	MODULE_ALIAS("nft-afinfo-" __stringify(family))
 
-#define MODULE_ALIAS_NFT_TABLE(family, name) \
-	MODULE_ALIAS("nft-table-" __stringify(family) "-" name)
+#define MODULE_ALIAS_NFT_CHAIN(family, name) \
+	MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)
 
 #define MODULE_ALIAS_NFT_EXPR(name) \
 	MODULE_ALIAS("nft-expr-" name)
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 9e92401..779cf95 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -115,6 +115,7 @@ enum nft_table_attributes {
  * @NFTA_CHAIN_HANDLE: numeric handle of the chain (NLA_U64)
  * @NFTA_CHAIN_NAME: name of the chain (NLA_STRING)
  * @NFTA_CHAIN_HOOK: hook specification for basechains (NLA_NESTED: nft_hook_attributes)
+ * @NFTA_CHAIN_TYPE: type name of the string (NLA_NUL_STRING)
  */
 enum nft_chain_attributes {
 	NFTA_CHAIN_UNSPEC,
@@ -122,6 +123,7 @@ enum nft_chain_attributes {
 	NFTA_CHAIN_HANDLE,
 	NFTA_CHAIN_NAME,
 	NFTA_CHAIN_HOOK,
+	NFTA_CHAIN_TYPE,
 	__NFTA_CHAIN_MAX
 };
 #define NFTA_CHAIN_MAX		(__NFTA_CHAIN_MAX - 1)
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index eb1d56e..ae65fe9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -44,13 +44,13 @@ config NFT_REJECT_IPV4
 	depends on NF_TABLES_IPV4
 	tristate "nf_tables IPv4 reject support"
 
-config NF_TABLE_ROUTE_IPV4
+config NFT_CHAIN_ROUTE_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables route table support"
+	tristate "IPv4 nf_tables route chain support"
 
-config NF_TABLE_NAT_IPV4
+config NFT_CHAIN_NAT_IPV4
 	depends on NF_TABLES_IPV4
-	tristate "IPv4 nf_tables nat table support"
+	tristate "IPv4 nf_tables nat chain support"
 
 config IP_NF_IPTABLES
 	tristate "IP tables support (required for filtering/masq/NAT)"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index b2f01cd..91e0bd7 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -29,8 +29,8 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
 obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV4) += nf_table_route_ipv4.o
-obj-$(CONFIG_NF_TABLE_NAT_IPV4) += nf_table_nat_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
+obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 63d0a3b..23525c4 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -41,14 +42,34 @@ static struct nft_af_info nft_af_ipv4 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv4_init(void)
 {
+	nft_register_chain_type(&filter_ipv4);
 	return nft_register_afinfo(&nft_af_ipv4);
 }
 
 static void __exit nf_tables_ipv4_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv4);
+	nft_unregister_chain_type(&filter_ipv4);
 }
 
 module_init(nf_tables_ipv4_init);
diff --git a/net/ipv4/netfilter/nf_table_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
similarity index 76%
rename from net/ipv4/netfilter/nf_table_nat_ipv4.c
rename to net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 2ecce39..cd28630 100644
--- a/net/ipv4/netfilter/nf_table_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -167,7 +168,7 @@ static struct nft_expr_type nft_nat_type __read_mostly = {
 };
 
 /*
- * NAT table
+ * NAT chains
  */
 
 static unsigned int nf_nat_fn(const struct nf_hook_ops *ops,
@@ -301,115 +302,52 @@ static unsigned int nf_nat_output(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nft_base_chain nf_chain_nat_prerouting __read_mostly = {
-	.chain	= {
-		.name		= "PREROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_prerouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_prerouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_prerouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_postrouting __read_mostly = {
-	.chain	= {
-		.name		= "POSTROUTING",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_postrouting.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_postrouting,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_postrouting.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_output,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_NAT_DST,
-		.priv		= &nf_chain_nat_output.chain,
-	},
-};
-
-static struct nft_base_chain nf_chain_nat_input __read_mostly = {
-	.chain	= {
-		.name		= "INPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_nat_input.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_nat_fn,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_NAT_SRC,
-		.priv		= &nf_chain_nat_input.chain,
+struct nf_chain_type nft_chain_nat_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "nat",
+	.type		= NFT_CHAIN_T_NAT,
+	.hook_mask	= (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_LOCAL_IN),
+	.fn		= {
+		[NF_INET_PRE_ROUTING]	= nf_nat_prerouting,
+		[NF_INET_POST_ROUTING]	= nf_nat_postrouting,
+		[NF_INET_LOCAL_OUT]	= nf_nat_output,
+		[NF_INET_LOCAL_IN]	= nf_nat_fn,
 	},
+	.me		= THIS_MODULE,
 };
 
-
-static struct nft_table nf_table_nat_ipv4 __read_mostly = {
-	.name	= "nat",
-	.chains	= LIST_HEAD_INIT(nf_table_nat_ipv4.chains),
-};
-
-static int __init nf_table_nat_init(void)
+static int __init nft_chain_nat_init(void)
 {
 	int err;
 
-	list_add_tail(&nf_chain_nat_prerouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_postrouting.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_output.chain.list,
-		      &nf_table_nat_ipv4.chains);
-	list_add_tail(&nf_chain_nat_input.chain.list,
-		      &nf_table_nat_ipv4.chains);
-
-	err = nft_register_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
+	err = nft_register_chain_type(&nft_chain_nat_ipv4);
 	if (err < 0)
-		goto err1;
+		return err;
 
 	err = nft_register_expr(&nft_nat_type);
 	if (err < 0)
-		goto err2;
+		goto err;
 
 	return 0;
 
-err2:
-	nft_unregister_table(&nf_table_nat_ipv4, NFPROTO_IPV4);
-err1:
+err:
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
 	return err;
 }
 
-static void __exit nf_table_nat_exit(void)
+static void __exit nft_chain_nat_exit(void)
 {
 	nft_unregister_expr(&nft_nat_type);
-	nft_unregister_table(&nf_table_nat_ipv4, AF_INET);
+	nft_unregister_chain_type(&nft_chain_nat_ipv4);
 }
 
-module_init(nf_table_nat_init);
-module_exit(nf_table_nat_exit);
+module_init(nft_chain_nat_init);
+module_exit(nft_chain_nat_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "nat");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
 MODULE_ALIAS_NFT_EXPR("nat");
diff --git a/net/ipv4/netfilter/nf_table_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
similarity index 61%
rename from net/ipv4/netfilter/nf_table_route_ipv4.c
rename to net/ipv4/netfilter/nft_chain_route_ipv4.c
index 4f257a1..6b84e09 100644
--- a/net/ipv4/netfilter/nf_table_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -56,42 +57,30 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
+static struct nf_chain_type nft_chain_route_ipv4 = {
+	.family		= NFPROTO_IPV4,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+		[NF_INET_LOCAL_OUT]	= nf_route_table_hook,
 	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
-};
-
-static struct nft_table nf_table_route_ipv4 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv4.chains),
+	.me		= THIS_MODULE,
 };
 
-static int __init nf_table_route_init(void)
+static int __init nft_chain_route_init(void)
 {
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv4.chains);
-	return nft_register_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+	return nft_register_chain_type(&nft_chain_route_ipv4);
 }
 
-static void __exit nf_table_route_exit(void)
+static void __exit nft_chain_route_exit(void)
 {
-	nft_unregister_table(&nf_table_route_ipv4, NFPROTO_IPV4);
+	nft_unregister_chain_type(&nft_chain_route_ipv4);
 }
 
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET, "route");
+MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 5677e38..2383306 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -29,9 +29,9 @@ config NF_TABLES_IPV6
 	depends on NF_TABLES
 	tristate "IPv6 nf_tables support"
 
-config NF_TABLE_ROUTE_IPV6
+config NFT_CHAIN_ROUTE_IPV6
 	depends on NF_TABLES_IPV6
-	tristate "IPv6 nf_tables route table support"
+	tristate "IPv6 nf_tables route chain support"
 
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 956af44..be4913a 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
-obj-$(CONFIG_NF_TABLE_ROUTE_IPV6) += nf_table_route_ipv6.o
+obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
 
 # matches
 obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index e0717ce..3631d62 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -39,14 +40,33 @@ static struct nft_af_info nft_af_ipv6 __read_mostly = {
 	},
 };
 
+static struct nf_chain_type filter_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.fn		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain,
+		[NF_INET_FORWARD]	= nft_do_chain,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain,
+		[NF_INET_POST_ROUTING]	= nft_do_chain,
+	},
+};
+
 static int __init nf_tables_ipv6_init(void)
 {
+	nft_register_chain_type(&filter_ipv6);
 	return nft_register_afinfo(&nft_af_ipv6);
 }
-
 static void __exit nf_tables_ipv6_exit(void)
 {
 	nft_unregister_afinfo(&nft_af_ipv6);
+	nft_unregister_chain_type(&filter_ipv6);
 }
 
 module_init(nf_tables_ipv6_init);
diff --git a/net/ipv6/netfilter/nf_table_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
similarity index 65%
rename from net/ipv6/netfilter/nf_table_route_ipv6.c
rename to net/ipv6/netfilter/nft_chain_route_ipv6.c
index 48ac65c..4cdc992 100644
--- a/net/ipv6/netfilter/nf_table_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -52,42 +53,30 @@ static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
 	return ret;
 }
 
-static struct nft_base_chain nf_chain_route_output __read_mostly = {
-	.chain	= {
-		.name		= "OUTPUT",
-		.rules		= LIST_HEAD_INIT(nf_chain_route_output.chain.rules),
-		.flags		= NFT_BASE_CHAIN | NFT_CHAIN_BUILTIN,
-	},
-	.ops	= {
-		.hook		= nf_route_table_hook,
-		.owner		= THIS_MODULE,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP6_PRI_MANGLE,
-		.priv		= &nf_chain_route_output.chain,
-	},
+static struct nf_chain_type nft_chain_route_ipv6 = {
+	.family		= NFPROTO_IPV6,
+	.name		= "route",
+	.type		= NFT_CHAIN_T_ROUTE,
+	.hook_mask	= (1 << NF_INET_LOCAL_OUT),
+	.fn		= {
+                [NF_INET_LOCAL_OUT]	= nf_route_table_hook,
+        },
+        .me		= THIS_MODULE,
 };
 
-static struct nft_table nf_table_route_ipv6 __read_mostly = {
-	.name	= "route",
-	.chains	= LIST_HEAD_INIT(nf_table_route_ipv6.chains),
-};
-
-static int __init nf_table_route_init(void)
+static int __init nft_chain_route_init(void)
 {
-	list_add_tail(&nf_chain_route_output.chain.list,
-		      &nf_table_route_ipv6.chains);
-	return nft_register_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+	return nft_register_chain_type(&nft_chain_route_ipv6);
 }
 
-static void __exit nf_table_route_exit(void)
+static void __exit nft_chain_route_exit(void)
 {
-	nft_unregister_table(&nf_table_route_ipv6, NFPROTO_IPV6);
+	nft_unregister_chain_type(&nft_chain_route_ipv6);
 }
 
-module_init(nf_table_route_init);
-module_exit(nf_table_route_exit);
+module_init(nft_chain_route_init);
+module_exit(nft_chain_route_exit);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_TABLE(AF_INET6, "route");
+MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6dac9a3..9c2d8d5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -104,8 +104,7 @@ static struct nft_table *nft_table_lookup(const struct nft_af_info *afi,
 }
 
 static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
-						const struct nlattr *nla,
-						bool autoload)
+						const struct nlattr *nla)
 {
 	struct nft_table *table;
 
@@ -116,16 +115,6 @@ static struct nft_table *nf_tables_table_lookup(const struct nft_af_info *afi,
 	if (table != NULL)
 		return table;
 
-#ifdef CONFIG_MODULES
-	if (autoload) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-table-%u-%*.s", afi->family,
-			       nla_len(nla)-1, (const char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
-		if (nft_table_lookup(afi, nla))
-			return ERR_PTR(-EAGAIN);
-	}
-#endif
 	return ERR_PTR(-ENOENT);
 }
 
@@ -134,6 +123,39 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 	return ++table->hgenerator;
 }
 
+static struct nf_chain_type *chain_type[AF_MAX][NFT_CHAIN_T_MAX];
+
+static int __nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+{
+	int i;
+
+	for (i=0; i<NFT_CHAIN_T_MAX; i++) {
+		if (chain_type[family][i] != NULL &&
+		    !nla_strcmp(nla, chain_type[family][i]->name))
+			return i;
+	}
+	return -1;
+}
+
+static int nf_tables_chain_type_lookup(const struct nft_af_info *afi,
+				       const struct nlattr *nla,
+				       bool autoload)
+{
+	int type;
+
+	type = __nf_tables_chain_type_lookup(afi->family, nla);
+#ifdef CONFIG_MODULES
+	if (type < 0 && autoload) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-chain-%u-%*.s", afi->family,
+			       nla_len(nla)-1, (const char *)nla_data(nla));
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		type = __nf_tables_chain_type_lookup(afi->family, nla);
+	}
+#endif
+	return type;
+}
+
 static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
 	[NFTA_TABLE_NAME]	= { .type = NLA_STRING },
 };
@@ -258,7 +280,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -294,7 +316,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
 		return PTR_ERR(afi);
 
 	name = nla[NFTA_TABLE_NAME];
-	table = nf_tables_table_lookup(afi, name, false);
+	table = nf_tables_table_lookup(afi, name);
 	if (IS_ERR(table)) {
 		if (PTR_ERR(table) != -ENOENT)
 			return PTR_ERR(table);
@@ -335,13 +357,10 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-	if (table->flags & NFT_TABLE_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (table->use)
 		return -EBUSY;
 
@@ -351,99 +370,34 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
 	return 0;
 }
 
-static struct nft_table *__nf_tables_table_lookup(const struct nft_af_info *afi,
-						  const char *name)
+int nft_register_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_table *table;
-
-	list_for_each_entry(table, &afi->tables, list) {
-		if (!strcmp(name, table->name))
-			return table;
-	}
-
-	return ERR_PTR(-ENOENT);
-}
-
-static int nf_tables_chain_notify(const struct sk_buff *oskb,
-				  const struct nlmsghdr *nlh,
-				  const struct nft_table *table,
-				  const struct nft_chain *chain,
-				  int event, int family);
-
-/**
- *	nft_register_table - register a built-in table
- *
- *	@table: the table to register
- *	@family: protocol family to register table with
- *
- *	Register a built-in table for use with nf_tables. Returns zero on
- *	success or a negative errno code otherwise.
- */
-int nft_register_table(struct nft_table *table, int family)
-{
-	struct nft_af_info *afi;
-	struct nft_table *t;
-	struct nft_chain *chain;
-	int err;
+	int err = 0;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-again:
-	afi = nf_tables_afinfo_lookup(family, true);
-	if (IS_ERR(afi)) {
-		err = PTR_ERR(afi);
-		if (err == -EAGAIN)
-			goto again;
-		goto err;
-	}
-
-	t = __nf_tables_table_lookup(afi, table->name);
-	if (IS_ERR(t)) {
-		err = PTR_ERR(t);
-		if (err != -ENOENT)
-			goto err;
-		t = NULL;
+	if (chain_type[ctype->family][ctype->type] != NULL) {
+		err = -EBUSY;
+		goto out;
 	}
 
-	if (t != NULL) {
-		err = -EEXIST;
-		goto err;
-	}
+	if (!try_module_get(ctype->me))
+		goto out;
 
-	table->flags |= NFT_TABLE_BUILTIN;
-	INIT_LIST_HEAD(&table->sets);
-	list_add_tail(&table->list, &afi->tables);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_NEWTABLE, family);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_NEWCHAIN, family);
-	err = 0;
-err:
+	chain_type[ctype->family][ctype->type] = ctype;
+out:
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 	return err;
 }
-EXPORT_SYMBOL_GPL(nft_register_table);
+EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
-/**
- *	nft_unregister_table - unregister a built-in table
- *
- *	@table: the table to unregister
- *	@family: protocol family to unregister table with
- *
- *	Unregister a built-in table for use with nf_tables.
- */
-void nft_unregister_table(struct nft_table *table, int family)
+void nft_unregister_chain_type(struct nf_chain_type *ctype)
 {
-	struct nft_chain *chain;
-
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_del(&table->list);
-	list_for_each_entry(chain, &table->chains, list)
-		nf_tables_chain_notify(NULL, NULL, table, chain,
-				       NFT_MSG_DELCHAIN, family);
-	nf_tables_table_notify(NULL, NULL, table, NFT_MSG_DELTABLE, family);
+	chain_type[ctype->family][ctype->type] = NULL;
+	module_put(ctype->me);
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
 }
-EXPORT_SYMBOL_GPL(nft_unregister_table);
+EXPORT_SYMBOL_GPL(nft_unregister_chain_type);
 
 /*
  * Chains
@@ -484,6 +438,7 @@ static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
 	[NFTA_CHAIN_NAME]	= { .type = NLA_STRING,
 				    .len = NFT_CHAIN_MAXNAMELEN - 1 },
 	[NFTA_CHAIN_HOOK]	= { .type = NLA_NESTED },
+	[NFTA_CHAIN_TYPE]	= { .type = NLA_NUL_STRING },
 };
 
 static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
@@ -526,6 +481,10 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq,
 		if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
 			goto nla_put_failure;
 		nla_nest_end(skb, nest);
+
+		if (nla_put_string(skb, NFTA_CHAIN_TYPE,
+			chain_type[ops->pf][nft_base_chain(chain)->type]->name))
+				goto nla_put_failure;
 	}
 
 	return nlmsg_end(skb, nlh);
@@ -633,7 +592,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -680,7 +639,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -722,6 +681,17 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 
 	if (nla[NFTA_CHAIN_HOOK]) {
 		struct nf_hook_ops *ops;
+		nf_hookfn *hookfn;
+		u32 hooknum;
+		int type = NFT_CHAIN_T_DEFAULT;
+
+		if (nla[NFTA_CHAIN_TYPE]) {
+			type = nf_tables_chain_type_lookup(afi,
+							   nla[NFTA_CHAIN_TYPE],
+							   create);
+			if (type < 0)
+				return -ENOENT;
+		}
 
 		err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
 				       nft_hook_policy);
@@ -730,12 +700,20 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		if (ha[NFTA_HOOK_HOOKNUM] == NULL ||
 		    ha[NFTA_HOOK_PRIORITY] == NULL)
 			return -EINVAL;
-		if (ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM])) >= afi->nhooks)
+
+		hooknum = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
+		if (hooknum >= afi->nhooks)
 			return -EINVAL;
 
+		hookfn = chain_type[family][type]->fn[hooknum];
+		if (hookfn == NULL)
+			return -EOPNOTSUPP;
+
 		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
 		if (basechain == NULL)
 			return -ENOMEM;
+
+		basechain->type = type;
 		chain = &basechain->chain;
 
 		ops = &basechain->ops;
@@ -744,7 +722,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
 		ops->hooknum	= ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
 		ops->priority	= ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
 		ops->priv	= chain;
-		ops->hook	= nft_do_chain;
+		ops->hook       = hookfn;
 		if (afi->hooks[ops->hooknum])
 			ops->hook = afi->hooks[ops->hooknum];
 
@@ -793,7 +771,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -801,9 +779,6 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(chain))
 		return PTR_ERR(chain);
 
-	if (chain->flags & NFT_CHAIN_BUILTIN)
-		return -EOPNOTSUPP;
-
 	if (!list_empty(&chain->rules))
 		return -EBUSY;
 
@@ -1190,7 +1165,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1268,7 +1243,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1374,7 +1349,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -1490,7 +1465,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
 		return PTR_ERR(afi);
 
 	if (nla[NFTA_SET_TABLE] != NULL) {
-		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], false);
+		table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 		if (IS_ERR(table))
 			return PTR_ERR(table);
 	}
@@ -1820,7 +1795,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE], create);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
@@ -2008,7 +1983,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
 	if (IS_ERR(afi))
 		return PTR_ERR(afi);
 
-	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE], false);
+	table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
 	if (IS_ERR(table))
 		return PTR_ERR(table);
 
-- 
1.7.10.4


^ permalink raw reply related

* [PATCH 01/17] netfilter: pass hook ops to hookfn
From: Pablo Neira Ayuso @ 2013-10-14 16:38 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, kaber, netdev
In-Reply-To: <1381768738-17739-1-git-send-email-pablo@netfilter.org>

From: Patrick McHardy <kaber@trash.net>

Pass the hook ops to the hookfn to allow for generic hook
functions. This change is required by nf_tables.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h                      |    3 +-
 net/bridge/br_netfilter.c                      |   22 ++++++++-----
 net/bridge/netfilter/ebtable_filter.c          |   16 +++++----
 net/bridge/netfilter/ebtable_nat.c             |   16 +++++----
 net/decnet/netfilter/dn_rtmsg.c                |    2 +-
 net/ipv4/netfilter/arptable_filter.c           |    5 +--
 net/ipv4/netfilter/ipt_CLUSTERIP.c             |    2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c              |    2 +-
 net/ipv4/netfilter/iptable_filter.c            |    7 ++--
 net/ipv4/netfilter/iptable_mangle.c            |   10 +++---
 net/ipv4/netfilter/iptable_nat.c               |   26 +++++++--------
 net/ipv4/netfilter/iptable_raw.c               |    6 ++--
 net/ipv4/netfilter/iptable_security.c          |    7 ++--
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |   12 +++----
 net/ipv4/netfilter/nf_defrag_ipv4.c            |    6 ++--
 net/ipv6/netfilter/ip6t_SYNPROXY.c             |    2 +-
 net/ipv6/netfilter/ip6table_filter.c           |    5 +--
 net/ipv6/netfilter/ip6table_mangle.c           |   10 +++---
 net/ipv6/netfilter/ip6table_nat.c              |   27 +++++++--------
 net/ipv6/netfilter/ip6table_raw.c              |    5 +--
 net/ipv6/netfilter/ip6table_security.c         |    5 +--
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   14 ++++----
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      |    6 ++--
 net/netfilter/core.c                           |    2 +-
 net/netfilter/ipvs/ip_vs_core.c                |   42 ++++++++++++------------
 security/selinux/hooks.c                       |   10 +++---
 26 files changed, 148 insertions(+), 122 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 61223c5..fef7e67 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -42,7 +42,8 @@ int netfilter_init(void);
 
 struct sk_buff;
 
-typedef unsigned int nf_hookfn(unsigned int hooknum,
+struct nf_hook_ops;
+typedef unsigned int nf_hookfn(const struct nf_hook_ops *ops,
 			       struct sk_buff *skb,
 			       const struct net_device *in,
 			       const struct net_device *out,
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index f877362..878f008 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -619,7 +619,7 @@ bad:
 
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
  * to ip6tables, which doesn't support NAT, so things are fairly simple. */
-static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
+static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
@@ -669,7 +669,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook,
  * receiving device) to make netfilter happy, the REDIRECT
  * target in particular.  Save the original destination IP
  * address to be able to detect DNAT afterwards. */
-static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
@@ -691,7 +692,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 			return NF_ACCEPT;
 
 		nf_bridge_pull_encap_header_rcsum(skb);
-		return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn);
+		return br_nf_pre_routing_ipv6(ops, skb, in, out, okfn);
 	}
 
 	if (!brnf_call_iptables && !br->nf_call_iptables)
@@ -727,7 +728,8 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
  * took place when the packet entered the bridge), but we
  * register an IPv4 PRE_ROUTING 'sabotage' hook that will
  * prevent this from happening. */
-static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
 				   const struct net_device *in,
 				   const struct net_device *out,
 				   int (*okfn)(struct sk_buff *))
@@ -765,7 +767,8 @@ static int br_nf_forward_finish(struct sk_buff *skb)
  * but we are still able to filter on the 'real' indev/outdev
  * because of the physdev module. For ARP, indev and outdev are the
  * bridge ports. */
-static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+				     struct sk_buff *skb,
 				     const struct net_device *in,
 				     const struct net_device *out,
 				     int (*okfn)(struct sk_buff *))
@@ -818,7 +821,8 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 
-static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
@@ -878,7 +882,8 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 #endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
-static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
+static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
 				       int (*okfn)(struct sk_buff *))
@@ -923,7 +928,8 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
 /* IP/SABOTAGE *****************************************************/
 /* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
  * for the second time. */
-static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff *skb,
+static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
 				   const struct net_device *in,
 				   const struct net_device *out,
 				   int (*okfn)(struct sk_buff *))
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 94b2b70..bb2da7b 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -60,17 +60,21 @@ static const struct ebt_table frame_filter =
 };
 
 static unsigned int
-ebt_in_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_in_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	    const struct net_device *in, const struct net_device *out,
+	    int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_filter);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(in)->xt.frame_filter);
 }
 
 static unsigned int
-ebt_out_hook(unsigned int hook, struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_out_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	     const struct net_device *in, const struct net_device *out,
+	     int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_filter);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(out)->xt.frame_filter);
 }
 
 static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 322555a..bd238f1 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -60,17 +60,21 @@ static struct ebt_table frame_nat =
 };
 
 static unsigned int
-ebt_nat_in(unsigned int hook, struct sk_buff *skb, const struct net_device *in
-   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_nat_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	   const struct net_device *in, const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(in)->xt.frame_nat);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(in)->xt.frame_nat);
 }
 
 static unsigned int
-ebt_nat_out(unsigned int hook, struct sk_buff *skb, const struct net_device *in
-   , const struct net_device *out, int (*okfn)(struct sk_buff *))
+ebt_nat_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
+	    const struct net_device *in, const struct net_device *out,
+	    int (*okfn)(struct sk_buff *))
 {
-	return ebt_do_table(hook, skb, in, out, dev_net(out)->xt.frame_nat);
+	return ebt_do_table(ops->hooknum, skb, in, out,
+			    dev_net(out)->xt.frame_nat);
 }
 
 static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 2a7efe3..e83015c 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -87,7 +87,7 @@ static void dnrmg_send_peer(struct sk_buff *skb)
 }
 
 
-static unsigned int dnrmg_hook(unsigned int hook,
+static unsigned int dnrmg_hook(const struct nf_hook_ops *ops,
 			struct sk_buff *skb,
 			const struct net_device *in,
 			const struct net_device *out,
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index a865f6f..802ddec 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -27,13 +27,14 @@ static const struct xt_table packet_filter = {
 
 /* The work comes in here from netfilter.c */
 static unsigned int
-arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
+	return arpt_do_table(skb, ops->hooknum, in, out,
+			     net->ipv4.arptable_filter);
 }
 
 static struct nf_hook_ops *arpfilter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0b732ef..a2e2b61 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -483,7 +483,7 @@ static void arp_print(struct arp_payload *payload)
 #endif
 
 static unsigned int
-arp_mangle(unsigned int hook,
+arp_mangle(const struct nf_hook_ops *ops,
 	   struct sk_buff *skb,
 	   const struct net_device *in,
 	   const struct net_device *out,
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index b6346bf..01cffea 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -297,7 +297,7 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv4_synproxy_hook(unsigned int hooknum,
+static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
 				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 50af5b4..e08a74a 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -33,20 +33,21 @@ static const struct xt_table packet_filter = {
 };
 
 static unsigned int
-iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		    const struct net_device *in, const struct net_device *out,
 		    int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT &&
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* root is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
+	return ipt_do_table(skb, ops->hooknum, in, out,
+			    net->ipv4.iptable_filter);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 0d8cd82..6a5079c 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -79,19 +79,19 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-iptable_mangle_hook(unsigned int hook,
+iptable_mangle_hook(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	if (hook == NF_INET_LOCAL_OUT)
+	if (ops->hooknum == NF_INET_LOCAL_OUT)
 		return ipt_mangle_out(skb, out);
-	if (hook == NF_INET_POST_ROUTING)
-		return ipt_do_table(skb, hook, in, out,
+	if (ops->hooknum == NF_INET_POST_ROUTING)
+		return ipt_do_table(skb, ops->hooknum, in, out,
 				    dev_net(out)->ipv4.iptable_mangle);
 	/* PREROUTING/INPUT/FORWARD: */
-	return ipt_do_table(skb, hook, in, out,
+	return ipt_do_table(skb, ops->hooknum, in, out,
 			    dev_net(in)->ipv4.iptable_mangle);
 }
 
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 683bfaf..ee28861 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -61,7 +61,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_fn(unsigned int hooknum,
+nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -71,7 +71,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
 	/* maniptype == SRC for postrouting. */
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 
 	/* We never see fragments: conntrack defrags on pre-routing
 	 * and local-out, and nf_nat_out protects post-routing.
@@ -108,7 +108,7 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 	case IP_CT_RELATED_REPLY:
 		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
 			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   hooknum))
+							   ops->hooknum))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
@@ -121,14 +121,14 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
 		} else {
 			pr_debug("Already setup manip %s for ct %p\n",
 				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
 				 ct);
-			if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 				goto oif_changed;
 		}
 		break;
@@ -137,11 +137,11 @@ nf_nat_ipv4_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
 
 oif_changed:
 	nf_ct_kill_acct(ct, ctinfo, skb);
@@ -149,7 +149,7 @@ oif_changed:
 }
 
 static unsigned int
-nf_nat_ipv4_in(unsigned int hooknum,
+nf_nat_ipv4_in(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -158,7 +158,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
 	unsigned int ret;
 	__be32 daddr = ip_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    daddr != ip_hdr(skb)->daddr)
 		skb_dst_drop(skb);
@@ -167,7 +167,7 @@ nf_nat_ipv4_in(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_out(unsigned int hooknum,
+nf_nat_ipv4_out(const struct nf_hook_ops *ops,
 		struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -185,7 +185,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
@@ -207,7 +207,7 @@ nf_nat_ipv4_out(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv4_local_fn(unsigned int hooknum,
+nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
@@ -223,7 +223,7 @@ nf_nat_ipv4_local_fn(unsigned int hooknum,
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv4_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv4_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1f82aea..b2f7e8f 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -20,20 +20,20 @@ static const struct xt_table packet_raw = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		 const struct net_device *in, const struct net_device *out,
 		 int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT && 
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* root is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
+	return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index f867a8d..c86647e 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -37,21 +37,22 @@ static const struct xt_table security_table = {
 };
 
 static unsigned int
-iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
 	const struct net *net;
 
-	if (hook == NF_INET_LOCAL_OUT &&
+	if (ops->hooknum == NF_INET_LOCAL_OUT &&
 	    (skb->len < sizeof(struct iphdr) ||
 	     ip_hdrlen(skb) < sizeof(struct iphdr)))
 		/* Somebody is playing with raw sockets. */
 		return NF_ACCEPT;
 
 	net = dev_net((in != NULL) ? in : out);
-	return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
+	return ipt_do_table(skb, ops->hooknum, in, out,
+			    net->ipv4.iptable_security);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 86f5b34..ecd8bec 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -92,7 +92,7 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv4_helper(unsigned int hooknum,
+static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -121,7 +121,7 @@ static unsigned int ipv4_helper(unsigned int hooknum,
 			    ct, ctinfo);
 }
 
-static unsigned int ipv4_confirm(unsigned int hooknum,
+static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
 				 struct sk_buff *skb,
 				 const struct net_device *in,
 				 const struct net_device *out,
@@ -147,16 +147,16 @@ out:
 	return nf_conntrack_confirm(skb);
 }
 
-static unsigned int ipv4_conntrack_in(unsigned int hooknum,
+static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
 				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return nf_conntrack_in(dev_net(in), PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
 }
 
-static unsigned int ipv4_conntrack_local(unsigned int hooknum,
+static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -166,7 +166,7 @@ static unsigned int ipv4_conntrack_local(unsigned int hooknum,
 	if (skb->len < sizeof(struct iphdr) ||
 	    ip_hdrlen(skb) < sizeof(struct iphdr))
 		return NF_ACCEPT;
-	return nf_conntrack_in(dev_net(out), PF_INET, hooknum, skb);
+	return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
 }
 
 /* Connection tracking may drop packets, but never alters them, so
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 7428155..12e13bd 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -60,7 +60,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 		return IP_DEFRAG_CONNTRACK_OUT + zone;
 }
 
-static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
+static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
 					  struct sk_buff *skb,
 					  const struct net_device *in,
 					  const struct net_device *out,
@@ -83,7 +83,9 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
 #endif
 	/* Gather fragments. */
 	if (ip_is_fragment(ip_hdr(skb))) {
-		enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+		enum ip_defrag_users user =
+			nf_ct_defrag_user(ops->hooknum, skb);
+
 		if (nf_ct_ipv4_gather_frags(skb, user))
 			return NF_STOLEN;
 	}
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 2748b04..bf9f612 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -312,7 +312,7 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static unsigned int ipv6_synproxy_hook(unsigned int hooknum,
+static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
 				       struct sk_buff *skb,
 				       const struct net_device *in,
 				       const struct net_device *out,
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 29b44b1..ca7f6c1 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -32,13 +32,14 @@ static const struct xt_table packet_filter = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_filter_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_filter);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_filter);
 }
 
 static struct nf_hook_ops *filter_ops __read_mostly;
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index c705907..307bbb7 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -76,17 +76,17 @@ ip6t_mangle_out(struct sk_buff *skb, const struct net_device *out)
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_mangle_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_mangle_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	if (hook == NF_INET_LOCAL_OUT)
+	if (ops->hooknum == NF_INET_LOCAL_OUT)
 		return ip6t_mangle_out(skb, out);
-	if (hook == NF_INET_POST_ROUTING)
-		return ip6t_do_table(skb, hook, in, out,
+	if (ops->hooknum == NF_INET_POST_ROUTING)
+		return ip6t_do_table(skb, ops->hooknum, in, out,
 				     dev_net(out)->ipv6.ip6table_mangle);
 	/* INPUT/FORWARD */
-	return ip6t_do_table(skb, hook, in, out,
+	return ip6t_do_table(skb, ops->hooknum, in, out,
 			     dev_net(in)->ipv6.ip6table_mangle);
 }
 
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 9b076d2..84c7f33 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -63,7 +63,7 @@ static unsigned int nf_nat_rule_find(struct sk_buff *skb, unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_fn(unsigned int hooknum,
+nf_nat_ipv6_fn(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -72,7 +72,7 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 	struct nf_conn *ct;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn_nat *nat;
-	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+	enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
 	__be16 frag_off;
 	int hdrlen;
 	u8 nexthdr;
@@ -111,7 +111,8 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 
 		if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
 			if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
-							     hooknum, hdrlen))
+							     ops->hooknum,
+							     hdrlen))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
@@ -124,14 +125,14 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 		if (!nf_nat_initialized(ct, maniptype)) {
 			unsigned int ret;
 
-			ret = nf_nat_rule_find(skb, hooknum, in, out, ct);
+			ret = nf_nat_rule_find(skb, ops->hooknum, in, out, ct);
 			if (ret != NF_ACCEPT)
 				return ret;
 		} else {
 			pr_debug("Already setup manip %s for ct %p\n",
 				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
 				 ct);
-			if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+			if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 				goto oif_changed;
 		}
 		break;
@@ -140,11 +141,11 @@ nf_nat_ipv6_fn(unsigned int hooknum,
 		/* ESTABLISHED */
 		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
 			     ctinfo == IP_CT_ESTABLISHED_REPLY);
-		if (nf_nat_oif_changed(hooknum, ctinfo, nat, out))
+		if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
 			goto oif_changed;
 	}
 
-	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+	return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
 
 oif_changed:
 	nf_ct_kill_acct(ct, ctinfo, skb);
@@ -152,7 +153,7 @@ oif_changed:
 }
 
 static unsigned int
-nf_nat_ipv6_in(unsigned int hooknum,
+nf_nat_ipv6_in(const struct nf_hook_ops *ops,
 	       struct sk_buff *skb,
 	       const struct net_device *in,
 	       const struct net_device *out,
@@ -161,7 +162,7 @@ nf_nat_ipv6_in(unsigned int hooknum,
 	unsigned int ret;
 	struct in6_addr daddr = ipv6_hdr(skb)->daddr;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
 		skb_dst_drop(skb);
@@ -170,7 +171,7 @@ nf_nat_ipv6_in(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_out(unsigned int hooknum,
+nf_nat_ipv6_out(const struct nf_hook_ops *ops,
 		struct sk_buff *skb,
 		const struct net_device *in,
 		const struct net_device *out,
@@ -187,7 +188,7 @@ nf_nat_ipv6_out(unsigned int hooknum,
 	if (skb->len < sizeof(struct ipv6hdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 #ifdef CONFIG_XFRM
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
@@ -209,7 +210,7 @@ nf_nat_ipv6_out(unsigned int hooknum,
 }
 
 static unsigned int
-nf_nat_ipv6_local_fn(unsigned int hooknum,
+nf_nat_ipv6_local_fn(const struct nf_hook_ops *ops,
 		     struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
@@ -224,7 +225,7 @@ nf_nat_ipv6_local_fn(unsigned int hooknum,
 	if (skb->len < sizeof(struct ipv6hdr))
 		return NF_ACCEPT;
 
-	ret = nf_nat_ipv6_fn(hooknum, skb, in, out, okfn);
+	ret = nf_nat_ipv6_fn(ops, skb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN &&
 	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9a626d8..5274740 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -19,13 +19,14 @@ static const struct xt_table packet_raw = {
 
 /* The work comes in here from netfilter.c. */
 static unsigned int
-ip6table_raw_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		  const struct net_device *in, const struct net_device *out,
 		  int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_raw);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_raw);
 }
 
 static struct nf_hook_ops *rawtable_ops __read_mostly;
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index ce88d1d..ab3b021 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -36,14 +36,15 @@ static const struct xt_table security_table = {
 };
 
 static unsigned int
-ip6table_security_hook(unsigned int hook, struct sk_buff *skb,
+ip6table_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		       const struct net_device *in,
 		       const struct net_device *out,
 		       int (*okfn)(struct sk_buff *))
 {
 	const struct net *net = dev_net((in != NULL) ? in : out);
 
-	return ip6t_do_table(skb, hook, in, out, net->ipv6.ip6table_security);
+	return ip6t_do_table(skb, ops->hooknum, in, out,
+			     net->ipv6.ip6table_security);
 }
 
 static struct nf_hook_ops *sectbl_ops __read_mostly;
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 54b75ea..486545e 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -95,7 +95,7 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	return NF_ACCEPT;
 }
 
-static unsigned int ipv6_helper(unsigned int hooknum,
+static unsigned int ipv6_helper(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -133,7 +133,7 @@ static unsigned int ipv6_helper(unsigned int hooknum,
 	return helper->help(skb, protoff, ct, ctinfo);
 }
 
-static unsigned int ipv6_confirm(unsigned int hooknum,
+static unsigned int ipv6_confirm(const struct nf_hook_ops *ops,
 				 struct sk_buff *skb,
 				 const struct net_device *in,
 				 const struct net_device *out,
@@ -219,16 +219,17 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
 	return nf_conntrack_in(net, PF_INET6, hooknum, skb);
 }
 
-static unsigned int ipv6_conntrack_in(unsigned int hooknum,
+static unsigned int ipv6_conntrack_in(const struct nf_hook_ops *ops,
 				      struct sk_buff *skb,
 				      const struct net_device *in,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, in, out, okfn);
+	return __ipv6_conntrack_in(dev_net(in), ops->hooknum, skb, in, out,
+				   okfn);
 }
 
-static unsigned int ipv6_conntrack_local(unsigned int hooknum,
+static unsigned int ipv6_conntrack_local(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -239,7 +240,8 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
 		return NF_ACCEPT;
 	}
-	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, in, out, okfn);
+	return __ipv6_conntrack_in(dev_net(out), ops->hooknum, skb, in, out,
+				   okfn);
 }
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index aacd121..ec483aa 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -52,7 +52,7 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum,
 
 }
 
-static unsigned int ipv6_defrag(unsigned int hooknum,
+static unsigned int ipv6_defrag(const struct nf_hook_ops *ops,
 				struct sk_buff *skb,
 				const struct net_device *in,
 				const struct net_device *out,
@@ -66,7 +66,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
 		return NF_ACCEPT;
 #endif
 
-	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb));
+	reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(ops->hooknum, skb));
 	/* queued */
 	if (reasm == NULL)
 		return NF_STOLEN;
@@ -75,7 +75,7 @@ static unsigned int ipv6_defrag(unsigned int hooknum,
 	if (reasm == skb)
 		return NF_ACCEPT;
 
-	nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in,
+	nf_ct_frag6_output(ops->hooknum, reasm, (struct net_device *)in,
 			   (struct net_device *)out, okfn);
 
 	return NF_STOLEN;
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 593b16e..1fbab0c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -146,7 +146,7 @@ unsigned int nf_iterate(struct list_head *head,
 		/* Optimization: we don't need to hold module
 		   reference here, since function can't sleep. --RR */
 repeat:
-		verdict = (*elemp)->hook(hook, skb, indev, outdev, okfn);
+		verdict = (*elemp)->hook(*elemp, skb, indev, outdev, okfn);
 		if (verdict != NF_ACCEPT) {
 #ifdef CONFIG_NETFILTER_DEBUG
 			if (unlikely((verdict & NF_VERDICT_MASK)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 74fd00c..34fda62 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1239,11 +1239,11 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	     const struct net_device *in, const struct net_device *out,
 	     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET);
+	return ip_vs_out(ops->hooknum, skb, AF_INET);
 }
 
 /*
@@ -1251,11 +1251,11 @@ ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET);
+	return ip_vs_out(ops->hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1266,11 +1266,11 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 	     const struct net_device *in, const struct net_device *out,
 	     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET6);
+	return ip_vs_out(ops->hooknum, skb, AF_INET6);
 }
 
 /*
@@ -1278,11 +1278,11 @@ ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
  *	Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
-ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_out(hooknum, skb, AF_INET6);
+	return ip_vs_out(ops->hooknum, skb, AF_INET6);
 }
 
 #endif
@@ -1733,12 +1733,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
  *	Schedule and forward packets from remote clients
  */
 static unsigned int
-ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET);
+	return ip_vs_in(ops->hooknum, skb, AF_INET);
 }
 
 /*
@@ -1746,11 +1746,11 @@ ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from local clients
  */
 static unsigned int
-ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET);
+	return ip_vs_in(ops->hooknum, skb, AF_INET);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -1760,7 +1760,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
  * Copy info from first fragment, to the rest of them.
  */
 static unsigned int
-ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_preroute_frag6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in,
 		     const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
@@ -1792,12 +1792,12 @@ ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from remote clients
  */
 static unsigned int
-ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in,
 		      const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET6);
+	return ip_vs_in(ops->hooknum, skb, AF_INET6);
 }
 
 /*
@@ -1805,11 +1805,11 @@ ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
  *	Schedule and forward packets from local clients
  */
 static unsigned int
-ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		     const struct net_device *in, const struct net_device *out,
 		     int (*okfn)(struct sk_buff *))
 {
-	return ip_vs_in(hooknum, skb, AF_INET6);
+	return ip_vs_in(ops->hooknum, skb, AF_INET6);
 }
 
 #endif
@@ -1825,7 +1825,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
  *      and send them to ip_vs_in_icmp.
  */
 static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		   const struct net_device *in, const struct net_device *out,
 		   int (*okfn)(struct sk_buff *))
 {
@@ -1842,12 +1842,12 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
-	return ip_vs_in_icmp(skb, &r, hooknum);
+	return ip_vs_in_icmp(skb, &r, ops->hooknum);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
 static unsigned int
-ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
 		      const struct net_device *in, const struct net_device *out,
 		      int (*okfn)(struct sk_buff *))
 {
@@ -1866,7 +1866,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 	if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
 		return NF_ACCEPT;
 
-	return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
+	return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
 }
 #endif
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 568c769..3f224d7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4668,7 +4668,7 @@ static unsigned int selinux_ip_forward(struct sk_buff *skb, int ifindex,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_forward(unsigned int hooknum,
+static unsigned int selinux_ipv4_forward(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -4678,7 +4678,7 @@ static unsigned int selinux_ipv4_forward(unsigned int hooknum,
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_forward(unsigned int hooknum,
+static unsigned int selinux_ipv6_forward(const struct nf_hook_ops *ops,
 					 struct sk_buff *skb,
 					 const struct net_device *in,
 					 const struct net_device *out,
@@ -4710,7 +4710,7 @@ static unsigned int selinux_ip_output(struct sk_buff *skb,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_output(unsigned int hooknum,
+static unsigned int selinux_ipv4_output(const struct nf_hook_ops *ops,
 					struct sk_buff *skb,
 					const struct net_device *in,
 					const struct net_device *out,
@@ -4837,7 +4837,7 @@ static unsigned int selinux_ip_postroute(struct sk_buff *skb, int ifindex,
 	return NF_ACCEPT;
 }
 
-static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
+static unsigned int selinux_ipv4_postroute(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
@@ -4847,7 +4847,7 @@ static unsigned int selinux_ipv4_postroute(unsigned int hooknum,
 }
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static unsigned int selinux_ipv6_postroute(unsigned int hooknum,
+static unsigned int selinux_ipv6_postroute(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
-- 
1.7.10.4


^ permalink raw reply related

* kernel policy routing table src ip not respected since 2.6.37 and commit 9fc3bbb4a752
From: Vincent Li @ 2013-10-14 16:13 UTC (permalink / raw)
  To: netdev@vger.kernel.org; +Cc: jsing

I had a simple bash script to test if the policy routing table src ip
is respected or not, git bisect found the  commit 9fc3bbb4a752 to
change the policy routing table source ip behavior.

commit 9fc3bbb4a752f108cf096d96640f3b548bbbce6c
Author: Joel Sing <jsing@google.com>
Date:   Mon Jan 3 20:24:20 2011 +0000

    ipv4/route.c: respect prefsrc for local routes

    The preferred source address is currently ignored for local routes,
    which results in all local connections having a src address that is the
    same as the local dst address. Fix this by respecting the preferred source
    address when it is provided for local routes.

test script:

#!/bin/bash
ip addr add 10.1.1.1/24 dev eth0
ip addr add 10.1.1.2/24 dev eth0
ip rule add priority 245 table 245
ip route add 10.1.1.0/24 dev eth0  proto kernel  scope link  src
10.1.1.2 table 245 <===source ip 10.1.1.2 to be preferred

ip addr show dev eth0
ip route list table main
ip route list table 245


tcpdump -nn -i eth0 host 10.1.1.9 and icmp &

ping 10.1.1.9



--before commit 9fc3bbb4a752

the source is from ip 10.1.1.2 as expected

--after commit 9fc3bbb4a752

the source is from ip 10.1.1.1 which not expected since I have high
priority table 245 with source ip 10.1.1.2

is this regression of commit 9fc3bbb4a752 ?

Vincent

^ permalink raw reply

* [PATCH ipsec] xfrm: prevent ipcomp scratch buffer race condition
From: Michal Kubecek @ 2013-10-14 16:03 UTC (permalink / raw)
  To: Steffen Klassert; +Cc: Herbert Xu, David S. Miller, netdev

In ipcomp_compress(), sortirq is enabled too early, allowing the
per-cpu scratch buffer to be rewritten by ipcomp_decompress()
(called on the same CPU in softirq context) between populating
the buffer and copying the compressed data to the skb.

Add similar protection into ipcomp_decompress() as it can be
called from process context as well (even if such scenario seems
a bit artificial).

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 net/xfrm/xfrm_ipcomp.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index 2906d52..96946fb 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -48,9 +48,11 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
 	const int cpu = get_cpu();
 	u8 *scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
 	struct crypto_comp *tfm = *per_cpu_ptr(ipcd->tfms, cpu);
-	int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+	int err;
 	int len;
 
+	local_bh_disable();
+	err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
 	if (err)
 		goto out;
 
@@ -103,6 +105,7 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
 	err = 0;
 
 out:
+	local_bh_enable();
 	put_cpu();
 	return err;
 }
@@ -148,7 +151,6 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
 
 	local_bh_disable();
 	err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
-	local_bh_enable();
 	if (err)
 		goto out;
 
@@ -158,12 +160,14 @@ static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
 	}
 
 	memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+	local_bh_enable();
 	put_cpu();
 
 	pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr));
 	return 0;
 
 out:
+	local_bh_enable();
 	put_cpu();
 	return err;
 }
-- 
1.8.1.4

^ permalink raw reply related

* Re: [RFC] ipv6: always join solicited-node address
From: Hannes Frederic Sowa @ 2013-10-14 15:56 UTC (permalink / raw)
  To: Bjørn Mork; +Cc: netdev
In-Reply-To: <87fvs4s4zc.fsf@nemi.mork.no>

On Mon, Oct 14, 2013 at 09:09:59AM +0200, Bjørn Mork wrote:
> Yes, but that would also make the IP layer try to resolve IP to link
> layer addressess both for IPv4 and IPv6, which just won't work. At least
> not for IPv4, where there just is no way to transport an ARP to the
> modem.  And I assume it may fail for IPv6 too on any sane device.

I don't think that clearing the IFF_NOARP flag would kill connectivity
for either IPv4 or IPv6. It may compromise security for IPv6 though
(no idea how the telco network behind the modem looks like).

> > Is this a specific bug of the modem you are using or are all devices
> > powered by this driver like this?
> 
> Unfortunately I have no IPv6 enabled SIM myself, so I have no
> information about other devices.  This report was based on user
> feedback.
> 
> I assume the bug is specific to this firmware implementation, probably
> extending to all similar devices from the same vendor.  But it could be
> more common than that.  The fact that the bug is there indicates that
> this works just fine in Windows.
> 
> Yes, I realize that I am in ugly-hack-to-workaround-firmware-issues land
> again... But it sure would be nice to have some way for a driver to
> indicate that L2 neighbour tables are meaningless, but that any incoming
> requests should still be answered.

L2 neighbour tables are resolved on demand and won't be queried for the
link you are talking about (at least for IPv6, but I assume IPv4, too).

A new flag should have clear semantics then:

* split IFF_NOARP to IFF_NOARP and IFF_NONDISC
* split IFF_NOARP to IFF_NOLLRESOLV_RESPONSE and IFF_NOLLRESOLV_MODIFY
  (each one flag which is applicable for both IPv4 and IPv6)

I tend to lean towards the last alternative but still wonder if this is
just overhead for this one buggy device.

Greetings,

  Hannes

^ permalink raw reply

* Re: [PATCH 1/1] net: fix cipso packet validation when !NETLABEL
From: Paul Moore @ 2013-10-14 15:12 UTC (permalink / raw)
  To: Seif Mazareeb
  Cc: davem@davemloft.net, netdev@vger.kernel.org,
	thomas.petazzoni@free-electrons.com, Dmitri Epshtein
In-Reply-To: <0DB595A2CB707F458400BE9663B6A72269C00479DF@SC-VEXCH2.marvell.com>

On Saturday, October 12, 2013 10:21:50 PM Seif Mazareeb wrote:
> When CONFIG_NETLABEL is disabled, the cipso_v4_validate() function could
> loop forever in the main loop if opt[opt_iter +1] == 0, this will causing a
> kernel crash in an SMP system, since the CPU executing this function will
> stall /not respond to IPIs.
> 
> This problem can be reproduced by running the IP Stack Integrity Checker
> (http://isic.sourceforge.net) using the following command on a Linux machine
> connected to DUT:
> 
> "icmpsic -s rand -d <DUT IP address> -r 123456"
> wait (1-2 min)
> 
> Signed-off-by: Seif Mazareeb <seif@marvell.com>

Thanks for sticking with this.

Acked-by: Paul Moore <paul@paul-moore.com>

> ---
>  include/net/cipso_ipv4.h | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
> index a7a683e..a8c2ef6 100644
> --- a/include/net/cipso_ipv4.h
> +++ b/include/net/cipso_ipv4.h
> @@ -290,6 +290,7 @@ static inline int cipso_v4_validate(const struct sk_buff
> *skb, unsigned char err_offset = 0;
>         u8 opt_len = opt[1];
>         u8 opt_iter;
> +       u8 tag_len;
> 
>         if (opt_len < 8) {
>                 err_offset = 1;
> @@ -302,11 +303,12 @@ static inline int cipso_v4_validate(const struct
> sk_buff *skb, }
> 
>         for (opt_iter = 6; opt_iter < opt_len;) {
> -               if (opt[opt_iter + 1] > (opt_len - opt_iter)) {
> +               tag_len = opt[opt_iter + 1];
> +               if ((tag_len == 0) || (opt[opt_iter + 1] > (opt_len -
> opt_iter))) { err_offset = opt_iter + 1;
>                         goto out;
>                 }
> -               opt_iter += opt[opt_iter + 1];
> +               opt_iter += tag_len;
>         }
> 
>  out:
> --
> 1.8.1.2

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH][net-next] gianfar: Simplify MQ polling to avoid soft lockup
From: Claudiu Manoil @ 2013-10-14 15:11 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, David S. Miller
In-Reply-To: <1381761267.3392.49.camel@edumazet-glaptop.roam.corp.google.com>

On 10/14/2013 5:34 PM, Eric Dumazet wrote:
> On Mon, 2013-10-14 at 17:05 +0300, Claudiu Manoil wrote:
>> Under certain low traffic conditions, the single core
>> devices with multiple Rx/Tx queues (MQ mode) may reach
>> soft lockup due to gfar_poll not returning in proper time.
>> The following exception was obtained using iperf on a 100Mbit
>> half-duplex link, for a p1010 single core device:
>>
>> BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847]
>> Modules linked in:
>> CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16
>> task: e8bf8000 ti: eeb16000 task.ti: ee646000
>> NIP: c0255b6c LR: c0367ae8 CTR: c0461c18
>> REGS: eeb17e70 TRAP: 0901   Not tainted  (3.12.0-rc3)
>> MSR: 00029000 <CE,EE,ME>  CR: 44228428  XER: 20000000
>>
>> GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff
>> 00000000
>> GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe
>> NIP [c0255b6c] find_next_bit+0xb8/0xc4
>> LR [c0367ae8] gfar_poll+0xc8/0x1d8
>> Call Trace:
>> [eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable)
>> [eeb17f70] [c0422100] net_rx_action+0xa4/0x158
>> [eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c
>> [eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c
>> [ee647cc0] [c0004660] do_softirq+0x6c/0x94
>> [ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0
>> [ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc
>> [ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c
>> [ee647d80] [c047b630] inet_recvmsg+0x40/0x64
>> [ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0
>> [ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc
>>
>> To prevent this, the outer while() loop has been removed
>> allowing gfar_poll() to return faster even if there's
>> still budget left.  Also, there's no need to recompute
>> the budget per Rx queue anymore.
>
> It seems there is a race condition, and this patch only makes it happen
> less often ?
>
> return faster means what exactly ?
>

Hi Eric,
Because of the outer while loop, gfar_poll may not return due
to continuous tx work. The later implementation of gfar_poll
allows only one iteration of the Tx queues before returning
control to net_rx_action(), that's what I meant with "returns faster".
I tested this fix with different loads, and the soft lockup
didn't trigger (without the fix it triggers right away).

Besides, isn't this a more appropriate napi poll implementation
than the former one with the outer while() loop?

Thanks,
Claudiu

^ permalink raw reply

* [RFC PATCH 2/2] net: Add trace events for all receive entry points, exposing more skb fields
From: Ben Hutchings @ 2013-10-14 14:48 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381761552.1626.8.camel@bwh-desktop.uk.level5networks.com>

The existing net/netif_rx and net/netif_receive_skb trace events
provide little information about the skb, nor do they indicate how it
entered the stack.

Add trace events at entry of each of the exported functions, including
most fields that are likely to be interesting for debugging driver
datapath behaviour.  Split netif_rx() and netif_receive_skb() so that
internal calls are not traced.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
There is a call to netif_rx() from xfrm_input() where I think the skb
has not passed through any layered device.  I'm thinking that perhaps
this should call netif_rx_internal() to avoid a confusing trace event.
Are there any other cases like this?

I'm not that happy about the event names here and am open to
suggestions.

Ben.

 include/trace/events/net.h | 100 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c             | 100 +++++++++++++++++++++++++++------------------
 2 files changed, 161 insertions(+), 39 deletions(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 0b61f2a..731907c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -136,6 +136,106 @@ DEFINE_EVENT(net_dev_template, netif_rx,
 
 	TP_ARGS(skb)
 );
+
+DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb),
+
+	TP_STRUCT__entry(
+		__string(	name,			skb->dev->name	)
+		__field(	unsigned int,		napi_id		)
+		__field(	u16,			queue_mapping	)
+		__field(	const void *,		skbaddr		)
+		__field(	bool,			vlan_tagged	)
+		__field(	u16,			vlan_proto	)
+		__field(	u16,			vlan_tci	)
+		__field(	u16,			protocol	)
+		__field(	u8,			ip_summed	)
+		__field(	u32,			rxhash		)
+		__field(	bool,			l4_rxhash	)
+		__field(	unsigned int,		len		)
+		__field(	unsigned int,		data_len	)
+		__field(	unsigned int,		truesize	)
+		__field(	bool,			mac_header_valid)
+		__field(	int,			mac_header	)
+		__field(	unsigned char,		nr_frags	)
+		__field(	u16,			gso_size	)
+		__field(	u16,			gso_type	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, skb->dev->name);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+		__entry->napi_id = skb->napi_id;
+#else
+		__entry->napi_id = 0;
+#endif
+		__entry->queue_mapping = skb->queue_mapping;
+		__entry->skbaddr = skb;
+		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_proto = ntohs(skb->vlan_proto);
+		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->protocol = ntohs(skb->protocol);
+		__entry->ip_summed = skb->ip_summed;
+		__entry->rxhash = skb->rxhash;
+		__entry->l4_rxhash = skb->l4_rxhash;
+		__entry->len = skb->len;
+		__entry->data_len = skb->data_len;
+		__entry->truesize = skb->truesize;
+		__entry->mac_header_valid = skb_mac_header_was_set(skb);
+		__entry->mac_header = skb_mac_header(skb) - skb->data;
+		__entry->nr_frags = skb_shinfo(skb)->nr_frags;
+		__entry->gso_size = skb_shinfo(skb)->gso_size;
+		__entry->gso_type = skb_shinfo(skb)->gso_type;
+	),
+
+	TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d rxhash=0x%08x l4_rxhash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+		  __get_str(name), __entry->napi_id, __entry->queue_mapping,
+		  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
+		  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
+		  __entry->rxhash, __entry->l4_rxhash, __entry->len,
+		  __entry->data_len, __entry->truesize,
+		  __entry->mac_header_valid, __entry->mac_header,
+		  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
+);
+
+DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry,
+
+	TP_PROTO(const struct sk_buff *skb),
+
+	TP_ARGS(skb)
+);
+
 #endif /* _TRACE_NET_H */
 
 /* This part must be outside protection */
diff --git a/net/core/dev.c b/net/core/dev.c
index e221963..faf49b7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -146,6 +146,8 @@ struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 struct list_head ptype_all __read_mostly;	/* Taps */
 static struct list_head offload_base __read_mostly;
 
+static int netif_rx_internal(struct sk_buff *skb);
+
 /*
  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
  * semaphore.
@@ -1698,7 +1700,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	 */
 	skb_scrub_packet(skb, true);
 
-	return netif_rx(skb);
+	return netif_rx_internal(skb);
 }
 EXPORT_SYMBOL_GPL(dev_forward_skb);
 
@@ -3213,22 +3215,7 @@ enqueue:
 	return NET_RX_DROP;
 }
 
-/**
- *	netif_rx	-	post buffer to the network code
- *	@skb: buffer to post
- *
- *	This function receives a packet from a device driver and queues it for
- *	the upper (protocol) levels to process.  It always succeeds. The buffer
- *	may be dropped during processing for congestion control or by the
- *	protocol layers.
- *
- *	return values:
- *	NET_RX_SUCCESS	(no congestion)
- *	NET_RX_DROP     (packet was dropped)
- *
- */
-
-int netif_rx(struct sk_buff *skb)
+static int netif_rx_internal(struct sk_buff *skb)
 {
 	int ret;
 
@@ -3264,14 +3251,38 @@ int netif_rx(struct sk_buff *skb)
 	}
 	return ret;
 }
+
+/**
+ *	netif_rx	-	post buffer to the network code
+ *	@skb: buffer to post
+ *
+ *	This function receives a packet from a device driver and queues it for
+ *	the upper (protocol) levels to process.  It always succeeds. The buffer
+ *	may be dropped during processing for congestion control or by the
+ *	protocol layers.
+ *
+ *	return values:
+ *	NET_RX_SUCCESS	(no congestion)
+ *	NET_RX_DROP     (packet was dropped)
+ *
+ */
+
+int netif_rx(struct sk_buff *skb)
+{
+	trace_netif_rx_entry(skb);
+
+	return netif_rx_internal(skb);
+}
 EXPORT_SYMBOL(netif_rx);
 
 int netif_rx_ni(struct sk_buff *skb)
 {
 	int err;
 
+	trace_netif_rx_ni_entry(skb);
+
 	preempt_disable();
-	err = netif_rx(skb);
+	err = netif_rx_internal(skb);
 	if (local_softirq_pending())
 		do_softirq();
 	preempt_enable();
@@ -3653,22 +3664,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
-/**
- *	netif_receive_skb - process receive buffer from network
- *	@skb: buffer to process
- *
- *	netif_receive_skb() is the main receive data processing function.
- *	It always succeeds. The buffer may be dropped during processing
- *	for congestion control or by the protocol layers.
- *
- *	This function may only be called from softirq context and interrupts
- *	should be enabled.
- *
- *	Return values (usually ignored):
- *	NET_RX_SUCCESS: no congestion
- *	NET_RX_DROP: packet was dropped
- */
-int netif_receive_skb(struct sk_buff *skb)
+static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	net_timestamp_check(netdev_tstamp_prequeue, skb);
 
@@ -3694,6 +3690,28 @@ int netif_receive_skb(struct sk_buff *skb)
 #endif
 	return __netif_receive_skb(skb);
 }
+
+/**
+ *	netif_receive_skb - process receive buffer from network
+ *	@skb: buffer to process
+ *
+ *	netif_receive_skb() is the main receive data processing function.
+ *	It always succeeds. The buffer may be dropped during processing
+ *	for congestion control or by the protocol layers.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb(struct sk_buff *skb)
+{
+	trace_netif_receive_skb_entry(skb);
+
+	return netif_receive_skb_internal(skb);
+}
 EXPORT_SYMBOL(netif_receive_skb);
 
 /* Network device is going away, flush any packets still pending
@@ -3755,7 +3773,7 @@ static int napi_gro_complete(struct sk_buff *skb)
 	}
 
 out:
-	return netif_receive_skb(skb);
+	return netif_receive_skb_internal(skb);
 }
 
 /* napi->gro_list contains packets ordered by age.
@@ -3906,7 +3924,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 {
 	switch (ret) {
 	case GRO_NORMAL:
-		if (netif_receive_skb(skb))
+		if (netif_receive_skb_internal(skb))
 			ret = GRO_DROP;
 		break;
 
@@ -3948,6 +3966,8 @@ static void skb_gro_reset_offset(struct sk_buff *skb)
 
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 {
+	trace_napi_gro_receive_entry(skb);
+
 	skb_gro_reset_offset(skb);
 
 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
@@ -3989,7 +4009,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *
 
 		if (ret == GRO_HELD)
 			skb_gro_pull(skb, -ETH_HLEN);
-		else if (netif_receive_skb(skb))
+		else if (netif_receive_skb_internal(skb))
 			ret = GRO_DROP;
 		break;
 
@@ -4048,6 +4068,8 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
 	if (!skb)
 		return GRO_DROP;
 
+	trace_napi_gro_frags_entry(skb);
+
 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
 }
 EXPORT_SYMBOL(napi_gro_frags);
@@ -6591,11 +6613,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
-		netif_rx(skb);
+		netif_rx_internal(skb);
 		input_queue_head_incr(oldsd);
 	}
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
-		netif_rx(skb);
+		netif_rx_internal(skb);
 		input_queue_head_incr(oldsd);
 	}
 

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [RFC PATCH 1/2] net: Add net_dev_start_xmit trace event, exposing more skb fields
From: Ben Hutchings @ 2013-10-14 14:41 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers
In-Reply-To: <1381761552.1626.8.camel@bwh-desktop.uk.level5networks.com>

The existing net/net_dev_xmit trace event provides little information
about the skb that has been passed to the driver, and it is not
simple to add more since the skb may already have been freed at
the point the event is emitted.

Add a separate trace event before the skb is passed to the driver,
including most fields that are likely to be interesting for debugging
driver datapath behaviour.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 include/trace/events/net.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c             |  2 ++
 2 files changed, 60 insertions(+)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index f99645d..0b61f2a 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -6,9 +6,67 @@
 
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
+#include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(net_dev_start_xmit,
+
+	TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),
+
+	TP_ARGS(skb, dev),
+
+	TP_STRUCT__entry(
+		__string(	name,			dev->name	)
+		__field(	u16,			queue_mapping	)
+		__field(	const void *,		skbaddr		)
+		__field(	bool,			vlan_tagged	)
+		__field(	u16,			vlan_proto	)
+		__field(	u16,			vlan_tci	)
+		__field(	u16,			protocol	)
+		__field(	u8,			ip_summed	)
+		__field(	unsigned int,		len		)
+		__field(	unsigned int,		data_len	)
+		__field(	int,			network_offset	)
+		__field(	bool,			transport_offset_valid)
+		__field(	int,			transport_offset)
+		__field(	u8,			tx_flags	)
+		__field(	u16,			gso_size	)
+		__field(	u16,			gso_segs	)
+		__field(	u16,			gso_type	)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, dev->name);
+		__entry->queue_mapping = skb->queue_mapping;
+		__entry->skbaddr = skb;
+		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_proto = ntohs(skb->vlan_proto);
+		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->protocol = ntohs(skb->protocol);
+		__entry->ip_summed = skb->ip_summed;
+		__entry->len = skb->len;
+		__entry->data_len = skb->data_len;
+		__entry->network_offset = skb_network_offset(skb);
+		__entry->transport_offset_valid =
+			skb_transport_header_was_set(skb);
+		__entry->transport_offset = skb_transport_offset(skb);
+		__entry->tx_flags = skb_shinfo(skb)->tx_flags;
+		__entry->gso_size = skb_shinfo(skb)->gso_size;
+		__entry->gso_segs = skb_shinfo(skb)->gso_segs;
+		__entry->gso_type = skb_shinfo(skb)->gso_type;
+	),
+
+	TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+		  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
+		  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
+		  __entry->protocol, __entry->ip_summed, __entry->len,
+		  __entry->data_len, 
+		  __entry->network_offset, __entry->transport_offset_valid,
+		  __entry->transport_offset, __entry->tx_flags,
+		  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
+);
+
 TRACE_EVENT(net_dev_xmit,
 
 	TP_PROTO(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 1b6eadf..e221963 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2602,6 +2602,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 			dev_queue_xmit_nit(skb, dev);
 
 		skb_len = skb->len;
+		trace_net_dev_start_xmit(skb, dev);
 		rc = ops->ndo_start_xmit(skb, dev);
 		trace_net_dev_xmit(skb, rc, dev, skb_len);
 		if (rc == NETDEV_TX_OK)
@@ -2620,6 +2621,7 @@ gso:
 			dev_queue_xmit_nit(nskb, dev);
 
 		skb_len = nskb->len;
+		trace_net_dev_start_xmit(nskb, dev);
 		rc = ops->ndo_start_xmit(nskb, dev);
 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
 		if (unlikely(rc != NETDEV_TX_OK)) {


-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* [RFC PATCH 0/2] Improve tracing at the driver/core boundary
From: Ben Hutchings @ 2013-10-14 14:39 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers

These patches add static tracpeoints at the driver/core boundary which
record various skb fields likely to be useful for datapath debugging.
On the TX side the boundary is where the core calls ndo_start_xmit, and
on the RX side it is where any of the various exported receive functions
is called.

The set of skb fields is mostly based on what I thought would be
interesting for sfc, and may need to be augmented to be more general.

Ben.

Ben Hutchings (2):
  net: Add net_dev_start_xmit trace event, exposing more skb fields
  net: Add trace events for all receive entry points, exposing more skb
    fields

 include/trace/events/net.h | 158 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c             | 102 ++++++++++++++++++-----------
 2 files changed, 221 insertions(+), 39 deletions(-)

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH][net-next] gianfar: Simplify MQ polling to avoid soft lockup
From: Eric Dumazet @ 2013-10-14 14:34 UTC (permalink / raw)
  To: Claudiu Manoil; +Cc: netdev, David S. Miller
In-Reply-To: <1381759509-26882-1-git-send-email-claudiu.manoil@freescale.com>

On Mon, 2013-10-14 at 17:05 +0300, Claudiu Manoil wrote:
> Under certain low traffic conditions, the single core
> devices with multiple Rx/Tx queues (MQ mode) may reach
> soft lockup due to gfar_poll not returning in proper time.
> The following exception was obtained using iperf on a 100Mbit
> half-duplex link, for a p1010 single core device:
> 
> BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847]
> Modules linked in:
> CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16
> task: e8bf8000 ti: eeb16000 task.ti: ee646000
> NIP: c0255b6c LR: c0367ae8 CTR: c0461c18
> REGS: eeb17e70 TRAP: 0901   Not tainted  (3.12.0-rc3)
> MSR: 00029000 <CE,EE,ME>  CR: 44228428  XER: 20000000
> 
> GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff
> 00000000
> GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe
> NIP [c0255b6c] find_next_bit+0xb8/0xc4
> LR [c0367ae8] gfar_poll+0xc8/0x1d8
> Call Trace:
> [eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable)
> [eeb17f70] [c0422100] net_rx_action+0xa4/0x158
> [eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c
> [eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c
> [ee647cc0] [c0004660] do_softirq+0x6c/0x94
> [ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0
> [ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc
> [ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c
> [ee647d80] [c047b630] inet_recvmsg+0x40/0x64
> [ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0
> [ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc
> 
> To prevent this, the outer while() loop has been removed
> allowing gfar_poll() to return faster even if there's
> still budget left.  Also, there's no need to recompute
> the budget per Rx queue anymore.

It seems there is a race condition, and this patch only makes it happen
less often ?

return faster means what exactly ?

^ permalink raw reply

* [PATCH][net-next] gianfar: Simplify MQ polling to avoid soft lockup
From: Claudiu Manoil @ 2013-10-14 14:05 UTC (permalink / raw)
  To: netdev; +Cc: David S. Miller

Under certain low traffic conditions, the single core
devices with multiple Rx/Tx queues (MQ mode) may reach
soft lockup due to gfar_poll not returning in proper time.
The following exception was obtained using iperf on a 100Mbit
half-duplex link, for a p1010 single core device:

BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847]
Modules linked in:
CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16
task: e8bf8000 ti: eeb16000 task.ti: ee646000
NIP: c0255b6c LR: c0367ae8 CTR: c0461c18
REGS: eeb17e70 TRAP: 0901   Not tainted  (3.12.0-rc3)
MSR: 00029000 <CE,EE,ME>  CR: 44228428  XER: 20000000

GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff
00000000
GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe
NIP [c0255b6c] find_next_bit+0xb8/0xc4
LR [c0367ae8] gfar_poll+0xc8/0x1d8
Call Trace:
[eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable)
[eeb17f70] [c0422100] net_rx_action+0xa4/0x158
[eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c
[eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c
[ee647cc0] [c0004660] do_softirq+0x6c/0x94
[ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0
[ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc
[ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c
[ee647d80] [c047b630] inet_recvmsg+0x40/0x64
[ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0
[ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc

To prevent this, the outer while() loop has been removed
allowing gfar_poll() to return faster even if there's
still budget left.  Also, there's no need to recompute
the budget per Rx queue anymore.

Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com>
---
 drivers/net/ethernet/freescale/gianfar.c | 87 ++++++++++++++------------------
 1 file changed, 38 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 9fbe4dd..d6d810c 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2918,7 +2918,7 @@ static int gfar_poll(struct napi_struct *napi, int budget)
 	struct gfar_priv_rx_q *rx_queue = NULL;
 	int work_done = 0, work_done_per_q = 0;
 	int i, budget_per_q = 0;
-	int has_tx_work;
+	int has_tx_work = 0;
 	unsigned long rstat_rxf;
 	int num_act_queues;
 
@@ -2933,62 +2933,51 @@ static int gfar_poll(struct napi_struct *napi, int budget)
 	if (num_act_queues)
 		budget_per_q = budget/num_act_queues;
 
-	while (1) {
-		has_tx_work = 0;
-		for_each_set_bit(i, &gfargrp->tx_bit_map, priv->num_tx_queues) {
-			tx_queue = priv->tx_queue[i];
-			/* run Tx cleanup to completion */
-			if (tx_queue->tx_skbuff[tx_queue->skb_dirtytx]) {
-				gfar_clean_tx_ring(tx_queue);
-				has_tx_work = 1;
-			}
+	for_each_set_bit(i, &gfargrp->tx_bit_map, priv->num_tx_queues) {
+		tx_queue = priv->tx_queue[i];
+		/* run Tx cleanup to completion */
+		if (tx_queue->tx_skbuff[tx_queue->skb_dirtytx]) {
+			gfar_clean_tx_ring(tx_queue);
+			has_tx_work = 1;
 		}
+	}
 
-		for_each_set_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) {
-			/* skip queue if not active */
-			if (!(rstat_rxf & (RSTAT_CLEAR_RXF0 >> i)))
-				continue;
-
-			rx_queue = priv->rx_queue[i];
-			work_done_per_q =
-				gfar_clean_rx_ring(rx_queue, budget_per_q);
-			work_done += work_done_per_q;
-
-			/* finished processing this queue */
-			if (work_done_per_q < budget_per_q) {
-				/* clear active queue hw indication */
-				gfar_write(&regs->rstat,
-					   RSTAT_CLEAR_RXF0 >> i);
-				rstat_rxf &= ~(RSTAT_CLEAR_RXF0 >> i);
-				num_act_queues--;
-
-				if (!num_act_queues)
-					break;
-				/* recompute budget per Rx queue */
-				budget_per_q =
-					(budget - work_done) / num_act_queues;
-			}
-		}
+	for_each_set_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) {
+		/* skip queue if not active */
+		if (!(rstat_rxf & (RSTAT_CLEAR_RXF0 >> i)))
+			continue;
 
-		if (work_done >= budget)
-			break;
+		rx_queue = priv->rx_queue[i];
+		work_done_per_q =
+			gfar_clean_rx_ring(rx_queue, budget_per_q);
+		work_done += work_done_per_q;
+
+		/* finished processing this queue */
+		if (work_done_per_q < budget_per_q) {
+			/* clear active queue hw indication */
+			gfar_write(&regs->rstat,
+				   RSTAT_CLEAR_RXF0 >> i);
+			num_act_queues--;
+
+			if (!num_act_queues)
+				break;
+		}
+	}
 
-		if (!num_act_queues && !has_tx_work) {
+	if (!num_act_queues && !has_tx_work) {
 
-			napi_complete(napi);
+		napi_complete(napi);
 
-			/* Clear the halt bit in RSTAT */
-			gfar_write(&regs->rstat, gfargrp->rstat);
+		/* Clear the halt bit in RSTAT */
+		gfar_write(&regs->rstat, gfargrp->rstat);
 
-			gfar_write(&regs->imask, IMASK_DEFAULT);
+		gfar_write(&regs->imask, IMASK_DEFAULT);
 
-			/* If we are coalescing interrupts, update the timer
-			 * Otherwise, clear it
-			 */
-			gfar_configure_coalescing(priv, gfargrp->rx_bit_map,
-						  gfargrp->tx_bit_map);
-			break;
-		}
+		/* If we are coalescing interrupts, update the timer
+		 * Otherwise, clear it
+		 */
+		gfar_configure_coalescing(priv, gfargrp->rx_bit_map,
+					  gfargrp->tx_bit_map);
 	}
 
 	return work_done;
-- 
1.7.11.7

^ permalink raw reply related

* Re: [PATCH net-next] sctp: Namespacify checksum_disable
From: Vlad Yasevich @ 2013-10-14 14:08 UTC (permalink / raw)
  To: Fan Du, nhorman; +Cc: davem, netdev
In-Reply-To: <1381739545-674-1-git-send-email-fan.du@windriver.com>



Fan Du <fan.du@windriver.com> wrote:

>SCTP CRC32-C checksum computing and verifying should be
>namespace-sensible,
>as each, e.g. tenant instance might need different checksum
>configuration for
>its requirement. So this patch enhance SCTP with this feature.
>
>Signed-off-by: Fan Du <fan.du@windriver.com>

NACK.  We don't want that setting to be sysctl configurable.  It is only useful in very limited circumstances and is not really for production/everyday use.

In fact, I am going to send in a patch that makes this module parameter read only in /sys. 

-vlad
>---
> include/net/netns/sctp.h   |    5 +++++
> include/net/sctp/structs.h |    3 ---
> net/sctp/input.c           |    2 +-
> net/sctp/output.c          |    4 +++-
> net/sctp/protocol.c        |    5 +++--
> net/sctp/sysctl.c          |    7 +++++++
> 6 files changed, 19 insertions(+), 7 deletions(-)
>
>diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
>index 3573a81..704adb3 100644
>--- a/include/net/netns/sctp.h
>+++ b/include/net/netns/sctp.h
>@@ -129,6 +129,11 @@ struct netns_sctp {
> 
> 	/* Threshold for autoclose timeout, in seconds. */
> 	unsigned long max_autoclose;
>+
>+	/* Set to 1 to disable CRC32-C checksum computing and verifying.
>+	 * Default to 0 to enable this feature.
>+	 */
>+	int checksum_disable;
> };
> 
> #endif /* __NETNS_SCTP_H__ */
>diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
>index 2174d8d..820895e 100644
>--- a/include/net/sctp/structs.h
>+++ b/include/net/sctp/structs.h
>@@ -134,9 +134,6 @@ extern struct sctp_globals {
> 	__u16 max_instreams;
> 	__u16 max_outstreams;
> 
>-	/* Flag to indicate whether computing and verifying checksum
>-	 * is disabled. */
>-        bool checksum_disable;
> } sctp_globals;
> 
> #define sctp_max_instreams		(sctp_globals.max_instreams)
>diff --git a/net/sctp/input.c b/net/sctp/input.c
>index 98b69bb..9db2a65 100644
>--- a/net/sctp/input.c
>+++ b/net/sctp/input.c
>@@ -134,7 +134,7 @@ int sctp_rcv(struct sk_buff *skb)
> 	__skb_pull(skb, skb_transport_offset(skb));
> 	if (skb->len < sizeof(struct sctphdr))
> 		goto discard_it;
>-	if (!sctp_checksum_disable && !skb_csum_unnecessary(skb) &&
>+	if (!net->sctp.checksum_disable && !skb_csum_unnecessary(skb) &&
> 		  sctp_rcv_checksum(net, skb) < 0)
> 		goto discard_it;
> 
>diff --git a/net/sctp/output.c b/net/sctp/output.c
>index 6de6402..5d0a45e 100644
>--- a/net/sctp/output.c
>+++ b/net/sctp/output.c
>@@ -395,6 +395,7 @@ int sctp_packet_transmit(struct sctp_packet
>*packet)
> 	struct sk_buff *nskb;
> 	struct sctp_chunk *chunk, *tmp;
> 	struct sock *sk;
>+	struct net *net;
> 	int err = 0;
> 	int padding;		/* How much padding do we need?  */
> 	__u8 has_data = 0;
>@@ -411,6 +412,7 @@ int sctp_packet_transmit(struct sctp_packet
>*packet)
> 	/* Set up convenience variables... */
> 	chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
> 	sk = chunk->skb->sk;
>+	net = sock_net(sk);
> 
> 	/* Allocate the new skb.  */
> 	nskb = alloc_skb(packet->size + LL_MAX_HEADER, GFP_ATOMIC);
>@@ -545,7 +547,7 @@ int sctp_packet_transmit(struct sctp_packet
>*packet)
> 	 * Note: Adler-32 is no longer applicable, as has been replaced
> 	 * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
> 	 */
>-	if (!sctp_checksum_disable) {
>+	if (!net->sctp.checksum_disable) {
> 		if ((!(dst->dev->features & NETIF_F_SCTP_CSUM)) ||
> 			is_xfrm_armed(dst)) {
> 
>diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
>index 5e17092..b3c51ce 100644
>--- a/net/sctp/protocol.c
>+++ b/net/sctp/protocol.c
>@@ -1239,6 +1239,9 @@ static int __net_init sctp_net_init(struct net
>*net)
> 	/* Initialize maximum autoclose timeout. */
> 	net->sctp.max_autoclose		= INT_MAX / HZ;
> 
>+	/* Enable checksum by default. */
>+	net->sctp.checksum_disable = 0;
>+
> 	status = sctp_sysctl_net_register(net);
> 	if (status)
> 		goto err_sysctl_register;
>@@ -1543,6 +1546,4 @@ MODULE_ALIAS("net-pf-" __stringify(PF_INET)
>"-proto-132");
> MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132");
>MODULE_AUTHOR("Linux Kernel SCTP developers
><linux-sctp@vger.kernel.org>");
> MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)");
>-module_param_named(no_checksums, sctp_checksum_disable, bool, 0644);
>-MODULE_PARM_DESC(no_checksums, "Disable checksums computing and
>verification");
> MODULE_LICENSE("GPL");
>diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
>index 6b36561..d6a6cca 100644
>--- a/net/sctp/sysctl.c
>+++ b/net/sctp/sysctl.c
>@@ -290,6 +290,13 @@ static struct ctl_table sctp_net_table[] = {
> 		.extra1		= &max_autoclose_min,
> 		.extra2		= &max_autoclose_max,
> 	},
>+	{
>+		.procname	= "checksum_disable",
>+		.data		= &init_net.sctp.checksum_disable,
>+		.maxlen		= sizeof(int),
>+		.mode		= 0644,
>+		.proc_handler	= proc_dointvec,
>+	},
> 
> 	{ /* sentinel */ }
> };

-- 
Sent from my Android phone with K-9 Mail. Please excuse my brevity.

^ permalink raw reply

* [PATCH] net: sctp: fix a cacc_saw_newack missetting issue
From: Chang Xiangzhong @ 2013-10-14 13:33 UTC (permalink / raw)
  To: nhorman, vyasevich
  Cc: davem, linux-sctp, netdev, linux-kernel, Chang Xiangzhong

For for each TSN t being newly acked (Not only cumulatively,
but also SELECTIVELY) cacc_saw_newack should be set to 1.

Signed-off-by: Xiangzhong Chang <changxiangzhong@gmail.com>
---
 net/sctp/outqueue.c |   42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 94df758..d86032b 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -1398,6 +1398,27 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 				forward_progress = true;
 			}
 
+			if (!tchunk->tsn_gap_acked) {
+				/*
+				 * SFR-CACC algorithm:
+				 * 2) If the SACK contains gap acks
+				 * and the flag CHANGEOVER_ACTIVE is
+				 * set the receiver of the SACK MUST
+				 * take the following action:
+				 *
+				 * B) For each TSN t being acked that
+				 * has not been acked in any SACK so
+				 * far, set cacc_saw_newack to 1 for
+				 * the destination that the TSN was
+				 * sent to.
+				 */
+				if (transport &&
+				    sack->num_gap_ack_blocks &&
+				    q->asoc->peer.primary_path->cacc.
+				    changeover_active)
+					transport->cacc.cacc_saw_newack	= 1;
+			}
+
 			if (TSN_lte(tsn, sack_ctsn)) {
 				/* RFC 2960  6.3.2 Retransmission Timer Rules
 				 *
@@ -1411,27 +1432,6 @@ static void sctp_check_transmitted(struct sctp_outq *q,
 				restart_timer = 1;
 				forward_progress = true;
 
-				if (!tchunk->tsn_gap_acked) {
-					/*
-					 * SFR-CACC algorithm:
-					 * 2) If the SACK contains gap acks
-					 * and the flag CHANGEOVER_ACTIVE is
-					 * set the receiver of the SACK MUST
-					 * take the following action:
-					 *
-					 * B) For each TSN t being acked that
-					 * has not been acked in any SACK so
-					 * far, set cacc_saw_newack to 1 for
-					 * the destination that the TSN was
-					 * sent to.
-					 */
-					if (transport &&
-					    sack->num_gap_ack_blocks &&
-					    q->asoc->peer.primary_path->cacc.
-					    changeover_active)
-						transport->cacc.cacc_saw_newack
-							= 1;
-				}
 
 				list_add_tail(&tchunk->transmitted_list,
 					      &q->sacked);
-- 
1.7.9.5

^ permalink raw reply related

* [patch] yam: remove a no-op in yam_ioctl()
From: Dan Carpenter @ 2013-10-14 12:46 UTC (permalink / raw)
  To: Jean-Paul Roubelat; +Cc: linux-hams, netdev, kernel-janitors

We overwrite the ->bitrate with the user supplied information on the
next line.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 0721e72..5af1c3e 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -975,7 +975,6 @@ static int yam_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 			return -EINVAL;		/* Cannot change this parameter when up */
 		if ((ym = kmalloc(sizeof(struct yamdrv_ioctl_mcs), GFP_KERNEL)) == NULL)
 			return -ENOBUFS;
-		ym->bitrate = 9600;
 		if (copy_from_user(ym, ifr->ifr_data, sizeof(struct yamdrv_ioctl_mcs))) {
 			kfree(ym);
 			return -EFAULT;

^ permalink raw reply related

* RE: [Xen-devel] [PATCH net-next v4 2/5] xen-netback: add support for IPv6 checksum offload from guest
From: Paul Durrant @ 2013-10-14 12:34 UTC (permalink / raw)
  To: David Vrabel, Wei Liu
  Cc: netdev@vger.kernel.org, Ian Campbell, xen-devel@lists.xen.org
In-Reply-To: <525BE148.1010508@citrix.com>

> -----Original Message-----
> From: David Vrabel
> Sent: 14 October 2013 13:19
> To: Wei Liu
> Cc: Paul Durrant; netdev@vger.kernel.org; Ian Campbell; David Vrabel; xen-
> devel@lists.xen.org
> Subject: Re: [Xen-devel] [PATCH net-next v4 2/5] xen-netback: add support
> for IPv6 checksum offload from guest
> 
> On 14/10/13 11:55, Wei Liu wrote:
> > On Mon, Oct 14, 2013 at 11:49:20AM +0100, Paul Durrant wrote:
> >>> -----Original Message-----
> >>> From: Wei Liu [mailto:wei.liu2@citrix.com]
> >>> Sent: 14 October 2013 11:43
> >>> To: Paul Durrant
> >>> Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Wei Liu; David
> Vrabel;
> >>> Ian Campbell
> >>> Subject: Re: [PATCH net-next v4 2/5] xen-netback: add support for IPv6
> >>> checksum offload from guest
> >>>
> >>> On Fri, Oct 11, 2013 at 04:06:19PM +0100, Paul Durrant wrote:
> >>> [...]
> >>>> -/*
> >>>> - * This is the amount of packet we copy rather than map, so that the
> >>>> - * guest can't fiddle with the contents of the headers while we do
> >>>> - * packet processing on them (netfilter, routing, etc).
> >>>> +/* This is a miniumum size for the linear area to avoid lots of
> >>>> + * calls to __pskb_pull_tail() as we set up checksum offsets.
> >>>>   */
> >>>
> >>> You seem to forget to explain why 128 is chosen. :-)
> >>
> >> Is that not sufficient explanation? What sort of thing are you looking for?
> >>
> >
> >>From the second version of this patch, we had a conversation.
> >
> >> Where does 128 come from?
> >>
> >
> > "It's just an arbitrary power of 2 that was chosen because it seems to
> > cover most likely v6 headers and all v4 headers."
> >
> > So something like: "We choose 128 which is likely to cover most V6
> > headers and all V4 headers" would be sufficeint.
> 
> Is "most IPv6 headers" actually good enough?  Don't we need to ensure
> netback copies all IP headers?
> 

It will do if checksum offload is in use, but perhaps the pull as far as the transport header needs to be done anyway? I'm unsure of the expectations of other code.

  Paul

^ permalink raw reply

* [patch] yam: integer underflow in yam_ioctl()
From: Dan Carpenter @ 2013-10-14 12:28 UTC (permalink / raw)
  To: Jean-Paul Roubelat; +Cc: linux-hams, netdev, kernel-janitors

We cap bitrate at YAM_MAXBITRATE in yam_ioctl(), but it could also be
negative.  I don't know the impact of using a negative bitrate but let's
prevent it.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/include/linux/yam.h b/include/linux/yam.h
index 7fe2822..512cdc2 100644
--- a/include/linux/yam.h
+++ b/include/linux/yam.h
@@ -77,6 +77,6 @@ struct yamdrv_ioctl_cfg {
 
 struct yamdrv_ioctl_mcs {
 	int cmd;
-	int bitrate;
+	unsigned int bitrate;
 	unsigned char bits[YAM_FPGA_SIZE];
 };

^ permalink raw reply related

* Re: [Xen-devel] [PATCH net-next v4 2/5] xen-netback: add support for IPv6 checksum offload from guest
From: David Vrabel @ 2013-10-14 12:19 UTC (permalink / raw)
  To: Wei Liu
  Cc: Paul Durrant, netdev@vger.kernel.org, Ian Campbell, David Vrabel,
	xen-devel@lists.xen.org
In-Reply-To: <20131014105527.GD11739@zion.uk.xensource.com>

On 14/10/13 11:55, Wei Liu wrote:
> On Mon, Oct 14, 2013 at 11:49:20AM +0100, Paul Durrant wrote:
>>> -----Original Message-----
>>> From: Wei Liu [mailto:wei.liu2@citrix.com]
>>> Sent: 14 October 2013 11:43
>>> To: Paul Durrant
>>> Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Wei Liu; David Vrabel;
>>> Ian Campbell
>>> Subject: Re: [PATCH net-next v4 2/5] xen-netback: add support for IPv6
>>> checksum offload from guest
>>>
>>> On Fri, Oct 11, 2013 at 04:06:19PM +0100, Paul Durrant wrote:
>>> [...]
>>>> -/*
>>>> - * This is the amount of packet we copy rather than map, so that the
>>>> - * guest can't fiddle with the contents of the headers while we do
>>>> - * packet processing on them (netfilter, routing, etc).
>>>> +/* This is a miniumum size for the linear area to avoid lots of
>>>> + * calls to __pskb_pull_tail() as we set up checksum offsets.
>>>>   */
>>>
>>> You seem to forget to explain why 128 is chosen. :-)
>>
>> Is that not sufficient explanation? What sort of thing are you looking for?
>>
> 
>>From the second version of this patch, we had a conversation.
> 
>> Where does 128 come from?
>>
> 
> "It's just an arbitrary power of 2 that was chosen because it seems to
> cover most likely v6 headers and all v4 headers."
> 
> So something like: "We choose 128 which is likely to cover most V6
> headers and all V4 headers" would be sufficeint.

Is "most IPv6 headers" actually good enough?  Don't we need to ensure
netback copies all IP headers?

David

^ permalink raw reply

* Re: [PATCH] net/ethernet: cpsw: Bugfix interrupts before enabling napi
From: Peter Korsgaard @ 2013-10-14 11:48 UTC (permalink / raw)
  To: Markus Pargmann
  Cc: David S. Miller, Florian Fainelli, Mugunthan V N,
	linux-arm-kernel, netdev, kernel
In-Reply-To: <1381691821-25498-1-git-send-email-mpa@pengutronix.de>

>>>>> "Markus" == Markus Pargmann <mpa@pengutronix.de> writes:

 Markus> If interrupts happen before napi_enable was called, the driver will not
 Markus> work as expected. Network transmissions are impossible in this state.
 Markus> This bug can be reproduced easily by restarting the network interface in
 Markus> a loop. After some time any network transmissions on the network
 Markus> interface will fail.

 Markus> This patch fixes the bug by enabling napi before enabling the network
 Markus> interface interrupts.

 Markus> Signed-off-by: Markus Pargmann <mpa@pengutronix.de>

Acked-by: Peter Korsgaard <jacmet@sunsite.dk>

-- 
Bye, Peter Korsgaard

^ permalink raw reply

* Re: DomU's network interface will hung when Dom0 running 32bit
From: Wei Liu @ 2013-10-14 11:19 UTC (permalink / raw)
  To: jianhai luan; +Cc: Ian Campbell, Wei Liu, xen-devel, netdev
In-Reply-To: <52590DFE.6080203@oracle.com>

On Sat, Oct 12, 2013 at 04:53:18PM +0800, jianhai luan wrote:
> Hi Ian,
>   I meet the DomU's network interface hung issue recently, and have
> been working on the issue from that time. I find that DomU's network
> interface, which send lesser package, will hung if Dom0 running
> 32bit and DomU's up-time is very long.  I think that one jiffies
> overflow bug exist in the function tx_credit_exceeded().
>   I know the inline function time_after_eq(a,b) will process jiffies
> overflow, but the function have one limit a should little that (b +
> MAX_SIGNAL_LONG). If a large than the value, time_after_eq will
> return false. The MAX_SINGNAL_LONG should be 0x7fffffff at 32-bit
> machine.
>   If DomU's network interface send lesser package (<0.5k/s if
> jiffies=250 and credit_bytes=ULONG_MAX), jiffies will beyond out
> (credit_timeout.expires + MAX_SIGNAL_LONG) and time_after_eq(now,
> next_credit) will failure (should be true). So one timer which will
> not be trigger in short time, and later process will be aborted when
> timer_pending(&vif->credit_timeout) is true. The result will be
> DomU's network interface will be hung in long time (> 40days).
>   Please think about the below scenario:
>   Condition:
>     Dom0 running 32-bit and HZ = 1000
>     vif->credit_timeout->expire = 0xffffffff, vif->remaining_credit
> = 0xffffffff, vif->credit_usec=0 jiffies=0
>     vif receive lesser package (DomU send lesser package). If the
> value is litter than 2K/s, consume 4G(0xffffffff) will need 582.55
> hours. jiffies will large than 0x7ffffff. we guess jiffies =
> 0x800000ff, time_after_eq(0x800000ff, 0xffffffff) will failure, and
> one time which expire is 0xfffffff will be pended into system. So
> the interface will hung until jiffies recount 0xffffffff (that will
> need very long time).

If I'm not mistaken you meant time_after_eq(now, next_credit) in
netback. How does next_credit become 0xffffffff?

Wei.

> 
>   If some error exist in above explain, please help me point it out.
> 
> Thanks,
> Jason

^ permalink raw reply

* RE: [PATCH net-next v4 1/5] xen-netback: add support for IPv6 checksum offload to guest
From: Paul Durrant @ 2013-10-14 11:10 UTC (permalink / raw)
  To: Ian Campbell
  Cc: xen-devel@lists.xen.org, netdev@vger.kernel.org, Wei Liu,
	David Vrabel
In-Reply-To: <1381748013.24708.102.camel@kazak.uk.xensource.com>

> -----Original Message-----
> From: Ian Campbell
> Sent: 14 October 2013 11:54
> To: Paul Durrant
> Cc: xen-devel@lists.xen.org; netdev@vger.kernel.org; Wei Liu; David Vrabel
> Subject: Re: [PATCH net-next v4 1/5] xen-netback: add support for IPv6
> checksum offload to guest
> 
> On Fri, 2013-10-11 at 16:06 +0100, Paul Durrant wrote:
> > Check xenstore flag feature-ipv6-csum-offload to determine if a
> > guest is happy to accept IPv6 packets with only partial checksum.
> > Also check analogous feature-ip-csum-offload to determine if a
> > guest is happy to accept IPv4 packets with only partial checksum
> > as a replacement for a negated feature-no-csum-offload value and
> > add a comment to deprecate use of feature-no-csum-offload.
> >
> > Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> > Cc: Wei Liu <wei.liu2@citrix.com>
> > Cc: David Vrabel <david.vrabel@citrix.com>
> > Cc: Ian Campbell <ian.campbell@citrix.com>
> 
> Shouldn't this come later in the series, i.e. after netback is actually
> able to cope with ipv6 offloads?
> 

I guess that's debatable. The patches don't have any dependency relation; offloads to and from the guest are quite independent.

> > diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-
> netback/common.h
> > index 5715318..b4a9a3c 100644
> > --- a/drivers/net/xen-netback/common.h
> > +++ b/drivers/net/xen-netback/common.h
> > @@ -153,7 +153,8 @@ struct xenvif {
> >  	u8 can_sg:1;
> >  	u8 gso:1;
> >  	u8 gso_prefix:1;
> > -	u8 csum:1;
> > +	u8 ip_csum:1;
> > +	u8 ipv6_csum:1;
> 
> Why not ipv4_csum for consistency/unambiguity?
> 

I followed general linux naming conventions e.g. ip_hdr and ipv6_hdr.

> > diff --git a/include/xen/interface/io/netif.h
> b/include/xen/interface/io/netif.h
> > index eb262e3..d9fb44739 100644
> > --- a/include/xen/interface/io/netif.h
> > +++ b/include/xen/interface/io/netif.h
> > @@ -51,6 +51,16 @@
> >   */
> >
> >  /*
> > + * "feature-no-csum-offload" was used to turn off IPv4 TCP/UDP
> checksum
> > + * offload but is now deprecated. Two new feature flags should now be
> used
> > + * to control checksum offload:
> 
> How is a frontend to know which sort of backend it is talking too? Is
> there going to be a feature flag to indicate support for these new
> flags?
> 
> In particular a new frontend running on an old backend needs to know
> that it needs to set no-csum-offload instead of ip-csum-offload somehow.
> 

Good point. Without any version I guess we have to live with the old flag forever. I'll stick with it for v4 and just leave the new one for v6.

  Paul

> > + * "feature-ip-csum-offload" should be used to turn IPv4 TCP/UDP
> checksum
> 
> "ipv4" again?
> 
> > + * offload on or off. If it is missing then the feature is assumed to be on.
> > + * "feature-ipv6-csum-offload" should be used to turn IPv6 TCP/UDP
> checksum
> > + * offload on or off. If it is missing then the feature is assumed to be off.
> > + */
> > +
> > +/*
> >   * This is the 'wire' format for packets:
> >   *  Request 1: xen_netif_tx_request  -- XEN_NETTXF_* (any flags)
> >   * [Request 2: xen_netif_extra_info]    (only if request 1 has
> XEN_NETTXF_extra_info)
> 


^ permalink raw reply

* Re: [PATCH net 2/2] virtio-net: refill only when device is up during setting queues
From: Michael S. Tsirkin @ 2013-10-14 11:09 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1381744595-26881-2-git-send-email-jasowang@redhat.com>

On Mon, Oct 14, 2013 at 05:56:35PM +0800, Jason Wang wrote:
> We used to schedule the refill work unconditionally after changing the
> number of queues. This may lead an issue if the device is not
> up. Since we only try to cancel the work in ndo_stop(), this may cause
> the refill work still work after removing the device. Fix this by only
> schedule the work when device is up.
> 
> The bug were introduce by commit 9b9cd8024a2882e896c65222aa421d461354e3f2.
> (virtio-net: fix the race between channels setting and refill)
> 
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>

It bothers me that we look at the flag without any
locks here.
I think we'll need to take the rtnl lock at least
on restore.

> ---
> The patch were need for 3.10 and above.
> ---
>  drivers/net/virtio_net.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c4bc1cc..92f0096 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -938,7 +938,9 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
>  		return -EINVAL;
>  	} else {
>  		vi->curr_queue_pairs = queue_pairs;
> -		schedule_delayed_work(&vi->refill, 0);
> +		/* virtnet_open() will refill when device is going to up. */
> +		if (dev->flags & IFF_UP)
> +			schedule_delayed_work(&vi->refill, 0);
>  	}
>  
>  	return 0;
> -- 
> 1.8.1.2

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox