Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 18/51] netfilter: nf_flow_table: track flow tables in nf_flow_table directly
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Avoids having nf_flow_table depend on nftables (useful for future
iptables backport work)

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h |  1 +
 include/net/netfilter/nf_tables.h     |  3 ---
 net/netfilter/nf_flow_table_core.c    | 21 ++++++++++++++++++---
 net/netfilter/nf_tables_api.c         | 17 -----------------
 4 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index f876e32a60b8..ab408adba688 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -21,6 +21,7 @@ struct nf_flowtable_type {
 };
 
 struct nf_flowtable {
+	struct list_head		list;
 	struct rhashtable		rhashtable;
 	const struct nf_flowtable_type	*type;
 	struct delayed_work		gc_work;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cd368d1b8cb8..2f2062ae1c45 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1109,9 +1109,6 @@ struct nft_flowtable {
 struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 						 const struct nlattr *nla,
 						 u8 genmask);
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data);
 
 void nft_register_flowtable_type(struct nf_flowtable_type *type);
 void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 09d1be669c39..e761359b56a9 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -18,6 +18,9 @@ struct flow_offload_entry {
 	struct rcu_head		rcu_head;
 };
 
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
 static void
 flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 		      struct nf_flow_route *route,
@@ -410,6 +413,10 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
 	queue_delayed_work(system_power_efficient_wq,
 			   &flowtable->gc_work, HZ);
 
+	mutex_lock(&flowtable_lock);
+	list_add(&flowtable->list, &flowtables);
+	mutex_unlock(&flowtable_lock);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_init);
@@ -425,20 +432,28 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 }
 
 static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
-					  void *data)
+					  struct net_device *dev)
 {
-	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
 	flush_delayed_work(&flowtable->gc_work);
 }
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
 {
-	nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+	struct nf_flowtable *flowtable;
+
+	mutex_lock(&flowtable_lock);
+	list_for_each_entry(flowtable, &flowtables, list)
+		nf_flow_table_iterate_cleanup(flowtable, dev);
+	mutex_unlock(&flowtable_lock);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	mutex_lock(&flowtable_lock);
+	list_del(&flow_table->list);
+	mutex_unlock(&flowtable_lock);
 	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 517bb93c00fb..16b67f54b3d2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5060,23 +5060,6 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
 	return ERR_PTR(-ENOENT);
 }
 
-void nft_flow_table_iterate(struct net *net,
-			    void (*iter)(struct nf_flowtable *flowtable, void *data),
-			    void *data)
-{
-	struct nft_flowtable *flowtable;
-	const struct nft_table *table;
-
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(table, &net->nft.tables, list) {
-		list_for_each_entry(flowtable, &table->flowtables, list) {
-			iter(&flowtable->data, data);
-		}
-	}
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
 static void nft_unregister_flowtable_net_hooks(struct net *net,
 					       struct nft_flowtable *flowtable)
 {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 19/51] netfilter: nf_flow_table: make flow_offload_dead inline
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

It is too trivial to keep as a separate exported function

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 5 ++++-
 net/netfilter/nf_flow_table_core.c    | 6 ------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index ab408adba688..5aa49524ebef 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -103,7 +103,10 @@ void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
 
-void flow_offload_dead(struct flow_offload *flow);
+static inline void flow_offload_dead(struct flow_offload *flow)
+{
+	flow->flags |= FLOW_OFFLOAD_DYING;
+}
 
 int nf_flow_snat_port(const struct flow_offload *flow,
 		      struct sk_buff *skb, unsigned int thoff,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index e761359b56a9..0d38f20fd226 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -113,12 +113,6 @@ void flow_offload_free(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_free);
 
-void flow_offload_dead(struct flow_offload *flow)
-{
-	flow->flags |= FLOW_OFFLOAD_DYING;
-}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
-
 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
 {
 	const struct flow_offload_tuple *tuple = data;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 16/51] netfilter: nf_flow_table: move init code to nf_flow_table_core.c
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Reduces duplication of .gc and .params in flowtable type definitions and
makes the API clearer

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |   6 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c |   3 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c |   3 +-
 net/netfilter/nf_flow_table_core.c      | 102 +++++++++++++++++++-------------
 net/netfilter/nf_flow_table_inet.c      |   3 +-
 net/netfilter/nf_tables_api.c           |  22 +++----
 6 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 76ee5c81b752..f876e32a60b8 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,9 +14,8 @@ struct nf_flowtable;
 struct nf_flowtable_type {
 	struct list_head		list;
 	int				family;
-	void				(*gc)(struct work_struct *work);
+	int				(*init)(struct nf_flowtable *ft);
 	void				(*free)(struct nf_flowtable *ft);
-	const struct rhashtable_params	*params;
 	nf_hookfn			*hook;
 	struct module			*owner;
 };
@@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flowtable *flow_table,
 
 void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
 
+int nf_flow_table_init(struct nf_flowtable *flow_table);
 void nf_flow_table_free(struct nf_flowtable *flow_table);
-void nf_flow_offload_work_gc(struct work_struct *work);
-extern const struct rhashtable_params nf_flow_offload_rhash_params;
 
 void flow_offload_dead(struct flow_offload *flow);
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index b6e43ff0c7b7..e1e56d7123d2 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -7,8 +7,7 @@
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ip_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index f1804ce8d561..c511d206bf9b 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -8,8 +8,7 @@
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_ipv6_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 7403a0dfddf7..09d1be669c39 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -116,16 +116,50 @@ void flow_offload_dead(struct flow_offload *flow)
 }
 EXPORT_SYMBOL_GPL(flow_offload_dead);
 
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple *tuple = data;
+
+	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+	const struct flow_offload_tuple_rhash *tuplehash = data;
+
+	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+					const void *ptr)
+{
+	const struct flow_offload_tuple *tuple = arg->key;
+	const struct flow_offload_tuple_rhash *x = ptr;
+
+	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+		return 1;
+
+	return 0;
+}
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
+	.hashfn			= flow_offload_hash,
+	.obj_hashfn		= flow_offload_hash_obj,
+	.obj_cmpfn		= flow_offload_hash_cmp,
+	.automatic_shrinking	= true,
+};
+
 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
 {
 	flow->timeout = (u32)jiffies;
 
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_insert_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -135,10 +169,10 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
 {
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 	rhashtable_remove_fast(&flow_table->rhashtable,
 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
-			       *flow_table->type->params);
+			       nf_flow_offload_rhash_params);
 
 	flow_offload_free(flow);
 }
@@ -148,7 +182,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table,
 		    struct flow_offload_tuple *tuple)
 {
 	return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
-				      *flow_table->type->params);
+				      nf_flow_offload_rhash_params);
 }
 EXPORT_SYMBOL_GPL(flow_offload_lookup);
 
@@ -237,7 +271,7 @@ static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
 	return 1;
 }
 
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
 {
 	struct nf_flowtable *flow_table;
 
@@ -245,42 +279,6 @@ void nf_flow_offload_work_gc(struct work_struct *work)
 	nf_flow_offload_gc_step(flow_table);
 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
 }
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple *tuple = data;
-
-	return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
-	const struct flow_offload_tuple_rhash *tuplehash = data;
-
-	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
-					const void *ptr)
-{
-	const struct flow_offload_tuple *tuple = arg->key;
-	const struct flow_offload_tuple_rhash *x = ptr;
-
-	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
-		return 1;
-
-	return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
-	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
-	.hashfn			= flow_offload_hash,
-	.obj_hashfn		= flow_offload_hash_obj,
-	.obj_cmpfn		= flow_offload_hash_cmp,
-	.automatic_shrinking	= true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
 
 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 				__be16 port, __be16 new_port)
@@ -398,6 +396,24 @@ int nf_flow_dnat_port(const struct flow_offload *flow,
 }
 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
 
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+	int err;
+
+	INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+	err = rhashtable_init(&flowtable->rhashtable,
+			      &nf_flow_offload_rhash_params);
+	if (err < 0)
+		return err;
+
+	queue_delayed_work(system_power_efficient_wq,
+			   &flowtable->gc_work, HZ);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
 {
 	struct net_device *dev = data;
@@ -423,8 +439,10 @@ EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
 
 void nf_flow_table_free(struct nf_flowtable *flow_table)
 {
+	cancel_delayed_work_sync(&flow_table->gc_work);
 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
 	WARN_ON(!nf_flow_offload_gc_step(flow_table));
+	rhashtable_destroy(&flow_table->rhashtable);
 }
 EXPORT_SYMBOL_GPL(nf_flow_table_free);
 
diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
index 375a1881d93d..99771aa7e7ea 100644
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
 
 static struct nf_flowtable_type flowtable_inet = {
 	.family		= NFPROTO_INET,
-	.params		= &nf_flow_offload_rhash_params,
-	.gc		= nf_flow_offload_work_gc,
+	.init		= nf_flow_table_init,
 	.free		= nf_flow_table_free,
 	.hook		= nf_flow_offload_inet_hook,
 	.owner		= THIS_MODULE,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9134cc429ad4..6cd9955916e5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5150,14 +5150,14 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 	}
 
 	flowtable->data.type = type;
-	err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+	err = type->init(&flowtable->data);
 	if (err < 0)
 		goto err3;
 
 	err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
 					     flowtable);
 	if (err < 0)
-		goto err3;
+		goto err4;
 
 	for (i = 0; i < flowtable->ops_len; i++) {
 		if (!flowtable->ops[i].dev)
@@ -5171,37 +5171,35 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 				if (flowtable->ops[i].dev == ft->ops[k].dev &&
 				    flowtable->ops[i].pf == ft->ops[k].pf) {
 					err = -EBUSY;
-					goto err4;
+					goto err5;
 				}
 			}
 		}
 
 		err = nf_register_net_hook(net, &flowtable->ops[i]);
 		if (err < 0)
-			goto err4;
+			goto err5;
 	}
 
 	err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
 	if (err < 0)
-		goto err5;
-
-	INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
-	queue_delayed_work(system_power_efficient_wq,
-			   &flowtable->data.gc_work, HZ);
+		goto err6;
 
 	list_add_tail_rcu(&flowtable->list, &table->flowtables);
 	table->use++;
 
 	return 0;
-err5:
+err6:
 	i = flowtable->ops_len;
-err4:
+err5:
 	for (k = i - 1; k >= 0; k--) {
 		kfree(flowtable->dev_name[k]);
 		nf_unregister_net_hook(net, &flowtable->ops[k]);
 	}
 
 	kfree(flowtable->ops);
+err4:
+	flowtable->data.type->free(&flowtable->data);
 err3:
 	module_put(type->owner);
 err2:
@@ -5485,11 +5483,9 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
 
 static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
 {
-	cancel_delayed_work_sync(&flowtable->data.gc_work);
 	kfree(flowtable->ops);
 	kfree(flowtable->name);
 	flowtable->data.type->free(&flowtable->data);
-	rhashtable_destroy(&flowtable->data.rhashtable);
 	module_put(flowtable->data.type->owner);
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 17/51] netfilter: nf_flow_table: fix priv pointer for netdev hook
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

The offload ip hook expects a pointer to the flowtable, not to the
rhashtable. Since the rhashtable is the first member, this is safe for
the moment, but breaks as soon as the structure layout changes

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 6cd9955916e5..517bb93c00fb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5019,7 +5019,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
 		flowtable->ops[i].pf		= NFPROTO_NETDEV;
 		flowtable->ops[i].hooknum	= hooknum;
 		flowtable->ops[i].priority	= priority;
-		flowtable->ops[i].priv		= &flowtable->data.rhashtable;
+		flowtable->ops[i].priv		= &flowtable->data;
 		flowtable->ops[i].hook		= flowtable->data.type->hook;
 		flowtable->ops[i].dev		= dev_array[i];
 		flowtable->dev_name[i]		= kstrdup(dev_array[i]->name,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 14/51] netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Useful as preparation for adding iptables support for offload.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 232 --------------------------------
 net/netfilter/nf_flow_table_ip.c        | 215 +++++++++++++++++++++++++++++
 2 files changed, 215 insertions(+), 232 deletions(-)

diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 0e6328490142..f1804ce8d561 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -3,240 +3,8 @@
 #include <linux/module.h>
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
-#include <linux/ipv6.h>
-#include <linux/netdevice.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
-				struct in6_addr *addr,
-				struct in6_addr *new_addr)
-{
-	struct tcphdr *tcph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
-		return -1;
-
-	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
-				  new_addr->s6_addr32, true);
-
-	return 0;
-}
-
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
-				struct in6_addr *addr,
-				struct in6_addr *new_addr)
-{
-	struct udphdr *udph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
-		return -1;
-
-	udph = (void *)(skb_network_header(skb) + thoff);
-	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
-					  new_addr->s6_addr32, true);
-		if (!udph->check)
-			udph->check = CSUM_MANGLED_0;
-	}
-
-	return 0;
-}
-
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
-				    unsigned int thoff, struct in6_addr *addr,
-				    struct in6_addr *new_addr)
-{
-	switch (ip6h->nexthdr) {
-	case IPPROTO_TCP:
-		if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	case IPPROTO_UDP:
-		if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	}
-
-	return 0;
-}
-
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
-			     struct sk_buff *skb, struct ipv6hdr *ip6h,
-			     unsigned int thoff,
-			     enum flow_offload_tuple_dir dir)
-{
-	struct in6_addr addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = ip6h->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
-		ip6h->saddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = ip6h->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
-		ip6h->daddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
-			     struct sk_buff *skb, struct ipv6hdr *ip6h,
-			     unsigned int thoff,
-			     enum flow_offload_tuple_dir dir)
-{
-	struct in6_addr addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = ip6h->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
-		ip6h->daddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = ip6h->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
-		ip6h->saddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-
-	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
-			    struct sk_buff *skb,
-			    enum flow_offload_tuple_dir dir)
-{
-	struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	unsigned int thoff = sizeof(*ip6h);
-
-	if (flow->flags & FLOW_OFFLOAD_SNAT &&
-	    (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
-	     nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
-		return -1;
-	if (flow->flags & FLOW_OFFLOAD_DNAT &&
-	    (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
-	     nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
-		return -1;
-
-	return 0;
-}
-
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
-			      struct flow_offload_tuple *tuple)
-{
-	struct flow_ports *ports;
-	struct ipv6hdr *ip6h;
-	unsigned int thoff;
-
-	if (!pskb_may_pull(skb, sizeof(*ip6h)))
-		return -1;
-
-	ip6h = ipv6_hdr(skb);
-
-	if (ip6h->nexthdr != IPPROTO_TCP &&
-	    ip6h->nexthdr != IPPROTO_UDP)
-		return -1;
-
-	thoff = sizeof(*ip6h);
-	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
-		return -1;
-
-	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
-	tuple->src_v6		= ip6h->saddr;
-	tuple->dst_v6		= ip6h->daddr;
-	tuple->src_port		= ports->source;
-	tuple->dst_port		= ports->dest;
-	tuple->l3proto		= AF_INET6;
-	tuple->l4proto		= ip6h->nexthdr;
-	tuple->iifidx		= dev->ifindex;
-
-	return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
-	if (skb->len <= mtu)
-		return false;
-
-	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
-		return false;
-
-	return true;
-}
-
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
-			  const struct nf_hook_state *state)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table = priv;
-	struct flow_offload_tuple tuple = {};
-	enum flow_offload_tuple_dir dir;
-	struct flow_offload *flow;
-	struct net_device *outdev;
-	struct in6_addr *nexthop;
-	struct ipv6hdr *ip6h;
-	struct rt6_info *rt;
-
-	if (skb->protocol != htons(ETH_P_IPV6))
-		return NF_ACCEPT;
-
-	if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
-		return NF_ACCEPT;
-
-	tuplehash = flow_offload_lookup(flow_table, &tuple);
-	if (tuplehash == NULL)
-		return NF_ACCEPT;
-
-	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
-	if (!outdev)
-		return NF_ACCEPT;
-
-	dir = tuplehash->tuple.dir;
-	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
-		return NF_ACCEPT;
-
-	if (skb_try_make_writable(skb, sizeof(*ip6h)))
-		return NF_DROP;
-
-	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
-	    nf_flow_nat_ipv6(flow, skb, dir) < 0)
-		return NF_DROP;
-
-	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
-	ip6h = ipv6_hdr(skb);
-	ip6h->hop_limit--;
-
-	skb->dev = outdev;
-	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
-	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-
-	return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
 
 static struct nf_flowtable_type flowtable_ipv6 = {
 	.family		= NFPROTO_IPV6,
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 103263e0c7c2..dc570fb7641d 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -4,8 +4,11 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
 #include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 /* For layer 4 checksum field offset. */
@@ -242,3 +245,215 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	return NF_STOLEN;
 }
 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+				  new_addr->s6_addr32, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+				struct in6_addr *addr,
+				struct in6_addr *new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+					  new_addr->s6_addr32, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+				    unsigned int thoff, struct in6_addr *addr,
+				    struct in6_addr *new_addr)
+{
+	switch (ip6h->nexthdr) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+			     struct sk_buff *skb, struct ipv6hdr *ip6h,
+			     unsigned int thoff,
+			     enum flow_offload_tuple_dir dir)
+{
+	struct in6_addr addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = ip6h->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+		ip6h->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = ip6h->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+		ip6h->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+
+	return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+			    struct sk_buff *skb,
+			    enum flow_offload_tuple_dir dir)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	unsigned int thoff = sizeof(*ip6h);
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+	     nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+			      struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	struct ipv6hdr *ip6h;
+	unsigned int thoff;
+
+	if (!pskb_may_pull(skb, sizeof(*ip6h)))
+		return -1;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_TCP &&
+	    ip6h->nexthdr != IPPROTO_UDP)
+		return -1;
+
+	thoff = sizeof(*ip6h);
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v6		= ip6h->saddr;
+	tuple->dst_v6		= ip6h->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET6;
+	tuple->l4proto		= ip6h->nexthdr;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+			  const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	struct in6_addr *nexthop;
+	struct ipv6hdr *ip6h;
+	struct rt6_info *rt;
+
+	if (skb->protocol != htons(ETH_P_IPV6))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*ip6h)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ipv6(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	ip6h = ipv6_hdr(skb);
+	ip6h->hop_limit--;
+
+	skb->dev = outdev;
+	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+	neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+	return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 15/51] netfilter: nf_flow_table: relax mixed ipv4/ipv6 flowtable dependencies
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Since the offload hook code was moved, this table no longer depends on
the IPv4 and IPv6 flowtable modules

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 704b3832dbad..d20664b02ae4 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -666,8 +666,7 @@ endif # NF_TABLES
 
 config NF_FLOW_TABLE_INET
 	tristate "Netfilter flow table mixed IPv4/IPv6 module"
-	depends on NF_FLOW_TABLE_IPV4
-	depends on NF_FLOW_TABLE_IPV6
+	depends on NF_FLOW_TABLE
 	help
           This option adds the flow table mixed IPv4/IPv6 support.
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 13/51] netfilter: nf_flow_table: move ip header check out of nf_flow_exceeds_mtu
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Allows the function to be shared with the IPv6 hook code

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 034fda963392..103263e0c7c2 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -182,9 +182,6 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	if (skb->len <= mtu)
 		return false;
 
-	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
-		return false;
-
 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 		return false;
 
@@ -223,7 +220,8 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
 
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+	    (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
-- 
2.11.0

^ permalink raw reply related

* [PATCH 12/51] netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Allows some minor code sharing with the ipv6 hook code and is also
useful as preparation for adding iptables support for offload

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 241 -------------------------------
 net/netfilter/Makefile                  |   2 +-
 net/netfilter/nf_flow_table_ip.c        | 246 ++++++++++++++++++++++++++++++++
 3 files changed, 247 insertions(+), 242 deletions(-)
 create mode 100644 net/netfilter/nf_flow_table_ip.c

diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 461b1815e633..b6e43ff0c7b7 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,249 +2,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/neighbour.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
-			      __be32 addr, __be32 new_addr)
-{
-	struct tcphdr *tcph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
-		return -1;
-
-	tcph = (void *)(skb_network_header(skb) + thoff);
-	inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
-	return 0;
-}
-
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
-			      __be32 addr, __be32 new_addr)
-{
-	struct udphdr *udph;
-
-	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
-	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
-		return -1;
-
-	udph = (void *)(skb_network_header(skb) + thoff);
-	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
-		inet_proto_csum_replace4(&udph->check, skb, addr,
-					 new_addr, true);
-		if (!udph->check)
-			udph->check = CSUM_MANGLED_0;
-	}
-
-	return 0;
-}
-
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
-				  unsigned int thoff, __be32 addr,
-				  __be32 new_addr)
-{
-	switch (iph->protocol) {
-	case IPPROTO_TCP:
-		if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	case IPPROTO_UDP:
-		if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
-			return NF_DROP;
-		break;
-	}
-
-	return 0;
-}
-
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			   struct iphdr *iph, unsigned int thoff,
-			   enum flow_offload_tuple_dir dir)
-{
-	__be32 addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = iph->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
-		iph->saddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = iph->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
-		iph->daddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-	csum_replace4(&iph->check, addr, new_addr);
-
-	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			   struct iphdr *iph, unsigned int thoff,
-			   enum flow_offload_tuple_dir dir)
-{
-	__be32 addr, new_addr;
-
-	switch (dir) {
-	case FLOW_OFFLOAD_DIR_ORIGINAL:
-		addr = iph->daddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
-		iph->daddr = new_addr;
-		break;
-	case FLOW_OFFLOAD_DIR_REPLY:
-		addr = iph->saddr;
-		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
-		iph->saddr = new_addr;
-		break;
-	default:
-		return -1;
-	}
-	csum_replace4(&iph->check, addr, new_addr);
-
-	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
-			  enum flow_offload_tuple_dir dir)
-{
-	struct iphdr *iph = ip_hdr(skb);
-	unsigned int thoff = iph->ihl * 4;
-
-	if (flow->flags & FLOW_OFFLOAD_SNAT &&
-	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
-	     nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
-		return -1;
-	if (flow->flags & FLOW_OFFLOAD_DNAT &&
-	    (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
-	     nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
-		return -1;
-
-	return 0;
-}
-
-static bool ip_has_options(unsigned int thoff)
-{
-	return thoff != sizeof(struct iphdr);
-}
-
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
-			    struct flow_offload_tuple *tuple)
-{
-	struct flow_ports *ports;
-	unsigned int thoff;
-	struct iphdr *iph;
-
-	if (!pskb_may_pull(skb, sizeof(*iph)))
-		return -1;
-
-	iph = ip_hdr(skb);
-	thoff = iph->ihl * 4;
-
-	if (ip_is_fragment(iph) ||
-	    unlikely(ip_has_options(thoff)))
-		return -1;
-
-	if (iph->protocol != IPPROTO_TCP &&
-	    iph->protocol != IPPROTO_UDP)
-		return -1;
-
-	thoff = iph->ihl * 4;
-	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
-		return -1;
-
-	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
-	tuple->src_v4.s_addr	= iph->saddr;
-	tuple->dst_v4.s_addr	= iph->daddr;
-	tuple->src_port		= ports->source;
-	tuple->dst_port		= ports->dest;
-	tuple->l3proto		= AF_INET;
-	tuple->l4proto		= iph->protocol;
-	tuple->iifidx		= dev->ifindex;
-
-	return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
-	if (skb->len <= mtu)
-		return false;
-
-	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
-		return false;
-
-	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
-		return false;
-
-	return true;
-}
-
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
-			const struct nf_hook_state *state)
-{
-	struct flow_offload_tuple_rhash *tuplehash;
-	struct nf_flowtable *flow_table = priv;
-	struct flow_offload_tuple tuple = {};
-	enum flow_offload_tuple_dir dir;
-	struct flow_offload *flow;
-	struct net_device *outdev;
-	const struct rtable *rt;
-	struct iphdr *iph;
-	__be32 nexthop;
-
-	if (skb->protocol != htons(ETH_P_IP))
-		return NF_ACCEPT;
-
-	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
-		return NF_ACCEPT;
-
-	tuplehash = flow_offload_lookup(flow_table, &tuple);
-	if (tuplehash == NULL)
-		return NF_ACCEPT;
-
-	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
-	if (!outdev)
-		return NF_ACCEPT;
-
-	dir = tuplehash->tuple.dir;
-	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-
-	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
-		return NF_ACCEPT;
-
-	if (skb_try_make_writable(skb, sizeof(*iph)))
-		return NF_DROP;
-
-	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
-	    nf_flow_nat_ip(flow, skb, dir) < 0)
-		return NF_DROP;
-
-	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
-	iph = ip_hdr(skb);
-	ip_decrease_ttl(iph);
-
-	skb->dev = outdev;
-	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
-	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-
-	return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
 
 static struct nf_flowtable_type flowtable_ipv4 = {
 	.family		= NFPROTO_IPV4,
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 700c5d51e405..3103ed1efe17 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
 
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
new file mode 100644
index 000000000000..034fda963392
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,246 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct tcphdr *tcph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+		return -1;
+
+	tcph = (void *)(skb_network_header(skb) + thoff);
+	inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+			      __be32 addr, __be32 new_addr)
+{
+	struct udphdr *udph;
+
+	if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+	    skb_try_make_writable(skb, thoff + sizeof(*udph)))
+		return -1;
+
+	udph = (void *)(skb_network_header(skb) + thoff);
+	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+		inet_proto_csum_replace4(&udph->check, skb, addr,
+					 new_addr, true);
+		if (!udph->check)
+			udph->check = CSUM_MANGLED_0;
+	}
+
+	return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+				  unsigned int thoff, __be32 addr,
+				  __be32 new_addr)
+{
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	case IPPROTO_UDP:
+		if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+			return NF_DROP;
+		break;
+	}
+
+	return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+	csum_replace4(&iph->check, addr, new_addr);
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			   struct iphdr *iph, unsigned int thoff,
+			   enum flow_offload_tuple_dir dir)
+{
+	__be32 addr, new_addr;
+
+	switch (dir) {
+	case FLOW_OFFLOAD_DIR_ORIGINAL:
+		addr = iph->daddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+		iph->daddr = new_addr;
+		break;
+	case FLOW_OFFLOAD_DIR_REPLY:
+		addr = iph->saddr;
+		new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+		iph->saddr = new_addr;
+		break;
+	default:
+		return -1;
+	}
+	csum_replace4(&iph->check, addr, new_addr);
+
+	return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+			  enum flow_offload_tuple_dir dir)
+{
+	struct iphdr *iph = ip_hdr(skb);
+	unsigned int thoff = iph->ihl * 4;
+
+	if (flow->flags & FLOW_OFFLOAD_SNAT &&
+	    (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+	if (flow->flags & FLOW_OFFLOAD_DNAT &&
+	    (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+	     nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+		return -1;
+
+	return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+	return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+			    struct flow_offload_tuple *tuple)
+{
+	struct flow_ports *ports;
+	unsigned int thoff;
+	struct iphdr *iph;
+
+	if (!pskb_may_pull(skb, sizeof(*iph)))
+		return -1;
+
+	iph = ip_hdr(skb);
+	thoff = iph->ihl * 4;
+
+	if (ip_is_fragment(iph) ||
+	    unlikely(ip_has_options(thoff)))
+		return -1;
+
+	if (iph->protocol != IPPROTO_TCP &&
+	    iph->protocol != IPPROTO_UDP)
+		return -1;
+
+	thoff = iph->ihl * 4;
+	if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+		return -1;
+
+	ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+	tuple->src_v4.s_addr	= iph->saddr;
+	tuple->dst_v4.s_addr	= iph->daddr;
+	tuple->src_port		= ports->source;
+	tuple->dst_port		= ports->dest;
+	tuple->l3proto		= AF_INET;
+	tuple->l4proto		= iph->protocol;
+	tuple->iifidx		= dev->ifindex;
+
+	return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+	if (skb->len <= mtu)
+		return false;
+
+	if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+		return false;
+
+	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
+		return false;
+
+	return true;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+			const struct nf_hook_state *state)
+{
+	struct flow_offload_tuple_rhash *tuplehash;
+	struct nf_flowtable *flow_table = priv;
+	struct flow_offload_tuple tuple = {};
+	enum flow_offload_tuple_dir dir;
+	struct flow_offload *flow;
+	struct net_device *outdev;
+	const struct rtable *rt;
+	struct iphdr *iph;
+	__be32 nexthop;
+
+	if (skb->protocol != htons(ETH_P_IP))
+		return NF_ACCEPT;
+
+	if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+		return NF_ACCEPT;
+
+	tuplehash = flow_offload_lookup(flow_table, &tuple);
+	if (tuplehash == NULL)
+		return NF_ACCEPT;
+
+	outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+	if (!outdev)
+		return NF_ACCEPT;
+
+	dir = tuplehash->tuple.dir;
+	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+		return NF_ACCEPT;
+
+	if (skb_try_make_writable(skb, sizeof(*iph)))
+		return NF_DROP;
+
+	if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+	    nf_flow_nat_ip(flow, skb, dir) < 0)
+		return NF_DROP;
+
+	flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+	iph = ip_hdr(skb);
+	ip_decrease_ttl(iph);
+
+	skb->dev = outdev;
+	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+	neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+	return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 11/51] netfilter: nf_flow_table: rename nf_flow_table.c to nf_flow_table_core.c
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Preparation for adding more code to the same module

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Makefile                                  | 2 ++
 net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} | 0
 2 files changed, 2 insertions(+)
 rename net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} (100%)

diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index fd32bd2c9521..700c5d51e405 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -111,6 +111,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV)	+= nft_fwd_netdev.o
 
 # flow table infrastructure
 obj-$(CONFIG_NF_FLOW_TABLE)	+= nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o
+
 obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
 
 # generic X tables 
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table_core.c
similarity index 100%
rename from net/netfilter/nf_flow_table.c
rename to net/netfilter/nf_flow_table_core.c
-- 
2.11.0

^ permalink raw reply related

* [PATCH 10/51] netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Reduces the number of cache lines touched in the offload forwarding
path. This is safe because PMTU limits are bypassed for the forwarding
path (see commit f87c10a8aa1e for more details).

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h   |  2 ++
 net/ipv4/netfilter/nf_flow_table_ipv4.c | 17 +++--------------
 net/ipv6/netfilter/nf_flow_table_ipv6.c | 17 +++--------------
 net/netfilter/nf_flow_table.c           |  8 ++++++--
 4 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 09ba67598991..76ee5c81b752 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -55,6 +55,8 @@ struct flow_offload_tuple {
 
 	int				oifidx;
 
+	u16				mtu;
+
 	struct dst_entry		*dst_cache;
 };
 
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
index 0cd46bffa469..461b1815e633 100644
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -178,7 +178,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -192,17 +192,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
-	u32 mtu;
-
-	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -233,9 +222,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*iph)))
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
index 207cb35569b1..0e6328490142 100644
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
 }
 
 /* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
 		return false;
@@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
-	u32 mtu;
-
-	mtu = ip6_dst_mtu_forward(&rt->dst);
-	if (__nf_flow_exceeds_mtu(skb, mtu))
-		return true;
-
-	return false;
-}
-
 unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
@@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	dir = tuplehash->tuple.dir;
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
 	rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-	if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+	if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
 		return NF_ACCEPT;
 
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index db0673a40b97..7403a0dfddf7 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,8 @@
 #include <linux/netfilter.h>
 #include <linux/rhashtable.h>
 #include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
 #include <net/netfilter/nf_tables.h>
 #include <net/netfilter/nf_flow_table.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 {
 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
 	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+	struct dst_entry *dst = route->tuple[dir].dst;
 
 	ft->dir = dir;
 
@@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 	case NFPROTO_IPV4:
 		ft->src_v4 = ctt->src.u3.in;
 		ft->dst_v4 = ctt->dst.u3.in;
+		ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
 		break;
 	case NFPROTO_IPV6:
 		ft->src_v6 = ctt->src.u3.in6;
 		ft->dst_v6 = ctt->dst.u3.in6;
+		ft->mtu = ip6_dst_mtu_forward(dst);
 		break;
 	}
 
@@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
 
 	ft->iifidx = route->tuple[dir].ifindex;
 	ft->oifidx = route->tuple[!dir].ifindex;
-
-	ft->dst_cache = route->tuple[dir].dst;
+	ft->dst_cache = dst;
 }
 
 struct flow_offload *
-- 
2.11.0

^ permalink raw reply related

* [PATCH 08/51] netfilter: nf_flow_table: clean up flow_offload_alloc
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Reduce code duplication and make it much easier to read

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table.c | 93 ++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 59 deletions(-)

diff --git a/net/netfilter/nf_flow_table.c b/net/netfilter/nf_flow_table.c
index ec410cae9307..db0673a40b97 100644
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -16,6 +16,38 @@ struct flow_offload_entry {
 	struct rcu_head		rcu_head;
 };
 
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+		      struct nf_flow_route *route,
+		      enum flow_offload_tuple_dir dir)
+{
+	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+	struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+
+	ft->dir = dir;
+
+	switch (ctt->src.l3num) {
+	case NFPROTO_IPV4:
+		ft->src_v4 = ctt->src.u3.in;
+		ft->dst_v4 = ctt->dst.u3.in;
+		break;
+	case NFPROTO_IPV6:
+		ft->src_v6 = ctt->src.u3.in6;
+		ft->dst_v6 = ctt->dst.u3.in6;
+		break;
+	}
+
+	ft->l3proto = ctt->src.l3num;
+	ft->l4proto = ctt->dst.protonum;
+	ft->src_port = ctt->src.u.tcp.port;
+	ft->dst_port = ctt->dst.u.tcp.port;
+
+	ft->iifidx = route->tuple[dir].ifindex;
+	ft->oifidx = route->tuple[!dir].ifindex;
+
+	ft->dst_cache = route->tuple[dir].dst;
+}
+
 struct flow_offload *
 flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 {
@@ -40,65 +72,8 @@ flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
 
 	entry->ct = ct;
 
-	switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
-	case NFPROTO_IPV4:
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
-		break;
-	case NFPROTO_IPV6:
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
-			ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
-		flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
-			ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
-		break;
-	}
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
-		  route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
-		  route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
-		ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
-		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
-		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
-						FLOW_OFFLOAD_DIR_ORIGINAL;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
-						FLOW_OFFLOAD_DIR_REPLY;
-
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
-	flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
-		route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+	flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
 
 	if (ct->status & IPS_SRC_NAT)
 		flow->flags |= FLOW_OFFLOAD_SNAT;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 09/51] ipv6: make ip6_dst_mtu_forward inline
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Just like ip_dst_mtu_maybe_forward(), to avoid a dependency with ipv6.ko.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip6_route.h | 21 +++++++++++++++++++++
 include/net/ipv6.h      |  2 --
 net/ipv6/ip6_output.c   | 22 ----------------------
 3 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d5fb1e4ae7ac..376928c26d2d 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -279,6 +279,27 @@ static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *
 	       !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate);
 }
 
+static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+	struct inet6_dev *idev;
+	unsigned int mtu;
+
+	if (dst_metric_locked(dst, RTAX_MTU)) {
+		mtu = dst_metric_raw(dst, RTAX_MTU);
+		if (mtu)
+			return mtu;
+	}
+
+	mtu = IPV6_MIN_MTU;
+	rcu_read_lock();
+	idev = __in6_dev_get(dst->dev);
+	if (idev)
+		mtu = idev->cnf.mtu6;
+	rcu_read_unlock();
+
+	return mtu;
+}
+
 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 				   struct net_device *dev, struct sk_buff *skb,
 				   const void *daddr);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..765441867cfa 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -958,8 +958,6 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
 			      &inet6_sk(sk)->cork);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
-
 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
 		   struct flowi6 *fl6);
 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3db47986ef38..cec49e137dbb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -383,28 +383,6 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 	return dst_output(net, sk, skb);
 }
 
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
-	unsigned int mtu;
-	struct inet6_dev *idev;
-
-	if (dst_metric_locked(dst, RTAX_MTU)) {
-		mtu = dst_metric_raw(dst, RTAX_MTU);
-		if (mtu)
-			return mtu;
-	}
-
-	mtu = IPV6_MIN_MTU;
-	rcu_read_lock();
-	idev = __in6_dev_get(dst->dev);
-	if (idev)
-		mtu = idev->cnf.mtu6;
-	rcu_read_unlock();
-
-	return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 {
 	if (skb->len <= mtu)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 07/51] netfilter: nf_flow_table: use IP_CT_DIR_* values for FLOW_OFFLOAD_DIR_*
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Felix Fietkau <nbd@nbd.name>

Simplifies further code cleanups

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_flow_table.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index 833752dd0c58..09ba67598991 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -6,6 +6,7 @@
 #include <linux/netdevice.h>
 #include <linux/rhashtable.h>
 #include <linux/rcupdate.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
 #include <net/dst.h>
 
 struct nf_flowtable;
@@ -27,11 +28,10 @@ struct nf_flowtable {
 };
 
 enum flow_offload_tuple_dir {
-	FLOW_OFFLOAD_DIR_ORIGINAL,
-	FLOW_OFFLOAD_DIR_REPLY,
-	__FLOW_OFFLOAD_DIR_MAX		= FLOW_OFFLOAD_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
+	FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
+	FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX
 };
-#define FLOW_OFFLOAD_DIR_MAX	(__FLOW_OFFLOAD_DIR_MAX + 1)
 
 struct flow_offload_tuple {
 	union {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 06/51] netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet.
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Taehee Yoo <ap420073@gmail.com>

The nfulnl_log_packet() is added to make sure that the NFLOG target
works as only user-space logger. but now, nf_log_packet() can find proper
log function using NF_LOG_TYPE_ULOG and NF_LOG_TYPE_LOG.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nfnetlink_log.h | 17 -----------------
 net/netfilter/nfnetlink_log.c         |  8 +++-----
 net/netfilter/xt_NFLOG.c              | 15 +++++++++++----
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/include/net/netfilter/nfnetlink_log.h b/include/net/netfilter/nfnetlink_log.h
index 612cfb63ac68..ea32a7d3cf1b 100644
--- a/include/net/netfilter/nfnetlink_log.h
+++ b/include/net/netfilter/nfnetlink_log.h
@@ -1,18 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _KER_NFNETLINK_LOG_H
-#define _KER_NFNETLINK_LOG_H
-
-void
-nfulnl_log_packet(struct net *net,
-		  u_int8_t pf,
-		  unsigned int hooknum,
-		  const struct sk_buff *skb,
-		  const struct net_device *in,
-		  const struct net_device *out,
-		  const struct nf_loginfo *li_user,
-		  const char *prefix);
-
-#define NFULNL_COPY_DISABLED    0xff
-
-#endif /* _KER_NFNETLINK_LOG_H */
-
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4c478d..e5cc4d9b9ce7 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -37,7 +37,6 @@
 #include <net/sock.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netns/generic.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 #include <linux/atomic.h>
 #include <linux/refcount.h>
@@ -47,6 +46,7 @@
 #include "../bridge/br_private.h"
 #endif
 
+#define NFULNL_COPY_DISABLED	0xff
 #define NFULNL_NLBUFSIZ_DEFAULT	NLMSG_GOODSIZE
 #define NFULNL_TIMEOUT_DEFAULT 	100	/* every second */
 #define NFULNL_QTHRESH_DEFAULT 	100	/* 100 packets */
@@ -618,7 +618,7 @@ static const struct nf_loginfo default_loginfo = {
 };
 
 /* log handler for internal netfilter logging api */
-void
+static void
 nfulnl_log_packet(struct net *net,
 		  u_int8_t pf,
 		  unsigned int hooknum,
@@ -633,7 +633,7 @@ nfulnl_log_packet(struct net *net,
 	struct nfulnl_instance *inst;
 	const struct nf_loginfo *li;
 	unsigned int qthreshold;
-	unsigned int plen;
+	unsigned int plen = 0;
 	struct nfnl_log_net *log = nfnl_log_pernet(net);
 	const struct nfnl_ct_hook *nfnl_ct = NULL;
 	struct nf_conn *ct = NULL;
@@ -648,7 +648,6 @@ nfulnl_log_packet(struct net *net,
 	if (!inst)
 		return;
 
-	plen = 0;
 	if (prefix)
 		plen = strlen(prefix) + 1;
 
@@ -760,7 +759,6 @@ nfulnl_log_packet(struct net *net,
 	/* FIXME: statistics */
 	goto unlock_and_release;
 }
-EXPORT_SYMBOL_GPL(nfulnl_log_packet);
 
 static int
 nfulnl_rcv_nl_event(struct notifier_block *this,
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index c7f8958cea4a..1ed0cac585c4 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -13,7 +13,6 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_NFLOG.h>
 #include <net/netfilter/nf_log.h>
-#include <net/netfilter/nfnetlink_log.h>
 
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG");
@@ -37,8 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->flags & XT_NFLOG_F_COPY_LEN)
 		li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
 
-	nfulnl_log_packet(net, xt_family(par), xt_hooknum(par), skb,
-			  xt_in(par), xt_out(par), &li, info->prefix);
+	nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par),
+		      xt_out(par), &li, "%s", info->prefix);
+
 	return XT_CONTINUE;
 }
 
@@ -50,7 +50,13 @@ static int nflog_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	if (info->prefix[sizeof(info->prefix) - 1] != '\0')
 		return -EINVAL;
-	return 0;
+
+	return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG);
+}
+
+static void nflog_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_logger_put(par->family, NF_LOG_TYPE_ULOG);
 }
 
 static struct xt_target nflog_tg_reg __read_mostly = {
@@ -58,6 +64,7 @@ static struct xt_target nflog_tg_reg __read_mostly = {
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.checkentry = nflog_tg_check,
+	.destroy    = nflog_tg_destroy,
 	.target     = nflog_tg,
 	.targetsize = sizeof(struct xt_nflog_info),
 	.me         = THIS_MODULE,
-- 
2.11.0

^ permalink raw reply related

* [PATCH 05/51] ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Vincent Bernat <vincent@bernat.im>

The sh/dh/lblc/lblcr algorithms are using Knuth's multiplicative
hashing incorrectly. Replace its use by the hash_32() macro, which
correctly implements this algorithm. It doesn't use the same constant,
but it shouldn't matter.

Signed-off-by: Vincent Bernat <vincent@bernat.im>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_dh.c    | 3 ++-
 net/netfilter/ipvs/ip_vs_lblc.c  | 3 ++-
 net/netfilter/ipvs/ip_vs_lblcr.c | 3 ++-
 net/netfilter/ipvs/ip_vs_sh.c    | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
+#include <linux/hash.h>
 
 #include <net/ip_vs.h>
 
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..08147fc6400c 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
 #include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/jiffies.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..9b6a6c9e9cfa 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
 #include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/hash.h>
 
 /* for sysctl */
 #include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+	return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
 		addr_fold = addr->ip6[0]^addr->ip6[1]^
 			    addr->ip6[2]^addr->ip6[3];
 #endif
-	return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+	return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+				 IP_VS_SH_TAB_BITS)) &
 		IP_VS_SH_TAB_MASK;
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 04/51] netfilter: ipvs: Add configurations of Maglev hashing
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 3306 bytes --]

From: Inju Song <inju.song@navercorp.com>

To build the maglev hashing scheduler, add some configuration
to Kconfig and Makefile.

 - The compile configurations of MH are added to the Kconfig.

 - The MH build rule is added to the Makefile.

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/Kconfig  | 37 +++++++++++++++++++++++++++++++++++++
 net/netfilter/ipvs/Makefile |  1 +
 2 files changed, 38 insertions(+)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config	IP_VS_SH
 	  If you want to compile it in kernel, say Y. To compile it as a
 	  module, choose M here. If unsure, say N.
 
+config	IP_VS_MH
+	tristate "maglev hashing scheduling"
+	---help---
+	  The maglev consistent hashing scheduling algorithm provides the
+	  Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+	  network connections to the servers through looking up a statically
+	  assigned special hash table called the lookup table. Maglev hashing
+	  is to assign a preference list of all the lookup table positions
+	  to each destination.
+
+	  Through this operation, The maglev hashing gives an almost equal
+	  share of the lookup table to each of the destinations and provides
+	  minimal disruption by using the lookup table. When the set of
+	  destinations changes, a connection will likely be sent to the same
+	  destination as it was before.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
 config	IP_VS_SED
 	tristate "shortest expected delay scheduling"
 	---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
 	  needs to be large enough to effectively fit all the destinations
 	  multiplied by their respective weights.
 
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+	int "IPVS maglev hashing table index of size (the prime numbers)"
+	range 8 17
+	default 12
+	---help---
+	  The maglev hashing scheduler maps source IPs to destinations
+	  stored in a hash table. This table is assigned by a preference
+	  list of the positions to each destination until all slots in
+	  the table are filled. The index determines the prime for size of
+	  the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+	  65521 or 131071. When using weights to allow destinations to
+	  receive more connections, the table is assigned an amount
+	  proportional to the weights specified. The table needs to be large
+	  enough to effectively fit all the destinations multiplied by their
+	  respective weights.
+
 comment 'IPVS application helper'
 
 config	IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
 obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
 obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 03/51] netfilter: ipvs: Add Maglev hashing scheduler
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 15318 bytes --]

From: Inju Song <inju.song@navercorp.com>

Implements the Google's Maglev hashing algorithm as a IPVS scheduler.

Basically it provides consistent hashing butÂ offers some special
features about disruption and load balancing.

 1)Â minimal disruption: when the set of destinations changes,
    a connection will likely be sent to the same destination
    as it was before.

 2)Â load balancing: each destination will receive an almost
    equal number of connections.

Seel also for detail: [3.4 Consistent Hasing] in
https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_mh.c | 540 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 540 insertions(+)
 create mode 100644 net/netfilter/ipvs/ip_vs_mh.c

diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS:	Maglev Hashing scheduling module
+ *
+ * Authors:	Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK	IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT	IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+	struct ip_vs_dest __rcu	*dest;	/* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+	unsigned int	offset; /* starting offset */
+	unsigned int	skip;	/* skip */
+	unsigned int	perm;	/* next_offset */
+	int		turns;	/* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+		       8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX	12
+#endif
+#define IP_VS_MH_TAB_BITS		(CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX		(CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+	struct rcu_head			rcu_head;
+	struct ip_vs_mh_lookup		*lookup;
+	struct ip_vs_mh_dest_setup	*dest_setup;
+	hsiphash_key_t			hash1, hash2;
+	int				gcd;
+	int				rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+					hsiphash_key_t *hash2)
+{
+	hash1->key[0] = 2654435761UL;
+	hash1->key[1] = 2654435761UL;
+
+	hash2->key[0] = 2654446892UL;
+	hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->weight) <= 0 ||
+	       dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+		 __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+	unsigned int v;
+	__be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+			    addr->ip6[2] ^ addr->ip6[3];
+#endif
+	v = (offset + ntohs(port) + ntohl(addr_fold));
+	return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+	int i;
+	struct ip_vs_mh_lookup *l;
+	struct ip_vs_dest *dest;
+
+	l = &s->lookup[0];
+	for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+		dest = rcu_dereference_protected(l->dest, 1);
+		if (dest) {
+			ip_vs_dest_put(dest);
+			RCU_INIT_POINTER(l->dest, NULL);
+		}
+		l++;
+	}
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+			      struct ip_vs_service *svc)
+{
+	struct list_head *p;
+	struct ip_vs_mh_dest_setup *ds;
+	struct ip_vs_dest *dest;
+	int lw;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, skip
+	 * permutation for the dests.
+	 */
+	if (s->gcd < 1)
+		return 0;
+
+	/* Set dest_setup for the dests permutation */
+	p = &svc->destinations;
+	ds = &s->dest_setup[0];
+	while ((p = p->next) != &svc->destinations) {
+		dest = list_entry(p, struct ip_vs_dest, n_list);
+
+		ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					      dest->port, &s->hash1, 0) %
+					      IP_VS_MH_TAB_SIZE;
+		ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+					    dest->port, &s->hash2, 0) %
+					    (IP_VS_MH_TAB_SIZE - 1) + 1;
+		ds->perm = ds->offset;
+
+		lw = atomic_read(&dest->last_weight);
+		ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+		ds++;
+	}
+
+	return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
+{
+	int n, c, dt_count;
+	unsigned long *table;
+	struct list_head *p;
+	struct ip_vs_mh_dest_setup *ds;
+	struct ip_vs_dest *dest, *new_dest;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, skip
+	 * the population for the dests and reset lookup table.
+	 */
+	if (s->gcd < 1) {
+		ip_vs_mh_reset(s);
+		return 0;
+	}
+
+	table =  kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+			 sizeof(unsigned long), GFP_KERNEL);
+	if (!table)
+		return -ENOMEM;
+
+	p = &svc->destinations;
+	n = 0;
+	dt_count = 0;
+	while (n < IP_VS_MH_TAB_SIZE) {
+		if (p == &svc->destinations)
+			p = p->next;
+
+		ds = &s->dest_setup[0];
+		while (p != &svc->destinations) {
+			/* Ignore added server with zero weight */
+			if (ds->turns < 1) {
+				p = p->next;
+				ds++;
+				continue;
+			}
+
+			c = ds->perm;
+			while (test_bit(c, table)) {
+				/* Add skip, mod IP_VS_MH_TAB_SIZE */
+				ds->perm += ds->skip;
+				if (ds->perm >= IP_VS_MH_TAB_SIZE)
+					ds->perm -= IP_VS_MH_TAB_SIZE;
+				c = ds->perm;
+			}
+
+			__set_bit(c, table);
+
+			dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+			new_dest = list_entry(p, struct ip_vs_dest, n_list);
+			if (dest != new_dest) {
+				if (dest)
+					ip_vs_dest_put(dest);
+				ip_vs_dest_hold(new_dest);
+				RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+			}
+
+			if (++n == IP_VS_MH_TAB_SIZE)
+				goto out;
+
+			if (++dt_count >= ds->turns) {
+				dt_count = 0;
+				p = p->next;
+				ds++;
+			}
+		}
+	}
+
+out:
+	kfree(table);
+	return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+	     const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+					     % IP_VS_MH_TAB_SIZE;
+	struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+	return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+		      const union nf_inet_addr *addr, __be16 port)
+{
+	unsigned int offset, roffset;
+	unsigned int hash, ihash;
+	struct ip_vs_dest *dest;
+
+	/* First try the dest it's supposed to go to */
+	ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+				 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+	dest = rcu_dereference(s->lookup[ihash].dest);
+	if (!dest)
+		return NULL;
+	if (!is_unavailable(dest))
+		return dest;
+
+	IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+	/* If the original dest is unavailable, loop around the table
+	 * starting from ihash to find a new dest
+	 */
+	for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+		roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+		hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+					roffset) % IP_VS_MH_TAB_SIZE;
+		dest = rcu_dereference(s->lookup[hash].dest);
+		if (!dest)
+			break;
+		if (!is_unavailable(dest))
+			return dest;
+		IP_VS_DBG_BUF(6,
+			      "MH: selected unavailable server %s:%u (offset %u), reselecting",
+			      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+			      ntohs(dest->port), roffset);
+	}
+
+	return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+			     struct ip_vs_service *svc)
+{
+	int ret;
+
+	if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+		return -EINVAL;
+
+	if (svc->num_dests >= 1) {
+		s->dest_setup = kcalloc(svc->num_dests,
+					sizeof(struct ip_vs_mh_dest_setup),
+					GFP_KERNEL);
+		if (!s->dest_setup)
+			return -ENOMEM;
+	}
+
+	ip_vs_mh_permutate(s, svc);
+
+	ret = ip_vs_mh_populate(s, svc);
+	if (ret < 0)
+		goto out;
+
+	IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+		      IP_VS_DBG_ADDR(svc->af, &svc->addr),
+		      ntohs(svc->port));
+
+out:
+	if (svc->num_dests >= 1) {
+		kfree(s->dest_setup);
+		s->dest_setup = NULL;
+	}
+	return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->last_weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+	struct ip_vs_dest *dest;
+	int new_weight, weight = 0;
+	int mw, shift;
+
+	/* If gcd is smaller then 1, number of dests or
+	 * all last_weight of dests are zero. So, return
+	 * shift value as zero.
+	 */
+	if (gcd < 1)
+		return 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		new_weight = atomic_read(&dest->last_weight);
+		if (new_weight > weight)
+			weight = new_weight;
+	}
+
+	/* Because gcd is greater than zero,
+	 * the maximum weight and gcd are always greater than zero
+	 */
+	mw = weight / gcd;
+
+	/* shift = occupied bits of weight/gcd - MH highest bits */
+	shift = fls(mw) - IP_VS_MH_TAB_BITS;
+	return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+	struct ip_vs_mh_state *s;
+
+	s = container_of(head, struct ip_vs_mh_state, rcu_head);
+	kfree(s->lookup);
+	kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+	int ret;
+	struct ip_vs_mh_state *s;
+
+	/* Allocate the MH table for this service */
+	s = kzalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+			    GFP_KERNEL);
+	if (!s->lookup) {
+		kfree(s);
+		return -ENOMEM;
+	}
+
+	generate_hash_secret(&s->hash1, &s->hash2);
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+	IP_VS_DBG(6,
+		  "MH lookup table (memory=%zdbytes) allocated for current service\n",
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+	/* Assign the lookup table with current dests */
+	ret = ip_vs_mh_reassign(s, svc);
+	if (ret < 0) {
+		ip_vs_mh_reset(s);
+		ip_vs_mh_state_free(&s->rcu_head);
+		return ret;
+	}
+
+	/* No more failures, attach state */
+	svc->sched_data = s;
+	return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_mh_state *s = svc->sched_data;
+
+	/* Got to clean up lookup entry here */
+	ip_vs_mh_reset(s);
+
+	call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+	IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+		  sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+				 struct ip_vs_dest *dest)
+{
+	struct ip_vs_mh_state *s = svc->sched_data;
+
+	s->gcd = ip_vs_mh_gcd_weight(svc);
+	s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+	/* Assign the lookup table with the updated service */
+	return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+	__be16 _ports[2], *ports;
+
+	/* At this point we know that we have a valid packet of some kind.
+	 * Because ICMP packets are only guaranteed to have the first 8
+	 * bytes, let's just grab the ports.  Fortunately they're in the
+	 * same position for all three of the protocols we care about.
+	 */
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_SCTP:
+		ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+					   &_ports);
+		if (unlikely(!ports))
+			return 0;
+
+		if (likely(!ip_vs_iph_inverse(iph)))
+			return ports[0];
+		else
+			return ports[1];
+	default:
+		return 0;
+	}
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+		  struct ip_vs_iphdr *iph)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_mh_state *s;
+	__be16 port = 0;
+	const union nf_inet_addr *hash_addr;
+
+	hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+	IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+		port = ip_vs_mh_get_port(skb, iph);
+
+	s = (struct ip_vs_mh_state *)svc->sched_data;
+
+	if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+		dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+	else
+		dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+	if (!dest) {
+		ip_vs_scheduler_err(svc, "no destination available");
+		return NULL;
+	}
+
+	IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+		      IP_VS_DBG_ADDR(svc->af, hash_addr),
+		      ntohs(port),
+		      IP_VS_DBG_ADDR(dest->af, &dest->addr),
+		      ntohs(dest->port));
+
+	return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+	.name =			"mh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.n_list	 =		LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+	.init_service =		ip_vs_mh_init_svc,
+	.done_service =		ip_vs_mh_done_svc,
+	.add_dest =		ip_vs_mh_dest_changed,
+	.del_dest =		ip_vs_mh_dest_changed,
+	.upd_dest =		ip_vs_mh_dest_changed,
+	.schedule =		ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+	return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+	rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
-- 
2.11.0

^ permalink raw reply related

* [PATCH 02/51] netfilter: ipvs: Keep latest weight of destination
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Inju Song <inju.song@navercorp.com>

The hashing table in scheduler such as source hash or maglev hash
should ignore the changed weight to 0 and allow changing the weight
from/to non-0 values. So, struct ip_vs_dest needs to keep weight
with latest non-0 weight.

Signed-off-by: Inju Song <inju.song@navercorp.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h            | 1 +
 net/netfilter/ipvs/ip_vs_ctl.c | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index eb0bec043c96..0ac795b41ab8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -668,6 +668,7 @@ struct ip_vs_dest {
 	volatile unsigned int	flags;		/* dest status flags */
 	atomic_t		conn_flags;	/* flags to copy to conn */
 	atomic_t		weight;		/* server weight */
+	atomic_t		last_weight;	/* server latest weight */
 
 	refcount_t		refcnt;		/* reference counter */
 	struct ip_vs_stats      stats;          /* statistics */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5ebde4b15810..b91bb70ece92 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	if (add && udest->af != svc->af)
 		ipvs->mixed_address_family_dests++;
 
+	/* keep the last_weight with latest non-0 weight */
+	if (add || udest->weight != 0)
+		atomic_set(&dest->last_weight, udest->weight);
+
 	/* set the weight and the flags */
 	atomic_set(&dest->weight, udest->weight);
 	conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 01/51] netfilter: ipvs: Fix space before '[' error.
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180506224709.29100-1-pablo@netfilter.org>

From: Arvind Yadav <arvind.yadav.cs@gmail.com>

Fix checkpatch.pl error:
ERROR: space prohibited before open square bracket '['.

Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..569631d2b2a1 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
 	return tcp_state_active_table[state];
 }
 
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
 /*	INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
 };
 
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
 /*	INPUT */
 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
-- 
2.11.0

^ permalink raw reply related

* [PATCH 00/51] Netfilter/IPVS updates for net-next
From: Pablo Neira Ayuso @ 2018-05-06 22:46 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi David,

The following patchset contains Netfilter/IPVS updates for your net-next
tree, more relevant updates in this batch are:

1) Add Maglev support to IPVS. Moreover, store lastest server weight in
   IPVS since this is needed by maglev, patches from from Inju Song.

2) Preparation works to add iptables flowtable support, patches
   from Felix Fietkau.

3) Hand over flows back to conntrack slow path in case of TCP RST/FIN
   packet is seen via new teardown state, also from Felix.

4) Add support for extended netlink error reporting for nf_tables.

5) Support for larger timeouts that 23 days in nf_tables, patch from
   Florian Westphal.

6) Always set an upper limit to dynamic sets, also from Florian.

7) Allow number generator to make map lookups, from Laura Garcia.

8) Use hash_32() instead of opencode hashing in IPVS, from Vicent Bernat.

9) Extend ip6tables SRH match to support previous, next and last SID,
   from Ahmed Abdelsalam.

10) Move Passive OS fingerprint nf_osf.c, from Fernando Fernandez.

11) Expose nf_conntrack_max through ctnetlink, from Florent Fourcot.

12) Several housekeeping patches for xt_NFLOG, x_tables and ebtables,
   from Taehee Yoo.

13) Unify meta bridge with core nft_meta, then make nft_meta built-in.
   Make rt and exthdr built-in too, again from Florian.

14) Missing initialization of tbl->entries in IPVS, from Cong Wang.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Thanks.

----------------------------------------------------------------

The following changes since commit 415787d7799f4fccbe8d49cb0b8e5811be6b0389:

  ipv6: frags: fix a lockdep false positive (2018-04-18 23:19:39 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git HEAD

for you to fetch changes up to b13468dc577498002cf4e62978359ff97ffcd187:

  netfilter: nft_dynset: fix timeout updates on 32bit (2018-05-07 00:05:22 +0200)

----------------------------------------------------------------
Ahmed Abdelsalam (1):
      netfilter: ip6t_srh: extend SRH matching for previous, next and last SID

Arvind Yadav (1):
      netfilter: ipvs: Fix space before '[' error.

Cong Wang (2):
      ipvs: initialize tbl->entries after allocation
      ipvs: initialize tbl->entries in ip_vs_lblc_init_svc()

Felix Fietkau (19):
      netfilter: nf_flow_table: use IP_CT_DIR_* values for FLOW_OFFLOAD_DIR_*
      netfilter: nf_flow_table: clean up flow_offload_alloc
      ipv6: make ip6_dst_mtu_forward inline
      netfilter: nf_flow_table: cache mtu in struct flow_offload_tuple
      netfilter: nf_flow_table: rename nf_flow_table.c to nf_flow_table_core.c
      netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table
      netfilter: nf_flow_table: move ip header check out of nf_flow_exceeds_mtu
      netfilter: nf_flow_table: move ipv6 offload hook code to nf_flow_table
      netfilter: nf_flow_table: relax mixed ipv4/ipv6 flowtable dependencies
      netfilter: nf_flow_table: move init code to nf_flow_table_core.c
      netfilter: nf_flow_table: fix priv pointer for netdev hook
      netfilter: nf_flow_table: track flow tables in nf_flow_table directly
      netfilter: nf_flow_table: make flow_offload_dead inline
      netfilter: nf_flow_table: add a new flow state for tearing down offloading
      netfilter: nf_flow_table: in flow_offload_lookup, skip entries being deleted
      netfilter: nf_flow_table: add support for sending flows back to the slow path
      netfilter: nf_flow_table: tear down TCP flows if RST or FIN was seen
      netfilter: nf_flow_table: add missing condition for TCP state check
      netfilter: nf_flow_table: fix offloading connections with SNAT+DNAT

Fernando Fernandez Mancera (1):
      netfilter: extract Passive OS fingerprint infrastructure from xt_osf

Florent Fourcot (1):
      netfilter: ctnetlink: export nf_conntrack_max

Florian Westphal (8):
      netfilter: nf_tables: support timeouts larger than 23 days
      netfilter: nf_tables: always use an upper set size for dynsets
      netfilter: merge meta_bridge into nft_meta
      netfilter: nf_tables: make meta expression builtin
      netfilter: nf_tables: merge rt expression into nft core
      netfilter: nf_tables: merge exthdr expression into nft core
      netfilter: nf_nat: remove unused ct arg from lookup functions
      netfilter: nft_dynset: fix timeout updates on 32bit

Inju Song (3):
      netfilter: ipvs: Keep latest weight of destination
      netfilter: ipvs: Add Maglev hashing scheduler
      netfilter: ipvs: Add configurations of Maglev hashing

Laura Garcia Liebana (2):
      netfilter: nft_numgen: add map lookups for numgen statements
      netfilter: nft_numgen: enable hashing of one element

Pablo Neira Ayuso (3):
      netfilter: nf_tables: simplify lookup functions
      netfilter: nf_tables: initial support for extended ACK reporting
      Merge tag 'ipvs-for-v4.18' of http://git.kernel.org/.../horms/ipvs-next

Phil Sutter (2):
      netfilter: nf_tables: Simplify set backend selection
      netfilter: nf_tables: Provide NFT_{RT,CT}_MAX for userspace

Taehee Yoo (7):
      netfilter: xt_NFLOG: use nf_log_packet instead of nfulnl_log_packet.
      netfilter: add __exit mark to helper modules
      netfilter: ebtables: add ebt_free_table_info function
      netfilter: ebtables: remove EBT_MATCH and EBT_NOMATCH
      netfilter: x_tables: remove duplicate ip6t_get_target function call
      netfilter: ebtables: add ebt_get_target and ebt_get_target_c
      netfilter: xtables: use ipt_get_target_c instead of ipt_get_target

Thierry Du Tre (1):
      netfilter: add NAT support for shifted portmap ranges

Vincent Bernat (1):
      ipvs: fix multiplicative hashing in sh/dh/lblc/lblcr algorithms

 include/linux/netfilter/nf_osf.h                   |  27 +
 include/linux/netfilter_bridge/ebtables.h          |   4 -
 include/net/ip6_route.h                            |  21 +
 include/net/ip_vs.h                                |   1 +
 include/net/ipv6.h                                 |   2 -
 include/net/netfilter/ipv4/nf_nat_masquerade.h     |   2 +-
 include/net/netfilter/ipv6/nf_nat_masquerade.h     |   2 +-
 include/net/netfilter/nf_flow_table.h              |  24 +-
 include/net/netfilter/nf_nat.h                     |   2 +-
 include/net/netfilter/nf_nat_l3proto.h             |  28 +-
 include/net/netfilter/nf_nat_l4proto.h             |   8 +-
 include/net/netfilter/nf_nat_redirect.h            |   2 +-
 include/net/netfilter/nf_tables.h                  |  53 +-
 include/net/netfilter/nf_tables_core.h             |   3 +
 include/net/netfilter/nfnetlink_log.h              |  17 -
 include/net/netfilter/nft_meta.h                   |  44 --
 include/uapi/linux/netfilter/nf_nat.h              |  12 +-
 include/uapi/linux/netfilter/nf_osf.h              |  90 +++
 include/uapi/linux/netfilter/nf_tables.h           |   8 +
 include/uapi/linux/netfilter/nfnetlink_conntrack.h |   1 +
 include/uapi/linux/netfilter/xt_osf.h              | 106 +---
 include/uapi/linux/netfilter_bridge/ebtables.h     |   6 +
 include/uapi/linux/netfilter_ipv6/ip6t_srh.h       |  43 +-
 net/bridge/netfilter/Kconfig                       |   7 -
 net/bridge/netfilter/Makefile                      |   1 -
 net/bridge/netfilter/ebtables.c                    |  63 +--
 net/bridge/netfilter/nft_meta_bridge.c             | 135 -----
 net/ipv4/netfilter/ip_tables.c                     |   2 +-
 net/ipv4/netfilter/ipt_MASQUERADE.c                |   2 +-
 net/ipv4/netfilter/iptable_nat.c                   |   3 +-
 net/ipv4/netfilter/nf_flow_table_ipv4.c            | 255 +--------
 net/ipv4/netfilter/nf_nat_h323.c                   |   4 +-
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c           |  18 +-
 net/ipv4/netfilter/nf_nat_masquerade_ipv4.c        |   4 +-
 net/ipv4/netfilter/nf_nat_pptp.c                   |   2 +-
 net/ipv4/netfilter/nf_nat_proto_gre.c              |   2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c             |   2 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c            |   3 +-
 net/ipv4/netfilter/nft_masq_ipv4.c                 |   2 +-
 net/ipv6/ip6_output.c                              |  22 -
 net/ipv6/netfilter/ip6_tables.c                    |   1 -
 net/ipv6/netfilter/ip6t_MASQUERADE.c               |   2 +-
 net/ipv6/netfilter/ip6t_srh.c                      | 173 +++++-
 net/ipv6/netfilter/ip6table_nat.c                  |   3 +-
 net/ipv6/netfilter/nf_flow_table_ipv6.c            | 246 +-------
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c           |  18 +-
 net/ipv6/netfilter/nf_nat_masquerade_ipv6.c        |   4 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c           |   2 +-
 net/ipv6/netfilter/nft_chain_nat_ipv6.c            |   3 +-
 net/ipv6/netfilter/nft_masq_ipv6.c                 |   2 +-
 net/ipv6/netfilter/nft_redir_ipv6.c                |   2 +-
 net/netfilter/Kconfig                              |  25 +-
 net/netfilter/Makefile                             |   8 +-
 net/netfilter/ipvs/Kconfig                         |  37 ++
 net/netfilter/ipvs/Makefile                        |   1 +
 net/netfilter/ipvs/ip_vs_ctl.c                     |   4 +
 net/netfilter/ipvs/ip_vs_dh.c                      |   3 +-
 net/netfilter/ipvs/ip_vs_lblc.c                    |   4 +-
 net/netfilter/ipvs/ip_vs_lblcr.c                   |   4 +-
 net/netfilter/ipvs/ip_vs_mh.c                      | 540 ++++++++++++++++++
 net/netfilter/ipvs/ip_vs_proto_tcp.c               |   4 +-
 net/netfilter/ipvs/ip_vs_sh.c                      |   3 +-
 net/netfilter/nf_conntrack_core.c                  |   1 +
 net/netfilter/nf_conntrack_ftp.c                   |   3 +-
 net/netfilter/nf_conntrack_irc.c                   |   6 +-
 net/netfilter/nf_conntrack_netlink.c               |   3 +
 net/netfilter/nf_conntrack_sane.c                  |   3 +-
 net/netfilter/nf_conntrack_sip.c                   |   2 +-
 net/netfilter/nf_conntrack_tftp.c                  |   2 +-
 .../{nf_flow_table.c => nf_flow_table_core.c}      | 309 ++++++----
 net/netfilter/nf_flow_table_inet.c                 |   3 +-
 net/netfilter/nf_flow_table_ip.c                   | 487 ++++++++++++++++
 net/netfilter/nf_nat_core.c                        |  27 +-
 net/netfilter/nf_nat_helper.c                      |   2 +-
 net/netfilter/nf_nat_proto_common.c                |   9 +-
 net/netfilter/nf_nat_proto_dccp.c                  |   2 +-
 net/netfilter/nf_nat_proto_sctp.c                  |   2 +-
 net/netfilter/nf_nat_proto_tcp.c                   |   2 +-
 net/netfilter/nf_nat_proto_udp.c                   |   4 +-
 net/netfilter/nf_nat_proto_unknown.c               |   2 +-
 net/netfilter/nf_nat_redirect.c                    |   6 +-
 net/netfilter/nf_nat_sip.c                         |   2 +-
 net/netfilter/nf_osf.c                             | 218 +++++++
 net/netfilter/nf_tables_api.c                      | 624 +++++++++++----------
 net/netfilter/nf_tables_core.c                     |   3 +
 net/netfilter/nfnetlink_log.c                      |   8 +-
 net/netfilter/nft_dynset.c                         |   7 +-
 net/netfilter/nft_exthdr.c                         |  23 +-
 net/netfilter/nft_flow_offload.c                   |   5 +-
 net/netfilter/nft_hash.c                           |   2 +-
 net/netfilter/nft_meta.c                           | 112 ++--
 net/netfilter/nft_nat.c                            |   2 +-
 net/netfilter/nft_numgen.c                         |  85 ++-
 net/netfilter/nft_objref.c                         |   4 +-
 net/netfilter/nft_rt.c                             |  22 +-
 net/netfilter/nft_set_bitmap.c                     |  34 +-
 net/netfilter/nft_set_hash.c                       | 153 ++---
 net/netfilter/nft_set_rbtree.c                     |  36 +-
 net/netfilter/xt_NETMAP.c                          |   8 +-
 net/netfilter/xt_NFLOG.c                           |  15 +-
 net/netfilter/xt_REDIRECT.c                        |   2 +-
 net/netfilter/xt_nat.c                             |  72 ++-
 net/netfilter/xt_osf.c                             | 202 +------
 net/openvswitch/conntrack.c                        |   4 +-
 104 files changed, 2753 insertions(+), 1887 deletions(-)
 create mode 100644 include/linux/netfilter/nf_osf.h
 delete mode 100644 include/net/netfilter/nft_meta.h
 create mode 100644 include/uapi/linux/netfilter/nf_osf.h
 delete mode 100644 net/bridge/netfilter/nft_meta_bridge.c
 create mode 100644 net/netfilter/ipvs/ip_vs_mh.c
 rename net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} (67%)
 create mode 100644 net/netfilter/nf_flow_table_ip.c
 create mode 100644 net/netfilter/nf_osf.c

^ permalink raw reply

* Re: [net-next PATCH v2 4/8] udp: Do not pass checksum as a parameter to GSO segmentation
From: Alexander Duyck @ 2018-05-06 22:29 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-KXbq1ag5JSovtWDNWjPFr_A28DjkK_buc8J4P6ae4ifA@mail.gmail.com>

On Sun, May 6, 2018 at 10:17 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sat, May 5, 2018 at 7:39 PM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> On Sat, May 5, 2018 at 3:01 AM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>> On Fri, May 4, 2018 at 8:30 PM, Alexander Duyck
>>> <alexander.duyck@gmail.com> wrote:
>>>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>>>
>>>> This patch is meant to allow us to avoid having to recompute the checksum
>>>> from scratch and have it passed as a parameter.
>>>>
>>>> Instead of taking that approach we can take advantage of the fact that the
>>>> length that was used to compute the existing checksum is included in the
>>>> UDP header. If we cancel that out by adding the value XOR with 0xFFFF we
>>>> can then just add the new length in and fold that into the new result.
>>>>
>>>> I think this may be fixing a checksum bug in the original code as well
>>>> since the checksum that was passed included the UDP header in the checksum
>>>> computation, but then excluded it for the adjustment on the last frame. I
>>>> believe this may have an effect on things in the cases where the two differ
>>>> by bits that would result in things crossing the byte boundaries.
>>>
>>> The replacement code, below, subtracts original payload size then adds
>>> the new payload size. mss here excludes the udp header size.
>>>
>>>>                 /* last packet can be partial gso_size */
>>>> -               if (!seg->next)
>>>> -                       csum_replace2(&uh->check, htons(mss),
>>>> -                                     htons(seg->len - hdrlen - sizeof(*uh)));
>>
>> That is my point. When you calculated your checksum you included the
>> UDP header in the calculation.
>>
>> -       return __udp_gso_segment(gso_skb, features,
>> -                                udp_v4_check(sizeof(struct udphdr) + mss,
>> -                                             iph->saddr, iph->daddr, 0));
>>
>> Basically the problem is in one spot you are adding the sizeof(struct
>> udphdr) + mss and then in another you are cancelling it out as mss and
>> trying to account for it by also dropping the UDP header from the
>> payload length of the value you are adding. That works in the cases
>> where the effect doesn't cause any issues with the byte ordering,
>> however I think when mss + 8 crosses a byte boundary it can lead to
>> issues since the calculation is done on a byte swapped value.
>
> Do you mean that the issue is that the arithmetic operations
> on a __be16 in csum_replace2 may be incorrect if they exceed
> the least significant byte?
>
> csum_replace2 is used in many locations in the stack to adjust a network
> byte order csum when the payload length changes (e.g., iph->tot_len in
> inet_gro_complete).
>
> Or am I missing something specific about the udphdr calculations?

Actually it looks like the math I was applying to test this was off.

Basically the part I wasn't a fan of is the fact that we account for
the UDP header in the first calculation but not in the next. I guess
in the grand scheme of things though you are just dropping it from
both the value being removed and the value being added so it works out
do to the fact that the checksum can be associative.

I guess I was just being too literal in my thinking. Still an
expensive way of doing this though. I'll update the patch description.

Thanks.

- Alex

^ permalink raw reply

* Re: [PATCH 8/8] rhashtable: don't hold lock on first table throughout insertion.
From: NeilBrown @ 2018-05-06 22:24 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <20180506052000.7yehd5lke3smccoj@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 1019 bytes --]

On Sun, May 06 2018, Herbert Xu wrote:

> On Sun, May 06, 2018 at 08:00:49AM +1000, NeilBrown wrote:
>>
>> The insert function must (and does) take the lock on the bucket before
>> testing if there is a "next" table.
>> If one inserter finds that it has locked the "last" table (because there
>> is no next) and successfully inserts, then the other inserter cannot
>> have locked that table yet, else it would have inserted.  When it does,
>> it will find what the first inserter inserted. 
>
> If you release the lock to the first table then it may be deleted
> by the resize thread.  Hence the other inserter may not have even
> started from the same place.

This is true, but I don't see how it is relevant.
At some point, each thread will find that the table they have just
locked for their search key, has a NULL 'future_tbl' pointer.
At the point, the thread can know that the key is not in any table,
and that no other thread can add the key until the lock is released.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH 7/8] rhashtable: add rhashtable_walk_prev()
From: NeilBrown @ 2018-05-06 22:16 UTC (permalink / raw)
  To: Tom Herbert, Herbert Xu
  Cc: Thomas Graf, Linux Kernel Network Developers, LKML
In-Reply-To: <CAPDqMeoPjypXCioHwy9CPiT_XZfWZk8_A6cErQuxVMAJqkQEqQ@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3738 bytes --]

On Sat, May 05 2018, Tom Herbert wrote:

> On Sat, May 5, 2018 at 2:43 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
>> On Fri, May 04, 2018 at 01:54:14PM +1000, NeilBrown wrote:
>>> rhashtable_walk_prev() returns the object returned by
>>> the previous rhashtable_walk_next(), providing it is still in the
>>> table (or was during this grace period).
>>> This works even if rhashtable_walk_stop() and rhashtable_talk_start()
>>> have been called since the last rhashtable_walk_next().
>>>
>>> If there have been no calls to rhashtable_walk_next(), or if the
>>> object is gone from the table, then NULL is returned.
>>>
>>> This can usefully be used in a seq_file ->start() function.
>>> If the pos is the same as was returned by the last ->next() call,
>>> then rhashtable_walk_prev() can be used to re-establish the
>>> current location in the table.  If it returns NULL, then
>>> rhashtable_walk_next() should be used.
>>>
>>> Signed-off-by: NeilBrown <neilb@suse.com>
>>
>> I will ack this if Tom is OK with replacing peek with it.
>>
> I'm not following why this is any better than peek. The point of
> having rhashtable_walk_peek is to to allow the caller to get then
> current element not the next one. This is needed when table is read in
> multiple parts and we need to pick up with what was returned from the
> last call to rhashtable_walk_next (which apparently is what this patch
> is also trying to do).
>
> There is one significant difference in that peek will return the
> element in the table regardless of where the iterator is at (this is
> why peek can move the iterator) and only returns NULL at end of table.
> As mentioned above rhashtable_walk_prev can return NULL so then caller
> needs and so rhashtable_walk_next "should be used" to get the previous
> element. Doing a peek is a lot cleaner and more straightforward API in
> this regard.

Thanks for the review.  I agree with a lot of what you say about the
behavior of the different implementations.
One important difference is the documentation.  The current
documentation for rhashtable_walk_peek() is wrong.   For example it says
that the function doesn't change the iterator, but sometimes it does.
The first rhashtable patch I submitted tried to fix this up, but it is
hard to document the function clearly because it really is doing one of
two different things.  It returns the previous element if it still
exists, or it returns the next one.  With my rhashtable_walk_prev(),
that can be done with
  rhashtable_walk_prev() ?: rhashtable_walk_next();

Both of these functions can easily be documented clearly.
We could combine the two as you have done, but "peek" does seem like a
good name.  "prev_or_next" is more honest, but somewhat clumsy.
Whether that is a good thing or not is partly a matter of taste, and we
seem to be on opposite sides of that fence.
There is a practical aspect to it though.

Lustre has a debugfs seq_file which shows all the cached pages of all
the cached object.  The objects are in a hashtable (which I want to
change to an rhashtable).  So the seq_file iterator has both an
rhashtable iterator an offset in the object.

When we restart a walk, we might be in the middle of some object - but
that object might have been removed from the cache, so we would need to
go on to the first page of the next object.
Using my interface I can do

 obj = rhashtable_walk_prev(&iter.rhash_iter);
 offset = iter.offset;
 if (!obj) {
    obj = rhashtable_walk_next(&iter.rhash_iter)
    offset = 0;
 }

I could achieve something similar with your interface if I kept an extra
copy of the previous object and compared with the value returned by
rhashtable_walk_peek(), but (to me) that seems like double handling.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [net-next PATCH v2 6/8] udp: Add support for software checksum and GSO_PARTIAL with GSO offload
From: Alexander Duyck @ 2018-05-06 22:13 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: Network Development, Willem de Bruijn, David Miller
In-Reply-To: <CAF=yD-L74w1isLi4JSnG4Om8Wrfen=16g-yedtuamNJoDsGEDw@mail.gmail.com>

On Sun, May 6, 2018 at 2:50 PM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
> On Sat, May 5, 2018 at 3:31 AM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> From: Alexander Duyck <alexander.h.duyck@intel.com>
>>
>> This patch adds support for a software provided checksum and GSO_PARTIAL
>> segmentation support. With this we can offload UDP segmentation on devices
>> that only have partial support for tunnels.
>>
>> Since we are no longer needing the hardware checksum we can drop the checks
>> in the segmentation code that were verifying if it was present.
>>
>> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
>> ---
>>  net/ipv4/udp_offload.c |   28 ++++++++++++++++++----------
>>  net/ipv6/udp_offload.c |   11 +----------
>>  2 files changed, 19 insertions(+), 20 deletions(-)
>>
>> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
>> index 946d06d2aa0c..fd94bbb369b2 100644
>> --- a/net/ipv4/udp_offload.c
>> +++ b/net/ipv4/udp_offload.c
>> @@ -217,6 +217,13 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
>>                 return segs;
>>         }
>>
>> +       /* GSO partial and frag_list segmentation only requires splitting
>> +        * the frame into an MSS multiple and possibly a remainder, both
>> +        * cases return a GSO skb. So update the mss now.
>> +        */
>> +       if (skb_is_gso(segs))
>> +               mss *= skb_shinfo(segs)->gso_segs;
>> +
>>         seg = segs;
>>         uh = udp_hdr(seg);
>>
>> @@ -237,6 +244,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
>>                 uh->len = newlen;
>>                 uh->check = check;
>>
>> +               if (seg->ip_summed == CHECKSUM_PARTIAL)
>> +                       gso_reset_checksum(seg, ~check);
>> +               else
>> +                       uh->check = gso_make_checksum(seg, ~check);
>
> Here and below, this needs
>
>   if (uh->check == 0)
>     uh->check = CSUM_MANGLED_0;
>
> similar to __skb_udp_tunnel_segment?

Good call, though I think I might change this up a bit and do something like:
uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;

That way I can avoid the extra read.

Thanks.

- Alex

^ permalink raw reply

* Re: [PATCH 4/8] rhashtable: fix race in nested_table_alloc()
From: NeilBrown @ 2018-05-06 22:02 UTC (permalink / raw)
  To: Herbert Xu; +Cc: Thomas Graf, netdev, linux-kernel
In-Reply-To: <20180506051859.w7evarps5jdcplz6@gondor.apana.org.au>

[-- Attachment #1: Type: text/plain, Size: 937 bytes --]

On Sun, May 06 2018, Herbert Xu wrote:

> On Sun, May 06, 2018 at 07:48:20AM +1000, NeilBrown wrote:
>>
>> The spinlock protects 2 or more buckets.  The nested table contains at
>> least 512 buckets, maybe more.
>> It is quite possible for two insertions into 2 different buckets to both
>> get their spinlock and both try to instantiate the same nested table.
>
> I think you missed the fact that when we use nested tables the spin
> lock table is limited to just a single page and hence corresponds
> to the first level in the nested table.  Therefore it's always safe.

Yes I had missed that - thanks for pointing it out.
In fact the lock table is limited to the number of nested_tables
in the second level.
And it is the same low-order bits that choose both the lock
and the set of nested tables.
So there isn't a bug here.  So we don't need this patch. (I still like
it though - it seems more obviously correct).

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox