Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 34/50] netfilter: nft_quota: add stateful object type
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Register a new quota stateful object type into the new stateful object
infrastructure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/netfilter/nft_quota.c                | 96 +++++++++++++++++++++++++++-----
 2 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index e352ef65d753..ad0577ba5d2a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1187,6 +1187,7 @@ enum nft_fib_flags {
 
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
+#define NFT_OBJECT_QUOTA	2
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c00104c07095..09ce72b1d6bf 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -27,12 +27,10 @@ static inline bool nft_overquota(struct nft_quota *priv,
 	return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
 }
 
-static void nft_quota_eval(const struct nft_expr *expr,
-			   struct nft_regs *regs,
-			   const struct nft_pktinfo *pkt)
+static inline void nft_quota_do_eval(struct nft_quota *priv,
+				     struct nft_regs *regs,
+				     const struct nft_pktinfo *pkt)
 {
-	struct nft_quota *priv = nft_expr_priv(expr);
-
 	if (nft_overquota(priv, pkt) ^ priv->invert)
 		regs->verdict.code = NFT_BREAK;
 }
@@ -42,11 +40,18 @@ static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
 	[NFTA_QUOTA_FLAGS]	= { .type = NLA_U32 },
 };
 
-static int nft_quota_init(const struct nft_ctx *ctx,
-			  const struct nft_expr *expr,
-			  const struct nlattr * const tb[])
+static void nft_quota_obj_eval(struct nft_object *obj,
+			       struct nft_regs *regs,
+			       const struct nft_pktinfo *pkt)
+{
+	struct nft_quota *priv = nft_obj_data(obj);
+
+	nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_do_init(const struct nlattr * const tb[],
+			     struct nft_quota *priv)
 {
-	struct nft_quota *priv = nft_expr_priv(expr);
 	u32 flags = 0;
 	u64 quota;
 
@@ -70,9 +75,16 @@ static int nft_quota_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_obj_init(const struct nlattr * const tb[],
+			      struct nft_object *obj)
+{
+	struct nft_quota *priv = nft_obj_data(obj);
+
+	return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv)
 {
-	const struct nft_quota *priv = nft_expr_priv(expr);
 	u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
 
 	if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
@@ -85,6 +97,49 @@ static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return -1;
 }
 
+static int nft_quota_obj_dump(struct sk_buff *skb, const struct nft_object *obj)
+{
+	struct nft_quota *priv = nft_obj_data(obj);
+
+	return nft_quota_do_dump(skb, priv);
+}
+
+static struct nft_object_type nft_quota_obj __read_mostly = {
+	.type		= NFT_OBJECT_QUOTA,
+	.size		= sizeof(struct nft_quota),
+	.maxattr	= NFTA_QUOTA_MAX,
+	.policy		= nft_quota_policy,
+	.init		= nft_quota_obj_init,
+	.eval		= nft_quota_obj_eval,
+	.dump		= nft_quota_obj_dump,
+	.owner		= THIS_MODULE,
+};
+
+static void nft_quota_eval(const struct nft_expr *expr,
+			   struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
+{
+	struct nft_quota *priv = nft_expr_priv(expr);
+
+	nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
+{
+	struct nft_quota *priv = nft_expr_priv(expr);
+
+	return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_quota *priv = nft_expr_priv(expr);
+
+	return nft_quota_do_dump(skb, priv);
+}
+
 static struct nft_expr_type nft_quota_type;
 static const struct nft_expr_ops nft_quota_ops = {
 	.type		= &nft_quota_type,
@@ -105,12 +160,26 @@ static struct nft_expr_type nft_quota_type __read_mostly = {
 
 static int __init nft_quota_module_init(void)
 {
-        return nft_register_expr(&nft_quota_type);
+	int err;
+
+	err = nft_register_obj(&nft_quota_obj);
+	if (err < 0)
+		return err;
+
+	err = nft_register_expr(&nft_quota_type);
+	if (err < 0)
+		goto err1;
+
+	return 0;
+err1:
+	nft_unregister_obj(&nft_quota_obj);
+	return err;
 }
 
 static void __exit nft_quota_module_exit(void)
 {
-        nft_unregister_expr(&nft_quota_type);
+	nft_unregister_expr(&nft_quota_type);
+	nft_unregister_obj(&nft_quota_obj);
 }
 
 module_init(nft_quota_module_init);
@@ -119,3 +188,4 @@ module_exit(nft_quota_module_exit);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
 MODULE_ALIAS_NFT_EXPR("quota");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_QUOTA);
-- 
2.1.4

^ permalink raw reply related

* [PATCH 32/50] netfilter: nf_tables: add stateful objects
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

This patch augments nf_tables to support stateful objects. This new
infrastructure allows you to create, dump and delete stateful objects,
that are identified by a user-defined name.

This patch adds the generic infrastructure, follow up patches add
support for two stateful objects: counters and quotas.

This patch provides a native infrastructure for nf_tables to replace
nfacct, the extended accounting infrastructure for iptables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  79 +++++
 include/uapi/linux/netfilter/nf_tables.h |  29 ++
 net/netfilter/nf_tables_api.c            | 516 +++++++++++++++++++++++++++++++
 3 files changed, 624 insertions(+)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 32970cba184a..903cd618f50e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);
  *	@list: used internally
  *	@chains: chains in the table
  *	@sets: sets in the table
+ *	@objects: stateful objects in the table
  *	@hgenerator: handle generator state
  *	@use: number of chain references to this table
  *	@flags: table flag (see enum nft_table_flags)
@@ -885,6 +886,7 @@ struct nft_table {
 	struct list_head		list;
 	struct list_head		chains;
 	struct list_head		sets;
+	struct list_head		objects;
 	u64				hgenerator;
 	u32				use;
 	u16				flags:14,
@@ -935,6 +937,73 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 		     const struct nft_verdict *v);
 
 /**
+ *	struct nft_object - nf_tables stateful object
+ *
+ *	@list: table stateful object list node
+ *	@type: pointer to object type
+ *	@data: pointer to object data
+ *	@name: name of this stateful object
+ *	@genmask: generation mask
+ *	@use: number of references to this stateful object
+ * 	@data: object data, layout depends on type
+ */
+struct nft_object {
+	struct list_head		list;
+	char				name[NFT_OBJ_MAXNAMELEN];
+	u32				genmask:2,
+					use:30;
+	/* runtime data below here */
+	const struct nft_object_type	*type ____cacheline_aligned;
+	unsigned char			data[]
+		__attribute__((aligned(__alignof__(u64))));
+};
+
+static inline void *nft_obj_data(const struct nft_object *obj)
+{
+	return (void *)obj->data;
+}
+
+#define nft_expr_obj(expr)	*((struct nft_object **)nft_expr_priv(expr))
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+					const struct nlattr *nla, u32 objtype,
+					u8 genmask);
+
+/**
+ *	struct nft_object_type - stateful object type
+ *
+ *	@eval: stateful object evaluation function
+ *	@list: list node in list of object types
+ *	@type: stateful object numeric type
+ *	@size: stateful object size
+ *	@owner: module owner
+ *	@maxattr: maximum netlink attribute
+ *	@policy: netlink attribute policy
+ *	@init: initialize object from netlink attributes
+ *	@destroy: release existing stateful object
+ *	@dump: netlink dump stateful object
+ */
+struct nft_object_type {
+	void				(*eval)(struct nft_object *obj,
+						struct nft_regs *regs,
+						const struct nft_pktinfo *pkt);
+	struct list_head		list;
+	u32				type;
+	unsigned int			size;
+	unsigned int			maxattr;
+	struct module			*owner;
+	const struct nla_policy		*policy;
+	int				(*init)(const struct nlattr * const tb[],
+						struct nft_object *obj);
+	void				(*destroy)(struct nft_object *obj);
+	int				(*dump)(struct sk_buff *skb,
+						const struct nft_object *obj);
+};
+
+int nft_register_obj(struct nft_object_type *obj_type);
+void nft_unregister_obj(struct nft_object_type *obj_type);
+
+/**
  *	struct nft_traceinfo - nft tracing information and state
  *
  *	@pkt: pktinfo currently processed
@@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info);
 #define MODULE_ALIAS_NFT_SET() \
 	MODULE_ALIAS("nft-set")
 
+#define MODULE_ALIAS_NFT_OBJ(type) \
+	MODULE_ALIAS("nft-obj-" __stringify(type))
+
 /*
  * The gencursor defines two generations, the currently active and the
  * next one. Objects contain a bitmask of 2 bits specifying the generations
@@ -1157,4 +1229,11 @@ struct nft_trans_elem {
 #define nft_trans_elem(trans)	\
 	(((struct nft_trans_elem *)trans->data)->elem)
 
+struct nft_trans_obj {
+	struct nft_object		*obj;
+};
+
+#define nft_trans_obj(trans)	\
+	(((struct nft_trans_obj *)trans->data)->obj)
+
 #endif /* _NET_NF_TABLES_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index f030e59aa2ec..18e30dbc8c3f 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -4,6 +4,7 @@
 #define NFT_TABLE_MAXNAMELEN	32
 #define NFT_CHAIN_MAXNAMELEN	32
 #define NFT_SET_MAXNAMELEN	32
+#define NFT_OBJ_MAXNAMELEN	32
 #define NFT_USERDATA_MAXLEN	256
 
 /**
@@ -85,6 +86,9 @@ enum nft_verdicts {
  * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes)
  * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes)
  * @NFT_MSG_TRACE: trace event (enum nft_trace_attributes)
+ * @NFT_MSG_NEWOBJ: create a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_GETOBJ: get a stateful object (enum nft_obj_attributes)
+ * @NFT_MSG_DELOBJ: delete a stateful object (enum nft_obj_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -105,6 +109,9 @@ enum nf_tables_msg_types {
 	NFT_MSG_NEWGEN,
 	NFT_MSG_GETGEN,
 	NFT_MSG_TRACE,
+	NFT_MSG_NEWOBJ,
+	NFT_MSG_GETOBJ,
+	NFT_MSG_DELOBJ,
 	NFT_MSG_MAX,
 };
 
@@ -1178,6 +1185,28 @@ enum nft_fib_flags {
 	NFTA_FIB_F_OIF		= 1 << 4,	/* restrict to oif */
 };
 
+#define NFT_OBJECT_UNSPEC	0
+
+/**
+ * enum nft_object_attributes - nf_tables stateful object netlink attributes
+ *
+ * @NFTA_OBJ_TABLE: name of the table containing the expression (NLA_STRING)
+ * @NFTA_OBJ_NAME: name of this expression type (NLA_STRING)
+ * @NFTA_OBJ_TYPE: stateful object type (NLA_U32)
+ * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED)
+ * @NFTA_OBJ_USE: number of references to this expression (NLA_U32)
+ */
+enum nft_object_attributes {
+	NFTA_OBJ_UNSPEC,
+	NFTA_OBJ_TABLE,
+	NFTA_OBJ_NAME,
+	NFTA_OBJ_TYPE,
+	NFTA_OBJ_DATA,
+	NFTA_OBJ_USE,
+	__NFTA_OBJ_MAX
+};
+#define NFTA_OBJ_MAX		(__NFTA_OBJ_MAX - 1)
+
 /**
  * enum nft_trace_attributes - nf_tables trace netlink attributes
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e5194f6f906c..2ae717c5dcb8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -22,6 +22,7 @@
 #include <net/sock.h>
 
 static LIST_HEAD(nf_tables_expressions);
+static LIST_HEAD(nf_tables_objects);
 
 /**
  *	nft_register_afinfo - register nf_tables address family info
@@ -304,6 +305,38 @@ static int nft_delset(struct nft_ctx *ctx, struct nft_set *set)
 	return err;
 }
 
+static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
+			     struct nft_object *obj)
+{
+	struct nft_trans *trans;
+
+	trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
+	if (trans == NULL)
+		return -ENOMEM;
+
+	if (msg_type == NFT_MSG_NEWOBJ)
+		nft_activate_next(ctx->net, obj);
+
+	nft_trans_obj(trans) = obj;
+	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
+
+	return 0;
+}
+
+static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
+{
+	int err;
+
+	err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
+	if (err < 0)
+		return err;
+
+	nft_deactivate_next(ctx->net, obj);
+	ctx->table->use--;
+
+	return err;
+}
+
 /*
  * Tables
  */
@@ -688,6 +721,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN);
 	INIT_LIST_HEAD(&table->chains);
 	INIT_LIST_HEAD(&table->sets);
+	INIT_LIST_HEAD(&table->objects);
 	table->flags = flags;
 
 	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
@@ -709,6 +743,7 @@ static int nft_flush_table(struct nft_ctx *ctx)
 {
 	int err;
 	struct nft_chain *chain, *nc;
+	struct nft_object *obj, *ne;
 	struct nft_set *set, *ns;
 
 	list_for_each_entry(chain, &ctx->table->chains, list) {
@@ -735,6 +770,12 @@ static int nft_flush_table(struct nft_ctx *ctx)
 			goto out;
 	}
 
+	list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
+		err = nft_delobj(ctx, obj);
+		if (err < 0)
+			goto out;
+	}
+
 	list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
 		if (!nft_is_active_next(ctx->net, chain))
 			continue;
@@ -3838,6 +3879,434 @@ struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
 }
 EXPORT_SYMBOL_GPL(nft_set_gc_batch_alloc);
 
+/*
+ * Stateful objects
+ */
+
+/**
+ *	nft_register_obj- register nf_tables stateful object type
+ *	@obj: object type
+ *
+ *	Registers the object type for use with nf_tables. Returns zero on
+ *	success or a negative errno code otherwise.
+ */
+int nft_register_obj(struct nft_object_type *obj_type)
+{
+	if (obj_type->type == NFT_OBJECT_UNSPEC)
+		return -EINVAL;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_add_rcu(&obj_type->list, &nf_tables_objects);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nft_register_obj);
+
+/**
+ *	nft_unregister_obj - unregister nf_tables object type
+ *	@obj: object type
+ *
+ * 	Unregisters the object type for use with nf_tables.
+ */
+void nft_unregister_obj(struct nft_object_type *obj_type)
+{
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_del_rcu(&obj_type->list);
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+}
+EXPORT_SYMBOL_GPL(nft_unregister_obj);
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+					const struct nlattr *nla,
+					u32 objtype, u8 genmask)
+{
+	struct nft_object *obj;
+
+	list_for_each_entry(obj, &table->objects, list) {
+		if (!nla_strcmp(nla, obj->name) &&
+		    objtype == obj->type->type &&
+		    nft_active_genmask(obj, genmask))
+			return obj;
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+
+static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
+	[NFTA_OBJ_TABLE]	= { .type = NLA_STRING },
+	[NFTA_OBJ_NAME]		= { .type = NLA_STRING },
+	[NFTA_OBJ_TYPE]		= { .type = NLA_U32 },
+	[NFTA_OBJ_DATA]		= { .type = NLA_NESTED },
+};
+
+static struct nft_object *nft_obj_init(const struct nft_object_type *type,
+				       const struct nlattr *attr)
+{
+	struct nlattr *tb[type->maxattr + 1];
+	struct nft_object *obj;
+	int err;
+
+	if (attr) {
+		err = nla_parse_nested(tb, type->maxattr, attr, type->policy);
+		if (err < 0)
+			goto err1;
+	} else {
+		memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
+	}
+
+	err = -ENOMEM;
+	obj = kzalloc(sizeof(struct nft_object) + type->size, GFP_KERNEL);
+	if (obj == NULL)
+		goto err1;
+
+	err = type->init((const struct nlattr * const *)tb, obj);
+	if (err < 0)
+		goto err2;
+
+	obj->type = type;
+	return obj;
+err2:
+	kfree(obj);
+err1:
+	return ERR_PTR(err);
+}
+
+static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
+			   const struct nft_object *obj)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, attr);
+	if (!nest)
+		goto nla_put_failure;
+	if (obj->type->dump(skb, obj) < 0)
+		goto nla_put_failure;
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
+{
+	const struct nft_object_type *type;
+
+	list_for_each_entry(type, &nf_tables_objects, list) {
+		if (objtype == type->type)
+			return type;
+	}
+	return NULL;
+}
+
+static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+{
+	const struct nft_object_type *type;
+
+	type = __nft_obj_type_get(objtype);
+	if (type != NULL && try_module_get(type->owner))
+		return type;
+
+#ifdef CONFIG_MODULES
+	if (type == NULL) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		request_module("nft-obj-%u", objtype);
+		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		if (__nft_obj_type_get(objtype))
+			return ERR_PTR(-EAGAIN);
+	}
+#endif
+	return ERR_PTR(-ENOENT);
+}
+
+static int nf_tables_newobj(struct net *net, struct sock *nlsk,
+			    struct sk_buff *skb, const struct nlmsghdr *nlh,
+			    const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	const struct nft_object_type *type;
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_object *obj;
+	struct nft_ctx ctx;
+	u32 objtype;
+	int err;
+
+	if (!nla[NFTA_OBJ_TYPE] ||
+	    !nla[NFTA_OBJ_NAME] ||
+	    !nla[NFTA_OBJ_DATA])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		if (err != -ENOENT)
+			return err;
+
+		obj = NULL;
+	}
+
+	if (obj != NULL) {
+		if (nlh->nlmsg_flags & NLM_F_EXCL)
+			return -EEXIST;
+
+		return 0;
+	}
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	type = nft_obj_type_get(objtype);
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		goto err1;
+	}
+	nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
+
+	err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
+	if (err < 0)
+		goto err2;
+
+	list_add_tail_rcu(&obj->list, &table->objects);
+	table->use++;
+	return 0;
+err2:
+	if (obj->type->destroy)
+		obj->type->destroy(obj);
+	kfree(obj);
+err1:
+	module_put(type->owner);
+	return err;
+}
+
+static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
+				   u32 portid, u32 seq, int event, u32 flags,
+				   int family, const struct nft_table *table,
+				   const struct nft_object *obj)
+{
+	struct nfgenmsg *nfmsg;
+	struct nlmsghdr *nlh;
+
+	event |= NFNL_SUBSYS_NFTABLES << 8;
+	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
+	if (nlh == NULL)
+		goto nla_put_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family	= family;
+	nfmsg->version		= NFNETLINK_V0;
+	nfmsg->res_id		= htons(net->nft.base_seq & 0xffff);
+
+	if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
+	    nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
+	    nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->type->type)) ||
+	    nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+	    nft_object_dump(skb, NFTA_OBJ_DATA, obj))
+		goto nla_put_failure;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, nlh);
+	return -1;
+}
+
+static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	const struct nft_object *obj;
+	unsigned int idx = 0, s_idx = cb->args[0];
+	struct net *net = sock_net(skb->sk);
+	int family = nfmsg->nfgen_family;
+
+	rcu_read_lock();
+	cb->seq = net->nft.base_seq;
+
+	list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
+		if (family != NFPROTO_UNSPEC && family != afi->family)
+			continue;
+
+		list_for_each_entry_rcu(table, &afi->tables, list) {
+			list_for_each_entry_rcu(obj, &table->objects, list) {
+				if (!nft_is_active(net, obj))
+					goto cont;
+				if (idx < s_idx)
+					goto cont;
+				if (idx > s_idx)
+					memset(&cb->args[1], 0,
+					       sizeof(cb->args) - sizeof(cb->args[0]));
+				if (nf_tables_fill_obj_info(skb, net, NETLINK_CB(cb->skb).portid,
+							    cb->nlh->nlmsg_seq,
+							    NFT_MSG_NEWOBJ,
+							    NLM_F_MULTI | NLM_F_APPEND,
+							    afi->family, table, obj) < 0)
+					goto done;
+
+				nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+cont:
+				idx++;
+			}
+		}
+	}
+done:
+	rcu_read_unlock();
+
+	cb->args[0] = idx;
+	return skb->len;
+}
+
+static int nf_tables_getobj(struct net *net, struct sock *nlsk,
+			    struct sk_buff *skb, const struct nlmsghdr *nlh,
+			    const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_cur(net);
+	int family = nfmsg->nfgen_family;
+	const struct nft_af_info *afi;
+	const struct nft_table *table;
+	struct nft_object *obj;
+	struct sk_buff *skb2;
+	u32 objtype;
+	int err;
+
+	if (nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = nf_tables_dump_obj,
+		};
+		return netlink_dump_start(nlsk, skb, nlh, &c);
+	}
+
+	if (!nla[NFTA_OBJ_NAME] ||
+	    !nla[NFTA_OBJ_TYPE])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, false);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb2)
+		return -ENOMEM;
+
+	err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
+				      nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+				      family, table, obj);
+	if (err < 0)
+		goto err;
+
+	return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid);
+err:
+	kfree_skb(skb2);
+	return err;
+
+	return 0;
+}
+
+static void nft_obj_destroy(struct nft_object *obj)
+{
+	if (obj->type->destroy)
+		obj->type->destroy(obj);
+
+	module_put(obj->type->owner);
+	kfree(obj);
+}
+
+static int nf_tables_delobj(struct net *net, struct sock *nlsk,
+			      struct sk_buff *skb, const struct nlmsghdr *nlh,
+			      const struct nlattr * const nla[])
+{
+	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+	u8 genmask = nft_genmask_next(net);
+	int family = nfmsg->nfgen_family;
+	struct nft_af_info *afi;
+	struct nft_table *table;
+	struct nft_object *obj;
+	struct nft_ctx ctx;
+	u32 objtype;
+
+	if (!nla[NFTA_OBJ_TYPE] ||
+	    !nla[NFTA_OBJ_NAME])
+		return -EINVAL;
+
+	afi = nf_tables_afinfo_lookup(net, family, true);
+	if (IS_ERR(afi))
+		return PTR_ERR(afi);
+
+	table = nf_tables_table_lookup(afi, nla[NFTA_OBJ_TABLE], genmask);
+	if (IS_ERR(table))
+		return PTR_ERR(table);
+
+	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+	obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+	if (obj->use > 0)
+		return -EBUSY;
+
+	nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
+
+	return nft_delobj(&ctx, obj);
+}
+
+static int nf_tables_obj_notify(const struct nft_ctx *ctx,
+				struct nft_object *obj, int event)
+{
+	struct sk_buff *skb;
+	int err;
+
+	if (!ctx->report &&
+	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+		return 0;
+
+	err = -ENOBUFS;
+	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (skb == NULL)
+		goto err;
+
+	err = nf_tables_fill_obj_info(skb, ctx->net, ctx->portid, ctx->seq,
+				      event, 0, ctx->afi->family, ctx->table,
+				      obj);
+	if (err < 0) {
+		kfree_skb(skb);
+		goto err;
+	}
+
+	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+			     ctx->report, GFP_KERNEL);
+err:
+	if (err < 0) {
+		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
+				  err);
+	}
+	return err;
+}
+
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
 {
@@ -3998,6 +4467,21 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 	[NFT_MSG_GETGEN] = {
 		.call		= nf_tables_getgen,
 	},
+	[NFT_MSG_NEWOBJ] = {
+		.call_batch	= nf_tables_newobj,
+		.attr_count	= NFTA_OBJ_MAX,
+		.policy		= nft_obj_policy,
+	},
+	[NFT_MSG_GETOBJ] = {
+		.call		= nf_tables_getobj,
+		.attr_count	= NFTA_OBJ_MAX,
+		.policy		= nft_obj_policy,
+	},
+	[NFT_MSG_DELOBJ] = {
+		.call_batch	= nf_tables_delobj,
+		.attr_count	= NFTA_OBJ_MAX,
+		.policy		= nft_obj_policy,
+	},
 };
 
 static void nft_chain_commit_update(struct nft_trans *trans)
@@ -4040,6 +4524,9 @@ static void nf_tables_commit_release(struct nft_trans *trans)
 		nft_set_elem_destroy(nft_trans_elem_set(trans),
 				     nft_trans_elem(trans).priv, true);
 		break;
+	case NFT_MSG_DELOBJ:
+		nft_obj_destroy(nft_trans_obj(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -4147,6 +4634,17 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 			atomic_dec(&te->set->nelems);
 			te->set->ndeact--;
 			break;
+		case NFT_MSG_NEWOBJ:
+			nft_clear(net, nft_trans_obj(trans));
+			nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+					     NFT_MSG_NEWOBJ);
+			nft_trans_destroy(trans);
+			break;
+		case NFT_MSG_DELOBJ:
+			list_del_rcu(&nft_trans_obj(trans)->list);
+			nf_tables_obj_notify(&trans->ctx, nft_trans_obj(trans),
+					     NFT_MSG_DELOBJ);
+			break;
 		}
 	}
 
@@ -4181,6 +4679,9 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nft_set_elem_destroy(nft_trans_elem_set(trans),
 				     nft_trans_elem(trans).priv, true);
 		break;
+	case NFT_MSG_NEWOBJ:
+		nft_obj_destroy(nft_trans_obj(trans));
+		break;
 	}
 	kfree(trans);
 }
@@ -4261,6 +4762,15 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 
 			nft_trans_destroy(trans);
 			break;
+		case NFT_MSG_NEWOBJ:
+			trans->ctx.table->use--;
+			list_del_rcu(&nft_trans_obj(trans)->list);
+			break;
+		case NFT_MSG_DELOBJ:
+			trans->ctx.table->use++;
+			nft_clear(trans->ctx.net, nft_trans_obj(trans));
+			nft_trans_destroy(trans);
+			break;
 		}
 	}
 
@@ -4807,6 +5317,7 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 {
 	struct nft_table *table, *nt;
 	struct nft_chain *chain, *nc;
+	struct nft_object *obj, *ne;
 	struct nft_rule *rule, *nr;
 	struct nft_set *set, *ns;
 	struct nft_ctx ctx = {
@@ -4833,6 +5344,11 @@ static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
 			table->use--;
 			nft_set_destroy(set);
 		}
+		list_for_each_entry_safe(obj, ne, &table->objects, list) {
+			list_del(&obj->list);
+			table->use--;
+			nft_obj_destroy(obj);
+		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
 			list_del(&chain->list);
 			table->use--;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 36/50] netfilter: nft_quota: dump consumed quota
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Add a new attribute NFTA_QUOTA_CONSUMED that displays the amount of
quota that has been already consumed. This allows us to restore the
internal state of the quota object between reboots as well as to monitor
how wasted it is.

This patch changes the logic to account for the consumed bytes, instead
of the bytes that remain to be consumed.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nft_quota.c                | 21 ++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 1043ce4250c5..3d47582caa80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -988,12 +988,14 @@ enum nft_quota_flags {
  *
  * @NFTA_QUOTA_BYTES: quota in bytes (NLA_U16)
  * @NFTA_QUOTA_FLAGS: flags (NLA_U32)
+ * @NFTA_QUOTA_CONSUMED: quota already consumed in bytes (NLA_U64)
  */
 enum nft_quota_attributes {
 	NFTA_QUOTA_UNSPEC,
 	NFTA_QUOTA_BYTES,
 	NFTA_QUOTA_FLAGS,
 	NFTA_QUOTA_PAD,
+	NFTA_QUOTA_CONSUMED,
 	__NFTA_QUOTA_MAX
 };
 #define NFTA_QUOTA_MAX		(__NFTA_QUOTA_MAX - 1)
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 09ce72b1d6bf..0d344209803a 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -18,20 +18,20 @@
 struct nft_quota {
 	u64		quota;
 	bool		invert;
-	atomic64_t	remain;
+	atomic64_t	consumed;
 };
 
 static inline bool nft_overquota(struct nft_quota *priv,
-				 const struct nft_pktinfo *pkt)
+				 const struct sk_buff *skb)
 {
-	return atomic64_sub_return(pkt->skb->len, &priv->remain) < 0;
+	return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
 }
 
 static inline void nft_quota_do_eval(struct nft_quota *priv,
 				     struct nft_regs *regs,
 				     const struct nft_pktinfo *pkt)
 {
-	if (nft_overquota(priv, pkt) ^ priv->invert)
+	if (nft_overquota(priv, pkt->skb) ^ priv->invert)
 		regs->verdict.code = NFT_BREAK;
 }
 
@@ -70,7 +70,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
 
 	priv->quota = quota;
 	priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
-	atomic64_set(&priv->remain, quota);
+	atomic64_set(&priv->consumed, 0);
 
 	return 0;
 }
@@ -86,9 +86,20 @@ static int nft_quota_obj_init(const struct nlattr * const tb[],
 static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv)
 {
 	u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+	u64 consumed;
+
+	consumed = atomic64_read(&priv->consumed);
+	/* Since we inconditionally increment consumed quota for each packet
+	 * that we see, don't go over the quota boundary in what we send to
+	 * userspace.
+	 */
+	if (consumed > priv->quota)
+		consumed = priv->quota;
 
 	if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
 			 NFTA_QUOTA_PAD) ||
+	    nla_put_be64(skb, NFTA_QUOTA_CONSUMED, cpu_to_be64(consumed),
+			 NFTA_QUOTA_PAD) ||
 	    nla_put_be32(skb, NFTA_QUOTA_FLAGS, htonl(flags)))
 		goto nla_put_failure;
 	return 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 38/50] netfilter: nf_tables: notify internal updates of stateful objects
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Introduce nf_tables_obj_notify() to notify internal state changes in
stateful objects. This is used by the quota object to report depletion
in a follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++++
 net/netfilter/nf_tables_api.c     | 31 +++++++++++++++++++------------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 6f7d6a1dc09c..339e374c28b5 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -969,6 +969,10 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 					const struct nlattr *nla, u32 objtype,
 					u8 genmask);
 
+int nft_obj_notify(struct net *net, struct nft_table *table,
+		   struct nft_object *obj, u32 portid, u32 seq,
+		   int event, int family, int report, gfp_t gfp);
+
 /**
  *	struct nft_object_type - stateful object type
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bfc015af366a..9d2ed3f520ef 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4282,38 +4282,45 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk,
 	return nft_delobj(&ctx, obj);
 }
 
-static int nf_tables_obj_notify(const struct nft_ctx *ctx,
-				struct nft_object *obj, int event)
+int nft_obj_notify(struct net *net, struct nft_table *table,
+		   struct nft_object *obj, u32 portid, u32 seq, int event,
+		   int family, int report, gfp_t gfp)
 {
 	struct sk_buff *skb;
 	int err;
 
-	if (!ctx->report &&
-	    !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
+	if (!report &&
+	    !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
 		return 0;
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
 	if (skb == NULL)
 		goto err;
 
-	err = nf_tables_fill_obj_info(skb, ctx->net, ctx->portid, ctx->seq,
-				      event, 0, ctx->afi->family, ctx->table,
-				      obj, false);
+	err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family,
+				      table, obj, false);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
 	}
 
-	err = nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-			     ctx->report, GFP_KERNEL);
+	err = nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp);
 err:
 	if (err < 0) {
-		nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES,
-				  err);
+		nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, err);
 	}
 	return err;
 }
+EXPORT_SYMBOL_GPL(nft_obj_notify);
+
+static int nf_tables_obj_notify(const struct nft_ctx *ctx,
+				struct nft_object *obj, int event)
+{
+	return nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
+			      ctx->seq, event, ctx->afi->family, ctx->report,
+			      GFP_KERNEL);
+}
 
 static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 				   u32 portid, u32 seq)
-- 
2.1.4

^ permalink raw reply related

* [PATCH 39/50] netfilter: nft_quota: add depleted flag for objects
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag
indicates we have reached overquota.

Add pointer to table from nft_object, so we can use it when sending the
depletion notification to userspace.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 ++
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/netfilter/nf_tables_api.c            |  1 +
 net/netfilter/nft_quota.c                | 36 +++++++++++++++++++++++++-------
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 339e374c28b5..ce6fb6e83b32 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  *	struct nft_object - nf_tables stateful object
  *
  *	@list: table stateful object list node
+ *	@table: table this object belongs to
  *	@type: pointer to object type
  *	@data: pointer to object data
  *	@name: name of this stateful object
@@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 struct nft_object {
 	struct list_head		list;
 	char				name[NFT_OBJ_MAXNAMELEN];
+	struct nft_table		*table;
 	u32				genmask:2,
 					use:30;
 	/* runtime data below here */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 399eac1eee91..4864caca1e8e 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -983,6 +983,7 @@ enum nft_queue_attributes {
 
 enum nft_quota_flags {
 	NFT_QUOTA_F_INV		= (1 << 0),
+	NFT_QUOTA_F_DEPLETED	= (1 << 1),
 };
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9d2ed3f520ef..c5419701ca79 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4075,6 +4075,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 		err = PTR_ERR(obj);
 		goto err1;
 	}
+	obj->table = table;
 	nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
 
 	err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 5d25f57497cb..7f27ebdce7ab 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -17,7 +17,7 @@
 
 struct nft_quota {
 	u64		quota;
-	bool		invert;
+	unsigned long	flags;
 	atomic64_t	consumed;
 };
 
@@ -27,11 +27,16 @@ static inline bool nft_overquota(struct nft_quota *priv,
 	return atomic64_add_return(skb->len, &priv->consumed) >= priv->quota;
 }
 
+static inline bool nft_quota_invert(struct nft_quota *priv)
+{
+	return priv->flags & NFT_QUOTA_F_INV;
+}
+
 static inline void nft_quota_do_eval(struct nft_quota *priv,
 				     struct nft_regs *regs,
 				     const struct nft_pktinfo *pkt)
 {
-	if (nft_overquota(priv, pkt->skb) ^ priv->invert)
+	if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
 		regs->verdict.code = NFT_BREAK;
 }
 
@@ -40,19 +45,29 @@ static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
 	[NFTA_QUOTA_FLAGS]	= { .type = NLA_U32 },
 };
 
+#define NFT_QUOTA_DEPLETED_BIT	1	/* From NFT_QUOTA_F_DEPLETED. */
+
 static void nft_quota_obj_eval(struct nft_object *obj,
 			       struct nft_regs *regs,
 			       const struct nft_pktinfo *pkt)
 {
 	struct nft_quota *priv = nft_obj_data(obj);
+	bool overquota;
 
-	nft_quota_do_eval(priv, regs, pkt);
+	overquota = nft_overquota(priv, pkt->skb);
+	if (overquota ^ nft_quota_invert(priv))
+		regs->verdict.code = NFT_BREAK;
+
+	if (overquota &&
+	    !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+		nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
+			       NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
 }
 
 static int nft_quota_do_init(const struct nlattr * const tb[],
 			     struct nft_quota *priv)
 {
-	u32 flags = 0;
+	unsigned long flags = 0;
 	u64 quota;
 
 	if (!tb[NFTA_QUOTA_BYTES])
@@ -66,10 +81,12 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
 		flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
 		if (flags & ~NFT_QUOTA_F_INV)
 			return -EINVAL;
+		if (flags & NFT_QUOTA_F_DEPLETED)
+			return -EOPNOTSUPP;
 	}
 
 	priv->quota = quota;
-	priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
+	priv->flags = flags;
 	atomic64_set(&priv->consumed, 0);
 
 	return 0;
@@ -86,13 +103,16 @@ static int nft_quota_obj_init(const struct nlattr * const tb[],
 static int nft_quota_do_dump(struct sk_buff *skb, struct nft_quota *priv,
 			     bool reset)
 {
-	u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
+	u32 flags = priv->flags;
 	u64 consumed;
 
-	if (reset)
+	if (reset) {
 		consumed = atomic64_xchg(&priv->consumed, 0);
-	else
+		if (test_and_clear_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags))
+			flags |= NFT_QUOTA_F_DEPLETED;
+	} else {
 		consumed = atomic64_read(&priv->consumed);
+	}
 
 	/* Since we inconditionally increment consumed quota for each packet
 	 * that we see, don't go over the quota boundary in what we send to
-- 
2.1.4

^ permalink raw reply related

* [PATCH 40/50] netfilter: nf_tables: add stateful object reference to set elements
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

This patch allows you to refer to stateful objects from set elements.
This provides the infrastructure to create maps where the right hand
side of the mapping is a stateful object.

This allows us to build dictionaries of stateful objects, that you can
use to perform fast lookups using any arbitrary key combination.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  9 ++++
 include/uapi/linux/netfilter/nf_tables.h |  8 ++++
 net/netfilter/nf_tables_api.c            | 72 +++++++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ce6fb6e83b32..85f0f03f1e87 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * 	@name: name of the set
  * 	@ktype: key type (numeric type defined by userspace, not used in the kernel)
  * 	@dtype: data type (verdict or numeric type defined by userspace)
+ * 	@objtype: object type (see NFT_OBJECT_* definitions)
  * 	@size: maximum set size
  * 	@nelems: number of elements
  * 	@ndeact: number of deactivated elements queued for removal
@@ -347,6 +348,7 @@ struct nft_set {
 	char				name[NFT_SET_MAXNAMELEN];
 	u32				ktype;
 	u32				dtype;
+	u32				objtype;
 	u32				size;
 	atomic_t			nelems;
 	u32				ndeact;
@@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  *	@NFT_SET_EXT_EXPIRATION: element expiration time
  *	@NFT_SET_EXT_USERDATA: user data associated with the element
  *	@NFT_SET_EXT_EXPR: expression assiociated with the element
+ *	@NFT_SET_EXT_OBJREF: stateful object reference associated with element
  *	@NFT_SET_EXT_NUM: number of extension types
  */
 enum nft_set_extensions {
@@ -426,6 +429,7 @@ enum nft_set_extensions {
 	NFT_SET_EXT_EXPIRATION,
 	NFT_SET_EXT_USERDATA,
 	NFT_SET_EXT_EXPR,
+	NFT_SET_EXT_OBJREF,
 	NFT_SET_EXT_NUM
 };
 
@@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
 	return elem + set->ops->elemsize;
 }
 
+static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
+{
+	return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
+}
+
 void *nft_set_elem_init(const struct nft_set *set,
 			const struct nft_set_ext_tmpl *tmpl,
 			const u32 *key, const u32 *data,
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 4864caca1e8e..a6b52dbff08c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -255,6 +255,7 @@ enum nft_rule_compat_attributes {
  * @NFT_SET_MAP: set is used as a dictionary
  * @NFT_SET_TIMEOUT: set uses timeouts
  * @NFT_SET_EVAL: set contains expressions for evaluation
+ * @NFT_SET_OBJECT: set contains stateful objects
  */
 enum nft_set_flags {
 	NFT_SET_ANONYMOUS		= 0x1,
@@ -263,6 +264,7 @@ enum nft_set_flags {
 	NFT_SET_MAP			= 0x8,
 	NFT_SET_TIMEOUT			= 0x10,
 	NFT_SET_EVAL			= 0x20,
+	NFT_SET_OBJECT			= 0x40,
 };
 
 /**
@@ -304,6 +306,7 @@ enum nft_set_desc_attributes {
  * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64)
  * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
  * @NFTA_SET_USERDATA: user data (NLA_BINARY)
+ * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
  */
 enum nft_set_attributes {
 	NFTA_SET_UNSPEC,
@@ -321,6 +324,7 @@ enum nft_set_attributes {
 	NFTA_SET_GC_INTERVAL,
 	NFTA_SET_USERDATA,
 	NFTA_SET_PAD,
+	NFTA_SET_OBJ_TYPE,
 	__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX		(__NFTA_SET_MAX - 1)
@@ -344,6 +348,7 @@ enum nft_set_elem_flags {
  * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64)
  * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY)
  * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING)
  */
 enum nft_set_elem_attributes {
 	NFTA_SET_ELEM_UNSPEC,
@@ -355,6 +360,7 @@ enum nft_set_elem_attributes {
 	NFTA_SET_ELEM_USERDATA,
 	NFTA_SET_ELEM_EXPR,
 	NFTA_SET_ELEM_PAD,
+	NFTA_SET_ELEM_OBJREF,
 	__NFTA_SET_ELEM_MAX
 };
 #define NFTA_SET_ELEM_MAX	(__NFTA_SET_ELEM_MAX - 1)
@@ -1207,6 +1213,8 @@ enum nft_fib_flags {
 #define NFT_OBJECT_UNSPEC	0
 #define NFT_OBJECT_COUNTER	1
 #define NFT_OBJECT_QUOTA	2
+#define __NFT_OBJECT_MAX	3
+#define NFT_OBJECT_MAX		(__NFT_OBJECT_MAX - 1)
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c5419701ca79..8228714c42d5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2452,6 +2452,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
 	[NFTA_SET_GC_INTERVAL]		= { .type = NLA_U32 },
 	[NFTA_SET_USERDATA]		= { .type = NLA_BINARY,
 					    .len  = NFT_USERDATA_MAXLEN },
+	[NFTA_SET_OBJ_TYPE]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2609,6 +2610,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 		if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
 			goto nla_put_failure;
 	}
+	if (set->flags & NFT_SET_OBJECT &&
+	    nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
+		goto nla_put_failure;
 
 	if (set->timeout &&
 	    nla_put_be64(skb, NFTA_SET_TIMEOUT,
@@ -2838,7 +2842,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	unsigned int size;
 	bool create;
 	u64 timeout;
-	u32 ktype, dtype, flags, policy, gc_int;
+	u32 ktype, dtype, flags, policy, gc_int, objtype;
 	struct nft_set_desc desc;
 	unsigned char *udata;
 	u16 udlen;
@@ -2868,11 +2872,12 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 		flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
 		if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
 			      NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
-			      NFT_SET_MAP | NFT_SET_EVAL))
+			      NFT_SET_MAP | NFT_SET_EVAL |
+			      NFT_SET_OBJECT))
 			return -EINVAL;
-		/* Only one of both operations is supported */
-		if ((flags & (NFT_SET_MAP | NFT_SET_EVAL)) ==
-			     (NFT_SET_MAP | NFT_SET_EVAL))
+		/* Only one of these operations is supported */
+		if ((flags & (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT)) ==
+			     (NFT_SET_MAP | NFT_SET_EVAL | NFT_SET_OBJECT))
 			return -EOPNOTSUPP;
 	}
 
@@ -2897,6 +2902,19 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	} else if (flags & NFT_SET_MAP)
 		return -EINVAL;
 
+	if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
+		if (!(flags & NFT_SET_OBJECT))
+			return -EINVAL;
+
+		objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
+		if (objtype == NFT_OBJECT_UNSPEC ||
+		    objtype > NFT_OBJECT_MAX)
+			return -EINVAL;
+	} else if (flags & NFT_SET_OBJECT)
+		return -EINVAL;
+	else
+		objtype = NFT_OBJECT_UNSPEC;
+
 	timeout = 0;
 	if (nla[NFTA_SET_TIMEOUT] != NULL) {
 		if (!(flags & NFT_SET_TIMEOUT))
@@ -2984,6 +3002,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	set->ktype = ktype;
 	set->klen  = desc.klen;
 	set->dtype = dtype;
+	set->objtype = objtype;
 	set->dlen  = desc.dlen;
 	set->flags = flags;
 	set->size  = desc.size;
@@ -3126,6 +3145,10 @@ const struct nft_set_ext_type nft_set_ext_types[] = {
 	[NFT_SET_EXT_EXPR]		= {
 		.align	= __alignof__(struct nft_expr),
 	},
+	[NFT_SET_EXT_OBJREF]		= {
+		.len	= sizeof(struct nft_object *),
+		.align	= __alignof__(struct nft_object *),
+	},
 	[NFT_SET_EXT_FLAGS]		= {
 		.len	= sizeof(u8),
 		.align	= __alignof__(u8),
@@ -3214,6 +3237,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 	    nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, nft_set_ext_expr(ext)) < 0)
 		goto nla_put_failure;
 
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+	    nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
+			   (*nft_set_ext_obj(ext))->name) < 0)
+		goto nla_put_failure;
+
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
 	    nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
 		         htonl(*nft_set_ext_flags(ext))))
@@ -3508,7 +3536,8 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
 		nft_data_uninit(nft_set_ext_data(ext), set->dtype);
 	if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
 		nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext));
-
+	if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
+		(*nft_set_ext_obj(ext))->use--;
 	kfree(elem);
 }
 EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
@@ -3533,11 +3562,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			    const struct nlattr *attr, u32 nlmsg_flags)
 {
 	struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
+	u8 genmask = nft_genmask_next(ctx->net);
 	struct nft_data_desc d1, d2;
 	struct nft_set_ext_tmpl tmpl;
 	struct nft_set_ext *ext, *ext2;
 	struct nft_set_elem elem;
 	struct nft_set_binding *binding;
+	struct nft_object *obj = NULL;
 	struct nft_userdata *udata;
 	struct nft_data data;
 	enum nft_registers dreg;
@@ -3600,6 +3631,20 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 			nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
 	}
 
+	if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
+		if (!(set->flags & NFT_SET_OBJECT)) {
+			err = -EINVAL;
+			goto err2;
+		}
+		obj = nf_tables_obj_lookup(ctx->table, nla[NFTA_SET_ELEM_OBJREF],
+					   set->objtype, genmask);
+		if (IS_ERR(obj)) {
+			err = PTR_ERR(obj);
+			goto err2;
+		}
+		nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
+	}
+
 	if (nla[NFTA_SET_ELEM_DATA] != NULL) {
 		err = nft_data_init(ctx, &data, sizeof(data), &d2,
 				    nla[NFTA_SET_ELEM_DATA]);
@@ -3658,6 +3703,10 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 		udata->len = ulen - 1;
 		nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
 	}
+	if (obj) {
+		*nft_set_ext_obj(ext) = obj;
+		obj->use++;
+	}
 
 	trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
 	if (trans == NULL)
@@ -3667,10 +3716,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	err = set->ops->insert(ctx->net, set, &elem, &ext2);
 	if (err) {
 		if (err == -EEXIST) {
-			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
-			    nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
-			    memcmp(nft_set_ext_data(ext),
-				   nft_set_ext_data(ext2), set->dlen) != 0)
+			if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
+			     nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
+			     memcmp(nft_set_ext_data(ext),
+				    nft_set_ext_data(ext2), set->dlen) != 0) ||
+			    (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
+			     nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
+			     *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
 				err = -EBUSY;
 			else if (!(nlmsg_flags & NLM_F_EXCL))
 				err = 0;
-- 
2.1.4

^ permalink raw reply related

* [PATCH 41/50] netfilter: nft_objref: support for stateful object maps
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

This patch allows us to refer to stateful object dictionaries, the
source register indicates the key data to be used to look up for the
corresponding state object. We can refer to these maps through names or,
alternatively, the map transaction id. This allows us to refer to both
anonymous and named maps.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |   6 ++
 net/netfilter/nf_tables_api.c            |   4 ++
 net/netfilter/nft_objref.c               | 116 ++++++++++++++++++++++++++++++-
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index a6b52dbff08c..881d49e94569 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1153,11 +1153,17 @@ enum nft_fwd_attributes {
  *
  * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: nft_register)
  * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING)
+ * @NFTA_OBJREF_SET_SREG: source register of the data to look for (NLA_U32: nft_registers)
+ * @NFTA_OBJREF_SET_NAME: name of the set where to look for (NLA_STRING)
+ * @NFTA_OBJREF_SET_ID: id of the set where to look for in this transaction (NLA_U32)
  */
 enum nft_objref_attributes {
 	NFTA_OBJREF_UNSPEC,
 	NFTA_OBJREF_IMM_TYPE,
 	NFTA_OBJREF_IMM_NAME,
+	NFTA_OBJREF_SET_SREG,
+	NFTA_OBJREF_SET_NAME,
+	NFTA_OBJREF_SET_ID,
 	__NFTA_OBJREF_MAX
 };
 #define NFTA_OBJREF_MAX	(__NFTA_OBJREF_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8228714c42d5..b4db5bf4c135 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2504,6 +2504,7 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table,
 	}
 	return ERR_PTR(-ENOENT);
 }
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
 
 struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 					  const struct nlattr *nla,
@@ -2522,6 +2523,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 	}
 	return ERR_PTR(-ENOENT);
 }
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
 
 static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 				    const char *name)
@@ -3124,6 +3126,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	list_add_tail_rcu(&binding->list, &set->bindings);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(nf_tables_bind_set);
 
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 			  struct nft_set_binding *binding)
@@ -3134,6 +3137,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	    nft_is_active(ctx->net, set))
 		nf_tables_set_destroy(ctx, set);
 }
+EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
 
 const struct nft_set_ext_type nft_set_ext_types[] = {
 	[NFT_SET_EXT_KEY]		= {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 23820f796aad..415a65ba2b85 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -81,14 +81,128 @@ static const struct nft_expr_ops nft_objref_ops = {
 	.dump		= nft_objref_dump,
 };
 
+struct nft_objref_map {
+	struct nft_set		*set;
+	enum nft_registers	sreg:8;
+	struct nft_set_binding	binding;
+};
+
+static void nft_objref_map_eval(const struct nft_expr *expr,
+				struct nft_regs *regs,
+				const struct nft_pktinfo *pkt)
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
+	const struct nft_set *set = priv->set;
+	const struct nft_set_ext *ext;
+	struct nft_object *obj;
+	bool found;
+
+	found = set->ops->lookup(nft_net(pkt), set, &regs->data[priv->sreg],
+				 &ext);
+	if (!found) {
+		regs->verdict.code = NFT_BREAK;
+		return;
+	}
+	obj = *nft_set_ext_obj(ext);
+	obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_map_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
+	u8 genmask = nft_genmask_next(ctx->net);
+	struct nft_set *set;
+	int err;
+
+	set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask);
+	if (IS_ERR(set)) {
+		if (tb[NFTA_OBJREF_SET_ID]) {
+			set = nf_tables_set_lookup_byid(ctx->net,
+							tb[NFTA_OBJREF_SET_ID],
+							genmask);
+		}
+		if (IS_ERR(set))
+			return PTR_ERR(set);
+	}
+
+	if (!(set->flags & NFT_SET_OBJECT))
+		return -EINVAL;
+
+	priv->sreg = nft_parse_register(tb[NFTA_OBJREF_SET_SREG]);
+	err = nft_validate_register_load(priv->sreg, set->klen);
+	if (err < 0)
+		return err;
+
+	priv->binding.flags = set->flags & NFT_SET_OBJECT;
+
+	err = nf_tables_bind_set(ctx, set, &priv->binding);
+	if (err < 0)
+		return err;
+
+	priv->set = set;
+	return 0;
+}
+
+static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_objref_map *priv = nft_expr_priv(expr);
+
+	if (nft_dump_register(skb, NFTA_OBJREF_SET_SREG, priv->sreg) ||
+	    nla_put_string(skb, NFTA_OBJREF_SET_NAME, priv->set->name))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static void nft_objref_map_destroy(const struct nft_ctx *ctx,
+				   const struct nft_expr *expr)
+{
+	struct nft_objref_map *priv = nft_expr_priv(expr);
+
+	nf_tables_unbind_set(ctx, priv->set, &priv->binding);
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct nft_expr_ops nft_objref_map_ops = {
+	.type		= &nft_objref_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
+	.eval		= nft_objref_map_eval,
+	.init		= nft_objref_map_init,
+	.destroy	= nft_objref_map_destroy,
+	.dump		= nft_objref_map_dump,
+};
+
+static const struct nft_expr_ops *
+nft_objref_select_ops(const struct nft_ctx *ctx,
+                      const struct nlattr * const tb[])
+{
+	if (tb[NFTA_OBJREF_SET_SREG] &&
+	    (tb[NFTA_OBJREF_SET_NAME] ||
+	     tb[NFTA_OBJREF_SET_ID]))
+		return &nft_objref_map_ops;
+	else if (tb[NFTA_OBJREF_IMM_NAME] &&
+		 tb[NFTA_OBJREF_IMM_TYPE])
+		return &nft_objref_ops;
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
 	[NFTA_OBJREF_IMM_NAME]	= { .type = NLA_STRING },
 	[NFTA_OBJREF_IMM_TYPE]	= { .type = NLA_U32 },
+	[NFTA_OBJREF_SET_SREG]	= { .type = NLA_U32 },
+	[NFTA_OBJREF_SET_NAME]	= { .type = NLA_STRING },
+	[NFTA_OBJREF_SET_ID]	= { .type = NLA_U32 },
 };
 
 static struct nft_expr_type nft_objref_type __read_mostly = {
 	.name		= "objref",
-	.ops		= &nft_objref_ops,
+	.select_ops	= nft_objref_select_ops,
 	.policy		= nft_objref_policy,
 	.maxattr	= NFTA_OBJREF_MAX,
 	.owner		= THIS_MODULE,
-- 
2.1.4

^ permalink raw reply related

* [PATCH 44/50] netfilter: nat: skip checksum on offload SCTP packets
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

From: Davide Caratti <dcaratti@redhat.com>

SCTP GSO and hardware can do CRC32c computation after netfilter processing,
so we can avoid calling sctp_compute_checksum() on skb if skb->ip_summed
is equal to CHECKSUM_PARTIAL. Moreover, set skb->ip_summed to CHECKSUM_NONE
when the NAT code computes the CRC, to prevent offloaders from computing
it again (on ixgbe this resulted in a transmission with wrong L4 checksum).

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_proto_sctp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index 2e14108ff697..31d358691af0 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -47,7 +47,10 @@ sctp_manip_pkt(struct sk_buff *skb,
 		hdr->dest = tuple->dst.u.sctp.port;
 	}
 
-	hdr->checksum = sctp_compute_cksum(skb, hdroff);
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		hdr->checksum = sctp_compute_cksum(skb, hdroff);
+		skb->ip_summed = CHECKSUM_NONE;
+	}
 
 	return true;
 }
-- 
2.1.4

^ permalink raw reply related

* [PATCH 46/50] netfilter: nft_set: introduce nft_{hash, rbtree}_deactivate_one()
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

This new function allows us to deactivate one single element, this is
required by the set flush command that comes in a follow up patch.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_hash.c   | 24 +++++++++++++++++-------
 net/netfilter/nft_set_rbtree.c | 11 ++++++++++-
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index a3dface3e6e6..73f7687c5656 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -167,6 +167,19 @@ static void nft_hash_activate(const struct net *net, const struct nft_set *set,
 	nft_set_elem_clear_busy(&he->ext);
 }
 
+static bool nft_hash_deactivate_one(const struct net *net,
+				    const struct nft_set *set, void *priv)
+{
+	struct nft_hash_elem *he = priv;
+
+	if (!nft_set_elem_mark_busy(&he->ext) ||
+	    !nft_is_active(net, &he->ext)) {
+		nft_set_elem_change_active(net, set, &he->ext);
+		return true;
+	}
+	return false;
+}
+
 static void *nft_hash_deactivate(const struct net *net,
 				 const struct nft_set *set,
 				 const struct nft_set_elem *elem)
@@ -181,13 +194,10 @@ static void *nft_hash_deactivate(const struct net *net,
 
 	rcu_read_lock();
 	he = rhashtable_lookup_fast(&priv->ht, &arg, nft_hash_params);
-	if (he != NULL) {
-		if (!nft_set_elem_mark_busy(&he->ext) ||
-		    !nft_is_active(net, &he->ext))
-			nft_set_elem_change_active(net, set, &he->ext);
-		else
-			he = NULL;
-	}
+	if (he != NULL &&
+	    !nft_hash_deactivate_one(net, set, he))
+		he = NULL;
+
 	rcu_read_unlock();
 
 	return he;
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 36493a7cae88..5580bb64dc0f 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -171,6 +171,15 @@ static void nft_rbtree_activate(const struct net *net,
 	nft_set_elem_change_active(net, set, &rbe->ext);
 }
 
+static bool nft_rbtree_deactivate_one(const struct net *net,
+				      const struct nft_set *set, void *priv)
+{
+	struct nft_rbtree_elem *rbe = priv;
+
+	nft_set_elem_change_active(net, set, &rbe->ext);
+	return true;
+}
+
 static void *nft_rbtree_deactivate(const struct net *net,
 				   const struct nft_set *set,
 				   const struct nft_set_elem *elem)
@@ -204,7 +213,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
 				parent = parent->rb_right;
 				continue;
 			}
-			nft_set_elem_change_active(net, set, &rbe->ext);
+			nft_rbtree_deactivate_one(net, set, rbe);
 			return rbe;
 		}
 	}
-- 
2.1.4

^ permalink raw reply related

* [PATCH 45/50] netfilter: nf_tables: constify struct nft_ctx * parameter in nft_trans_alloc()
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Context is not modified by nft_trans_alloc(), so constify it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b04d4ee1d533..b42059795819 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -111,8 +111,8 @@ static void nft_ctx_init(struct nft_ctx *ctx,
 	ctx->seq	= nlh->nlmsg_seq;
 }
 
-static struct nft_trans *nft_trans_alloc(struct nft_ctx *ctx, int msg_type,
-					 u32 size)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+					 int msg_type, u32 size)
 {
 	struct nft_trans *trans;
 
-- 
2.1.4

^ permalink raw reply related

* [PATCH 48/50] netfilter: x_tables: avoid warn and OOM killer on vmalloc call
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

Andrey Konovalov reported that this vmalloc call is based on an
userspace request and that it's spewing traces, which may flood the logs
and cause DoS if abused.

Florian Westphal also mentioned that this call should not trigger OOM
killer.

This patch brings the vmalloc call in sync to kmalloc and disables the
warn trace on allocation failure and also disable OOM killer invocation.

Note, however, that under such stress situation, other places may
trigger OOM killer invocation.

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Florian Westphal <fw@strlen.de>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f6ce4a7036e6..2ff499680cc6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -959,7 +959,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
 		info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
 	if (!info) {
-		info = vmalloc(sz);
+		info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN |
+				     __GFP_NORETRY | __GFP_HIGHMEM,
+				 PAGE_KERNEL);
 		if (!info)
 			return NULL;
 	}
-- 
2.1.4

^ permalink raw reply related

* [PATCH 50/50] netfilter: nft_quota: allow to restore consumed quota
From: Pablo Neira Ayuso @ 2016-12-07 21:52 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <1481147576-5690-1-git-send-email-pablo@netfilter.org>

Allow to restore consumed quota, this is useful to restore the quota
state across reboots.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_quota.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 7f27ebdce7ab..bd6efc53f26d 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -43,6 +43,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv,
 static const struct nla_policy nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
 	[NFTA_QUOTA_BYTES]	= { .type = NLA_U64 },
 	[NFTA_QUOTA_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_QUOTA_CONSUMED]	= { .type = NLA_U64 },
 };
 
 #define NFT_QUOTA_DEPLETED_BIT	1	/* From NFT_QUOTA_F_DEPLETED. */
@@ -68,7 +69,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
 			     struct nft_quota *priv)
 {
 	unsigned long flags = 0;
-	u64 quota;
+	u64 quota, consumed = 0;
 
 	if (!tb[NFTA_QUOTA_BYTES])
 		return -EINVAL;
@@ -77,6 +78,12 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
 	if (quota > S64_MAX)
 		return -EOVERFLOW;
 
+	if (tb[NFTA_QUOTA_CONSUMED]) {
+		consumed = be64_to_cpu(nla_get_be64(tb[NFTA_QUOTA_CONSUMED]));
+		if (consumed > quota)
+			return -EINVAL;
+	}
+
 	if (tb[NFTA_QUOTA_FLAGS]) {
 		flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
 		if (flags & ~NFT_QUOTA_F_INV)
@@ -87,7 +94,7 @@ static int nft_quota_do_init(const struct nlattr * const tb[],
 
 	priv->quota = quota;
 	priv->flags = flags;
-	atomic64_set(&priv->consumed, 0);
+	atomic64_set(&priv->consumed, consumed);
 
 	return 0;
 }
-- 
2.1.4

^ permalink raw reply related

* RE: [net-next 20/20] i40e: don't allow i40e_vsi_(add|kill)_vlan to operate when VID<1
From: Keller, Jacob E @ 2016-12-07 22:00 UTC (permalink / raw)
  To: Kirsher, Jeffrey T, Sergei Shtylyov, davem@davemloft.net
  Cc: netdev@vger.kernel.org, nhorman@redhat.com, sassmann@redhat.com,
	jogreene@redhat.com, guru.anbalagane@oracle.com
In-Reply-To: <1481147594.2404.22.camel@intel.com>



> -----Original Message-----
> From: Kirsher, Jeffrey T
> Sent: Wednesday, December 07, 2016 1:53 PM
> To: Keller, Jacob E <jacob.e.keller@intel.com>; Sergei Shtylyov
> <sergei.shtylyov@cogentembedded.com>; davem@davemloft.net
> Cc: netdev@vger.kernel.org; nhorman@redhat.com; sassmann@redhat.com;
> jogreene@redhat.com; guru.anbalagane@oracle.com
> Subject: Re: [net-next 20/20] i40e: don't allow i40e_vsi_(add|kill)_vlan to operate
> when VID<1
> 
> On Wed, 2016-12-07 at 13:50 -0800, Keller, Jacob E wrote:
> > > -----Original Message-----
> > > From: Sergei Shtylyov [mailto:sergei.shtylyov@cogentembedded.com]
> > > Sent: Wednesday, December 07, 2016 2:11 AM
> > > To: Kirsher, Jeffrey T <jeffrey.t.kirsher@intel.com>; davem@davemloft.n
> > > et
> > > Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
> > > nhorman@redhat.com; sassmann@redhat.com; jogreene@redhat.com;
> > > guru.anbalagane@oracle.com
> > > Subject: Re: [net-next 20/20] i40e: don't allow
> > > i40e_vsi_(add|kill)_vlan to operate
> > > when VID<1
> > >
> > > Hello!
> > > > +   if (!(vid > 0) || vsi->info.pvid)
> > >
> > >      Why not just '!vid'?
> >
> > Left over artifact of this previously being a signed value. We can fix
> > this.
> >
> > Thanks,
> > Jake
> >
> > > > -void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
> > > > +void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid)
> > > >   {
> > > > +   if (!(vid > 0) || vsi->info.pvid)
> > >
> > >      Likewise.
> >
> > Same here. Can get this fixed.
> 
> While you are fixing this up and sending me a new version of this patch, I
> will just drop this from the series and re-send.

Yes, since it's the last patch that's fine.

Thanks,
Jake

^ permalink raw reply

* Re: [patch] ser_gigaset: return -ENOMEM on error instead of success
From: Tilman Schmidt @ 2016-12-07 22:04 UTC (permalink / raw)
  To: Paul Bolle, Dan Carpenter
  Cc: Karsten Keil, David S. Miller, gigaset307x-common, netdev,
	kernel-janitors
In-Reply-To: <1481144894.11167.7.camel@tiscali.nl>


[-- Attachment #1.1: Type: text/plain, Size: 1173 bytes --]

Hi Paul,

Am 07.12.2016 um 22:08 schrieb Paul Bolle:
> On Wed, 2016-12-07 at 21:57 +0100, Tilman Schmidt wrote:
>> Not much of a mess, I reckon. Everything that has been allocated and
>> registered up to that point is properly deallocated and unregistered.
>> The code just fails to tell the kernel that module initialization has
>> failed, so the module remains loaded even though it can never be
>> called because it isn't hooked anywhere. That's a nuisance and a
>> waste of RAM, but not much more.
> 
> Yes.
> 
> But then the removal of the module, which is the only reasonable thing to do
> after all this has happened, seems to trigger a WARN in driver_unregister().
> And it's that WARN that I think requires the entire stable song and dance.

Ah, yes, of course, because driver_unregister() has already been run
in the failure path of module_init and is now called a second time.
Not sure how much evil that does beyond the WARN, but I agree it's
worth investigating.

Best regards,
Tilman

-- 
Tilman Schmidt                              E-Mail: tilman@imap.cc
Bonn, Germany
Nous, on a des fleurs et des bougies pour nous protéger.


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 473 bytes --]

^ permalink raw reply

* Re: [PATCH] net: pch_gbe: Fix TX RX descriptor accesses for big endian systems
From: Francois Romieu @ 2016-12-07 22:05 UTC (permalink / raw)
  To: Hassan Naveed
  Cc: netdev, Paul Burton, Matt Redfearn, David S. Miller,
	Florian Westphal
In-Reply-To: <1481133534-26224-1-git-send-email-hassan.naveed@imgtec.com>

Hassan Naveed <hassan.naveed@imgtec.com> :
> Fix pch_gbe driver for ethernet operations for a big endian CPU.
> Values written to and read from transmit and receive descriptors
> in the pch_gbe driver are byte swapped from the perspective of a
> big endian CPU, since the ethernet controller always operates in
> little endian mode. Rectify this by appropriately byte swapping
> these descriptor field values in the driver software.

You should also use __le{16/32} types in struct pch_gbe_{rx/tx}_desc.

-- 
Ueimor

^ permalink raw reply

* [net-next v2 00/19][pull request] 40GbE Intel Wired LAN Driver Updates 2016-12-07
From: Jeff Kirsher @ 2016-12-07 22:18 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane

This series contains updates to i40e and i40evf only.

Filip modifies the i40e to log link speed change and when the link is
brought up and down.

Mitch replaces i40e_txd_use_count() with a new function which is slightly
faster and better documented so the dim witted can better follow the
code.  Fixes the locking of the service task so that it is actually
done in the service task and not in the scheduling function which calls
the service task.

Jacob, being the busy little beaver he is, provides most of the changes
starting restores a workaround that is still needed in some configurations,
specifically the Ethernet Controller XL710 for 40GbE QSFP+.  Removes
duplicate code and simplifies the i40e_vsi_add_vlan() and
i40e_vsi_kill_vlan() functions.  Removes detection of PTP frames over L4
(UDP) on the XL710 MAC, since there was a product decision to defeature
it.  Fixed a previous refactor of active filters which caused issues in
the accounting of active_filters.  Remaining work was done in the VLAN
filters to improve readability and simplify code as much as possible
to reduce inconsistencies.

Alex fixes foul budget accounting in core code by returning actual
work done, capped to budget-1.

Henry fixes the "ethtool -p" function for 1G BaseT PHYs.

Carolyn adds support for 25G devices for i40e and i40evf.

Michal adds functions to apply the correct access method for external PHYs
which could use Clause22 or Clause45 depending on the PHY.

v2: dropped last patch from previous series, since changes are needed based
    on feedback from Sergei Shtylyov

The following are changes since commit d4aea20d889e05575bb331a3dadf176176f7d631:
  tun: Use netif_receive_skb instead of netif_rx
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alexander Duyck (1):
  i40e/i40evf: napi_poll must return the work done

Bimmy Pujari (1):
  Changed version from 1.6.21 to 1.6.25

Carolyn Wyborny (2):
  i40e: Add support for 25G devices
  i40e: Add FEC for 25g

Filip Sadowski (1):
  i40e: Driver prints log message on link speed change

Henry Tieman (1):
  i40e: Blink LED on 1G BaseT boards

Jacob Keller (10):
  i40e: restore workaround for removing default MAC filter
  i40e: remove code to handle dev_addr specially
  i40e: use unsigned printf format specifier for active_filters count
  i40e: defeature support for PTP L4 frame detection on XL710
  i40e: recalculate vsi->active_filters from hash contents
  i40e: refactor i40e_update_filter_state to avoid passing aq_err
  i40e: delete filter after adding its replacement when converting
  i40e: factor out addition/deletion of VLAN per each MAC address
  i40e: use (add|rm)_vlan_all_mac helper functions when changing PVID
  i40e: move all updates for VLAN mode into i40e_sync_vsi_filters

Michal Kosiarz (1):
  i40e: Add functions which apply correct PHY access method for read and
    write operation

Mitch Williams (2):
  i40e: simplify txd use count calculation
  i40e: lock service task correctly

 drivers/net/ethernet/intel/i40e/i40e.h             |   6 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  51 ++-
 drivers/net/ethernet/intel/i40e/i40e_common.c      |  85 +++-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_devids.h      |   2 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  51 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c        | 485 ++++++++++++---------
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   4 +
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  21 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h        |  45 +-
 drivers/net/ethernet/intel/i40e/i40e_type.h        |  82 ++--
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  46 +-
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h    |  51 ++-
 drivers/net/ethernet/intel/i40evf/i40e_common.c    |   2 +
 drivers/net/ethernet/intel/i40evf/i40e_devids.h    |   2 +
 drivers/net/ethernet/intel/i40evf/i40e_prototype.h |   4 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c      |   2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h      |  45 +-
 drivers/net/ethernet/intel/i40evf/i40e_type.h      |  82 ++--
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |   8 +
 drivers/net/ethernet/intel/i40evf/i40evf_main.c    |   2 +-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c    |   3 +
 23 files changed, 723 insertions(+), 360 deletions(-)

-- 
2.9.3

^ permalink raw reply

* [net-next v2 01/19] i40e: Driver prints log message on link speed change
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Filip Sadowski, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Filip Sadowski <filip.sadowski@intel.com>

This patch makes the driver log link speed change. Before applying the
patch link messages were printed only on state change. Now message is
printed when link is brought up or down and when speed changes.

Change-ID: Ifbee14b4b16c24967450b3cecac6e8351dcc8f74
Signed-off-by: Filip Sadowski <filip.sadowski@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h      | 1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 4cb8fb3..06e3c23 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -596,6 +596,7 @@ struct i40e_vsi {
 	u16 veb_idx;		/* index of VEB parent */
 	struct kobject *kobj;	/* sysfs object */
 	bool current_isup;	/* Sync 'link up' logging */
+	enum i40e_aq_link_speed current_speed;	/* Sync link speed logging */
 
 	void *priv;	/* client driver data reference. */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5777e49..4534d41 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5225,12 +5225,16 @@ static int i40e_init_pf_dcb(struct i40e_pf *pf)
  */
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 {
+	enum i40e_aq_link_speed new_speed;
 	char *speed = "Unknown";
 	char *fc = "Unknown";
 
-	if (vsi->current_isup == isup)
+	new_speed = vsi->back->hw.phy.link_info.link_speed;
+
+	if ((vsi->current_isup == isup) && (vsi->current_speed == new_speed))
 		return;
 	vsi->current_isup = isup;
+	vsi->current_speed = new_speed;
 	if (!isup) {
 		netdev_info(vsi->netdev, "NIC Link is Down\n");
 		return;
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 02/19] i40e: simplify txd use count calculation
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Mitch Williams, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Alexander Duyck, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Mitch Williams <mitch.a.williams@intel.com>

The i40e_txd_use_count function was fast but confusing. In the comments,
it even admits that it's ugly. So replace it with a new function that is
(very) slightly faster and has extensive commenting to help the thicker
among us (including the author, who will forget in a week) understand
how it works.

Change-ID: Ifb533f13786a0bf39cb29f77969a5be2c83d9a87
Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 45 +++++++++++++++++----------
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 45 +++++++++++++++++----------
 2 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index de8550f..e065321 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
 #define I40E_MAX_DATA_PER_TXD_ALIGNED \
 	(I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
 
-/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where X is
- * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the fact
- * that 12K is not a power of 2 and division is expensive.  It is used to
- * approximate the number of descriptors used per linear buffer.  Note
- * that this will overestimate in some cases as it doesn't account for the
- * fact that we will add up to 4K - 1 in aligning the 12K buffer, however
- * the error should not impact things much as large buffers usually mean
- * we will use fewer descriptors then there are frags in an skb.
+/**
+ * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
+ * @size: transmit request size in bytes
+ *
+ * Due to hardware alignment restrictions (4K alignment), we need to
+ * assume that we can have no more than 12K of data per descriptor, even
+ * though each descriptor can take up to 16K - 1 bytes of aligned memory.
+ * Thus, we need to divide by 12K. But division is slow! Instead,
+ * we decompose the operation into shifts and one relatively cheap
+ * multiply operation.
+ *
+ * To divide by 12K, we first divide by 4K, then divide by 3:
+ *     To divide by 4K, shift right by 12 bits
+ *     To divide by 3, multiply by 85, then divide by 256
+ *     (Divide by 256 is done by shifting right by 8 bits)
+ * Finally, we add one to round up. Because 256 isn't an exact multiple of
+ * 3, we'll underestimate near each multiple of 12K. This is actually more
+ * accurate as we have 4K - 1 of wiggle room that we can fit into the last
+ * segment.  For our purposes this is accurate out to 1M which is orders of
+ * magnitude greater than our largest possible GSO size.
+ *
+ * This would then be implemented as:
+ *     return (((size >> 12) * 85) >> 8) + 1;
+ *
+ * Since multiplication and division are commutative, we can reorder
+ * operations into:
+ *     return ((size * 85) >> 20) + 1;
  */
 static inline unsigned int i40e_txd_use_count(unsigned int size)
 {
-	const unsigned int max = I40E_MAX_DATA_PER_TXD_ALIGNED;
-	const unsigned int reciprocal = ((1ull << 32) - 1 + (max / 2)) / max;
-	unsigned int adjust = ~(u32)0;
-
-	/* if we rounded up on the reciprocal pull down the adjustment */
-	if ((max * reciprocal) > adjust)
-		adjust = ~(u32)(reciprocal - 1);
-
-	return (u32)((((u64)size * reciprocal) + adjust) >> 32);
+	return ((size * 85) >> 20) + 1;
 }
 
 /* Tx Descriptors needed, worst case */
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
index a586e19..a5fc789 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc,
 #define I40E_MAX_DATA_PER_TXD_ALIGNED \
 	(I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
 
-/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where X is
- * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the fact
- * that 12K is not a power of 2 and division is expensive.  It is used to
- * approximate the number of descriptors used per linear buffer.  Note
- * that this will overestimate in some cases as it doesn't account for the
- * fact that we will add up to 4K - 1 in aligning the 12K buffer, however
- * the error should not impact things much as large buffers usually mean
- * we will use fewer descriptors then there are frags in an skb.
+/**
+ * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
+ * @size: transmit request size in bytes
+ *
+ * Due to hardware alignment restrictions (4K alignment), we need to
+ * assume that we can have no more than 12K of data per descriptor, even
+ * though each descriptor can take up to 16K - 1 bytes of aligned memory.
+ * Thus, we need to divide by 12K. But division is slow! Instead,
+ * we decompose the operation into shifts and one relatively cheap
+ * multiply operation.
+ *
+ * To divide by 12K, we first divide by 4K, then divide by 3:
+ *     To divide by 4K, shift right by 12 bits
+ *     To divide by 3, multiply by 85, then divide by 256
+ *     (Divide by 256 is done by shifting right by 8 bits)
+ * Finally, we add one to round up. Because 256 isn't an exact multiple of
+ * 3, we'll underestimate near each multiple of 12K. This is actually more
+ * accurate as we have 4K - 1 of wiggle room that we can fit into the last
+ * segment.  For our purposes this is accurate out to 1M which is orders of
+ * magnitude greater than our largest possible GSO size.
+ *
+ * This would then be implemented as:
+ *     return (((size >> 12) * 85) >> 8) + 1;
+ *
+ * Since multiplication and division are commutative, we can reorder
+ * operations into:
+ *     return ((size * 85) >> 20) + 1;
  */
 static inline unsigned int i40e_txd_use_count(unsigned int size)
 {
-	const unsigned int max = I40E_MAX_DATA_PER_TXD_ALIGNED;
-	const unsigned int reciprocal = ((1ull << 32) - 1 + (max / 2)) / max;
-	unsigned int adjust = ~(u32)0;
-
-	/* if we rounded up on the reciprocal pull down the adjustment */
-	if ((max * reciprocal) > adjust)
-		adjust = ~(u32)(reciprocal - 1);
-
-	return (u32)((((u64)size * reciprocal) + adjust) >> 32);
+	return ((size * 85) >> 20) + 1;
 }
 
 /* Tx Descriptors needed, worst case */
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 05/19] i40e: remove code to handle dev_addr specially
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

The netdev->dev_addr MAC filter already exists in the
MAC/VLAN hash table, as it is added when we configure
the netdev in i40e_configure_netdev. Because we already
know that this address will be updated in the
hash_for_each loops, we do not need to handle it
specially. This removes duplicate code and simplifies
the i40e_vsi_add_vlan and i40e_vsi_kill_vlan functions.
Because we know these filters must be part of the
MAC/VLAN hash table, this should not have any functional
impact on what filters are included and is merely a code
simplification.

Change-ID: I5e648302dbdd7cc29efc6d203b7019c11f0b5705
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 43 +++++------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c467cc4..ae4a2b2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2515,17 +2515,6 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
 	/* Locked once because all functions invoked below iterates list*/
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
 
-	if (vsi->netdev) {
-		add_f = i40e_add_filter(vsi, vsi->netdev->dev_addr, vid);
-		if (!add_f) {
-			dev_info(&vsi->back->pdev->dev,
-				 "Could not add vlan filter %d for %pM\n",
-				 vid, vsi->netdev->dev_addr);
-			spin_unlock_bh(&vsi->mac_filter_hash_lock);
-			return -ENOMEM;
-		}
-	}
-
 	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
 		if (f->state == I40E_FILTER_REMOVE)
 			continue;
@@ -2539,28 +2528,14 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
 		}
 	}
 
-	/* Now if we add a vlan tag, make sure to check if it is the first
-	 * tag (i.e. a "tag" -1 does exist) and if so replace the -1 "tag"
-	 * with 0, so we now accept untagged and specified tagged traffic
-	 * (and not all tags along with untagged)
+	/* When we add a new VLAN filter, we need to make sure that all existing
+	 * filters which are marked as vid=-1 (I40E_VLAN_ANY) are converted to
+	 * vid=0. The simplest way is just search for all filters marked as
+	 * vid=-1 and replace them with vid=0. This converts all filters that
+	 * were marked to receive all traffic (tagged or untagged) into
+	 * filters to receive only untagged traffic, so that we don't receive
+	 * tagged traffic for VLANs which we have not configured.
 	 */
-	if (vid > 0 && vsi->netdev) {
-		del_f = i40e_find_filter(vsi, vsi->netdev->dev_addr,
-					 I40E_VLAN_ANY);
-		if (del_f) {
-			__i40e_del_filter(vsi, del_f);
-			add_f = i40e_add_filter(vsi, vsi->netdev->dev_addr, 0);
-			if (!add_f) {
-				dev_info(&vsi->back->pdev->dev,
-					 "Could not add filter 0 for %pM\n",
-					 vsi->netdev->dev_addr);
-				spin_unlock_bh(&vsi->mac_filter_hash_lock);
-				return -ENOMEM;
-			}
-		}
-	}
-
-	/* Do not assume that I40E_VLAN_ANY should be reset to VLAN 0 */
 	if (vid > 0 && !vsi->info.pvid) {
 		hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
 			if (f->state == I40E_FILTER_REMOVE)
@@ -2597,7 +2572,6 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
  **/
 void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
 {
-	struct net_device *netdev = vsi->netdev;
 	struct i40e_mac_filter *f;
 	struct hlist_node *h;
 	int bkt;
@@ -2605,9 +2579,6 @@ void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
 	/* Locked once because all functions invoked below iterates list */
 	spin_lock_bh(&vsi->mac_filter_hash_lock);
 
-	if (vsi->netdev)
-		i40e_del_filter(vsi, netdev->dev_addr, vid);
-
 	hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
 		if (f->vlan == vid)
 			__i40e_del_filter(vsi, f);
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 03/19] i40e: restore workaround for removing default MAC filter
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

A previous commit 53cb6e9e8949 ("i40e: Removal of workaround for simple
MAC address filter deletion") removed a workaround for some
firmware versions which was reported to not be necessary in production
NICs. Unfortunately this workaround is necessary in some configurations,
specifically the Ethernet Controller XL710 for 40GbE QSFP+ (8086:1583).

Without this patch, the mentioned NICs with current firmware exhibit
issues when adding VLANs, as outlined by the following reproduction:

  $modprobe i40e
  $ip link set <device> up
  $ip link add link <device> vlan100 type vlan id 100
  $dmesg | tail
  <snip>
  kernel: i40e 0000:82:00.0: Error I40E_AQ_RC_EINVAL adding RX
filters on PF, promiscuous mode forced on

This results in filters being marked as FAILED and setting the device in
promiscuous mode.

The root cause of receiving the -EINVAL error response appears to be due
to a conflict with the default MAC filter which still exists on the
default firmware for this device. Attempting to add a new VLAN filter on
the default MAC address conflicts with the IGNORE_VLAN setting on the
default rule.

Change-ID: I4d8f6d48ac5f60cfe981b3baad30eb4d7c170d61
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 41 +++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 4534d41..c467cc4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1226,6 +1226,39 @@ bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi)
 }
 
 /**
+ * i40e_rm_default_mac_filter - Remove the default MAC filter set by NVM
+ * @vsi: the PF Main VSI - inappropriate for any other VSI
+ * @macaddr: the MAC address
+ *
+ * Remove whatever filter the firmware set up so the driver can manage
+ * its own filtering intelligently.
+ **/
+static void i40e_rm_default_mac_filter(struct i40e_vsi *vsi, u8 *macaddr)
+{
+	struct i40e_aqc_remove_macvlan_element_data element;
+	struct i40e_pf *pf = vsi->back;
+
+	/* Only appropriate for the PF main VSI */
+	if (vsi->type != I40E_VSI_MAIN)
+		return;
+
+	memset(&element, 0, sizeof(element));
+	ether_addr_copy(element.mac_addr, macaddr);
+	element.vlan_tag = 0;
+	/* Ignore error returns, some firmware does it this way... */
+	element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+	i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
+
+	memset(&element, 0, sizeof(element));
+	ether_addr_copy(element.mac_addr, macaddr);
+	element.vlan_tag = 0;
+	/* ...and some firmware does it this way. */
+	element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH |
+			I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+	i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
+}
+
+/**
  * i40e_add_filter - Add a mac/vlan filter to the VSI
  * @vsi: the VSI to be searched
  * @macaddr: the MAC address
@@ -9295,6 +9328,12 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
 	if (vsi->type == I40E_VSI_MAIN) {
 		SET_NETDEV_DEV(netdev, &pf->pdev->dev);
 		ether_addr_copy(mac_addr, hw->mac.perm_addr);
+		/* The following steps are necessary to prevent reception
+		 * of tagged packets - some older NVM configurations load a
+		 * default a MAC-VLAN filter that accepts any tagged packet
+		 * which must be replaced by a normal filter.
+		 */
+		i40e_rm_default_mac_filter(vsi, mac_addr);
 		spin_lock_bh(&vsi->mac_filter_hash_lock);
 		i40e_add_filter(vsi, mac_addr, I40E_VLAN_ANY);
 		spin_unlock_bh(&vsi->mac_filter_hash_lock);
@@ -9828,6 +9867,8 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
 	pf->vsi[pf->lan_vsi]->tc_config.enabled_tc = 0;
 	pf->vsi[pf->lan_vsi]->seid = pf->main_vsi_seid;
 	i40e_vsi_config_tc(pf->vsi[pf->lan_vsi], enabled_tc);
+	if (vsi->type == I40E_VSI_MAIN)
+		i40e_rm_default_mac_filter(vsi, pf->hw.mac.perm_addr);
 
 	/* assign it some queues */
 	ret = i40e_alloc_rings(vsi);
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 04/19] i40e/i40evf: napi_poll must return the work done
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Alexander Duyck, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>

Currently the function i40e_napi-poll() returns 0 when it clean completely
the Rx rings, but this foul budget accounting in core code.

Fix this by returning the actual work done, capped to budget - 1, since
the core doesn't allow to return the full budget when the driver modifies
the NAPI status

This is based on a similar change that was made for the ixgbe driver by
Paolo Abeni.

Change-ID: Ic3d93ad2fa2fc8ce3164bc461e69367da0f9173b
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5544b50..352cf7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2027,7 +2027,7 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
 	else
 		i40e_update_enable_itr(vsi, q_vector);
 
-	return 0;
+	return min(work_done, budget - 1);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index c4b174a..df67ef3 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1490,7 +1490,7 @@ int i40evf_napi_poll(struct napi_struct *napi, int budget)
 	else
 		i40e_update_enable_itr(vsi, q_vector);
 
-	return 0;
+	return min(work_done, budget - 1);
 }
 
 /**
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 06/19] i40e: Blink LED on 1G BaseT boards
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Henry Tieman, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Harshitha Ramamurthy, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Henry Tieman <henry.w.tieman@intel.com>

Before this patch "ethtool -p" was not blinking the LEDs on boards
with 1G BaseT PHYs.

This commit identifies 1G BaseT boards as having the LEDs connected
to the MAC. Also, renamed the flag to be more descriptive of usage.
The flag is now I40E_FLAG_PHY_CONTROLS_LEDS.

Change-ID: I4eb741da9780da7849ddf2dc4c0cb27ffa42a801
Signed-off-by: Henry Tieman <henry.w.tieman@intel.com>
Signed-off-by: Harshitha Ramamurthy <harshitha.ramamurthy@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e.h         |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 10 +++++-----
 drivers/net/ethernet/intel/i40e/i40e_main.c    |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index 06e3c23..b8f2978 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -356,7 +356,7 @@ struct i40e_pf {
 #define I40E_FLAG_NO_DCB_SUPPORT		BIT_ULL(45)
 #define I40E_FLAG_USE_SET_LLDP_MIB		BIT_ULL(46)
 #define I40E_FLAG_STOP_FW_LLDP			BIT_ULL(47)
-#define I40E_FLAG_HAVE_10GBASET_PHY		BIT_ULL(48)
+#define I40E_FLAG_PHY_CONTROLS_LEDS		BIT_ULL(48)
 #define I40E_FLAG_PF_MAC			BIT_ULL(50)
 #define I40E_FLAG_TRUE_PROMISC_SUPPORT		BIT_ULL(51)
 #define I40E_FLAG_HAVE_CRT_RETIMER		BIT_ULL(52)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 76753e1..6ba0035 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1890,7 +1890,7 @@ static int i40e_set_phys_id(struct net_device *netdev,
 
 	switch (state) {
 	case ETHTOOL_ID_ACTIVE:
-		if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY)) {
+		if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS)) {
 			pf->led_status = i40e_led_get(hw);
 		} else {
 			i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, NULL);
@@ -1900,20 +1900,20 @@ static int i40e_set_phys_id(struct net_device *netdev,
 		}
 		return blink_freq;
 	case ETHTOOL_ID_ON:
-		if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY))
+		if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS))
 			i40e_led_set(hw, 0xf, false);
 		else
 			ret = i40e_led_set_phy(hw, true, pf->led_status, 0);
 		break;
 	case ETHTOOL_ID_OFF:
-		if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY))
+		if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS))
 			i40e_led_set(hw, 0x0, false);
 		else
 			ret = i40e_led_set_phy(hw, false, pf->led_status, 0);
 		break;
 	case ETHTOOL_ID_INACTIVE:
-		if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY)) {
-			i40e_led_set(hw, false, pf->led_status);
+		if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS)) {
+			i40e_led_set(hw, pf->led_status, false);
 		} else {
 			ret = i40e_led_set_phy(hw, false, pf->led_status,
 					       (pf->phy_led_val |
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ae4a2b2..dbb854b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11380,8 +11380,8 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 						       pf->main_vsi_seid);
 
 	if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
-	    (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
-		pf->flags |= I40E_FLAG_HAVE_10GBASET_PHY;
+		(pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
+		pf->flags |= I40E_FLAG_PHY_CONTROLS_LEDS;
 	if (pf->hw.device_id == I40E_DEV_ID_SFP_I_X722)
 		pf->flags |= I40E_FLAG_HAVE_CRT_RETIMER;
 	/* print a string summarizing features */
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 07/19] Changed version from 1.6.21 to 1.6.25
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Bimmy Pujari, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Bimmy Pujari <bimmy.pujari@intel.com>

Signed-off-by: Bimmy Pujari <bimmy.pujari@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c     | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index dbb854b..aecf63b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -41,7 +41,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 21
+#define DRV_VERSION_BUILD 25
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 	     __stringify(DRV_VERSION_MINOR) "." \
 	     __stringify(DRV_VERSION_BUILD)    DRV_KERN
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index ca85021..c0fc533 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -38,7 +38,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 21
+#define DRV_VERSION_BUILD 25
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 	     __stringify(DRV_VERSION_MINOR) "." \
 	     __stringify(DRV_VERSION_BUILD) \
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 08/19] i40e: use unsigned printf format specifier for active_filters count
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Jacob Keller, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Jacob Keller <jacob.e.keller@intel.com>

Replace the %d specifier used for printing vsi->active_filters and
vsi->promisc_threshold with an unsigned %u format specifier. While it is
unlikely in practice that these values will ever reach such a large
number they are unsigned values and thus should not be interpreted as
negative numbers.

Change-ID: Iff050fad5a1c8537c4c57fcd527441cd95cfc0d4
Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index b8a03a0..f1f41f1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -172,7 +172,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
 			 f->macaddr, f->vlan,
 			 i40e_filter_state_string[f->state]);
 	}
-	dev_info(&pf->pdev->dev, "    active_filters %d, promisc_threshold %d, overflow promisc %s\n",
+	dev_info(&pf->pdev->dev, "    active_filters %u, promisc_threshold %u, overflow promisc %s\n",
 		 vsi->active_filters, vsi->promisc_threshold,
 		 (test_bit(__I40E_FILTER_OVERFLOW_PROMISC, &vsi->state) ?
 		  "ON" : "OFF"));
-- 
2.9.3

^ permalink raw reply related

* [net-next v2 12/19] i40e: lock service task correctly
From: Jeff Kirsher @ 2016-12-07 22:19 UTC (permalink / raw)
  To: davem
  Cc: Mitch Williams, netdev, nhorman, sassmann, jogreene,
	guru.anbalagane, Jeff Kirsher
In-Reply-To: <20161207221918.57932-1-jeffrey.t.kirsher@intel.com>

From: Mitch Williams <mitch.a.williams@intel.com>

The service task lock was being set in the scheduling function, not the
actual service task. This would potentially leave the bit set for a long
time before the task actually ran. Furthermore, if the service task
takes too long, it calls the schedule function to reschedule itself -
which would fail to take the lock and do nothing.

Instead, set and clear the lock bit in the service task itself. In the
process, get rid of the i40e_service_event_complete() function, which is
really just two lines of code that can be put right in the service task
itself.

Change-ID: I83155e682b686121e2897f4429eb7d3f7c669168
Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b0486c9..c47e9c5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -288,8 +288,7 @@ struct i40e_vsi *i40e_find_vsi_from_id(struct i40e_pf *pf, u16 id)
 void i40e_service_event_schedule(struct i40e_pf *pf)
 {
 	if (!test_bit(__I40E_DOWN, &pf->state) &&
-	    !test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state) &&
-	    !test_and_set_bit(__I40E_SERVICE_SCHED, &pf->state))
+	    !test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state))
 		queue_work(i40e_wq, &pf->service_task);
 }
 
@@ -5955,19 +5954,6 @@ static void i40e_handle_lan_overflow_event(struct i40e_pf *pf,
 }
 
 /**
- * i40e_service_event_complete - Finish up the service event
- * @pf: board private structure
- **/
-static void i40e_service_event_complete(struct i40e_pf *pf)
-{
-	WARN_ON(!test_bit(__I40E_SERVICE_SCHED, &pf->state));
-
-	/* flush memory to make sure state is correct before next watchog */
-	smp_mb__before_atomic();
-	clear_bit(__I40E_SERVICE_SCHED, &pf->state);
-}
-
-/**
  * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters
  * @pf: board private structure
  **/
@@ -7276,10 +7262,12 @@ static void i40e_service_task(struct work_struct *work)
 
 	/* don't bother with service tasks if a reset is in progress */
 	if (test_bit(__I40E_RESET_RECOVERY_PENDING, &pf->state)) {
-		i40e_service_event_complete(pf);
 		return;
 	}
 
+	if (test_and_set_bit(__I40E_SERVICE_SCHED, &pf->state))
+		return;
+
 	i40e_detect_recover_hung(pf);
 	i40e_sync_filters_subtask(pf);
 	i40e_reset_subtask(pf);
@@ -7292,7 +7280,9 @@ static void i40e_service_task(struct work_struct *work)
 	i40e_sync_udp_filters_subtask(pf);
 	i40e_clean_adminq_subtask(pf);
 
-	i40e_service_event_complete(pf);
+	/* flush memory to make sure state is correct before next watchdog */
+	smp_mb__before_atomic();
+	clear_bit(__I40E_SERVICE_SCHED, &pf->state);
 
 	/* If the tasks have taken longer than one timer cycle or there
 	 * is more work to be done, reschedule the service task now
-- 
2.9.3

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox