Netdev List
 help / color / mirror / Atom feed
* [patch net-next v5 06/10] net: sched: allow ingress and clsact qdiscs to share filter blocks
From: Jiri Pirko @ 2017-12-26 14:16 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Benefit from the previously introduced shared filter blocks
infrastructure and allow ingress and clsact qdisc instances to share
filter blocks. The block index is coming from userspace as qdisc option.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v3->v4:
- rebased on top of the current net-next
v2->v3:
- removed "p_" prefix from block index function args
---
 include/uapi/linux/pkt_sched.h | 11 ++++++
 net/sched/sch_ingress.c        | 89 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096..8cc554a 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -934,4 +934,15 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+/* Ingress/clsact */
+
+enum {
+	TCA_CLSACT_UNSPEC,
+	TCA_CLSACT_INGRESS_BLOCK,
+	TCA_CLSACT_EGRESS_BLOCK,
+	__TCA_CLSACT_MAX
+};
+
+#define TCA_CLSACT_MAX	(__TCA_CLSACT_MAX - 1)
+
 #endif
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index 7ca2be2..debcb8f 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -61,6 +61,29 @@ static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
 	struct mini_Qdisc_pair *miniqp = priv;
 
 	mini_qdisc_pair_swap(miniqp, tp_head);
+};
+
+static const struct nla_policy ingress_policy[TCA_CLSACT_MAX + 1] = {
+	[TCA_CLSACT_INGRESS_BLOCK]	= { .type = NLA_U32 },
+};
+
+static int ingress_parse_opt(struct nlattr *opt, u32 *ingress_block_index)
+{
+	struct nlattr *tb[TCA_CLSACT_MAX + 1];
+	int err;
+
+	*ingress_block_index = 0;
+
+	if (!opt)
+		return 0;
+	err = nla_parse_nested(tb, TCA_CLSACT_MAX, opt, ingress_policy, NULL);
+	if (err)
+		return err;
+
+	if (tb[TCA_CLSACT_INGRESS_BLOCK])
+		*ingress_block_index =
+			nla_get_u32(tb[TCA_CLSACT_INGRESS_BLOCK]);
+	return 0;
 }
 
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
@@ -74,6 +97,11 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress);
 
+	err = ingress_parse_opt(opt, &q->block_info.block_index);
+	if (err)
+		return err;
+
+	q->block_info.shareable = true;
 	q->block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->block_info.chain_head_change = clsact_chain_head_change;
 	q->block_info.chain_head_change_priv = &q->miniqp;
@@ -97,11 +125,14 @@ static void ingress_destroy(struct Qdisc *sch)
 
 static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct ingress_sched_data *q = qdisc_priv(sch);
 	struct nlattr *nest;
 
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_CLSACT_INGRESS_BLOCK, q->block->index))
+		goto nla_put_failure;
 
 	return nla_nest_end(skb, nest);
 
@@ -170,6 +201,35 @@ static struct tcf_block *clsact_tcf_block(struct Qdisc *sch, unsigned long cl,
 	}
 }
 
+static const struct nla_policy clsact_policy[TCA_CLSACT_MAX + 1] = {
+	[TCA_CLSACT_INGRESS_BLOCK]	= { .type = NLA_U32 },
+	[TCA_CLSACT_EGRESS_BLOCK]	= { .type = NLA_U32 },
+};
+
+static int clsact_parse_opt(struct nlattr *opt, u32 *ingress_block_index,
+			    u32 *egress_block_index)
+{
+	struct nlattr *tb[TCA_CLSACT_MAX + 1];
+	int err;
+
+	*ingress_block_index = 0;
+	*egress_block_index = 0;
+
+	if (!opt)
+		return 0;
+	err = nla_parse_nested(tb, TCA_CLSACT_MAX, opt, clsact_policy, NULL);
+	if (err)
+		return err;
+
+	if (tb[TCA_CLSACT_INGRESS_BLOCK])
+		*ingress_block_index =
+			nla_get_u32(tb[TCA_CLSACT_INGRESS_BLOCK]);
+	if (tb[TCA_CLSACT_EGRESS_BLOCK])
+		*egress_block_index =
+			nla_get_u32(tb[TCA_CLSACT_EGRESS_BLOCK]);
+	return 0;
+}
+
 static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 		       struct netlink_ext_ack *extack)
 {
@@ -182,6 +242,12 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress);
 
+	err = clsact_parse_opt(opt, &q->ingress_block_info.block_index,
+			       &q->egress_block_info.block_index);
+	if (err)
+		return err;
+
+	q->ingress_block_info.shareable = true;
 	q->ingress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
 	q->ingress_block_info.chain_head_change = clsact_chain_head_change;
 	q->ingress_block_info.chain_head_change_priv = &q->miniqp_ingress;
@@ -193,6 +259,7 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt,
 
 	mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress);
 
+	q->egress_block_info.shareable = true;
 	q->egress_block_info.binder_type = TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS;
 	q->egress_block_info.chain_head_change = clsact_chain_head_change;
 	q->egress_block_info.chain_head_change_priv = &q->miniqp_egress;
@@ -218,6 +285,26 @@ static void clsact_destroy(struct Qdisc *sch)
 	net_dec_egress_queue();
 }
 
+static int clsact_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct clsact_sched_data *q = qdisc_priv(sch);
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, TCA_OPTIONS);
+	if (!nest)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_CLSACT_INGRESS_BLOCK, q->ingress_block->index))
+		goto nla_put_failure;
+	if (nla_put_u32(skb, TCA_CLSACT_EGRESS_BLOCK, q->egress_block->index))
+		goto nla_put_failure;
+
+	return nla_nest_end(skb, nest);
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -1;
+}
+
 static const struct Qdisc_class_ops clsact_class_ops = {
 	.leaf		=	ingress_leaf,
 	.find		=	clsact_find,
@@ -233,7 +320,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
 	.priv_size	=	sizeof(struct clsact_sched_data),
 	.init		=	clsact_init,
 	.destroy	=	clsact_destroy,
-	.dump		=	ingress_dump,
+	.dump		=	clsact_dump,
 	.owner		=	THIS_MODULE,
 };
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 05/10] net: sched: keep track of offloaded filters and check tc offload feature
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

During block bind, we need to check tc offload feature. If it is
disabled yet still the block contains offloaded filters, forbid the
bind. Also forbid to register callback for a block that already
containes offloaded filters, as the play back is not supported now.
For keeping track of offloaded filters there is a new counter
introduced, alongside with couple of helpers called from cls_* code.
These helpers set and clear TCA_CLS_FLAGS_IN_HW flag.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v4->v5:
- add tracking of binding of devs that are unable to offload and check
  that before block cbs call.
v3->v4:
- propagate netdev_ops->ndo_setup_tc error up to tcf_block_offload_bind
  caller
v2->v3:
- new patch

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/sch_generic.h | 18 +++++++++++
 net/sched/cls_api.c       | 79 ++++++++++++++++++++++++++++++++++++++---------
 net/sched/cls_bpf.c       |  5 ++-
 net/sched/cls_flower.c    |  3 +-
 net/sched/cls_matchall.c  |  3 +-
 net/sched/cls_u32.c       | 13 ++++----
 6 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index dba2214..ab86b64 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -289,8 +289,26 @@ struct tcf_block {
 	struct list_head cb_list;
 	struct list_head owner_list;
 	bool keep_dst;
+	unsigned int offloadcnt; /* Number of oddloaded filters */
+	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
 };
 
+static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
+{
+	if (*flags & TCA_CLS_FLAGS_IN_HW)
+		return;
+	*flags |= TCA_CLS_FLAGS_IN_HW;
+	block->offloadcnt++;
+}
+
+static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
+{
+	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
+		return;
+	*flags &= ~TCA_CLS_FLAGS_IN_HW;
+	block->offloadcnt--;
+}
+
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 {
 	struct qdisc_skb_cb *qcb;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 07767b4..1ea856b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -265,31 +265,66 @@ void tcf_chain_put(struct tcf_chain *chain)
 }
 EXPORT_SYMBOL(tcf_chain_put);
 
-static void tcf_block_offload_cmd(struct tcf_block *block, struct Qdisc *q,
-				  struct tcf_block_ext_info *ei,
-				  enum tc_block_command command)
+static bool tcf_block_offload_in_use(struct tcf_block *block)
+{
+	return block->offloadcnt;
+}
+
+static int tcf_block_offload_cmd(struct tcf_block *block,
+				 struct net_device *dev,
+				 struct tcf_block_ext_info *ei,
+				 enum tc_block_command command)
 {
-	struct net_device *dev = q->dev_queue->dev;
 	struct tc_block_offload bo = {};
 
-	if (!dev->netdev_ops->ndo_setup_tc)
-		return;
 	bo.command = command;
 	bo.binder_type = ei->binder_type;
 	bo.block = block;
-	dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
 }
 
-static void tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
-				   struct tcf_block_ext_info *ei)
+static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
+				  struct tcf_block_ext_info *ei)
 {
-	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_BIND);
+	struct net_device *dev = q->dev_queue->dev;
+	int err;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		goto no_offload_dev_inc;
+
+	/* If tc offload feature is disabled and the block we try to bind
+	 * to already has some offloaded filters, forbid to bind.
+	 */
+	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
+		return -EOPNOTSUPP;
+
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
+	if (err == -EOPNOTSUPP)
+		goto no_offload_dev_inc;
+	return err;
+
+no_offload_dev_inc:
+	if (tcf_block_offload_in_use(block))
+		return -EOPNOTSUPP;
+	block->nooffloaddevcnt++;
+	return 0;
 }
 
 static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 				     struct tcf_block_ext_info *ei)
 {
-	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
+	struct net_device *dev = q->dev_queue->dev;
+	int err;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		goto no_offload_dev_dec;
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
+	if (err == -EOPNOTSUPP)
+		goto no_offload_dev_dec;
+	return;
+
+no_offload_dev_dec:
+	WARN_ON(block->nooffloaddevcnt-- == 0);
 }
 
 static int
@@ -510,10 +545,15 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 	if (err)
 		goto err_chain_head_change_cb_add;
 
-	tcf_block_offload_bind(block, q, ei);
+	err = tcf_block_offload_bind(block, q, ei);
+	if (err)
+		goto err_block_offload_bind;
+
 	*p_block = block;
 	return 0;
 
+err_block_offload_bind:
+	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
 err_chain_head_change_cb_add:
 	tcf_block_owner_del(block, q, ei->binder_type);
 err_block_owner_add:
@@ -643,9 +683,16 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 {
 	struct tcf_block_cb *block_cb;
 
+	/* At this point, playback of previous block cb calls is not supported,
+	 * so forbid to register to block which already has some offloaded
+	 * filters present.
+	 */
+	if (tcf_block_offload_in_use(block))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
 	if (!block_cb)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	block_cb->cb = cb;
 	block_cb->cb_ident = cb_ident;
 	block_cb->cb_priv = cb_priv;
@@ -661,7 +708,7 @@ int tcf_block_cb_register(struct tcf_block *block,
 	struct tcf_block_cb *block_cb;
 
 	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
-	return block_cb ? 0 : -ENOMEM;
+	return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
 }
 EXPORT_SYMBOL(tcf_block_cb_register);
 
@@ -691,6 +738,10 @@ static int tcf_block_cb_call(struct tcf_block *block, enum tc_setup_type type,
 	int ok_count = 0;
 	int err;
 
+	/* Make sure all netdevs sharing this block are offload-capable. */
+	if (block->nooffloaddevcnt && err_stop)
+		return -EOPNOTSUPP;
+
 	list_for_each_entry(block_cb, &block->cb_list, list) {
 		err = block_cb->cb(type, type_data, block_cb->cb_priv);
 		if (err) {
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index d79cc50..cf72aef 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -167,13 +167,16 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 	cls_bpf.exts_integrated = obj->exts_integrated;
 	cls_bpf.gen_flags = obj->gen_flags;
 
+	if (oldprog)
+		tcf_block_offload_dec(block, &oldprog->gen_flags);
+
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSBPF, &cls_bpf, skip_sw);
 	if (prog) {
 		if (err < 0) {
 			cls_bpf_offload_cmd(tp, oldprog, prog);
 			return err;
 		} else if (err > 0) {
-			prog->gen_flags |= TCA_CLS_FLAGS_IN_HW;
+			tcf_block_offload_inc(block, &prog->gen_flags);
 		}
 	}
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6132a73..f61df19 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -229,6 +229,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f)
 
 	tc_setup_cb_call(block, &f->exts, TC_SETUP_CLSFLOWER,
 			 &cls_flower, false);
+	tcf_block_offload_dec(block, &f->flags);
 }
 
 static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -256,7 +257,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		fl_hw_destroy_filter(tp, f);
 		return err;
 	} else if (err > 0) {
-		f->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &f->flags);
 	}
 
 	if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 66d4e00..d0e57c8 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -81,6 +81,7 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp,
 	cls_mall.cookie = cookie;
 
 	tc_setup_cb_call(block, NULL, TC_SETUP_CLSMATCHALL, &cls_mall, false);
+	tcf_block_offload_dec(block, &head->flags);
 }
 
 static int mall_replace_hw_filter(struct tcf_proto *tp,
@@ -103,7 +104,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 		mall_destroy_hw_filter(tp, head, cookie);
 		return err;
 	} else if (err > 0) {
-		head->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &head->flags);
 	}
 
 	if (skip_sw && !(head->flags & TCA_CLS_FLAGS_IN_HW))
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 507859c..020d328 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -529,16 +529,17 @@ static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h,
 	return 0;
 }
 
-static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
+static void u32_remove_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 {
 	struct tcf_block *block = tp->chain->block;
 	struct tc_cls_u32_offload cls_u32 = {};
 
 	tc_cls_common_offload_init(&cls_u32.common, tp);
 	cls_u32.command = TC_CLSU32_DELETE_KNODE;
-	cls_u32.knode.handle = handle;
+	cls_u32.knode.handle = n->handle;
 
 	tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, false);
+	tcf_block_offload_dec(block, &n->flags);
 }
 
 static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
@@ -567,10 +568,10 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 
 	err = tc_setup_cb_call(block, NULL, TC_SETUP_CLSU32, &cls_u32, skip_sw);
 	if (err < 0) {
-		u32_remove_hw_knode(tp, n->handle);
+		u32_remove_hw_knode(tp, n);
 		return err;
 	} else if (err > 0) {
-		n->flags |= TCA_CLS_FLAGS_IN_HW;
+		tcf_block_offload_inc(block, &n->flags);
 	}
 
 	if (skip_sw && !(n->flags & TCA_CLS_FLAGS_IN_HW))
@@ -589,7 +590,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
 			tcf_unbind_filter(tp, &n->res);
-			u32_remove_hw_knode(tp, n->handle);
+			u32_remove_hw_knode(tp, n);
 			idr_remove_ext(&ht->handle_idr, n->handle);
 			if (tcf_exts_get_net(&n->exts))
 				call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
@@ -682,7 +683,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last)
 		goto out;
 
 	if (TC_U32_KEY(ht->handle)) {
-		u32_remove_hw_knode(tp, ht->handle);
+		u32_remove_hw_knode(tp, (struct tc_u_knode *)ht);
 		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
 		goto out;
 	}
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 03/10] net: sched: introduce block mechanism to handle netif_keep_dst calls
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Couple of classifiers call netif_keep_dst directly on q->dev. That is
not possible to do directly for shared blocke where multiple qdiscs are
owning the block. So introduce a infrastructure to keep track of the
block owners in list and use this list to implement block variant of
netif_keep_dst.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v3->v4:
- rebased on top of the current net-next
v1->v2:
- fix binder_type to check "egress" as well as pointed out by Daniel
---
 include/net/pkt_cls.h     |  1 +
 include/net/sch_generic.h |  2 ++
 net/sched/cls_api.c       | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/cls_bpf.c       |  4 +--
 net/sched/cls_flow.c      |  2 +-
 net/sched/cls_route.c     |  2 +-
 6 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 35ab7c9..4837f4a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -40,6 +40,7 @@ bool tcf_queue_work(struct work_struct *work);
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
 void tcf_chain_put(struct tcf_chain *chain);
+void tcf_block_netif_keep_dst(struct tcf_block *block);
 int tcf_block_get(struct tcf_block **p_block,
 		  struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q,
 		  struct netlink_ext_ack *extack);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 5cc4d71..df97c3e 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -289,6 +289,8 @@ struct tcf_block {
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
+	struct list_head owner_list;
+	bool keep_dst;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b93eca8..8a7d77a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -383,6 +383,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
 	}
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
+	INIT_LIST_HEAD(&block->owner_list);
 
 	/* Create chain 0 by default, it has to be always present. */
 	chain = tcf_chain_create(block, 0);
@@ -414,6 +415,65 @@ static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
 	return list_first_entry(&block->chain_list, struct tcf_chain, list);
 }
 
+struct tcf_block_owner_item {
+	struct list_head list;
+	struct Qdisc *q;
+	enum tcf_block_binder_type binder_type;
+};
+
+static void
+tcf_block_owner_netif_keep_dst(struct tcf_block *block,
+			       struct Qdisc *q,
+			       enum tcf_block_binder_type binder_type)
+{
+	if (block->keep_dst &&
+	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS &&
+	    binder_type != TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+		netif_keep_dst(qdisc_dev(q));
+}
+
+void tcf_block_netif_keep_dst(struct tcf_block *block)
+{
+	struct tcf_block_owner_item *item;
+
+	block->keep_dst = true;
+	list_for_each_entry(item, &block->owner_list, list)
+		tcf_block_owner_netif_keep_dst(block, item->q,
+					       item->binder_type);
+}
+EXPORT_SYMBOL(tcf_block_netif_keep_dst);
+
+static int tcf_block_owner_add(struct tcf_block *block,
+			       struct Qdisc *q,
+			       enum tcf_block_binder_type binder_type)
+{
+	struct tcf_block_owner_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+	item->q = q;
+	item->binder_type = binder_type;
+	list_add(&item->list, &block->owner_list);
+	return 0;
+}
+
+static void tcf_block_owner_del(struct tcf_block *block,
+				struct Qdisc *q,
+				enum tcf_block_binder_type binder_type)
+{
+	struct tcf_block_owner_item *item;
+
+	list_for_each_entry(item, &block->owner_list, list) {
+		if (item->q == q && item->binder_type == binder_type) {
+			list_del(&item->list);
+			kfree(item);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
 int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		      struct tcf_block_ext_info *ei,
 		      struct netlink_ext_ack *extack)
@@ -442,6 +502,12 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		}
 	}
 
+	err = tcf_block_owner_add(block, q, ei->binder_type);
+	if (err)
+		goto err_block_owner_add;
+
+	tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
+
 	err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
 					   ei, extack);
 	if (err)
@@ -452,6 +518,8 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 	return 0;
 
 err_chain_head_change_cb_add:
+	tcf_block_owner_del(block, q, ei->binder_type);
+err_block_owner_add:
 	if (created) {
 		if (ei->shareable)
 			tcf_block_remove(block, net);
@@ -497,6 +565,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		return;
 
 	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
+	tcf_block_owner_del(block, q, ei->binder_type);
 
 	if (--block->refcnt == 0) {
 		if (ei->shareable)
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 8d78e7f..d79cc50 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -392,8 +392,8 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 	prog->bpf_name = name;
 	prog->filter = fp;
 
-	if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS))
-		netif_keep_dst(qdisc_dev(tp->q));
+	if (fp->dst_needed)
+		tcf_block_netif_keep_dst(tp->chain->block);
 
 	return 0;
 }
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 25c2a88..28cd6fb 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -526,7 +526,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 
 	timer_setup(&fnew->perturb_timer, flow_perturbation, TIMER_DEFERRABLE);
 
-	netif_keep_dst(qdisc_dev(tp->q));
+	tcf_block_netif_keep_dst(tp->chain->block);
 
 	if (tb[TCA_FLOW_KEYS]) {
 		fnew->keymask = keymask;
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index ac9a5b8..a1f2b1b 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -527,7 +527,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
 		if (f->handle < f1->handle)
 			break;
 
-	netif_keep_dst(qdisc_dev(tp->q));
+	tcf_block_netif_keep_dst(tp->chain->block);
 	rcu_assign_pointer(f->next, f1);
 	rcu_assign_pointer(*fp, f);
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 04/10] net: sched: remove classid and q fields from tcf_proto
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Both are no longer used, so remove them.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/sch_generic.h | 2 --
 net/sched/cls_api.c       | 7 ++-----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index df97c3e..dba2214 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -255,8 +255,6 @@ struct tcf_proto {
 
 	/* All the rest */
 	u32			prio;
-	u32			classid;
-	struct Qdisc		*q;
 	void			*data;
 	const struct tcf_proto_ops	*ops;
 	struct tcf_chain	*chain;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8a7d77a..07767b4 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -122,8 +122,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
 }
 
 static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
-					  u32 prio, u32 parent, struct Qdisc *q,
-					  struct tcf_chain *chain)
+					  u32 prio, struct tcf_chain *chain)
 {
 	struct tcf_proto *tp;
 	int err;
@@ -157,8 +156,6 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
 	tp->classify = tp->ops->classify;
 	tp->protocol = protocol;
 	tp->prio = prio;
-	tp->classid = parent;
-	tp->q = q;
 	tp->chain = chain;
 
 	err = tp->ops->init(tp);
@@ -1075,7 +1072,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			prio = tcf_auto_prio(tcf_chain_tp_prev(&chain_info));
 
 		tp = tcf_proto_create(nla_data(tca[TCA_KIND]),
-				      protocol, prio, parent, q, chain);
+				      protocol, prio, chain);
 		if (IS_ERR(tp)) {
 			err = PTR_ERR(tp);
 			goto errout;
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 02/10] net: sched: avoid usage of tp->q in tcf_classify
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

Use block index in the messages instead.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/sched/cls_api.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9b1b4fa..b93eca8 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -678,8 +678,9 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 #ifdef CONFIG_NET_CLS_ACT
 reset:
 	if (unlikely(limit++ >= max_reclassify_loop)) {
-		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
-				       tp->q->ops->id, tp->prio & 0xffff,
+		net_notice_ratelimited("%u: reclassify loop, rule prio %u, protocol %02x\n",
+				       tp->chain->block->index,
+				       tp->prio & 0xffff,
 				       ntohs(tp->protocol));
 		return TC_ACT_SHOT;
 	}
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 01/10] net: sched: introduce support for multiple filter chain pointers registration
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern
In-Reply-To: <20171226141604.1605-1-jiri@resnulli.us>

From: Jiri Pirko <jiri@mellanox.com>

So far, there was possible only to register a single filter chain
pointer to block->chain[0]. However, when the blocks will get shareable,
we need to allow multiple filter chain pointers registration.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v3->v4:
- rebased on top of the current net-next
- added some extack strings
v2->v3:
- rebased on top of the current net-next
---
 include/net/pkt_cls.h     |   3 +
 include/net/sch_generic.h |   5 +-
 net/sched/cls_api.c       | 236 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 216 insertions(+), 28 deletions(-)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 31574c9..35ab7c9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -29,6 +29,8 @@ struct tcf_block_ext_info {
 	enum tcf_block_binder_type binder_type;
 	tcf_chain_head_change_t *chain_head_change;
 	void *chain_head_change_priv;
+	bool shareable;
+	u32 block_index;
 };
 
 struct tcf_block_cb;
@@ -50,6 +52,7 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
+	WARN_ON(block->refcnt != 1);
 	return block->q;
 }
 
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ac029d5..5cc4d71 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -275,8 +275,7 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
 
 struct tcf_chain {
 	struct tcf_proto __rcu *filter_chain;
-	tcf_chain_head_change_t *chain_head_change;
-	void *chain_head_change_priv;
+	struct list_head filter_chain_list;
 	struct list_head list;
 	struct tcf_block *block;
 	u32 index; /* chain index */
@@ -285,6 +284,8 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	u32 index; /* block index for shared blocks */
+	unsigned int refcnt;
 	struct net *net;
 	struct Qdisc *q;
 	struct list_head cb_list;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4591b87..9b1b4fa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -24,6 +24,7 @@
 #include <linux/init.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
+#include <linux/idr.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -179,6 +180,12 @@ static void tcf_proto_destroy(struct tcf_proto *tp)
 	kfree_rcu(tp, rcu);
 }
 
+struct tcf_filter_chain_list_item {
+	struct list_head list;
+	tcf_chain_head_change_t *chain_head_change;
+	void *chain_head_change_priv;
+};
+
 static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 					  u32 chain_index)
 {
@@ -187,6 +194,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	chain = kzalloc(sizeof(*chain), GFP_KERNEL);
 	if (!chain)
 		return NULL;
+	INIT_LIST_HEAD(&chain->filter_chain_list);
 	list_add_tail(&chain->list, &block->chain_list);
 	chain->block = block;
 	chain->index = chain_index;
@@ -194,12 +202,19 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
 	return chain;
 }
 
+static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
+				       struct tcf_proto *tp_head)
+{
+	if (item->chain_head_change)
+		item->chain_head_change(tp_head, item->chain_head_change_priv);
+}
 static void tcf_chain_head_change(struct tcf_chain *chain,
 				  struct tcf_proto *tp_head)
 {
-	if (chain->chain_head_change)
-		chain->chain_head_change(tp_head,
-					 chain->chain_head_change_priv);
+	struct tcf_filter_chain_list_item *item;
+
+	list_for_each_entry(item, &chain->filter_chain_list, list)
+		tcf_chain_head_change_item(item, tp_head);
 }
 
 static void tcf_chain_flush(struct tcf_chain *chain)
@@ -280,17 +295,91 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 	tcf_block_offload_cmd(block, q, ei, TC_BLOCK_UNBIND);
 }
 
-int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
-		      struct tcf_block_ext_info *ei,
-		      struct netlink_ext_ack *extack)
+static int
+tcf_chain_head_change_cb_add(struct tcf_chain *chain,
+			     struct tcf_block_ext_info *ei,
+			     struct netlink_ext_ack *extack)
 {
-	struct tcf_block *block = kzalloc(sizeof(*block), GFP_KERNEL);
+	struct tcf_filter_chain_list_item *item;
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item) {
+		NL_SET_ERR_MSG(extack, "Memory allocation for head change callback item failed");
+		return -ENOMEM;
+	}
+	item->chain_head_change = ei->chain_head_change;
+	item->chain_head_change_priv = ei->chain_head_change_priv;
+	if (chain->filter_chain)
+		tcf_chain_head_change_item(item, chain->filter_chain);
+	list_add(&item->list, &chain->filter_chain_list);
+	return 0;
+}
+
+static void
+tcf_chain_head_change_cb_del(struct tcf_chain *chain,
+			     struct tcf_block_ext_info *ei)
+{
+	struct tcf_filter_chain_list_item *item;
+
+	list_for_each_entry(item, &chain->filter_chain_list, list) {
+		if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
+		    (item->chain_head_change == ei->chain_head_change &&
+		     item->chain_head_change_priv == ei->chain_head_change_priv)) {
+			tcf_chain_head_change_item(item, NULL);
+			list_del(&item->list);
+			kfree(item);
+			return;
+		}
+	}
+	WARN_ON(1);
+}
+
+struct tcf_net {
+	struct idr idr;
+};
+
+static unsigned int tcf_net_id;
+
+static int tcf_block_insert(struct tcf_block *block, struct net *net,
+			    u32 block_index, struct netlink_ext_ack *extack)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+	int idr_start;
+	int idr_end;
+	int index;
+
+	if (block_index >= INT_MAX) {
+		NL_SET_ERR_MSG(extack, "Invalid block index value (>= INT_MAX)");
+		return -EINVAL;
+	}
+	idr_start = block_index ? block_index : 1;
+	idr_end = block_index ? block_index + 1 : INT_MAX;
+
+	index = idr_alloc(&tn->idr, block, idr_start, idr_end, GFP_KERNEL);
+	if (index < 0)
+		return index;
+	block->index = index;
+	return 0;
+}
+
+static void tcf_block_remove(struct tcf_block *block, struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_remove(&tn->idr, block->index);
+}
+
+static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
+					  struct netlink_ext_ack *extack)
+{
+	struct tcf_block *block;
 	struct tcf_chain *chain;
 	int err;
 
+	block = kzalloc(sizeof(*block), GFP_KERNEL);
 	if (!block) {
 		NL_SET_ERR_MSG(extack, "Memory allocation for block failed");
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 	INIT_LIST_HEAD(&block->chain_list);
 	INIT_LIST_HEAD(&block->cb_list);
@@ -302,17 +391,75 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 		err = -ENOMEM;
 		goto err_chain_create;
 	}
-	WARN_ON(!ei->chain_head_change);
-	chain->chain_head_change = ei->chain_head_change;
-	chain->chain_head_change_priv = ei->chain_head_change_priv;
 	block->net = qdisc_net(q);
+	block->refcnt = 1;
+	block->net = net;
 	block->q = q;
+	return block;
+
+err_chain_create:
+	kfree(block);
+	return ERR_PTR(err);
+}
+
+static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	return idr_find(&tn->idr, block_index);
+}
+
+static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
+{
+	return list_first_entry(&block->chain_list, struct tcf_chain, list);
+}
+
+int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
+		      struct tcf_block_ext_info *ei,
+		      struct netlink_ext_ack *extack)
+{
+	struct net *net = qdisc_net(q);
+	struct tcf_block *block = NULL;
+	bool created = false;
+	int err;
+
+	if (ei->shareable) {
+		block = tcf_block_lookup(net, ei->block_index);
+		if (block)
+			block->refcnt++;
+	}
+
+	if (!block) {
+		block = tcf_block_create(net, q, extack);
+		if (IS_ERR(block))
+			return PTR_ERR(block);
+		created = true;
+		if (ei->shareable) {
+			err = tcf_block_insert(block, net,
+					       ei->block_index, extack);
+			if (err)
+				goto err_block_insert;
+		}
+	}
+
+	err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
+					   ei, extack);
+	if (err)
+		goto err_chain_head_change_cb_add;
+
 	tcf_block_offload_bind(block, q, ei);
 	*p_block = block;
 	return 0;
 
-err_chain_create:
-	kfree(block);
+err_chain_head_change_cb_add:
+	if (created) {
+		if (ei->shareable)
+			tcf_block_remove(block, net);
+err_block_insert:
+		kfree(block);
+	} else {
+		block->refcnt--;
+	}
 	return err;
 }
 EXPORT_SYMBOL(tcf_block_get_ext);
@@ -346,26 +493,35 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 {
 	struct tcf_chain *chain, *tmp;
 
-	/* Hold a refcnt for all chains, so that they don't disappear
-	 * while we are iterating.
-	 */
 	if (!block)
 		return;
-	list_for_each_entry(chain, &block->chain_list, list)
-		tcf_chain_hold(chain);
 
-	list_for_each_entry(chain, &block->chain_list, list)
-		tcf_chain_flush(chain);
+	tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
+
+	if (--block->refcnt == 0) {
+		if (ei->shareable)
+			tcf_block_remove(block, block->net);
+
+		/* Hold a refcnt for all chains, so that they don't disappear
+		 * while we are iterating.
+		 */
+		list_for_each_entry(chain, &block->chain_list, list)
+			tcf_chain_hold(chain);
+
+		list_for_each_entry(chain, &block->chain_list, list)
+			tcf_chain_flush(chain);
+	}
 
 	tcf_block_offload_unbind(block, q, ei);
 
-	/* At this point, all the chains should have refcnt >= 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
+	if (block->refcnt == 0) {
+		/* At this point, all the chains should have refcnt >= 1. */
+		list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+			tcf_chain_put(chain);
 
-	/* Finally, put chain 0 and allow block to be freed. */
-	chain = list_first_entry(&block->chain_list, struct tcf_chain, list);
-	tcf_chain_put(chain);
+		/* Finally, put chain 0 and allow block to be freed. */
+		tcf_chain_put(tcf_block_chain_zero(block));
+	}
 }
 EXPORT_SYMBOL(tcf_block_put_ext);
 
@@ -1250,12 +1406,40 @@ int tc_setup_cb_call(struct tcf_block *block, struct tcf_exts *exts,
 }
 EXPORT_SYMBOL(tc_setup_cb_call);
 
+static __net_init int tcf_net_init(struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_init(&tn->idr);
+	return 0;
+}
+
+static void __net_exit tcf_net_exit(struct net *net)
+{
+	struct tcf_net *tn = net_generic(net, tcf_net_id);
+
+	idr_destroy(&tn->idr);
+}
+
+static struct pernet_operations tcf_net_ops = {
+	.init = tcf_net_init,
+	.exit = tcf_net_exit,
+	.id   = &tcf_net_id,
+	.size = sizeof(struct tcf_net),
+};
+
 static int __init tc_filter_init(void)
 {
+	int err;
+
 	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
 	if (!tc_filter_wq)
 		return -ENOMEM;
 
+	err = register_pernet_subsys(&tcf_net_ops);
+	if (err)
+		return err;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v5 00/10] net: sched: allow qdiscs to share filter block instances
From: Jiri Pirko @ 2017-12-26 14:15 UTC (permalink / raw)
  To: netdev
  Cc: davem, jhs, xiyou.wangcong, mlxsw, andrew, vivien.didelot,
	f.fainelli, michael.chan, ganeshgr, saeedm, matanb, leonro,
	idosch, jakub.kicinski, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, ogerlitz, john.fastabend, daniel,
	dsahern

From: Jiri Pirko <jiri@mellanox.com>

Currently the filters added to qdiscs are independent. So for example if you
have 2 netdevices and you create ingress qdisc on both and you want to add
identical filter rules both, you need to add them twice. This patchset
makes this easier and mainly saves resources allowing to share all filters
within a qdisc - I call it a "filter block". Also this helps to save
resources when we do offload to hw for example to expensive TCAM.

So back to the example. First, we create 2 qdiscs. Both will share
block number 22. "22" is just an identification. If we don't pass any
block number, a new one will be generated by kernel:

$ tc qdisc add dev ens7 ingress block 22
                                ^^^^^^^^
$ tc qdisc add dev ens8 ingress block 22
                                ^^^^^^^^

Now if we list the qdiscs, we will see the block index in the output:

$ tc qdisc
qdisc ingress ffff: dev ens7 parent ffff:fff1 block 22
qdisc ingress ffff: dev ens8 parent ffff:fff1 block 22

To make is more visual, the situation looks like this:

   ens7 ingress qdisc                 ens7 ingress qdisc
          |                                  |
          |                                  |
          +---------->  block 22  <----------+

Unlimited number of qdiscs may share the same block.

Now we can add filter to any of qdiscs sharing the same block:

$ tc filter add dev ens7 ingress protocol ip pref 25 flower dst_ip 192.168.0.0/16 action drop


We will see the same output if we list filters for ens7 and ens8, including stats:

$ tc -s filter show dev ens7 ingress
filter protocol ip pref 25 flower chain 0
filter protocol ip pref 25 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 192.168.0.0/16
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 39 sec used 2 sec
        Action statistics:
        Sent 3108 bytes 37 pkt (dropped 37, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

$ tc -s filter show dev ens8 ingress
filter protocol ip pref 25 flower chain 0
filter protocol ip pref 25 flower chain 0 handle 0x1
  eth_type ipv4
  dst_ip 192.168.0.0/16
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 40 sec used 3 sec
        Action statistics:
        Sent 3108 bytes 37 pkt (dropped 37, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

---
v4->v5:
- patch 5:
 - add tracking of binding of devs that are unable to offload and check
   that before block cbs call.

v3->v4:
- patch 1:
 - rebased on top of the current net-next
 - added some extack strings
- patch 3:
 - rebased on top of the current net-next
- patch 5:
 - propagate netdev_ops->ndo_setup_tc error up to tcf_block_offload_bind
   caller
- patch 6:
 - rebased on top of the current net-next

v2->v3:
- removed original patch 1, removing tp->q cls_bpf dependency. Fixed by
  Jakub in the meantime.
- patch 1:
 - rebased on top of the current net-next
- patch 5:
 - new patch
- patch 6:
 - removed "p_" prefix from block index function args
- patch 9:
 - add tc offload feature handling

Jiri Pirko (10):
  net: sched: introduce support for multiple filter chain pointers
    registration
  net: sched: avoid usage of tp->q in tcf_classify
  net: sched: introduce block mechanism to handle netif_keep_dst calls
  net: sched: remove classid and q fields from tcf_proto
  net: sched: keep track of offloaded filters and check tc offload
    feature
  net: sched: allow ingress and clsact qdiscs to share filter blocks
  mlxsw: spectrum_acl: Reshuffle code around
    mlxsw_sp_acl_ruleset_create/destroy
  mlxsw: spectrum_acl: Don't store netdev and ingress for ruleset unbind
  mlxsw: spectrum_acl: Implement TC block sharing
  mlxsw: spectrum_acl: Pass mlxsw_sp_port down to ruleset bind/unbind
    ops

 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 182 ++++++++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  44 ++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c | 302 ++++++++++++----
 .../ethernet/mellanox/mlxsw/spectrum_acl_tcam.c    |  44 +--
 .../net/ethernet/mellanox/mlxsw/spectrum_flower.c  |  41 +--
 include/net/pkt_cls.h                              |   4 +
 include/net/sch_generic.h                          |  27 +-
 include/uapi/linux/pkt_sched.h                     |  11 +
 net/sched/cls_api.c                                | 396 ++++++++++++++++++---
 net/sched/cls_bpf.c                                |   9 +-
 net/sched/cls_flow.c                               |   2 +-
 net/sched/cls_flower.c                             |   3 +-
 net/sched/cls_matchall.c                           |   3 +-
 net/sched/cls_route.c                              |   2 +-
 net/sched/cls_u32.c                                |  13 +-
 net/sched/sch_ingress.c                            |  89 ++++-
 16 files changed, 948 insertions(+), 224 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH net] macvlan: Fix one possible double free
From: gfree.wind @ 2017-12-26 13:44 UTC (permalink / raw)
  To: davem, netdev; +Cc: Gao Feng

From: Gao Feng <gfree.wind@vip.163.com>

Because the macvlan_uninit would free the macvlan port, so there is one
double free case in macvlan_common_newlink. When the macvlan port is just
created, then register_netdevice or netdev_upper_dev_link failed and they
would invoke macvlan_uninit. Then it would reach the macvlan_port_destroy
which triggers the double free.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
---
 drivers/net/macvlan.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index a178c5e..a0f2be8 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1444,9 +1444,14 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 	return 0;
 
 unregister_netdev:
+	/* macvlan_uninit would free the macvlan port */
 	unregister_netdevice(dev);
+	return err;
 destroy_macvlan_port:
-	if (create)
+	/* the macvlan port may be freed by macvlan_uninit when fail to register.
+	 * so we destroy the macvlan port only when it's valid.
+	 */
+	if (create && macvlan_port_get_rtnl(dev))
 		macvlan_port_destroy(port->dev);
 	return err;
 }
-- 
1.9.1

^ permalink raw reply related

* FUND INVESTMENT PROJECT
From: hon odik @ 2017-12-26 13:37 UTC (permalink / raw)


Hello

My name is Barr. Odik Blaise kasibayo a lawyer and Financial
Consultant based here in East African continent and also operating out
of Panama and Cayman Island through a proxy.  We provide discrete
financial services for High Net worth Individuals, CEO's/Executives of
Corporations and Senior Government officials in around Africa and
Middle East. What we do is legal and within the ambient of the law,
but following the leakage of the infamous Panama Papers and subsequent
treat by the International Consortium of Investigative Journalists
(ICIJ) to publish more details and release more list not included in
the first one.

Practically all our clients  are no longer comfortable, they wants
their accounts closed and transfer their assets and cash out from this
Offshore locations  to  be invested in the real sector of an economy
in any part of the world that have good prospects.  So we are looking
for Manufacturers, Real estate developers, Construction Companies, and
Oil Exploration Companies, Mining Corporations that will be vetted to
qualify for this fund injection.

These funds will be given at a 0% interest as a structured loan,
rather it will be given as a tenured operational fund, and the
investor is only interested on any agreed percentage of profit
declared, either quarterly or yearly. A minimum of ROI of 8% is
expected.

Please write me only if you believe you or your organization can be
assessed for qualification and consideration to participate in this
program.

Yours Sincerely
……………………….
Barr. Odik Blaise kasibayo
Email: honodikblaisekasibayo@gmail.com

^ permalink raw reply

* Re: [PATCHv4 net-next 02/14] net: sched: sch_api: handle generic qdisc errors
From: Jiri Pirko @ 2017-12-26 11:53 UTC (permalink / raw)
  To: Alexander Aring; +Cc: jhs, xiyou.wangcong, davem, netdev, kernel, David Ahern
In-Reply-To: <20171220173524.25874-3-aring@mojatatu.com>

Wed, Dec 20, 2017 at 06:35:12PM CET, aring@mojatatu.com wrote:
>This patch adds extack support for generic qdisc handling. The extack
>will be set deeper to each called function which is not part of netdev
>core api.
>
>Cc: David Ahern <dsahern@gmail.com>
>Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
>Signed-off-by: Alexander Aring <aring@mojatatu.com>
>---

[...]


>@@ -1349,21 +1387,33 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
> 
> 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
> 			if (tcm->tcm_handle) {
>-				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
>+				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
>+					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
> 					return -EEXIST;
>-				if (TC_H_MIN(tcm->tcm_handle))
>+				}
>+				if (TC_H_MIN(tcm->tcm_handle)) {
>+					NL_SET_ERR_MSG(extack, "Invalid minor handle");
> 					return -EINVAL;
>+				}
> 				q = qdisc_lookup(dev, tcm->tcm_handle);
>-				if (!q)
>+				if (!q) {
>+					NL_SET_ERR_MSG(extack, "No qdisc found for specified handle");

This is incorrect. This is hit on successpath as well, confusing user
with the message. I will send fix shortly.


> 					goto create_n_graft;
>-				if (n->nlmsg_flags & NLM_F_EXCL)
>+				}

^ permalink raw reply

* [patch net-next v2 10/10] mlxsw: core: Add support for reload
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for hot reload. First, all the driver/core resources are
released but the PCI and devlink instances, then reset is performed
through the PCI interface. Finally the driver performs initialization.

In case of reload failure the driver is left in a partially initialized
state. Special care is taken during the driver removal in order to
properly handle this state.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c | 64 +++++++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlxsw/core.h |  5 ++-
 drivers/net/ethernet/mellanox/mlxsw/i2c.c  |  5 ++-
 drivers/net/ethernet/mellanox/mlxsw/pci.c  |  5 ++-
 4 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 9fe25b1..4b33919 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -113,6 +113,7 @@ struct mlxsw_core {
 	struct mlxsw_thermal *thermal;
 	struct mlxsw_core_port *ports;
 	unsigned int max_ports;
+	bool reload_fail;
 	unsigned long driver_priv[0];
 	/* driver_priv has to be always the last item */
 };
@@ -962,7 +963,28 @@ mlxsw_devlink_sb_occ_tc_port_bind_get(struct devlink_port *devlink_port,
 						     pool_type, p_cur, p_max);
 }
 
+static int mlxsw_devlink_core_bus_device_reload(struct devlink *devlink)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	const struct mlxsw_bus *mlxsw_bus = mlxsw_core->bus;
+	int err;
+
+	if (!mlxsw_bus->reset)
+		return -EOPNOTSUPP;
+
+	mlxsw_core_bus_device_unregister(mlxsw_core, true);
+	mlxsw_bus->reset(mlxsw_core->bus_priv);
+	err = mlxsw_core_bus_device_register(mlxsw_core->bus_info,
+					     mlxsw_core->bus,
+					     mlxsw_core->bus_priv, true,
+					     devlink);
+	if (err)
+		mlxsw_core->reload_fail = true;
+	return err;
+}
+
 static const struct devlink_ops mlxsw_devlink_ops = {
+	.reload				= mlxsw_devlink_core_bus_device_reload,
 	.port_type_set			= mlxsw_devlink_port_type_set,
 	.port_split			= mlxsw_devlink_port_split,
 	.port_unsplit			= mlxsw_devlink_port_unsplit,
@@ -980,23 +1002,26 @@ static const struct devlink_ops mlxsw_devlink_ops = {
 
 int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 				   const struct mlxsw_bus *mlxsw_bus,
-				   void *bus_priv)
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink)
 {
 	const char *device_kind = mlxsw_bus_info->device_kind;
 	struct mlxsw_core *mlxsw_core;
 	struct mlxsw_driver *mlxsw_driver;
-	struct devlink *devlink;
 	size_t alloc_size;
 	int err;
 
 	mlxsw_driver = mlxsw_core_driver_get(device_kind);
 	if (!mlxsw_driver)
 		return -EINVAL;
-	alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
-	devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
-	if (!devlink) {
-		err = -ENOMEM;
-		goto err_devlink_alloc;
+
+	if (!reload) {
+		alloc_size = sizeof(*mlxsw_core) + mlxsw_driver->priv_size;
+		devlink = devlink_alloc(&mlxsw_devlink_ops, alloc_size);
+		if (!devlink) {
+			err = -ENOMEM;
+			goto err_devlink_alloc;
+		}
 	}
 
 	mlxsw_core = devlink_priv(devlink);
@@ -1012,7 +1037,7 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_bus_init;
 
-	if (mlxsw_driver->resources_register) {
+	if (mlxsw_driver->resources_register && !reload) {
 		err = mlxsw_driver->resources_register(mlxsw_core);
 		if (err)
 			goto err_register_resources;
@@ -1038,9 +1063,11 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_emad_init;
 
-	err = devlink_register(devlink, mlxsw_bus_info->dev);
-	if (err)
-		goto err_devlink_register;
+	if (!reload) {
+		err = devlink_register(devlink, mlxsw_bus_info->dev);
+		if (err)
+			goto err_devlink_register;
+	}
 
 	err = mlxsw_hwmon_init(mlxsw_core, mlxsw_bus_info, &mlxsw_core->hwmon);
 	if (err)
@@ -1082,20 +1109,29 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 }
 EXPORT_SYMBOL(mlxsw_core_bus_device_register);
 
-void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core,
+				      bool reload)
 {
 	const char *device_kind = mlxsw_core->bus_info->device_kind;
 	struct devlink *devlink = priv_to_devlink(mlxsw_core);
 
+	if (mlxsw_core->reload_fail)
+		goto out;
+
 	if (mlxsw_core->driver->fini)
 		mlxsw_core->driver->fini(mlxsw_core);
 	mlxsw_thermal_fini(mlxsw_core->thermal);
-	devlink_unregister(devlink);
+	if (!reload)
+		devlink_unregister(devlink);
 	mlxsw_emad_fini(mlxsw_core);
 	kfree(mlxsw_core->lag.mapping);
 	mlxsw_ports_fini(mlxsw_core);
-	devlink_resources_unregister(devlink, NULL);
+	if (!reload)
+		devlink_resources_unregister(devlink, NULL);
 	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
+	if (reload)
+		return;
+out:
 	devlink_free(devlink);
 	mlxsw_core_driver_put(device_kind);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e44061d..5ddafd7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -66,8 +66,9 @@ void mlxsw_core_driver_unregister(struct mlxsw_driver *mlxsw_driver);
 
 int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 				   const struct mlxsw_bus *mlxsw_bus,
-				   void *bus_priv);
-void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core);
+				   void *bus_priv, bool reload,
+				   struct devlink *devlink);
+void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core, bool reload);
 
 struct mlxsw_tx_info {
 	u8 local_port;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
index c0dcfa0..25f9915 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -539,7 +539,8 @@ static int mlxsw_i2c_probe(struct i2c_client *client,
 	mlxsw_i2c->dev = &client->dev;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info,
-					     &mlxsw_i2c_bus, mlxsw_i2c);
+					     &mlxsw_i2c_bus, mlxsw_i2c, false,
+					     NULL);
 	if (err) {
 		dev_err(&client->dev, "Fail to register core bus\n");
 		return err;
@@ -557,7 +558,7 @@ static int mlxsw_i2c_remove(struct i2c_client *client)
 {
 	struct mlxsw_i2c *mlxsw_i2c = i2c_get_clientdata(client);
 
-	mlxsw_core_bus_device_unregister(mlxsw_i2c->core);
+	mlxsw_core_bus_device_unregister(mlxsw_i2c->core, false);
 	mutex_destroy(&mlxsw_i2c->cmd.lock);
 
 	return 0;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 6468cf2..ceb8785 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1734,7 +1734,8 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mlxsw_pci->id = id;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
-					     &mlxsw_pci_bus, mlxsw_pci);
+					     &mlxsw_pci_bus, mlxsw_pci, false,
+					     NULL);
 	if (err) {
 		dev_err(&pdev->dev, "cannot register bus device\n");
 		goto err_bus_device_register;
@@ -1762,7 +1763,7 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
 {
 	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
 
-	mlxsw_core_bus_device_unregister(mlxsw_pci->core);
+	mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
 	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	iounmap(mlxsw_pci->hw_addr);
 	pci_release_regions(mlxsw_pci->pdev);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 09/10] mlxsw: pci: Add support for getting resource through devlink
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Up until now the KVD partition was static. This patch introduces the
ability to get the resource sizes via devlink. In case the resource is not
available the default configuration is used.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     | 16 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |  9 ++++
 drivers/net/ethernet/mellanox/mlxsw/pci.c      | 40 +++++-------------
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 57 ++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 2488662..9fe25b1 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1800,6 +1800,22 @@ void mlxsw_core_flush_owq(void)
 }
 EXPORT_SYMBOL(mlxsw_core_flush_owq);
 
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size)
+{
+	struct mlxsw_driver *driver =  mlxsw_core->driver;
+
+	if (!driver->kvd_sizes_get)
+		return -EINVAL;
+
+	return driver->kvd_sizes_get(mlxsw_core, profile,
+				     p_single_size, p_double_size,
+				     p_linear_size);
+}
+EXPORT_SYMBOL(mlxsw_core_kvd_sizes_get);
+
 static int __init mlxsw_core_module_init(void)
 {
 	int err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e23f83b..e44061d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -309,10 +309,19 @@ struct mlxsw_driver {
 	void (*txhdr_construct)(struct sk_buff *skb,
 				const struct mlxsw_tx_info *tx_info);
 	int (*resources_register)(struct mlxsw_core *mlxsw_core);
+	int (*kvd_sizes_get)(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
 	u8 txhdr_len;
 	const struct mlxsw_config_profile *profile;
 };
 
+int mlxsw_core_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+			     const struct mlxsw_config_profile *profile,
+			     u64 *p_single_size, u64 *p_double_size,
+			     u64 *p_linear_size);
+
 bool mlxsw_core_res_valid(struct mlxsw_core *mlxsw_core,
 			  enum mlxsw_res_id res_id);
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 39872fb..6468cf2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -1053,38 +1053,18 @@ static int mlxsw_pci_resources_query(struct mlxsw_pci *mlxsw_pci, char *mbox,
 }
 
 static int
-mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_config_profile *profile,
+mlxsw_pci_profile_get_kvd_sizes(const struct mlxsw_pci *mlxsw_pci,
+				const struct mlxsw_config_profile *profile,
 				struct mlxsw_res *res)
 {
-	u32 single_size, double_size, linear_size;
-
-	if (!MLXSW_RES_VALID(res, KVD_SINGLE_MIN_SIZE) ||
-	    !MLXSW_RES_VALID(res, KVD_DOUBLE_MIN_SIZE) ||
-	    !profile->used_kvd_split_data)
-		return -EIO;
-
-	linear_size = profile->kvd_linear_size;
+	u64 single_size, double_size, linear_size;
+	int err;
 
-	/* The hash part is what left of the kvd without the
-	 * linear part. It is split to the single size and
-	 * double size by the parts ratio from the profile.
-	 * Both sizes must be a multiplications of the
-	 * granularity from the profile.
-	 */
-	double_size = MLXSW_RES_GET(res, KVD_SIZE) - linear_size;
-	double_size *= profile->kvd_hash_double_parts;
-	double_size /= profile->kvd_hash_double_parts +
-		       profile->kvd_hash_single_parts;
-	double_size /= profile->kvd_hash_granularity;
-	double_size *= profile->kvd_hash_granularity;
-	single_size = MLXSW_RES_GET(res, KVD_SIZE) - double_size -
-		      linear_size;
-
-	/* Check results are legal. */
-	if (single_size < MLXSW_RES_GET(res, KVD_SINGLE_MIN_SIZE) ||
-	    double_size < MLXSW_RES_GET(res, KVD_DOUBLE_MIN_SIZE) ||
-	    MLXSW_RES_GET(res, KVD_SIZE) < linear_size)
-		return -EIO;
+	err = mlxsw_core_kvd_sizes_get(mlxsw_pci->core, profile,
+				       &single_size, &double_size,
+				       &linear_size);
+	if (err)
+		return err;
 
 	MLXSW_RES_SET(res, KVD_SINGLE_SIZE, single_size);
 	MLXSW_RES_SET(res, KVD_DOUBLE_SIZE, double_size);
@@ -1185,7 +1165,7 @@ static int mlxsw_pci_config_profile(struct mlxsw_pci *mlxsw_pci, char *mbox,
 			mbox, profile->adaptive_routing_group_cap);
 	}
 	if (MLXSW_RES_VALID(res, KVD_SIZE)) {
-		err = mlxsw_pci_profile_get_kvd_sizes(profile, res);
+		err = mlxsw_pci_profile_get_kvd_sizes(mlxsw_pci, profile, res);
 		if (err)
 			return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 6569e55..ef5f052 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4126,6 +4126,62 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
 	return 0;
 }
 
+static int mlxsw_sp_kvd_sizes_get(struct mlxsw_core *mlxsw_core,
+				  const struct mlxsw_config_profile *profile,
+				  u64 *p_single_size, u64 *p_double_size,
+				  u64 *p_linear_size)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u32 double_size;
+	int err;
+
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    !MLXSW_CORE_RES_VALID(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    !profile->used_kvd_split_data)
+		return -EIO;
+
+	/* The hash part is what left of the kvd without the
+	 * linear part. It is split to the single size and
+	 * double size by the parts ratio from the profile.
+	 * Both sizes must be a multiplications of the
+	 * granularity from the profile. In case the user
+	 * provided the sizes they are obtained via devlink.
+	 */
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					p_linear_size);
+	if (err)
+		*p_linear_size = profile->kvd_linear_size;
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					p_double_size);
+	if (err) {
+		double_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+			      *p_linear_size;
+		double_size *= profile->kvd_hash_double_parts;
+		double_size /= profile->kvd_hash_double_parts +
+			       profile->kvd_hash_single_parts;
+		*p_double_size = rounddown(double_size,
+					   profile->kvd_hash_granularity);
+	}
+
+	err = devlink_resource_size_get(devlink,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					p_single_size);
+	if (err)
+		*p_single_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) -
+				 *p_double_size - *p_linear_size;
+
+	/* Check results are legal. */
+	if (*p_single_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE) ||
+	    *p_double_size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE) ||
+	    MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE) < *p_linear_size)
+		return -EIO;
+
+	return 0;
+}
+
 static struct mlxsw_driver mlxsw_sp_driver = {
 	.kind				= mlxsw_sp_driver_name,
 	.priv_size			= sizeof(struct mlxsw_sp),
@@ -4146,6 +4202,7 @@ static struct mlxsw_driver mlxsw_sp_driver = {
 	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
 	.resources_register		= mlxsw_sp_resources_register,
+	.kvd_sizes_get			= mlxsw_sp_kvd_sizes_get,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp_config_profile,
 };
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 07/10] mlxsw: spectrum_dpipe: Connect dpipe tables to resources
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Connect current dpipe tables to resources. The tables are connected
in the following fashion:
1. IPv4 host - KVD hash single
2. IPv6 host - KVD hash double
3. Adjacency - KVD linear

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   | 72 ++++++++++++++++++----
 1 file changed, 60 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
index 96fdba7..282fe82 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_dpipe.c
@@ -774,11 +774,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_host4_ops = {
 static int mlxsw_sp_dpipe_host4_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
-					    &mlxsw_sp_host4_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					   &mlxsw_sp_host4_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST4,
+					       MLXSW_SP_RESOURCE_KVD_HASH_SINGLE);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST4);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_host4_table_fini(struct mlxsw_sp *mlxsw_sp)
@@ -832,11 +848,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_host6_ops = {
 static int mlxsw_sp_dpipe_host6_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
-					    &mlxsw_sp_host6_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					   &mlxsw_sp_host6_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_HOST6,
+					       MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_HOST6);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_host6_table_fini(struct mlxsw_sp *mlxsw_sp)
@@ -1216,11 +1248,27 @@ static struct devlink_dpipe_table_ops mlxsw_sp_dpipe_table_adj_ops = {
 static int mlxsw_sp_dpipe_adj_table_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct devlink *devlink = priv_to_devlink(mlxsw_sp->core);
+	int err;
 
-	return devlink_dpipe_table_register(devlink,
-					    MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
-					    &mlxsw_sp_dpipe_table_adj_ops,
-					    mlxsw_sp, false);
+	err = devlink_dpipe_table_register(devlink,
+					   MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					   &mlxsw_sp_dpipe_table_adj_ops,
+					   mlxsw_sp, false);
+	if (err)
+		return err;
+
+	err = devlink_dpipe_table_resource_set(devlink,
+					       MLXSW_SP_DPIPE_TABLE_NAME_ADJ,
+					       MLXSW_SP_RESOURCE_KVD_LINEAR);
+	if (err)
+		goto err_resource_set;
+
+	return 0;
+
+err_resource_set:
+	devlink_dpipe_table_unregister(devlink,
+				       MLXSW_SP_DPIPE_TABLE_NAME_ADJ);
+	return err;
 }
 
 static void mlxsw_sp_dpipe_adj_table_fini(struct mlxsw_sp *mlxsw_sp)
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 08/10] mlxsw: spectrum: Add support for getting kvdl occupancy
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for getting the kvdl occupancy through the resource interface.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  9 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  1 +
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    | 26 ++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 56d0e07..6569e55 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -4043,12 +4043,21 @@ mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, u64 siz
 	return 0;
 }
 
+static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+	struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core);
+
+	return mlxsw_sp_kvdl_occ_get(mlxsw_sp);
+}
+
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_ops = {
 	.size_validate = mlxsw_sp_resource_kvd_size_validate,
 };
 
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
 	.size_validate = mlxsw_sp_resource_kvd_linear_size_validate,
+	.occ_get = mlxsw_sp_resource_kvd_linear_occ_get,
 };
 
 static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_single_ops = {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index 5abc8c5..ff8d32b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -469,6 +469,7 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index);
 int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp,
 				   unsigned int entry_count,
 				   unsigned int *p_alloc_size);
+u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp);
 
 struct mlxsw_sp_acl_rule_info {
 	unsigned int priority;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
index 310c382..cfacc17 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c
@@ -286,6 +286,32 @@ static void mlxsw_sp_kvdl_parts_fini(struct mlxsw_sp *mlxsw_sp)
 		mlxsw_sp_kvdl_part_fini(mlxsw_sp, i);
 }
 
+u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part)
+{
+	unsigned int nr_entries;
+	int bit = -1;
+	u64 occ = 0;
+
+	nr_entries = (part->info->end_index -
+		      part->info->start_index + 1) /
+		      part->info->alloc_size;
+	while ((bit = find_next_bit(part->usage, nr_entries, bit + 1))
+		< nr_entries)
+		occ += part->info->alloc_size;
+	return occ;
+}
+
+u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp)
+{
+	struct mlxsw_sp_kvdl_part *part;
+	u64 occ = 0;
+
+	list_for_each_entry(part, &mlxsw_sp->kvdl->parts_list, list)
+		occ += mlxsw_sp_kvdl_part_occ(part);
+
+	return occ;
+}
+
 int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp)
 {
 	struct mlxsw_sp_kvdl *kvdl;
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 06/10] mlxsw: spectrum: Register KVD resources with devlink
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Register the KVD resources with devlink. The KVD is a memory resource
which is subdivided into three partitions which are the linear, hash
single and hash double.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2
- Add resource unit of double word for all the resources.
---
 drivers/net/ethernet/mellanox/mlxsw/core.c     |   9 ++
 drivers/net/ethernet/mellanox/mlxsw/core.h     |   1 +
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 139 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h |  12 +++
 4 files changed, 161 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index f3315bc..2488662 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1012,6 +1012,12 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 	if (err)
 		goto err_bus_init;
 
+	if (mlxsw_driver->resources_register) {
+		err = mlxsw_driver->resources_register(mlxsw_core);
+		if (err)
+			goto err_register_resources;
+	}
+
 	err = mlxsw_ports_init(mlxsw_core);
 	if (err)
 		goto err_ports_init;
@@ -1067,6 +1073,8 @@ int mlxsw_core_bus_device_register(const struct mlxsw_bus_info *mlxsw_bus_info,
 err_ports_init:
 	mlxsw_bus->fini(bus_priv);
 err_bus_init:
+	devlink_resources_unregister(devlink, NULL);
+err_register_resources:
 	devlink_free(devlink);
 err_devlink_alloc:
 	mlxsw_core_driver_put(device_kind);
@@ -1086,6 +1094,7 @@ void mlxsw_core_bus_device_unregister(struct mlxsw_core *mlxsw_core)
 	mlxsw_emad_fini(mlxsw_core);
 	kfree(mlxsw_core->lag.mapping);
 	mlxsw_ports_fini(mlxsw_core);
+	devlink_resources_unregister(devlink, NULL);
 	mlxsw_core->bus->fini(mlxsw_core->bus_priv);
 	devlink_free(devlink);
 	mlxsw_core_driver_put(device_kind);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 34dda96..e23f83b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -308,6 +308,7 @@ struct mlxsw_driver {
 				       u32 *p_cur, u32 *p_max);
 	void (*txhdr_construct)(struct sk_buff *skb,
 				const struct mlxsw_tx_info *tx_info);
+	int (*resources_register)(struct mlxsw_core *mlxsw_core);
 	u8 txhdr_len;
 	const struct mlxsw_config_profile *profile;
 };
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d373df7..56d0e07 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -3979,6 +3979,144 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = {
 	.resource_query_enable		= 1,
 };
 
+static bool
+mlxsw_sp_resource_kvd_granularity_validate(struct netlink_ext_ack *extack,
+					   u64 size)
+{
+	const struct mlxsw_config_profile *profile;
+
+	profile = &mlxsw_sp_config_profile;
+	if (size % profile->kvd_hash_granularity) {
+		NL_SET_ERR_MSG_MOD(extack, "resource set with wrong granularity");
+		return false;
+	}
+	return true;
+}
+
+static int
+mlxsw_sp_resource_kvd_size_validate(struct devlink *devlink, u64 size,
+				    struct netlink_ext_ack *extack)
+{
+	NL_SET_ERR_MSG_MOD(extack, "kvd size cannot be changed");
+	return -EINVAL;
+}
+
+static int
+mlxsw_sp_resource_kvd_linear_size_validate(struct devlink *devlink, u64 size,
+					   struct netlink_ext_ack *extack)
+{
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int
+mlxsw_sp_resource_kvd_hash_single_size_validate(struct devlink *devlink, u64 size,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_SINGLE_MIN_SIZE)) {
+		NL_SET_ERR_MSG_MOD(extack, "hash single size is smaller than minimum");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+mlxsw_sp_resource_kvd_hash_double_size_validate(struct devlink *devlink, u64 size,
+						struct netlink_ext_ack *extack)
+{
+	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
+
+	if (!mlxsw_sp_resource_kvd_granularity_validate(extack, size))
+		return -EINVAL;
+
+	if (size < MLXSW_CORE_RES_GET(mlxsw_core, KVD_DOUBLE_MIN_SIZE)) {
+		NL_SET_ERR_MSG_MOD(extack, "hash double size is smaller than minimum");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_linear_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_single_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_hash_single_size_validate,
+};
+
+static struct devlink_resource_ops mlxsw_sp_resource_kvd_hash_double_ops = {
+	.size_validate = mlxsw_sp_resource_kvd_hash_double_size_validate,
+};
+
+static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core)
+{
+	struct devlink *devlink = priv_to_devlink(mlxsw_core);
+	u32 kvd_size, single_size, double_size, linear_size;
+	const struct mlxsw_config_profile *profile;
+	int err;
+
+	profile = &mlxsw_sp_config_profile;
+	if (!MLXSW_CORE_RES_VALID(mlxsw_core, KVD_SIZE))
+		return -EIO;
+
+	kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD,
+					true, kvd_size,
+					MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_ID_PARENT_TOP,
+					DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+					&mlxsw_sp_resource_kvd_ops);
+	if (err)
+		return err;
+
+	linear_size = profile->kvd_linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR,
+					false, linear_size,
+					MLXSW_SP_RESOURCE_KVD_LINEAR,
+					MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+					&mlxsw_sp_resource_kvd_linear_ops);
+	if (err)
+		return err;
+
+	double_size = kvd_size - linear_size;
+	double_size *= profile->kvd_hash_double_parts;
+	double_size /= profile->kvd_hash_double_parts +
+		       profile->kvd_hash_single_parts;
+	double_size = rounddown(double_size, profile->kvd_hash_granularity);
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE,
+					false, double_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+					MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+					&mlxsw_sp_resource_kvd_hash_double_ops);
+	if (err)
+		return err;
+
+	single_size = kvd_size - double_size - linear_size;
+	err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE,
+					false, single_size,
+					MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+					MLXSW_SP_RESOURCE_KVD,
+					DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+					&mlxsw_sp_resource_kvd_hash_single_ops);
+	if (err)
+		return err;
+
+	return 0;
+}
+
 static struct mlxsw_driver mlxsw_sp_driver = {
 	.kind				= mlxsw_sp_driver_name,
 	.priv_size			= sizeof(struct mlxsw_sp),
@@ -3998,6 +4136,7 @@ static struct mlxsw_driver mlxsw_sp_driver = {
 	.sb_occ_port_pool_get		= mlxsw_sp_sb_occ_port_pool_get,
 	.sb_occ_tc_port_bind_get	= mlxsw_sp_sb_occ_tc_port_bind_get,
 	.txhdr_construct		= mlxsw_sp_txhdr_construct,
+	.resources_register		= mlxsw_sp_resources_register,
 	.txhdr_len			= MLXSW_TXHDR_LEN,
 	.profile			= &mlxsw_sp_config_profile,
 };
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index a0adcd8..5abc8c5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -66,6 +66,18 @@
 #define MLXSW_SP_KVD_LINEAR_SIZE 98304 /* entries */
 #define MLXSW_SP_KVD_GRANULARITY 128
 
+#define MLXSW_SP_RESOURCE_NAME_KVD "kvd"
+#define MLXSW_SP_RESOURCE_NAME_KVD_LINEAR "linear"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE "hash_single"
+#define MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE "hash_double"
+
+enum mlxsw_sp_resource_id {
+	MLXSW_SP_RESOURCE_KVD,
+	MLXSW_SP_RESOURCE_KVD_LINEAR,
+	MLXSW_SP_RESOURCE_KVD_HASH_SINGLE,
+	MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE,
+};
+
 struct mlxsw_sp_port;
 struct mlxsw_sp_rif;
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 05/10] mlxsw: pci: Add support for performing bus reset
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

This is a preparation stage before introducing hot reload. During the
reload process the ASIC should be resetted by accessing the PCI BAR due
to unavailability of the mailbox/emad interfaces.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.h |  1 +
 drivers/net/ethernet/mellanox/mlxsw/pci.c  | 53 ++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 6e966af..34dda96 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -332,6 +332,7 @@ struct mlxsw_bus {
 		    const struct mlxsw_config_profile *profile,
 		    struct mlxsw_res *res);
 	void (*fini)(void *bus_priv);
+	void (*reset)(void *bus_priv);
 	bool (*skb_transmit_busy)(void *bus_priv,
 				  const struct mlxsw_tx_info *tx_info);
 	int (*skb_transmit)(void *bus_priv, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index 23f7d82..39872fb 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -154,6 +154,7 @@ struct mlxsw_pci {
 		} comp;
 	} cmd;
 	struct mlxsw_bus_info bus_info;
+	const struct pci_device_id *id;
 };
 
 static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
@@ -1622,16 +1623,6 @@ static int mlxsw_pci_cmd_exec(void *bus_priv, u16 opcode, u8 opcode_mod,
 	return err;
 }
 
-static const struct mlxsw_bus mlxsw_pci_bus = {
-	.kind			= "pci",
-	.init			= mlxsw_pci_init,
-	.fini			= mlxsw_pci_fini,
-	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
-	.skb_transmit		= mlxsw_pci_skb_transmit,
-	.cmd_exec		= mlxsw_pci_cmd_exec,
-	.features		= MLXSW_BUS_F_TXRX,
-};
-
 static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
 			      const struct pci_device_id *id)
 {
@@ -1655,6 +1646,41 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
 	return 0;
 }
 
+static void mlxsw_pci_free_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	pci_free_irq_vectors(mlxsw_pci->pdev);
+}
+
+static int mlxsw_pci_alloc_irq_vectors(struct mlxsw_pci *mlxsw_pci)
+{
+	int err;
+
+	err = pci_alloc_irq_vectors(mlxsw_pci->pdev, 1, 1, PCI_IRQ_MSIX);
+	if (err < 0)
+		dev_err(&mlxsw_pci->pdev->dev, "MSI-X init failed\n");
+	return err;
+}
+
+static void mlxsw_pci_reset(void *bus_priv)
+{
+	struct mlxsw_pci *mlxsw_pci = bus_priv;
+
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
+	mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
+	mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
+}
+
+static const struct mlxsw_bus mlxsw_pci_bus = {
+	.kind			= "pci",
+	.init			= mlxsw_pci_init,
+	.fini			= mlxsw_pci_fini,
+	.skb_transmit_busy	= mlxsw_pci_skb_transmit_busy,
+	.skb_transmit		= mlxsw_pci_skb_transmit,
+	.cmd_exec		= mlxsw_pci_cmd_exec,
+	.features		= MLXSW_BUS_F_TXRX,
+	.reset			= mlxsw_pci_reset,
+};
+
 static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	const char *driver_name = pdev->driver->name;
@@ -1716,7 +1742,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto err_sw_reset;
 	}
 
-	err = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSIX);
+	err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
 	if (err < 0) {
 		dev_err(&pdev->dev, "MSI-X init failed\n");
 		goto err_msix_init;
@@ -1725,6 +1751,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mlxsw_pci->bus_info.device_kind = driver_name;
 	mlxsw_pci->bus_info.device_name = pci_name(mlxsw_pci->pdev);
 	mlxsw_pci->bus_info.dev = &pdev->dev;
+	mlxsw_pci->id = id;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_pci->bus_info,
 					     &mlxsw_pci_bus, mlxsw_pci);
@@ -1736,7 +1763,7 @@ static int mlxsw_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	return 0;
 
 err_bus_device_register:
-	pci_free_irq_vectors(mlxsw_pci->pdev);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 err_msix_init:
 err_sw_reset:
 	iounmap(mlxsw_pci->hw_addr);
@@ -1756,7 +1783,7 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
 	struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
 
 	mlxsw_core_bus_device_unregister(mlxsw_pci->core);
-	pci_free_irq_vectors(mlxsw_pci->pdev);
+	mlxsw_pci_free_irq_vectors(mlxsw_pci);
 	iounmap(mlxsw_pci->hw_addr);
 	pci_release_regions(mlxsw_pci->pdev);
 	pci_disable_device(mlxsw_pci->pdev);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 04/10] devlink: Add relation between dpipe and resource
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

The hardware processes which are modeled via dpipe commonly use some
internal hardware resources. Such relation can improve the understanding
of hardware limitations.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/devlink.h        | 13 +++++++++++++
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 7a06ec6..5cb061c 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -183,6 +183,8 @@ struct devlink_dpipe_table_ops;
  * @counters_enabled: indicates if counters are active
  * @counter_control_extern: indicates if counter control is in dpipe or
  *			    external tool
+ * @resource_valid: Indicate that the resource id is valid
+ * @resource_id: relative resource this table is related to
  * @table_ops: table operations
  * @rcu: rcu
  */
@@ -192,6 +194,8 @@ struct devlink_dpipe_table {
 	const char *name;
 	bool counters_enabled;
 	bool counter_control_extern;
+	bool resource_valid;
+	u64 resource_id;
 	struct devlink_dpipe_table_ops *table_ops;
 	struct rcu_head rcu;
 };
@@ -389,6 +393,8 @@ void devlink_resources_unregister(struct devlink *devlink,
 int devlink_resource_size_get(struct devlink *devlink,
 			      u64 resource_id,
 			      u64 *p_resource_size);
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id);
 
 #else
 
@@ -552,6 +558,13 @@ devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_dpipe_table_resource_set(struct devlink *devlink,
+				 const char *table_name, u64 resource_id)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index fcb86a6..1b474d69 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -218,6 +218,7 @@ enum devlink_attr {
 	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
 	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
 	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
+	DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,	/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 40ffa07..a257e88 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1694,6 +1694,9 @@ static int devlink_dpipe_table_put(struct sk_buff *skb,
 		       table->counters_enabled))
 		goto nla_put_failure;
 
+	if (table->resource_valid)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+				  table->resource_id, DEVLINK_ATTR_PAD);
 	if (devlink_dpipe_matches_put(table, skb))
 		goto nla_put_failure;
 
@@ -3239,6 +3242,34 @@ int devlink_resource_size_get(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_resource_size_get);
 
+/**
+ *	devlink_dpipe_table_resource_set - set the resource id
+ *
+ *	@devlink: devlink
+ *	@table_name: table name
+ *	@resource_id: resource id
+ */
+int devlink_dpipe_table_resource_set(struct devlink *devlink,
+				     const char *table_name, u64 resource_id)
+{
+	struct devlink_dpipe_table *table;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+					 table_name);
+	if (!table) {
+		err = -EINVAL;
+		goto out;
+	}
+	table->resource_id = resource_id;
+	table->resource_valid = true;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 03/10] devlink: Add support for reload
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for performing driver hot reload.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2
- Use the NO_LOCK flag instead of releasing the lock manually.
---
 include/net/devlink.h        |  1 +
 include/uapi/linux/devlink.h |  5 +++++
 net/core/devlink.c           | 48 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 76ab373..7a06ec6 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -267,6 +267,7 @@ struct devlink_resource {
 #define DEVLINK_RESOURCE_ID_PARENT_TOP 0
 
 struct devlink_ops {
+	int (*reload)(struct devlink *devlink);
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
 	int (*port_split)(struct devlink *devlink, unsigned int port_index,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1d59d8e..fcb86a6 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -73,6 +73,11 @@ enum devlink_command {
 	DEVLINK_CMD_RESOURCE_SET,
 	DEVLINK_CMD_RESOURCE_DUMP,
 
+	/* Hot driver reload, makes configuration changes take place. The
+	 * devlink instance is not released during the process.
+	 */
+	DEVLINK_CMD_RELOAD,
+
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
 	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 733a9c4..40ffa07 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2499,6 +2499,46 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
 	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
 }
 
+static int
+devlink_resources_validate(struct devlink *devlink,
+			   struct devlink_resource *resource,
+			   struct genl_info *info)
+{
+	struct list_head *resource_list;
+	int err = 0;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		if (!resource->size_valid)
+			return -EINVAL;
+		err = devlink_resources_validate(devlink, resource, info);
+		if (err)
+			return err;
+	}
+	return err;
+}
+
+static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	int err;
+
+	if (!devlink->ops->reload)
+		return -EOPNOTSUPP;
+
+	err = devlink_resources_validate(devlink, NULL, info);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(info->extack,
+				   "resources size validation failed");
+		return err;
+	}
+	return devlink->ops->reload(devlink);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2693,6 +2733,14 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RELOAD,
+		.doit = devlink_nl_cmd_reload,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 02/10] devlink: Add support for resource abstraction
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

Add support for hardware resource abstraction over devlink. Each resource
is identified via id, furthermore it contains information regarding its
size and its related sub resources. Each resource can also provide its
current occupancy.

In some cases the sizes of some resources can be changed, yet for those
changes to take place a hot driver reload may be needed. The reload
capability will be introduced in the next patch.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2
- Add resource unit.
---
 include/net/devlink.h        |  82 ++++++++++
 include/uapi/linux/devlink.h |  15 ++
 net/core/devlink.c           | 358 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 455 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 4d2c6fc..76ab373 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -26,6 +26,7 @@ struct devlink {
 	struct list_head port_list;
 	struct list_head sb_list;
 	struct list_head dpipe_table_list;
+	struct list_head resource_list;
 	struct devlink_dpipe_headers *dpipe_headers;
 	const struct devlink_ops *ops;
 	struct device *dev;
@@ -224,6 +225,47 @@ struct devlink_dpipe_headers {
 	unsigned int headers_count;
 };
 
+/**
+ * struct devlink_resource_ops - resource ops
+ * @occ_get: get the occupied size
+ * @size_validate: validate the size of the resource before update, reload
+ *                 is needed for changes to take place
+ */
+struct devlink_resource_ops {
+	u64 (*occ_get)(struct devlink *devlink);
+	int (*size_validate)(struct devlink *devlink, u64 size,
+			     struct netlink_ext_ack *extack);
+};
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ *              including its children
+ * @id: id, per devlink instance
+ * @unit: resource's unit
+ * @parent: parent resource
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @resource_ops: resource ops
+ */
+struct devlink_resource {
+	const char *name;
+	u64 size;
+	u64 size_new;
+	bool size_valid;
+	u64 id;
+	enum devlink_resource_unit unit;
+	struct devlink_resource *parent;
+	struct list_head list;
+	struct list_head resource_list;
+	const struct devlink_resource_ops *resource_ops;
+};
+
+#define DEVLINK_RESOURCE_ID_PARENT_TOP 0
+
 struct devlink_ops {
 	int (*port_type_set)(struct devlink_port *devlink_port,
 			     enum devlink_port_type port_type);
@@ -333,6 +375,20 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ethernet;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv4;
 extern struct devlink_dpipe_header devlink_dpipe_header_ipv6;
 
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      enum devlink_resource_unit unit,
+			      const struct devlink_resource_ops *resource_ops);
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource);
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size);
+
 #else
 
 static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
@@ -469,6 +525,32 @@ devlink_dpipe_match_put(struct sk_buff *skb,
 	return 0;
 }
 
+static inline int
+devlink_resource_register(struct devlink *devlink,
+			  const char *resource_name,
+			  bool top_hierarchy,
+			  u64 resource_size,
+			  u64 resource_id,
+			  u64 parent_resource_id,
+			  enum devlink_resource_unit unit,
+			  const struct devlink_resource_ops *resource_ops)
+{
+	return 0;
+}
+
+static inline void
+devlink_resources_unregister(struct devlink *devlink,
+			     struct devlink_resource *resource)
+{
+}
+
+static inline int
+devlink_resource_size_get(struct devlink *devlink, u64 resource_id,
+			  u64 *p_resource_size)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6665df6..1d59d8e 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -70,6 +70,8 @@ enum devlink_command {
 	DEVLINK_CMD_DPIPE_ENTRIES_GET,
 	DEVLINK_CMD_DPIPE_HEADERS_GET,
 	DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+	DEVLINK_CMD_RESOURCE_SET,
+	DEVLINK_CMD_RESOURCE_DUMP,
 
 	/* add new commands above here */
 	__DEVLINK_CMD_MAX,
@@ -202,6 +204,15 @@ enum devlink_attr {
 	DEVLINK_ATTR_PAD,
 
 	DEVLINK_ATTR_ESWITCH_ENCAP_MODE,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_LIST,		/* nested */
+	DEVLINK_ATTR_RESOURCE,			/* nested */
+	DEVLINK_ATTR_RESOURCE_NAME,		/* string */
+	DEVLINK_ATTR_RESOURCE_SIZE,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_NEW,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_SIZE_VALID,	/* u8 */
+	DEVLINK_ATTR_RESOURCE_UNIT,		/* u8 */
+	DEVLINK_ATTR_RESOURCE_OCC,		/* u64 */
+	DEVLINK_ATTR_RESOURCE_ID,		/* u64 */
 
 	/* add new attributes above here, update the policy in devlink.c */
 
@@ -245,4 +256,8 @@ enum devlink_dpipe_header_id {
 	DEVLINK_DPIPE_HEADER_IPV6,
 };
 
+enum devlink_resource_unit {
+	DEVLINK_RESOURCE_UNIT_DOUBLE_WORD,
+};
+
 #endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2f71734..733a9c4 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2288,6 +2288,217 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
 						counters_enable);
 }
 
+struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+		      struct devlink_resource *resource, u64 resource_id)
+{
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	list_for_each_entry(resource, resource_list, list) {
+		struct devlink_resource *__resource;
+
+		if (resource->id == resource_id)
+			return resource;
+
+		__resource = devlink_resource_find(devlink, resource,
+						   resource_id);
+		if (__resource)
+			return __resource;
+	}
+	return NULL;
+}
+
+void devlink_resource_validate_children(struct devlink_resource *resource)
+{
+	struct devlink_resource *__resource;
+	bool size_valid = true;
+	u64 parts_size = 0;
+
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	list_for_each_entry(__resource, &resource->resource_list, list)
+		parts_size += __resource->size_new;
+
+	if (parts_size > resource->size)
+		size_valid = false;
+out:
+	resource->size_valid = size_valid;
+}
+
+static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
+				       struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	u64 resource_id;
+	u64 size;
+	int err;
+
+	if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
+	    !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+		return -EINVAL;
+	resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource)
+		return -EINVAL;
+
+	if (!resource->resource_ops->size_validate)
+		return -EINVAL;
+
+	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+	err = resource->resource_ops->size_validate(devlink, size,
+						    info->extack);
+	if (err)
+		return err;
+
+	resource->size_new = size;
+	devlink_resource_validate_children(resource);
+	if (resource->parent)
+		devlink_resource_validate_children(resource->parent);
+	return 0;
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+				struct devlink_resource *resource)
+{
+	struct devlink_resource *__resource;
+	struct nlattr *__resource_attr;
+	struct nlattr *resource_attr;
+
+	resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
+	if (!resource_attr)
+		return -EMSGSIZE;
+
+	if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
+			      DEVLINK_ATTR_PAD) ||
+	    nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, resource->unit))
+		goto nla_put_failure;
+	if (resource->size != resource->size_new)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+				  resource->size_new, DEVLINK_ATTR_PAD);
+	if (resource->resource_ops && resource->resource_ops->occ_get)
+		nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
+				  resource->resource_ops->occ_get(devlink),
+				  DEVLINK_ATTR_PAD);
+	if (list_empty(&resource->resource_list))
+		goto out;
+
+	if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+		       resource->size_valid))
+		goto nla_put_failure;
+
+	__resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!__resource_attr)
+		goto nla_put_failure;
+
+	list_for_each_entry(__resource, &resource->resource_list, list) {
+		if (devlink_resource_put(devlink, skb, __resource))
+			goto resource_put_failure;
+	}
+
+	nla_nest_end(skb, __resource_attr);
+out:
+	nla_nest_end(skb, resource_attr);
+	return 0;
+
+resource_put_failure:
+	nla_nest_cancel(skb, __resource_attr);
+nla_put_failure:
+	nla_nest_cancel(skb, resource_attr);
+	return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+				 enum devlink_command cmd, int flags)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_resource *resource;
+	struct nlattr *resources_attr;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	bool incomplete;
+	void *hdr;
+	int i;
+	int err;
+
+	resource = list_first_entry(&devlink->resource_list,
+				    struct devlink_resource, list);
+start_again:
+	err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+	if (err)
+		return err;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &devlink_nl_family, NLM_F_MULTI, cmd);
+	if (!hdr) {
+		nlmsg_free(skb);
+		return -EMSGSIZE;
+	}
+
+	if (devlink_nl_put_handle(skb, devlink))
+		goto nla_put_failure;
+
+	resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
+	if (!resources_attr)
+		goto nla_put_failure;
+
+	incomplete = false;
+	i = 0;
+	list_for_each_entry_from(resource, &devlink->resource_list, list) {
+		err = devlink_resource_put(devlink, skb, resource);
+		if (err) {
+			if (!i)
+				goto err_resource_put;
+			incomplete = true;
+			break;
+		}
+		i++;
+	}
+	nla_nest_end(skb, resources_attr);
+	genlmsg_end(skb, hdr);
+	if (incomplete)
+		goto start_again;
+send_done:
+	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+			NLMSG_DONE, 0, flags | NLM_F_MULTI);
+	if (!nlh) {
+		err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+		if (err)
+			goto err_skb_send_alloc;
+		goto send_done;
+	}
+	return genlmsg_reply(skb, info);
+
+nla_put_failure:
+	err = -EMSGSIZE;
+err_resource_put:
+err_skb_send_alloc:
+	genlmsg_cancel(skb, hdr);
+	nlmsg_free(skb);
+	return err;
+}
+
+static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+
+	if (list_empty(&devlink->resource_list))
+		return -EOPNOTSUPP;
+
+	return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
 static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -2306,6 +2517,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
 	[DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
+	[DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
+	[DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
 };
 
 static const struct genl_ops devlink_nl_ops[] = {
@@ -2466,6 +2679,20 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
 	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_SET,
+		.doit = devlink_nl_cmd_resource_set,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
+	{
+		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
+		.doit = devlink_nl_cmd_resource_dump,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+	},
 };
 
 static struct genl_family devlink_nl_family __ro_after_init = {
@@ -2503,6 +2730,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	INIT_LIST_HEAD(&devlink->resource_list);
 	mutex_init(&devlink->lock);
 	return devlink;
 }
@@ -2833,6 +3061,136 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
+/**
+ *	devlink_resource_register - devlink resource register
+ *
+ *	@devlink: devlink
+ *	@resource_name: resource's name
+ *	@top_hierarchy: top hierarchy
+ *	@reload_required: reload is required for new configuration to
+ *			  apply
+ *	@resource_size: resource's size
+ *	@resource_id: resource's id
+ *	@parent_reosurce_id: resource's parent id
+ *	@unit: resource's unit type
+ *	@resource_ops: resource ops
+ */
+int devlink_resource_register(struct devlink *devlink,
+			      const char *resource_name,
+			      bool top_hierarchy,
+			      u64 resource_size,
+			      u64 resource_id,
+			      u64 parent_resource_id,
+			      enum devlink_resource_unit unit,
+			      const struct devlink_resource_ops *resource_ops)
+{
+	struct devlink_resource *resource;
+	struct list_head *resource_list;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (resource) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+	if (!resource) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	if (top_hierarchy) {
+		resource_list = &devlink->resource_list;
+	} else {
+		struct devlink_resource *parent_resource;
+
+		parent_resource = devlink_resource_find(devlink, NULL,
+							parent_resource_id);
+		if (parent_resource) {
+			resource_list = &parent_resource->resource_list;
+			resource->parent = parent_resource;
+		} else {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	resource->name = resource_name;
+	resource->size = resource_size;
+	resource->size_new = resource_size;
+	resource->id = resource_id;
+	resource->unit = unit;
+	resource->resource_ops = resource_ops;
+	resource->size_valid = true;
+	INIT_LIST_HEAD(&resource->resource_list);
+	list_add_tail(&resource->list, resource_list);
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_register);
+
+/**
+ *	devlink_resources_unregister - free all resources
+ *
+ *	@devlink: devlink
+ *	@resource: resource
+ */
+void devlink_resources_unregister(struct devlink *devlink,
+				  struct devlink_resource *resource)
+{
+	struct devlink_resource *tmp, *__resource;
+	struct list_head *resource_list;
+
+	if (resource)
+		resource_list = &resource->resource_list;
+	else
+		resource_list = &devlink->resource_list;
+
+	if (!resource)
+		mutex_lock(&devlink->lock);
+
+	list_for_each_entry_safe(__resource, tmp, resource_list, list) {
+		devlink_resources_unregister(devlink, __resource);
+		list_del(&__resource->list);
+		kfree(__resource);
+	}
+
+	if (!resource)
+		mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ *	devlink_resource_size_get - get and update size
+ *
+ *	@devlink: devlink
+ *	@resource_id: the requested resource id
+ *	@p_resource_size: ptr to update
+ */
+int devlink_resource_size_get(struct devlink *devlink,
+			      u64 resource_id,
+			      u64 *p_resource_size)
+{
+	struct devlink_resource *resource;
+	int err = 0;
+
+	mutex_lock(&devlink->lock);
+	resource = devlink_resource_find(devlink, NULL, resource_id);
+	if (!resource) {
+		err = -EINVAL;
+		goto out;
+	}
+	*p_resource_size = resource->size_new;
+	resource->size = resource->size_new;
+out:
+	mutex_unlock(&devlink->lock);
+	return err;
+}
+EXPORT_SYMBOL_GPL(devlink_resource_size_get);
+
 static int __init devlink_module_init(void)
 {
 	return genl_register_family(&devlink_nl_family);
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 01/10] devlink: Add per devlink instance lock
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa
In-Reply-To: <20171226112359.5313-1-jiri@resnulli.us>

From: Arkadi Sharshevsky <arkadis@mellanox.com>

This is a preparation before introducing resources and hot reload support.
Currently there are two global lock where one protects all devlink access,
and the second one protects devlink port access. This patch adds per devlink
instance lock which protects the internal members which are the sb/dpipe/
resource/ports. By introducing this lock the global devlink port lock can
be discarded.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
v1->v2
- Add flag for indication about not locking the devlink per-instance
  lock. It is currently used for port split commands, and will be used
  for reload as well.
---
 include/net/devlink.h |   1 +
 net/core/devlink.c    | 136 ++++++++++++++++++++++++++++----------------------
 2 files changed, 78 insertions(+), 59 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b9654e1..4d2c6fc 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -30,6 +30,7 @@ struct devlink {
 	const struct devlink_ops *ops;
 	struct device *dev;
 	possible_net_t _net;
+	struct mutex lock;
 	char priv[0] __aligned(NETDEV_ALIGN);
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7d430c1..2f71734 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list);
  */
 static DEFINE_MUTEX(devlink_mutex);
 
-/* devlink_port_mutex
- *
- * Shared lock to guard lists of ports in all devlink devices.
- */
-static DEFINE_MUTEX(devlink_port_mutex);
-
 static struct net *devlink_net(const struct devlink *devlink)
 {
 	return read_pnet(&devlink->_net);
@@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 #define DEVLINK_NL_FLAG_NEED_DEVLINK	BIT(0)
 #define DEVLINK_NL_FLAG_NEED_PORT	BIT(1)
 #define DEVLINK_NL_FLAG_NEED_SB		BIT(2)
-#define DEVLINK_NL_FLAG_LOCK_PORTS	BIT(3)
-	/* port is not needed but we need to ensure they don't
-	 * change in the middle of command
-	 */
+
+/* The per devlink instance lock is taken by default in the pre-doit
+ * operation, yet several commands do not require this. The global
+ * devlink lock is taken and protects from disruption by user-calls.
+ */
+#define DEVLINK_NL_FLAG_NO_LOCK		BIT(3)
 
 static int devlink_nl_pre_doit(const struct genl_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink;
+	int err;
 
 	mutex_lock(&devlink_mutex);
 	devlink = devlink_get_from_info(info);
@@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops,
 		mutex_unlock(&devlink_mutex);
 		return PTR_ERR(devlink);
 	}
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_lock(&devlink->lock);
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
 		info->user_ptr[0] = devlink;
 	} else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
 		struct devlink_port *devlink_port;
 
-		mutex_lock(&devlink_port_mutex);
 		devlink_port = devlink_port_get_from_info(devlink, info);
 		if (IS_ERR(devlink_port)) {
-			mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_port);
+			err = PTR_ERR(devlink_port);
+			goto unlock;
 		}
 		info->user_ptr[0] = devlink_port;
 	}
-	if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) {
-		mutex_lock(&devlink_port_mutex);
-	}
 	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
 		struct devlink_sb *devlink_sb;
 
 		devlink_sb = devlink_sb_get_from_info(devlink, info);
 		if (IS_ERR(devlink_sb)) {
-			if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
-				mutex_unlock(&devlink_port_mutex);
-			mutex_unlock(&devlink_mutex);
-			return PTR_ERR(devlink_sb);
+			err = PTR_ERR(devlink_sb);
+			goto unlock;
 		}
 		info->user_ptr[1] = devlink_sb;
 	}
 	return 0;
+
+unlock:
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
+	mutex_unlock(&devlink_mutex);
+	return err;
 }
 
 static void devlink_nl_post_doit(const struct genl_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
-	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT ||
-	    ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS)
-		mutex_unlock(&devlink_port_mutex);
+	struct devlink *devlink;
+
+	devlink = devlink_get_from_info(info);
+	if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
+		mutex_unlock(&devlink->lock);
 	mutex_unlock(&devlink_mutex);
 }
 
@@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_port, &devlink->port_list, list) {
 			if (idx < start) {
 				idx++;
@@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq,
 						   NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			if (idx < start) {
 				idx++;
@@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
 						 NETLINK_CB(cb->skb).portid,
 						 cb->nlh->nlmsg_seq,
 						 NLM_F_MULTI);
-			if (err)
+			if (err) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 			idx++;
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
 						   devlink_sb,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
 	mutex_unlock(&devlink_mutex);
@@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_port_pool_get)
 			continue;
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_port_pool_get_dumpit(msg, start, &idx,
 							devlink, devlink_sb,
 							NETLINK_CB(cb->skb).portid,
 							cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
 	int err;
 
 	mutex_lock(&devlink_mutex);
-	mutex_lock(&devlink_port_mutex);
 	list_for_each_entry(devlink, &devlink_list, list) {
 		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
 		    !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
 			continue;
+
+		mutex_lock(&devlink->lock);
 		list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
 			err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
 							   devlink,
 							   devlink_sb,
 							   NETLINK_CB(cb->skb).portid,
 							   cb->nlh->nlmsg_seq);
-			if (err && err != -EOPNOTSUPP)
+			if (err && err != -EOPNOTSUPP) {
+				mutex_unlock(&devlink->lock);
 				goto out;
+			}
 		}
+		mutex_unlock(&devlink->lock);
 	}
 out:
-	mutex_unlock(&devlink_port_mutex);
 	mutex_unlock(&devlink_mutex);
 
 	cb->args[0] = idx;
@@ -2322,14 +2337,16 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.doit = devlink_nl_cmd_port_split_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
 		.doit = devlink_nl_cmd_port_unsplit_doit,
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
-		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+				  DEVLINK_NL_FLAG_NO_LOCK,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_GET,
@@ -2397,8 +2414,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
@@ -2406,8 +2422,7 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.policy = devlink_nl_policy,
 		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
-				  DEVLINK_NL_FLAG_NEED_SB |
-				  DEVLINK_NL_FLAG_LOCK_PORTS,
+				  DEVLINK_NL_FLAG_NEED_SB,
 	},
 	{
 		.cmd = DEVLINK_CMD_ESWITCH_GET,
@@ -2488,6 +2503,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
 	INIT_LIST_HEAD(&devlink->port_list);
 	INIT_LIST_HEAD(&devlink->sb_list);
 	INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+	mutex_init(&devlink->lock);
 	return devlink;
 }
 EXPORT_SYMBOL_GPL(devlink_alloc);
@@ -2550,16 +2566,16 @@ int devlink_port_register(struct devlink *devlink,
 			  struct devlink_port *devlink_port,
 			  unsigned int port_index)
 {
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_port_index_exists(devlink, port_index)) {
-		mutex_unlock(&devlink_port_mutex);
+		mutex_unlock(&devlink->lock);
 		return -EEXIST;
 	}
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	devlink_port->registered = true;
 	list_add_tail(&devlink_port->list, &devlink->port_list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 	return 0;
 }
@@ -2572,10 +2588,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register);
  */
 void devlink_port_unregister(struct devlink_port *devlink_port)
 {
+	struct devlink *devlink = devlink_port->devlink;
+
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
-	mutex_lock(&devlink_port_mutex);
+	mutex_lock(&devlink->lock);
 	list_del(&devlink_port->list);
-	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_port_unregister);
 
@@ -2651,7 +2669,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	struct devlink_sb *devlink_sb;
 	int err = 0;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	if (devlink_sb_index_exists(devlink, sb_index)) {
 		err = -EEXIST;
 		goto unlock;
@@ -2670,7 +2688,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
 	devlink_sb->egress_tc_count = egress_tc_count;
 	list_add_tail(&devlink_sb->list, &devlink->sb_list);
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return err;
 }
 EXPORT_SYMBOL_GPL(devlink_sb_register);
@@ -2679,11 +2697,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
 {
 	struct devlink_sb *devlink_sb;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
 	WARN_ON(!devlink_sb);
 	list_del(&devlink_sb->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree(devlink_sb);
 }
 EXPORT_SYMBOL_GPL(devlink_sb_unregister);
@@ -2699,9 +2717,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister);
 int devlink_dpipe_headers_register(struct devlink *devlink,
 				   struct devlink_dpipe_headers *dpipe_headers)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = dpipe_headers;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
@@ -2715,9 +2733,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
  */
 void devlink_dpipe_headers_unregister(struct devlink *devlink)
 {
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	devlink->dpipe_headers = NULL;
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
 
@@ -2783,9 +2801,9 @@ int devlink_dpipe_table_register(struct devlink *devlink,
 	table->priv = priv;
 	table->counter_control_extern = counter_control_extern;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
@@ -2801,17 +2819,17 @@ void devlink_dpipe_table_unregister(struct devlink *devlink,
 {
 	struct devlink_dpipe_table *table;
 
-	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink->lock);
 	table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
 					 table_name);
 	if (!table)
 		goto unlock;
 	list_del_rcu(&table->list);
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 	kfree_rcu(table, rcu);
 	return;
 unlock:
-	mutex_unlock(&devlink_mutex);
+	mutex_unlock(&devlink->lock);
 }
 EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
 
-- 
2.9.5

^ permalink raw reply related

* [patch net-next v2 00/10] Add support for resource abstraction
From: Jiri Pirko @ 2017-12-26 11:23 UTC (permalink / raw)
  To: netdev
  Cc: davem, arkadis, mlxsw, andrew, vivien.didelot, f.fainelli,
	michael.chan, ganeshgr, saeedm, matanb, leonro, idosch,
	jakub.kicinski, ast, daniel, simon.horman, pieter.jansenvanvuuren,
	john.hurley, alexander.h.duyck, linville, gospo, steven.lin1,
	yuvalm, ogerlitz, dsa, roopa

From: Jiri Pirko <jiri@mellanox.com>

Many of the ASIC's internal resources are limited and are shared between
several hardware procedures. For example, unified hash-based memory can
be used for many lookup purposes, like FDB and LPM. In many cases the user
can provide a partitioning scheme for such a resource in order to perform
fine tuning for his application. In such cases performing driver reload is
needed for the changes to take place, thus this patchset also adds support
for hot reload.

Such an abstraction can be coupled with devlink's dpipe interface, which
models the ASIC's pipeline as a graph of match/action tables. By modeling
the hardware resource object, and by coupling it to several dpipe tables,
further visibility can be achieved in order to debug ASIC-wide issues.

The proposed interface will provide the user the ability to understand the
limitations of the hardware, and receive notification regarding its occupancy.
Furthermore, monitoring the resource occupancy can be done in real-time and
can be useful in many cases.
---
Userspace part prototype can be found at https://github.com/arkadis/iproute2/
at resource_dev branch.

v1->v2
- Add resource size attribute.
- Fix split bug.

Arkadi Sharshevsky (10):
  devlink: Add per devlink instance lock
  devlink: Add support for resource abstraction
  devlink: Add support for reload
  devlink: Add relation between dpipe and resource
  mlxsw: pci: Add support for performing bus reset
  mlxsw: spectrum: Register KVD resources with devlink
  mlxsw: spectrum_dpipe: Connect dpipe tables to resources
  mlxsw: spectrum: Add support for getting kvdl occupancy
  mlxsw: pci: Add support for getting resource through devlink
  mlxsw: core: Add support for reload

 drivers/net/ethernet/mellanox/mlxsw/core.c         |  85 ++-
 drivers/net/ethernet/mellanox/mlxsw/core.h         |  16 +-
 drivers/net/ethernet/mellanox/mlxsw/i2c.c          |   5 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c          |  98 ++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     | 205 ++++++++
 drivers/net/ethernet/mellanox/mlxsw/spectrum.h     |  13 +
 .../net/ethernet/mellanox/mlxsw/spectrum_dpipe.c   |  72 ++-
 .../net/ethernet/mellanox/mlxsw/spectrum_kvdl.c    |  26 +
 include/net/devlink.h                              |  97 ++++
 include/uapi/linux/devlink.h                       |  21 +
 net/core/devlink.c                                 | 573 ++++++++++++++++++---
 11 files changed, 1079 insertions(+), 132 deletions(-)

-- 
2.9.5

^ permalink raw reply

* Re: [bpf-next V2 PATCH 11/14] virtio_net: setup xdp_rxq_info
From: Jesper Dangaard Brouer @ 2017-12-26 11:20 UTC (permalink / raw)
  To: Daniel Borkmann, Alexei Starovoitov
  Cc: Michael S. Tsirkin, netdev, Jason Wang, dsahern, virtualization,
	gospo, bjorn.topel, michael.chan, brouer
In-Reply-To: <151396275093.20006.9684557344807016870.stgit@firesoul>

On Fri, 22 Dec 2017 18:12:31 +0100
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> The virtio_net driver doesn't dynamically change the RX-ring queue
> layout and backing pages, but instead reject XDP setup if all the
> conditions for XDP is not meet.  Thus, the xdp_rxq_info also remains
> fairly static.  This allow us to simply add the reg/unreg to
> net_device open/close functions.
> 
> Driver hook points for xdp_rxq_info:
>  * reg  : virtnet_open
>  * unreg: virtnet_close
> 
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Jason Wang <jasowang@redhat.com>
> Cc: virtualization@lists.linux-foundation.org
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  drivers/net/virtio_net.c |   12 +++++++++++-

Found a bug while testing this... I forgot to update receive_mergeable() too.

I'll send a V3

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH iproute] qdisc: Print offload indication
From: Jiri Pirko @ 2017-12-26 11:16 UTC (permalink / raw)
  To: Yuval Mintz; +Cc: netdev, stephen, mlxsw
In-Reply-To: <1514281725-47488-1-git-send-email-yuvalm@mellanox.com>

Tue, Dec 26, 2017 at 10:48:45AM CET, yuvalm@mellanox.com wrote:
>Use the newly added TCA_HW_OFFLOAD indication from kernel
>to print a consistent 'offloaded' message to user when listing qdiscs.
>
>Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>

Reviewed-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* [bisected] Forwarded packets occasionally has loopback output interface in Netfilter
From: Anders K. Pedersen | Cohaesio @ 2017-12-26 11:05 UTC (permalink / raw)
  To: weiwan@google.com; +Cc: netdev@vger.kernel.org, netfilter-devel@vger.kernel.org

Hello,

On one of our border routers, Netfilter is occasionally logging packets
with "OUT=lo" (output interface lo) even though the packet should be
going out via a regular interface. This behavior is present on Linux
4.13.0 to 4.14.9, and a bisection of the problem points to

[95c47f9cf5e028d1ae77dc6c767c1edc8a18025b] ipv4: call dst_dev_put() properly

as the first bad commit. This commit adds dst_dev_put() calls before
some dst_release() calls, and dst_dev_put() does

    dst->dev = dev_net(dst->dev)->loopback_dev;

(among other things), which fits the problem we're seeing.

The essential part of our nftables rule set that shows this behavior is

chain forward {
                type filter hook forward priority 0;

                meta oif { $internal_interfaces } accept

                meta oif lo ip daddr != 127.0.0.0/8 \
                        log group 0 snaplen 80 prefix "oif-lo" counter

                ip saddr { $our_ip_series } \
                        flow table acct_out \
                        { meta oif . rt nexthop . ip saddr timeout 12m counter } \
                        accept

                log group 0 snaplen 80 prefix "DROP" counter drop
}

The router only does stateless packet filtering and no redirection or
rewriting of the packets (connection tracking, NAT, ipvs etc. are not
even compiled for this kernel).

As a result of this problem we see packets that should be going to an
internal interface (and thus accepted by the first rule above) being
logged and dropped by the last rule. Some examples:

Dec 22 11:57:02 cix4 oif-lo IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:10:f3:11:38:06:77:08:00 SRC=81.170.163.118 DST=212.97.158.33 LEN=1500 TOS=00 PREC=0x00 TTL=116 ID=25932 DF PROTO=TCP SPT=35118 DPT=8443 SEQ=604358330 ACK=1182278705 WINDOW=3295 ACK URGP=0 MARK=0
Dec 22 11:57:02 cix4 DROP IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:10:f3:11:38:06:77:08:00 SRC=81.170.163.118 DST=212.97.158.33 LEN=1500 TOS=00 PREC=0x00 TTL=116 ID=25932 DF PROTO=TCP SPT=35118 DPT=8443 SEQ=604358330 ACK=1182278705 WINDOW=3295 ACK URGP=0 MARK=0

Dec 22 12:47:07 cix4 oif-lo IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:0e:86:10:27:99:f3:08:00 SRC=40.101.30.18 DST=212.97.130.32 LEN=245 TOS=00 PREC=0x00 TTL=118 ID=10370 DF PROTO=TCP SPT=443 DPT=44988 SEQ=1141545913 ACK=3844573103 WINDOW=65535 ACK PSH URGP=0 MARK=0
Dec 22 12:47:07 cix4 DROP IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:0e:86:10:27:99:f3:08:00 SRC=40.101.30.18 DST=212.97.130.32 LEN=245 TOS=00 PREC=0x00 TTL=118 ID=10370 DF PROTO=TCP SPT=443 DPT=44988 SEQ=1141545913 ACK=3844573103 WINDOW=65535 ACK PSH URGP=0 MARK=0

Dec 22 12:53:56 cix4 oif-lo IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:0e:86:10:27:99:f3:08:00 SRC=40.101.12.34 DST=212.97.130.32 LEN=245 TOS=00 PREC=0x00 TTL=115 ID=27728 DF PROTO=TCP SPT=443 DPT=39724 SEQ=3797156404 ACK=3944234612 WINDOW=65535 ACK PSH URGP=0 MARK=0
Dec 22 12:53:56 cix4 DROP IN=eth10 OUT=lo MAC=90:e2:ba:5c:b6:95:0e:86:10:27:99:f3:08:00 SRC=40.101.12.34 DST=212.97.130.32 LEN=245 TOS=00 PREC=0x00 TTL=115 ID=27728 DF PROTO=TCP SPT=443 DPT=39724 SEQ=3797156404 ACK=3944234612 WINDOW=65535 ACK PSH URGP=0 MARK=0

It also happens for outbound traffic, where the packets are logged and
counted in the acct_out flow table with "meta oif" = "lo", but a
correct "rt nexthop" - an example:

Dec 22 12:29:13 cix4 oif-lo IN=team0.20 OUT=lo MAC=3c:fd:fe:15:db:a8:00:24:a8:ff:f0:00:08:00 SRC=212.97.129.25 DST=95.166.119.129 LEN=40 TOS=00 PREC=0x00 TTL=62 ID=19481 DF PROTO=TCP SPT=443 DPT=52560 SEQ=3034827396 ACK=2862814901 WINDOW=12618 ACK URGP=0 MARK=0

# nft list flow table filter acct_out|tr ',' '\n'|grep lo
        flow table acct_out {
 "lo" . 94.101.208.217 . 212.97.129.25 expires 3m17s : counter packets 1 bytes 40

I don't know if these packets are actually sent out on the correct
outbound interface thanks to the proper nexthop (the MAC= information
in the Netfilter log is from the received packet and thus not useful
here).

I tried running a tcpdump on the lo interface to see if these packets
would show up there, but during the three days I had it running, it
only logged one such packet, while Netfilter logs 20+ outbound packets
every day, and the one packet logged by tcpdump was *not* logged by
Netfilter.

If it helps, I can try adding each of the six dst_dev_put() calls from
the first bad commit individually to see which of them that has an
effect on this.

Regards,
Anders K. Pedersen

^ permalink raw reply

* [PATCH iproute] qdisc: Print offload indication
From: Yuval Mintz @ 2017-12-26  9:48 UTC (permalink / raw)
  To: netdev, stephen; +Cc: mlxsw, Yuval Mintz

Use the newly added TCA_HW_OFFLOAD indication from kernel
to print a consistent 'offloaded' message to user when listing qdiscs.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
---
 include/uapi/linux/rtnetlink.h | 1 +
 tc/tc_qdisc.c                  | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index b8335b5..13bf56f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -557,6 +557,7 @@ enum {
 	TCA_PAD,
 	TCA_DUMP_INVISIBLE,
 	TCA_CHAIN,
+	TCA_HW_OFFLOAD,
 	__TCA_MAX
 };
 
diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c
index 4431d5f..70279b9 100644
--- a/tc/tc_qdisc.c
+++ b/tc/tc_qdisc.c
@@ -266,6 +266,10 @@ int print_qdisc(const struct sockaddr_nl *who,
 	if (t->tcm_info != 1)
 		print_uint(PRINT_ANY, "refcnt", "refcnt %u ", t->tcm_info);
 
+	if (tb[TCA_HW_OFFLOAD] &&
+	    (rta_getattr_u8(tb[TCA_HW_OFFLOAD])))
+		print_bool(PRINT_ANY, "offloaded", "offloaded ", true);
+
 	/* pfifo_fast is generic enough to warrant the hardcoding --JHS */
 	if (strcmp("pfifo_fast", RTA_DATA(tb[TCA_KIND])) == 0)
 		q = get_qdisc_kind("prio");
-- 
2.4.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox