Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 7/7] net: sched: call reoffload op on block callback reg
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Call the reoffload tcf_proto_op on all tcf_proto nodes in all chains of a
block when a callback tries to register to a block that already has
offloaded rules. If all existing rules cannot be offloaded then the
registration is rejected. This replaces the previous policy of rejecting
such callback registration outright.

On unregistration of a callback, the rules are flushed for that given cb.
The implementation of block sharing in the NFP driver, for example,
duplicates shared rules to all devs bound to a block. This meant that
rules could still exist in hw even after a device is unbound from a block
(assuming the block still remains active).

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum.c    |  4 +-
 include/net/pkt_cls.h                         |  6 ++-
 net/sched/cls_api.c                           | 54 ++++++++++++++++---
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index d2bc335dda11..52437363766a 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1542,7 +1542,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
 
 err_block_bind:
 	if (!tcf_block_cb_decref(block_cb)) {
-		__tcf_block_cb_unregister(block_cb);
+		__tcf_block_cb_unregister(block, block_cb);
 err_cb_register:
 		mlxsw_sp_acl_block_destroy(acl_block);
 	}
@@ -1572,7 +1572,7 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port,
 	err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block,
 					mlxsw_sp_port, ingress);
 	if (!err && !tcf_block_cb_decref(block_cb)) {
-		__tcf_block_cb_unregister(block_cb);
+		__tcf_block_cb_unregister(block, block_cb);
 		mlxsw_sp_acl_block_destroy(acl_block);
 	}
 }
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a2c6d35ba057..4070b8eb6d14 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -78,7 +78,8 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 int tcf_block_cb_register(struct tcf_block *block,
 			  tc_setup_cb_t *cb, void *cb_ident,
 			  void *cb_priv, struct netlink_ext_ack *extack);
-void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb);
+void __tcf_block_cb_unregister(struct tcf_block *block,
+			       struct tcf_block_cb *block_cb);
 void tcf_block_cb_unregister(struct tcf_block *block,
 			     tc_setup_cb_t *cb, void *cb_ident);
 
@@ -177,7 +178,8 @@ int tcf_block_cb_register(struct tcf_block *block,
 }
 
 static inline
-void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+void __tcf_block_cb_unregister(struct tcf_block *block,
+			       struct tcf_block_cb *block_cb)
 {
 }
 
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 8c9fb4b827a1..3e5132e41b3a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -751,19 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
 }
 EXPORT_SYMBOL(tcf_block_cb_decref);
 
+static int
+tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb,
+			    void *cb_priv, bool add, bool offload_in_use,
+			    struct netlink_ext_ack *extack)
+{
+	struct tcf_chain *chain;
+	struct tcf_proto *tp;
+	int err;
+
+	list_for_each_entry(chain, &block->chain_list, list) {
+		for (tp = rtnl_dereference(chain->filter_chain); tp;
+		     tp = rtnl_dereference(tp->next)) {
+			if (tp->ops->reoffload) {
+				err = tp->ops->reoffload(tp, add, cb, cb_priv,
+							 extack);
+				if (err && add)
+					goto err_playback_remove;
+			} else if (add && offload_in_use) {
+				err = -EOPNOTSUPP;
+				NL_SET_ERR_MSG(extack, "Filter replay failed - a filters doesn't support re-offloading");
+				goto err_playback_remove;
+			}
+		}
+	}
+
+	return 0;
+
+err_playback_remove:
+	tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use,
+				    extack);
+	return err;
+}
+
 struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 					     tc_setup_cb_t *cb, void *cb_ident,
 					     void *cb_priv,
 					     struct netlink_ext_ack *extack)
 {
 	struct tcf_block_cb *block_cb;
+	int err;
 
-	/* At this point, playback of previous block cb calls is not supported,
-	 * so forbid to register to block which already has some offloaded
-	 * filters present.
-	 */
-	if (tcf_block_offload_in_use(block))
-		return ERR_PTR(-EOPNOTSUPP);
+	/* Replay any already present rules */
+	err = tcf_block_playback_offloads(block, cb, cb_priv, true,
+					  tcf_block_offload_in_use(block),
+					  extack);
+	if (err)
+		return ERR_PTR(err);
 
 	block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
 	if (!block_cb)
@@ -788,8 +822,12 @@ int tcf_block_cb_register(struct tcf_block *block,
 }
 EXPORT_SYMBOL(tcf_block_cb_register);
 
-void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb)
+void __tcf_block_cb_unregister(struct tcf_block *block,
+			       struct tcf_block_cb *block_cb)
 {
+	tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv,
+				    false, tcf_block_offload_in_use(block),
+				    NULL);
 	list_del(&block_cb->list);
 	kfree(block_cb);
 }
@@ -803,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block,
 	block_cb = tcf_block_cb_lookup(block, cb, cb_ident);
 	if (!block_cb)
 		return;
-	__tcf_block_cb_unregister(block_cb);
+	__tcf_block_cb_unregister(block, block_cb);
 }
 EXPORT_SYMBOL(tcf_block_cb_unregister);
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 6/7] net: sched: cls_bpf: implement offload tcf_proto_op
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add the offload tcf_proto_op in cls_bpf to generate an offload message for
each bpf prog in the given tcf_proto. Call the specified callback with
this new offload message. The function only returns an error if the
callback rejects adding a 'hardware only' prog.

A prog contains a flag to indicate if it is in hardware or not. To
ensure the offload function properly maintains this flag, keep a reference
counter for the number of instances of the prog that are in hardware. Only
update the flag when this counter changes from or to 0.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 net/sched/cls_bpf.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 1aa7f6511065..79857639a140 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -43,6 +43,7 @@ struct cls_bpf_prog {
 	struct tcf_result res;
 	bool exts_integrated;
 	u32 gen_flags;
+	u32 in_hw_count;
 	struct tcf_exts exts;
 	u32 handle;
 	u16 bpf_num_ops;
@@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog,
 			cls_bpf_offload_cmd(tp, oldprog, prog, extack);
 			return err;
 		} else if (err > 0) {
+			prog->in_hw_count = err;
 			tcf_block_offload_inc(block, &prog->gen_flags);
 		}
 	}
@@ -652,6 +654,42 @@ static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
+static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+			     void *cb_priv, struct netlink_ext_ack *extack)
+{
+	struct cls_bpf_head *head = rtnl_dereference(tp->root);
+	struct tcf_block *block = tp->chain->block;
+	struct tc_cls_bpf_offload cls_bpf = {};
+	struct cls_bpf_prog *prog;
+	int err;
+
+	list_for_each_entry(prog, &head->plist, link) {
+		if (tc_skip_hw(prog->gen_flags))
+			continue;
+
+		tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags,
+					   extack);
+		cls_bpf.command = TC_CLSBPF_OFFLOAD;
+		cls_bpf.exts = &prog->exts;
+		cls_bpf.prog = add ? prog->filter : NULL;
+		cls_bpf.oldprog = add ? NULL : prog->filter;
+		cls_bpf.name = prog->bpf_name;
+		cls_bpf.exts_integrated = prog->exts_integrated;
+
+		err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv);
+		if (err) {
+			if (add && tc_skip_sw(prog->gen_flags))
+				return err;
+			continue;
+		}
+
+		tc_cls_offload_cnt_update(block, &prog->in_hw_count,
+					  &prog->gen_flags, add);
+	}
+
+	return 0;
+}
+
 static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
 	.kind		=	"bpf",
 	.owner		=	THIS_MODULE,
@@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
 	.change		=	cls_bpf_change,
 	.delete		=	cls_bpf_delete,
 	.walk		=	cls_bpf_walk,
+	.reoffload	=	cls_bpf_reoffload,
 	.dump		=	cls_bpf_dump,
 	.bind_class	=	cls_bpf_bind_class,
 };
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 5/7] net: sched: cls_u32: implement offload tcf_proto_op
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add the offload tcf_proto_op in cls_u32 to generate an offload message for
each filter and the hashtable in the given tcf_proto. Call the specified
callback with this new offload message. The function only returns an error
if the callback rejects adding a 'hardware only' rule.

A filter contains a flag to indicate if it is in hardware or not. To
ensure the offload function properly maintains this flag, keep a reference
counter for the number of instances of the filter that are in hardware.
Only update the flag when this counter changes from or to 0.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 net/sched/cls_u32.c | 111 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index fb861f90fde6..b45489ed137d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -62,6 +62,7 @@ struct tc_u_knode {
 	struct tc_u32_pcnt __percpu *pf;
 #endif
 	u32			flags;
+	u32			in_hw_count;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 		u32_remove_hw_knode(tp, n, NULL);
 		return err;
 	} else if (err > 0) {
+		n->in_hw_count = err;
 		tcf_block_offload_inc(block, &n->flags);
 	}
 
@@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
+static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
+			       bool add, tc_setup_cb_t *cb, void *cb_priv,
+			       struct netlink_ext_ack *extack)
+{
+	struct tc_cls_u32_offload cls_u32 = {};
+	int err;
+
+	tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack);
+	cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE;
+	cls_u32.hnode.divisor = ht->divisor;
+	cls_u32.hnode.handle = ht->handle;
+	cls_u32.hnode.prio = ht->prio;
+
+	err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+	if (err && add && tc_skip_sw(ht->flags))
+		return err;
+
+	return 0;
+}
+
+static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n,
+			       bool add, tc_setup_cb_t *cb, void *cb_priv,
+			       struct netlink_ext_ack *extack)
+{
+	struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
+	struct tcf_block *block = tp->chain->block;
+	struct tc_cls_u32_offload cls_u32 = {};
+	int err;
+
+	tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack);
+	cls_u32.command = add ?
+		TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE;
+	cls_u32.knode.handle = n->handle;
+
+	if (add) {
+		cls_u32.knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+		cls_u32.knode.val = n->val;
+		cls_u32.knode.mask = n->mask;
+#else
+		cls_u32.knode.val = 0;
+		cls_u32.knode.mask = 0;
+#endif
+		cls_u32.knode.sel = &n->sel;
+		cls_u32.knode.exts = &n->exts;
+		if (n->ht_down)
+			cls_u32.knode.link_handle = ht->handle;
+	}
+
+	err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv);
+	if (err) {
+		if (add && tc_skip_sw(n->flags))
+			return err;
+		return 0;
+	}
+
+	tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add);
+
+	return 0;
+}
+
+static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+			 void *cb_priv, struct netlink_ext_ack *extack)
+{
+	struct tc_u_common *tp_c = tp->data;
+	struct tc_u_hnode *ht;
+	struct tc_u_knode *n;
+	unsigned int h;
+	int err;
+
+	for (ht = rtnl_dereference(tp_c->hlist);
+	     ht;
+	     ht = rtnl_dereference(ht->next)) {
+		if (ht->prio != tp->prio)
+			continue;
+
+		/* When adding filters to a new dev, try to offload the
+		 * hashtable first. When removing, do the filters before the
+		 * hashtable.
+		 */
+		if (add && !tc_skip_hw(ht->flags)) {
+			err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv,
+						  extack);
+			if (err)
+				return err;
+		}
+
+		for (h = 0; h <= ht->divisor; h++) {
+			for (n = rtnl_dereference(ht->ht[h]);
+			     n;
+			     n = rtnl_dereference(n->next)) {
+				if (tc_skip_hw(n->flags))
+					continue;
+
+				err = u32_reoffload_knode(tp, n, add, cb,
+							  cb_priv, extack);
+				if (err)
+					return err;
+			}
+		}
+
+		if (!add && !tc_skip_hw(ht->flags))
+			u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack);
+	}
+
+	return 0;
+}
+
 static void u32_bind_class(void *fh, u32 classid, unsigned long cl)
 {
 	struct tc_u_knode *n = fh;
@@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = {
 	.change		=	u32_change,
 	.delete		=	u32_delete,
 	.walk		=	u32_walk,
+	.reoffload	=	u32_reoffload,
 	.dump		=	u32_dump,
 	.bind_class	=	u32_bind_class,
 	.owner		=	THIS_MODULE,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 4/7] net: sched: cls_matchall: implement offload tcf_proto_op
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add the reoffload tcf_proto_op in matchall to generate an offload message
for each filter in the given tcf_proto. Call the specified callback with
this new offload message. The function only returns an error if the
callback rejects adding a 'hardware only' rule.

Ensure matchall flags correctly report if the rule is in hw by keeping a
reference counter for the number of instances of the rule offloaded. Only
update the flag when this counter changes from or to 0.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 net/sched/cls_matchall.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 47b207ef7762..6279e95bee2e 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -21,6 +21,7 @@ struct cls_mall_head {
 	struct tcf_result res;
 	u32 handle;
 	u32 flags;
+	u32 in_hw_count;
 	struct rcu_work rwork;
 };
 
@@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp,
 		mall_destroy_hw_filter(tp, head, cookie, NULL);
 		return err;
 	} else if (err > 0) {
+		head->in_hw_count = err;
 		tcf_block_offload_inc(block, &head->flags);
 	}
 
@@ -235,6 +237,35 @@ static void mall_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	arg->count++;
 }
 
+static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+			  void *cb_priv, struct netlink_ext_ack *extack)
+{
+	struct cls_mall_head *head = rtnl_dereference(tp->root);
+	struct tc_cls_matchall_offload cls_mall = {};
+	struct tcf_block *block = tp->chain->block;
+	int err;
+
+	if (tc_skip_hw(head->flags))
+		return 0;
+
+	tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack);
+	cls_mall.command = add ?
+		TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY;
+	cls_mall.exts = &head->exts;
+	cls_mall.cookie = (unsigned long)head;
+
+	err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv);
+	if (err) {
+		if (add && tc_skip_sw(head->flags))
+			return err;
+		return 0;
+	}
+
+	tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add);
+
+	return 0;
+}
+
 static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
 		     struct sk_buff *skb, struct tcmsg *t)
 {
@@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = {
 	.change		= mall_change,
 	.delete		= mall_delete,
 	.walk		= mall_walk,
+	.reoffload	= mall_reoffload,
 	.dump		= mall_dump,
 	.bind_class	= mall_bind_class,
 	.owner		= THIS_MODULE,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 3/7] net: sched: cls_flower: implement offload tcf_proto_op
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add the reoffload tcf_proto_op in flower to generate an offload message
for each filter in the given tcf_proto. Call the specified callback with
this new offload message. The function only returns an error if the
callback rejects adding a 'hardware only' rule.

A filter contains a flag to indicate if it is in hardware or not. To
ensure the reoffload function properly maintains this flag, keep a
reference counter for the number of instances of the filter that are in
hardware. Only update the flag when this counter changes from or to 0. Add
a generic helper function to implement this behaviour.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/sch_generic.h | 15 +++++++++++++
 net/sched/cls_flower.c    | 44 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 88ed64f60056..c0bd11a928ed 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -336,6 +336,21 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
 	block->offloadcnt--;
 }
 
+static inline void
+tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, u32 *flags,
+			  bool add)
+{
+	if (add) {
+		if (!*cnt)
+			tcf_block_offload_inc(block, flags);
+		(*cnt)++;
+	} else {
+		(*cnt)--;
+		if (!*cnt)
+			tcf_block_offload_dec(block, flags);
+	}
+}
+
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
 {
 	struct qdisc_skb_cb *qcb;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9e8b26a80fb3..919bbcfd629b 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -87,6 +87,7 @@ struct cls_fl_filter {
 	struct list_head list;
 	u32 handle;
 	u32 flags;
+	u32 in_hw_count;
 	struct rcu_work rwork;
 	struct net_device *hw_dev;
 };
@@ -289,6 +290,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
 		fl_hw_destroy_filter(tp, f, NULL);
 		return err;
 	} else if (err > 0) {
+		f->in_hw_count = err;
 		tcf_block_offload_inc(block, &f->flags);
 	}
 
@@ -1087,6 +1089,47 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
 	}
 }
 
+static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
+			void *cb_priv, struct netlink_ext_ack *extack)
+{
+	struct cls_fl_head *head = rtnl_dereference(tp->root);
+	struct tc_cls_flower_offload cls_flower = {};
+	struct tcf_block *block = tp->chain->block;
+	struct fl_flow_mask *mask;
+	struct cls_fl_filter *f;
+	int err;
+
+	list_for_each_entry(mask, &head->masks, list) {
+		list_for_each_entry(f, &mask->filters, list) {
+			if (tc_skip_hw(f->flags))
+				continue;
+
+			tc_cls_common_offload_init(&cls_flower.common, tp,
+						   f->flags, extack);
+			cls_flower.command = add ?
+				TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY;
+			cls_flower.cookie = (unsigned long)f;
+			cls_flower.dissector = &mask->dissector;
+			cls_flower.mask = &f->mkey;
+			cls_flower.key = &f->key;
+			cls_flower.exts = &f->exts;
+			cls_flower.classid = f->res.classid;
+
+			err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
+			if (err) {
+				if (add && tc_skip_sw(f->flags))
+					return err;
+				continue;
+			}
+
+			tc_cls_offload_cnt_update(block, &f->in_hw_count,
+						  &f->flags, add);
+		}
+	}
+
+	return 0;
+}
+
 static int fl_dump_key_val(struct sk_buff *skb,
 			   void *val, int val_type,
 			   void *mask, int mask_type, int len)
@@ -1438,6 +1481,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
 	.change		= fl_change,
 	.delete		= fl_delete,
 	.walk		= fl_walk,
+	.reoffload	= fl_reoffload,
 	.dump		= fl_dump,
 	.bind_class	= fl_bind_class,
 	.owner		= THIS_MODULE,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 2/7] net: sched: add tcf_proto_op to offload a rule
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Create a new tcf_proto_op called 'reoffload' that generates a new offload
message for each node in a tcf_proto. Pointers to the tcf_proto and
whether the offload request is to add or delete the node are included.
Also included is a callback function to send the offload message to and
the option of priv data to go with the cb.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/net/act_api.h     | 3 ---
 include/net/sch_generic.h | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 9e59ebfded62..5ff11adbe2a6 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -190,9 +190,6 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
 #endif
 }
 
-typedef int tc_setup_cb_t(enum tc_setup_type type,
-			  void *type_data, void *cb_priv);
-
 #ifdef CONFIG_NET_CLS_ACT
 int tc_setup_cb_egdev_register(const struct net_device *dev,
 			       tc_setup_cb_t *cb, void *cb_priv);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 6488daa32f82..88ed64f60056 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -20,6 +20,9 @@ struct qdisc_walker;
 struct tcf_walker;
 struct module;
 
+typedef int tc_setup_cb_t(enum tc_setup_type type,
+			  void *type_data, void *cb_priv);
+
 struct qdisc_rate_table {
 	struct tc_ratespec rate;
 	u32		data[256];
@@ -256,6 +259,9 @@ struct tcf_proto_ops {
 					  bool *last,
 					  struct netlink_ext_ack *);
 	void			(*walk)(struct tcf_proto*, struct tcf_walker *arg);
+	int			(*reoffload)(struct tcf_proto *, bool,
+					     tc_setup_cb_t *, void *,
+					     struct netlink_ext_ack *);
 	void			(*bind_class)(void *, u32, unsigned long);
 
 	/* rtnetlink specific */
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 1/7] net: sched: pass extack pointer to block binds and cb registration
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers, John Hurley,
	Jakub Kicinski
In-Reply-To: <20180625043431.13413-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Pass the extact struct from a tc qdisc add to the block bind function and,
in turn, to the setup_tc ndo of binding device via the tc_block_offload
struct. Pass this back to any block callback registrations to allow
netlink logging of fails in the bind process.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c |  2 +-
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   |  2 +-
 .../net/ethernet/intel/i40evf/i40evf_main.c   |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c     |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |  2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum.c    | 10 +++++---
 drivers/net/ethernet/netronome/nfp/bpf/main.c |  2 +-
 .../ethernet/netronome/nfp/flower/offload.c   |  2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |  2 +-
 drivers/net/netdevsim/netdev.c                |  2 +-
 include/net/pkt_cls.h                         | 11 +++++---
 net/dsa/slave.c                               |  2 +-
 net/sched/cls_api.c                           | 25 ++++++++++++-------
 17 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 176fc9f4d7de..b5fc6414a951 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7984,7 +7984,7 @@ static int bnxt_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb,
-					     bp, bp);
+					     bp, bp, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp);
 		return 0;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
index 05d405905906..0745f2dfc80c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c
@@ -173,7 +173,7 @@ static int bnxt_vf_rep_setup_tc_block(struct net_device *dev,
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block,
 					     bnxt_vf_rep_setup_tc_block_cb,
-					     vf_rep, vf_rep);
+					     vf_rep, vf_rep, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block,
 					bnxt_vf_rep_setup_tc_block_cb, vf_rep);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index dd04a2f89ce6..84eca1d45ad1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3016,7 +3016,7 @@ static int cxgb_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb,
-					     pi, dev);
+					     pi, dev, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi);
 		return 0;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 95e9dfbe9839..9e5abf5eeef8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -7554,7 +7554,7 @@ static int i40e_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb,
-					     np, np);
+					     np, np, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np);
 		return 0;
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index a7b87f935411..3f8bb0d61f63 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -2926,7 +2926,7 @@ static int i40evf_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, i40evf_setup_tc_block_cb,
-					     adapter, adapter);
+					     adapter, adapter, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, i40evf_setup_tc_block_cb,
 					adapter);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index f707709969ac..cd9dd4250a8b 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2728,7 +2728,7 @@ static int igb_setup_tc_block(struct igb_adapter *adapter,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, igb_setup_tc_block_cb,
-					     adapter, adapter);
+					     adapter, adapter, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb,
 					adapter);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3e87dbbc9024..d29bd8fc3ff3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9325,7 +9325,7 @@ static int ixgbe_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, ixgbe_setup_tc_block_cb,
-					     adapter, adapter);
+					     adapter, adapter, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, ixgbe_setup_tc_block_cb,
 					adapter);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 56c1b6f5593e..134f20a182b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3371,7 +3371,7 @@ static int mlx5e_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb,
-					     priv, priv);
+					     priv, priv, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb,
 					priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 57987f6546e8..3f2fe95e01d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -797,7 +797,7 @@ static int mlx5e_rep_setup_tc_block(struct net_device *dev,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb,
-					     priv, priv);
+					     priv, priv, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv);
 		return 0;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 968b88af2ef5..d2bc335dda11 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1503,7 +1503,8 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type,
 
 static int
 mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
-				    struct tcf_block *block, bool ingress)
+				    struct tcf_block *block, bool ingress,
+				    struct netlink_ext_ack *extack)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
 	struct mlxsw_sp_acl_block *acl_block;
@@ -1518,7 +1519,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port,
 			return -ENOMEM;
 		block_cb = __tcf_block_cb_register(block,
 						   mlxsw_sp_setup_tc_block_cb_flower,
-						   mlxsw_sp, acl_block);
+						   mlxsw_sp, acl_block, extack);
 		if (IS_ERR(block_cb)) {
 			err = PTR_ERR(block_cb);
 			goto err_cb_register;
@@ -1596,11 +1597,12 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port,
-					    mlxsw_sp_port);
+					    mlxsw_sp_port, f->extack);
 		if (err)
 			return err;
 		err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port,
-							  f->block, ingress);
+							  f->block, ingress,
+							  f->extack);
 		if (err) {
 			tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port);
 			return err;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index fcdfb8e7fdea..bf46f7bff912 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -206,7 +206,7 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev,
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block,
 					     nfp_bpf_setup_tc_block_cb,
-					     nn, nn);
+					     nn, nn, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block,
 					nfp_bpf_setup_tc_block_cb,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index c42e64f32333..7abefed1efe9 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -627,7 +627,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev,
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block,
 					     nfp_flower_setup_tc_block_cb,
-					     repr, repr);
+					     repr, repr, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block,
 					nfp_flower_setup_tc_block_cb,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index cba46b62a1cd..2354e30caa78 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3776,7 +3776,7 @@ static int stmmac_setup_tc_block(struct stmmac_priv *priv,
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb,
-				priv, priv);
+				priv, priv, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv);
 		return 0;
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index ec68f38213d9..c9dacc6fcd59 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -260,7 +260,7 @@ nsim_setup_tc_block(struct net_device *dev, struct tc_block_offload *f)
 	switch (f->command) {
 	case TC_BLOCK_BIND:
 		return tcf_block_cb_register(f->block, nsim_setup_tc_block_cb,
-					     ns, ns);
+					     ns, ns, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, nsim_setup_tc_block_cb, ns);
 		return 0;
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a3c1a2c47cd4..a2c6d35ba057 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -73,10 +73,11 @@ void tcf_block_cb_incref(struct tcf_block_cb *block_cb);
 unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb);
 struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv);
+					     void *cb_priv,
+					     struct netlink_ext_ack *extack);
 int tcf_block_cb_register(struct tcf_block *block,
 			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv);
+			  void *cb_priv, struct netlink_ext_ack *extack);
 void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb);
 void tcf_block_cb_unregister(struct tcf_block *block,
 			     tc_setup_cb_t *cb, void *cb_ident);
@@ -161,7 +162,8 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb)
 static inline
 struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv)
+					     void *cb_priv,
+					     struct netlink_ext_ack *extack)
 {
 	return NULL;
 }
@@ -169,7 +171,7 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 static inline
 int tcf_block_cb_register(struct tcf_block *block,
 			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv)
+			  void *cb_priv, struct netlink_ext_ack *extack)
 {
 	return 0;
 }
@@ -596,6 +598,7 @@ struct tc_block_offload {
 	enum tc_block_command command;
 	enum tcf_block_binder_type binder_type;
 	struct tcf_block *block;
+	struct netlink_ext_ack *extack;
 };
 
 struct tc_cls_common_offload {
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1e3b6a6d8a40..71536c435132 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev,
 
 	switch (f->command) {
 	case TC_BLOCK_BIND:
-		return tcf_block_cb_register(f->block, cb, dev, dev);
+		return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
 	case TC_BLOCK_UNBIND:
 		tcf_block_cb_unregister(f->block, cb, dev);
 		return 0;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index cdc3c87c53e6..8c9fb4b827a1 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block)
 static int tcf_block_offload_cmd(struct tcf_block *block,
 				 struct net_device *dev,
 				 struct tcf_block_ext_info *ei,
-				 enum tc_block_command command)
+				 enum tc_block_command command,
+				 struct netlink_ext_ack *extack)
 {
 	struct tc_block_offload bo = {};
 
 	bo.command = command;
 	bo.binder_type = ei->binder_type;
 	bo.block = block;
+	bo.extack = extack;
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
 }
 
 static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
-				  struct tcf_block_ext_info *ei)
+				  struct tcf_block_ext_info *ei,
+				  struct netlink_ext_ack *extack)
 {
 	struct net_device *dev = q->dev_queue->dev;
 	int err;
@@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q,
 	/* If tc offload feature is disabled and the block we try to bind
 	 * to already has some offloaded filters, forbid to bind.
 	 */
-	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block))
+	if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) {
+		NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled");
 		return -EOPNOTSUPP;
+	}
 
-	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND);
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_inc;
 	return err;
@@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q,
 
 	if (!dev->netdev_ops->ndo_setup_tc)
 		goto no_offload_dev_dec;
-	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND);
+	err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL);
 	if (err == -EOPNOTSUPP)
 		goto no_offload_dev_dec;
 	return;
@@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
 	if (err)
 		goto err_chain_head_change_cb_add;
 
-	err = tcf_block_offload_bind(block, q, ei);
+	err = tcf_block_offload_bind(block, q, ei, extack);
 	if (err)
 		goto err_block_offload_bind;
 
@@ -748,7 +753,8 @@ EXPORT_SYMBOL(tcf_block_cb_decref);
 
 struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block,
 					     tc_setup_cb_t *cb, void *cb_ident,
-					     void *cb_priv)
+					     void *cb_priv,
+					     struct netlink_ext_ack *extack)
 {
 	struct tcf_block_cb *block_cb;
 
@@ -772,11 +778,12 @@ EXPORT_SYMBOL(__tcf_block_cb_register);
 
 int tcf_block_cb_register(struct tcf_block *block,
 			  tc_setup_cb_t *cb, void *cb_ident,
-			  void *cb_priv)
+			  void *cb_priv, struct netlink_ext_ack *extack)
 {
 	struct tcf_block_cb *block_cb;
 
-	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv);
+	block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
+					   extack);
 	return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
 }
 EXPORT_SYMBOL(tcf_block_cb_register);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 0/7] net: sched: support replay of filter offload when binding to block
From: Jakub Kicinski @ 2018-06-25  4:34 UTC (permalink / raw)
  To: davem, jiri
  Cc: xiyou.wangcong, jhs, gerlitz.or, netdev, oss-drivers,
	Jakub Kicinski

Hi!

This series from John adds the ability to replay filter offload requests
when new offload callback is being registered on a TC block.  This is most
likely to take place for shared blocks today, when a block which already
has rules is bound to another interface.  Prior to this patch set if any
of the rules were offloaded the block bind would fail.

A new tcf_proto_op is added to generate a filter-specific offload request.
The new 'offload' op is supporting extack from day 0, hence we need to
propagate extack to .ndo_setup_tc TC_BLOCK_BIND/TC_BLOCK_UNBIND and
through tcf_block_cb_register() to tcf_block_playback_offloads().

The immediate use of this patch set is to simplify life of drivers which
require duplicating rules when sharing blocks.  Switch drivers (mlxsw)
can bind ports to rule lists dynamically, NIC drivers generally don't
have that ability and need the rules to be duplicated for each ingress
they match on.  In code terms this means that switch drivers don't
register multiple callbacks for each port.  NIC drivers do, and get a
separate request and hance rule per-port, as if the block was not shared.
The registration fails today, however, if some rules were already present.

As John notes in description of patch 2, drivers which register multiple
callbacks to shared blocks will likely need to flush the rules on block
unbind.  This set makes the core not only replay the the offload add
requests but also offload remove requests when callback is unregistered.

John Hurley (7):
  net: sched: pass extack pointer to block binds and cb registration
  net: sched: add tcf_proto_op to offload a rule
  net: sched: cls_flower: implement offload tcf_proto_op
  net: sched: cls_matchall: implement offload tcf_proto_op
  net: sched: cls_u32: implement offload tcf_proto_op
  net: sched: cls_bpf: implement offload tcf_proto_op
  net: sched: call reoffload op on block callback reg

 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |   2 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c |   2 +-
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c   |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   2 +-
 .../net/ethernet/intel/i40evf/i40evf_main.c   |   2 +-
 drivers/net/ethernet/intel/igb/igb_main.c     |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en_main.c |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |   2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum.c    |  14 ++-
 drivers/net/ethernet/netronome/nfp/bpf/main.c |   2 +-
 .../ethernet/netronome/nfp/flower/offload.c   |   2 +-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |   2 +-
 drivers/net/netdevsim/netdev.c                |   2 +-
 include/net/act_api.h                         |   3 -
 include/net/pkt_cls.h                         |  17 ++-
 include/net/sch_generic.h                     |  21 ++++
 net/dsa/slave.c                               |   2 +-
 net/sched/cls_api.c                           |  79 ++++++++++---
 net/sched/cls_bpf.c                           |  39 ++++++
 net/sched/cls_flower.c                        |  44 +++++++
 net/sched/cls_matchall.c                      |  32 +++++
 net/sched/cls_u32.c                           | 111 ++++++++++++++++++
 23 files changed, 342 insertions(+), 46 deletions(-)

-- 
2.17.1

^ permalink raw reply

* [PATCH bpf-next 7/7] nfp: bpf: migrate to advanced reciprocal divide in reciprocal_div.h
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

As we are doing JIT, we would want to use the advanced version of the
reciprocal divide (reciprocal_value_adv) to trade performance with host.

We could reduce the required ALU instructions from 4 to 2 or 1.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 38 ++++++++++++++-----
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 16 ++++++--
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index d732b6cfc356..f99ac00bd649 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1498,8 +1498,9 @@ static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
 {
 	swreg tmp_both = imm_both(nfp_prog), dst_both = reg_both(dst);
 	swreg dst_a = reg_a(dst), dst_b = reg_a(dst);
-	struct reciprocal_value rvalue;
+	struct reciprocal_value_adv rvalue;
 	swreg tmp_b = imm_b(nfp_prog);
+	u8 pre_shift, exp;
 	swreg magic;
 
 	if (imm > U32_MAX) {
@@ -1507,15 +1508,34 @@ static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
 		return 0;
 	}
 
-	rvalue = reciprocal_value(imm);
+	rvalue = reciprocal_value_adv(imm, 32);
+	exp = rvalue.exp;
+	if (rvalue.is_wide_m && !(imm & 1)) {
+		pre_shift = fls(imm & -imm) - 1;
+		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
+	} else {
+		pre_shift = 0;
+	}
 	magic = re_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
-	wrp_mul_u32(nfp_prog, tmp_both, tmp_both, dst_a, magic, true);
-	emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB, tmp_b);
-	emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
-		 SHF_SC_R_SHF, rvalue.sh1);
-	emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD, tmp_b);
-	emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
-		 SHF_SC_R_SHF, rvalue.sh2);
+	if (imm == 1 << exp) {
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, exp);
+	} else if (rvalue.is_wide_m) {
+		wrp_mul_u32(nfp_prog, tmp_both, tmp_both, dst_a, magic, true);
+		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB, tmp_b);
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, 1);
+		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD, tmp_b);
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+			 SHF_SC_R_SHF, rvalue.sh - 1);
+	} else {
+		if (pre_shift)
+			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
+				 dst_b, SHF_SC_R_SHF, pre_shift);
+		wrp_mul_u32(nfp_prog, dst_both, dst_both, dst_a, magic, true);
+		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
+			 dst_b, SHF_SC_R_SHF, rvalue.sh);
+	}
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index f0f07e988c46..39c2c24fea11 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -561,12 +561,22 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	/* NFP doesn't have divide instructions, we support divide by constant
 	 * through reciprocal multiplication. Given NFP support multiplication
 	 * no bigger than u32, we'd require divisor and dividend no bigger than
-	 * that as well.
+	 * that as well. There is a further range requirement on dividend,
+	 * please see the NOTE below.
 	 *
 	 * Also eBPF doesn't support signed divide and has enforced this on C
 	 * language level by failing compilation. However LLVM assembler hasn't
 	 * enforced this, so it is possible for negative constant to leak in as
 	 * a BPF_K operand through assembly code, we reject such cases as well.
+	 *
+	 * NOTE: because we are using "reciprocal_value_adv" which doesn't
+	 * support dividend with MSB set, so we need to JIT separate NFP
+	 * sequence to handle such case. It could be a simple sequence if there
+	 * is conditional move, however there isn't for NFP. So, we don't bother
+	 * generating compare-if-set-branch sequence by rejecting the program
+	 * straight away when the u32 dividend has MSB set. Divide by such a
+	 * large constant would be rare in practice. Also, the programmer could
+	 * simply rewrite it as "result = divisor >= the_const".
 	 */
 	if (is_mbpf_div(meta)) {
 		if (meta->umax_dst > U32_MAX) {
@@ -578,8 +588,8 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 				pr_vlog(env, "dividend is not constant\n");
 				return -EINVAL;
 			}
-			if (meta->umax_src > U32_MAX) {
-				pr_vlog(env, "dividend is not within u32 value range\n");
+			if (meta->umax_src > U32_MAX / 2) {
+				pr_vlog(env, "dividend is bigger than U32_MAX/2\n");
 				return -EINVAL;
 			}
 		}
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 6/7] nfp: bpf: support u32 divide using reciprocal_div.h
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

NFP doesn't have integer divide instruction, this patch use reciprocal
algorithm (the basic one, reciprocal_div) to emulate it.

For each u32 divide, we would need 11 instructions to finish the operation.

  7 (for multiplication) + 4 (various ALUs) = 11

Given NFP only supports multiplication no bigger than u32, we'd require
divisor and dividend no bigger than that as well.

Also eBPF doesn't support signed divide and has enforced this on C language
level by failing compilation. However LLVM assembler hasn't enforced this,
so it is possible for negative constant to leak in as a BPF_K operand
through assembly code, we reject such cases as well.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 58 ++++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  5 ++
 .../net/ethernet/netronome/nfp/bpf/verifier.c | 31 ++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 7d7061d93358..d732b6cfc356 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -34,10 +34,11 @@
 #define pr_fmt(fmt)	"NFP net bpf: " fmt
 
 #include <linux/bug.h>
-#include <linux/kernel.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <linux/pkt_cls.h>
+#include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 
 #include "main.h"
@@ -1493,6 +1494,32 @@ wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	return 0;
 }
 
+static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
+{
+	swreg tmp_both = imm_both(nfp_prog), dst_both = reg_both(dst);
+	swreg dst_a = reg_a(dst), dst_b = reg_a(dst);
+	struct reciprocal_value rvalue;
+	swreg tmp_b = imm_b(nfp_prog);
+	swreg magic;
+
+	if (imm > U32_MAX) {
+		wrp_immed(nfp_prog, dst_both, 0);
+		return 0;
+	}
+
+	rvalue = reciprocal_value(imm);
+	magic = re_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
+	wrp_mul_u32(nfp_prog, tmp_both, tmp_both, dst_a, magic, true);
+	emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB, tmp_b);
+	emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+		 SHF_SC_R_SHF, rvalue.sh1);
+	emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD, tmp_b);
+	emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
+		 SHF_SC_R_SHF, rvalue.sh2);
+
+	return 0;
+}
+
 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
@@ -1807,6 +1834,21 @@ static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_mul(nfp_prog, meta, true, false);
 }
 
+static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	const struct bpf_insn *insn = &meta->insn;
+
+	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
+}
+
+static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	/* NOTE: verifier hook has rejected cases for which verifier doesn't
+	 * know whether the source operand is constant or not.
+	 */
+	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
+}
+
 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2230,6 +2272,16 @@ static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_mul(nfp_prog, meta, false, false);
 }
 
+static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return div_reg64(nfp_prog, meta);
+}
+
+static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return div_imm64(nfp_prog, meta);
+}
+
 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	u8 dst = meta->insn.dst_reg * 2;
@@ -2983,6 +3035,8 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
+	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
+	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
@@ -3004,6 +3058,8 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
+	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
+	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
 	[BPF_ALU | BPF_NEG] =		neg_reg,
 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index c10079b1a312..9845c1a2d4c2 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -399,6 +399,11 @@ static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta)
 	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_MUL;
 }
 
+static inline bool is_mbpf_div(const struct nfp_insn_meta *meta)
+{
+	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_DIV;
+}
+
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 30d4f1580693..f0f07e988c46 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -558,6 +558,37 @@ nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 		}
 	}
 
+	/* NFP doesn't have divide instructions, we support divide by constant
+	 * through reciprocal multiplication. Given NFP support multiplication
+	 * no bigger than u32, we'd require divisor and dividend no bigger than
+	 * that as well.
+	 *
+	 * Also eBPF doesn't support signed divide and has enforced this on C
+	 * language level by failing compilation. However LLVM assembler hasn't
+	 * enforced this, so it is possible for negative constant to leak in as
+	 * a BPF_K operand through assembly code, we reject such cases as well.
+	 */
+	if (is_mbpf_div(meta)) {
+		if (meta->umax_dst > U32_MAX) {
+			pr_vlog(env, "divisor is not within u32 value range\n");
+			return -EINVAL;
+		}
+		if (mbpf_src(meta) == BPF_X) {
+			if (meta->umin_src != meta->umax_src) {
+				pr_vlog(env, "dividend is not constant\n");
+				return -EINVAL;
+			}
+			if (meta->umax_src > U32_MAX) {
+				pr_vlog(env, "dividend is not within u32 value range\n");
+				return -EINVAL;
+			}
+		}
+		if (mbpf_src(meta) == BPF_K && meta->insn.imm < 0) {
+			pr_vlog(env, "divide by negative constant is not supported\n");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 5/7] nfp: bpf: support u16 and u32 multiplications
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

NFP supports u16 and u32 multiplication. Multiplication is done 8-bits per
step, therefore we need 2 steps for u16 and 4 steps for u32.

We also need one start instruction to initialize the sequence and one or
two instructions to fetch the result depending on either you need the high
halve of u32 multiplication.

For ALU64, if either operand is beyond u32's value range, we reject it. One
thing to note, if the source operand is BPF_K, then we need to check "imm"
field directly, and we'd reject it if it is negative.  Because for ALU64,
"imm" (with s32 type) is expected to be sign extended to s64 which NFP mul
doesn't support. For ALU32, it is fine for "imm" be negative though,
because the result is 32-bits and here is no difference on the low halve
of result for signed/unsigned mul, so we will get correct result.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 137 ++++++++++++++++++
 drivers/net/ethernet/netronome/nfp/bpf/main.h |   5 +
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  58 ++++++--
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |  28 ++++
 4 files changed, 217 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 4a629e9b5c0f..7d7061d93358 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -415,6 +415,60 @@ emit_alu(struct nfp_prog *nfp_prog, swreg dst,
 		   reg.dst_lmextn, reg.src_lmextn);
 }
 
+static void
+__emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
+	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
+	   bool wr_both, bool dst_lmextn, bool src_lmextn)
+{
+	u64 insn;
+
+	insn = OP_MUL_BASE |
+		FIELD_PREP(OP_MUL_A_SRC, areg) |
+		FIELD_PREP(OP_MUL_B_SRC, breg) |
+		FIELD_PREP(OP_MUL_STEP, step) |
+		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
+		FIELD_PREP(OP_MUL_SW, swap) |
+		FIELD_PREP(OP_MUL_TYPE, type) |
+		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
+		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
+		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
+
+	nfp_prog_push(nfp_prog, insn);
+}
+
+static void
+emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
+	 enum mul_step step, swreg rreg)
+{
+	struct nfp_insn_ur_regs reg;
+	u16 areg;
+	int err;
+
+	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
+		nfp_prog->error = -EINVAL;
+		return;
+	}
+
+	if (step == MUL_LAST || step == MUL_LAST_2) {
+		/* When type is step and step Number is LAST or LAST2, left
+		 * source is used as destination.
+		 */
+		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
+		areg = reg.dst;
+	} else {
+		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
+		areg = reg.areg;
+	}
+
+	if (err) {
+		nfp_prog->error = err;
+		return;
+	}
+
+	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
+		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
+}
+
 static void
 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
@@ -1380,6 +1434,65 @@ static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
 		      SHF_SC_R_ROT, 16);
 }
 
+static void
+wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
+	    swreg rreg, bool gen_high_half)
+{
+	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
+	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
+	if (gen_high_half)
+		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
+			 reg_none());
+	else
+		wrp_immed(nfp_prog, dst_hi, 0);
+}
+
+static void
+wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
+	    swreg rreg)
+{
+	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
+	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
+	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
+}
+
+static int
+wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+	bool gen_high_half, bool ropnd_from_reg)
+{
+	swreg multiplier, multiplicand, dst_hi, dst_lo;
+	const struct bpf_insn *insn = &meta->insn;
+	u32 lopnd_max, ropnd_max;
+	u8 dst_reg;
+
+	dst_reg = insn->dst_reg;
+	multiplicand = reg_a(dst_reg * 2);
+	dst_hi = reg_both(dst_reg * 2 + 1);
+	dst_lo = reg_both(dst_reg * 2);
+	lopnd_max = meta->umax_dst;
+	if (ropnd_from_reg) {
+		multiplier = reg_b(insn->src_reg * 2);
+		ropnd_max = meta->umax_src;
+	} else {
+		u32 imm = insn->imm;
+
+		multiplier = re_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
+		ropnd_max = imm;
+	}
+	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
+		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
+			    gen_high_half);
+	else
+		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
+
+	return 0;
+}
+
 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
@@ -1684,6 +1797,16 @@ static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return 0;
 }
 
+static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, true, true);
+}
+
+static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, true, false);
+}
+
 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	const struct bpf_insn *insn = &meta->insn;
@@ -2097,6 +2220,16 @@ static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm);
 }
 
+static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, false, true);
+}
+
+static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+	return wrp_mul(nfp_prog, meta, false, false);
+}
+
 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
 	u8 dst = meta->insn.dst_reg * 2;
@@ -2848,6 +2981,8 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
+	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
+	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
@@ -2867,6 +3002,8 @@ static const instr_cb_t instr_cb[256] = {
 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
+	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
+	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
 	[BPF_ALU | BPF_NEG] =		neg_reg,
 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index c985d0ac61a3..c10079b1a312 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -394,6 +394,11 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
 }
 
+static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta)
+{
+	return is_mbpf_alu(meta) && mbpf_op(meta) == BPF_MUL;
+}
+
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 7bd9666bd8ff..30d4f1580693 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -516,6 +516,51 @@ nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 	return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.dst_reg);
 }
 
+static int
+nfp_bpf_check_alu(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
+		  struct bpf_verifier_env *env)
+{
+	const struct bpf_reg_state *sreg =
+		cur_regs(env) + meta->insn.src_reg;
+	const struct bpf_reg_state *dreg =
+		cur_regs(env) + meta->insn.dst_reg;
+
+	meta->umin_src = min(meta->umin_src, sreg->umin_value);
+	meta->umax_src = max(meta->umax_src, sreg->umax_value);
+	meta->umin_dst = min(meta->umin_dst, dreg->umin_value);
+	meta->umax_dst = max(meta->umax_dst, dreg->umax_value);
+
+	/* NFP supports u16 and u32 multiplication.
+	 *
+	 * For ALU64, if either operand is beyond u32's value range, we reject
+	 * it. One thing to note, if the source operand is BPF_K, then we need
+	 * to check "imm" field directly, and we'd reject it if it is negative.
+	 * Because for ALU64, "imm" (with s32 type) is expected to be sign
+	 * extended to s64 which NFP mul doesn't support.
+	 *
+	 * For ALU32, it is fine for "imm" be negative though, because the
+	 * result is 32-bits and there is no difference on the low halve of
+	 * the result for signed/unsigned mul, so we will get correct result.
+	 */
+	if (is_mbpf_mul(meta)) {
+		if (meta->umax_dst > U32_MAX) {
+			pr_vlog(env, "multiplier is not within u32 value range\n");
+			return -EINVAL;
+		}
+		if (mbpf_src(meta) == BPF_X && meta->umax_src > U32_MAX) {
+			pr_vlog(env, "multiplicand is not within u32 value range\n");
+			return -EINVAL;
+		}
+		if (mbpf_class(meta) == BPF_ALU64 &&
+		    mbpf_src(meta) == BPF_K && meta->insn.imm < 0) {
+			pr_vlog(env, "sign extended multiplicand won't be within u32 value range\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
 static int
 nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 {
@@ -551,17 +596,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	if (is_mbpf_xadd(meta))
 		return nfp_bpf_check_xadd(nfp_prog, meta, env);
 
-	if (is_mbpf_alu(meta)) {
-		const struct bpf_reg_state *sreg =
-			cur_regs(env) + meta->insn.src_reg;
-		const struct bpf_reg_state *dreg =
-			cur_regs(env) + meta->insn.dst_reg;
-
-		meta->umin_src = min(meta->umin_src, sreg->umin_value);
-		meta->umax_src = max(meta->umax_src, sreg->umax_value);
-		meta->umin_dst = min(meta->umin_dst, dreg->umin_value);
-		meta->umax_dst = max(meta->umax_dst, dreg->umax_value);
-	}
+	if (is_mbpf_alu(meta))
+		return nfp_bpf_check_alu(nfp_prog, meta, env);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
index f6677bc9875a..cdc4e065f6f5 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h
@@ -426,4 +426,32 @@ static inline u32 nfp_get_ind_csr_ctx_ptr_offs(u32 read_offset)
 	return (read_offset & ~NFP_IND_ME_CTX_PTR_BASE_MASK) | NFP_CSR_CTX_PTR;
 }
 
+enum mul_type {
+	MUL_TYPE_START		= 0x00,
+	MUL_TYPE_STEP_24x8	= 0x01,
+	MUL_TYPE_STEP_16x16	= 0x02,
+	MUL_TYPE_STEP_32x32	= 0x03,
+};
+
+enum mul_step {
+	MUL_STEP_1		= 0x00,
+	MUL_STEP_NONE		= MUL_STEP_1,
+	MUL_STEP_2		= 0x01,
+	MUL_STEP_3		= 0x02,
+	MUL_STEP_4		= 0x03,
+	MUL_LAST		= 0x04,
+	MUL_LAST_2		= 0x05,
+};
+
+#define OP_MUL_BASE		0x0f800000000ULL
+#define OP_MUL_A_SRC		0x000000003ffULL
+#define OP_MUL_B_SRC		0x000000ffc00ULL
+#define OP_MUL_STEP		0x00000700000ULL
+#define OP_MUL_DST_AB		0x00000800000ULL
+#define OP_MUL_SW		0x00040000000ULL
+#define OP_MUL_TYPE		0x00180000000ULL
+#define OP_MUL_WR_AB		0x20000000000ULL
+#define OP_MUL_SRC_LMEXTN	0x40000000000ULL
+#define OP_MUL_DST_LMEXTN	0x80000000000ULL
+
 #endif
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 4/7] nfp: bpf: copy range info for all operands of all ALU operations
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

NFP verifier hook is coping range information of the shift amount for
indirect shift operation so optimized shift sequences could be generated.

We want to use range info to do more things. For example, to decide whether
multiplication and divide are supported on the given range.

This patch simply let NFP verifier hook to copy range info for all operands
of all ALU operands.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.h | 33 +++++++------------
 .../net/ethernet/netronome/nfp/bpf/offload.c  |  4 ++-
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  6 +++-
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 5975a19c28cb..c985d0ac61a3 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -265,6 +265,8 @@ struct nfp_bpf_reg_state {
  * @arg2: arg2 for call instructions
  * @umin_src: copy of core verifier umin_value for src opearnd.
  * @umax_src: copy of core verifier umax_value for src operand.
+ * @umin_dst: copy of core verifier umin_value for dst opearnd.
+ * @umax_dst: copy of core verifier umax_value for dst operand.
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
@@ -300,12 +302,15 @@ struct nfp_insn_meta {
 			struct bpf_reg_state arg1;
 			struct nfp_bpf_reg_state arg2;
 		};
-		/* We are interested in range info for some operands,
-		 * for example, the shift amount which is kept in src operand.
+		/* We are interested in range info for operands of ALU
+		 * operations. For example, shift amount, multiplicand and
+		 * multiplier etc.
 		 */
 		struct {
 			u64 umin_src;
 			u64 umax_src;
+			u64 umin_dst;
+			u64 umax_dst;
 		};
 	};
 	unsigned int off;
@@ -339,6 +344,11 @@ static inline u8 mbpf_mode(const struct nfp_insn_meta *meta)
 	return BPF_MODE(meta->insn.code);
 }
 
+static inline bool is_mbpf_alu(const struct nfp_insn_meta *meta)
+{
+	return mbpf_class(meta) == BPF_ALU64 || mbpf_class(meta) == BPF_ALU;
+}
+
 static inline bool is_mbpf_load(const struct nfp_insn_meta *meta)
 {
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM);
@@ -384,25 +394,6 @@ static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta)
 	return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD);
 }
 
-static inline bool is_mbpf_indir_shift(const struct nfp_insn_meta *meta)
-{
-	u8 code = meta->insn.code;
-	bool is_alu, is_shift;
-	u8 opclass, opcode;
-
-	opclass = BPF_CLASS(code);
-	is_alu = opclass == BPF_ALU64 || opclass == BPF_ALU;
-	if (!is_alu)
-		return false;
-
-	opcode = BPF_OP(code);
-	is_shift = opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH;
-	if (!is_shift)
-		return false;
-
-	return BPF_SRC(code) == BPF_X;
-}
-
 /**
  * struct nfp_prog - nfp BPF program
  * @bpf: backpointer to the bpf app priv structure
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 856a0003bb75..78f44c4d95b4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -190,8 +190,10 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 
 		meta->insn = prog[i];
 		meta->n = i;
-		if (is_mbpf_indir_shift(meta))
+		if (is_mbpf_alu(meta)) {
 			meta->umin_src = U64_MAX;
+			meta->umin_dst = U64_MAX;
+		}
 
 		list_add_tail(&meta->l, &nfp_prog->insns);
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index e862b739441f..7bd9666bd8ff 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -551,12 +551,16 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 	if (is_mbpf_xadd(meta))
 		return nfp_bpf_check_xadd(nfp_prog, meta, env);
 
-	if (is_mbpf_indir_shift(meta)) {
+	if (is_mbpf_alu(meta)) {
 		const struct bpf_reg_state *sreg =
 			cur_regs(env) + meta->insn.src_reg;
+		const struct bpf_reg_state *dreg =
+			cur_regs(env) + meta->insn.dst_reg;
 
 		meta->umin_src = min(meta->umin_src, sreg->umin_value);
 		meta->umax_src = max(meta->umax_src, sreg->umax_value);
+		meta->umin_dst = min(meta->umin_dst, dreg->umin_value);
+		meta->umax_dst = max(meta->umax_dst, dreg->umax_value);
 	}
 
 	return 0;
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 3/7] nfp: bpf: rename umin/umax to umin_src/umax_src
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

The two fields are a copy of umin and umax info of bpf_insn->src_reg
generated by verifier.

Rename to make their meaning clear.

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c      | 12 ++++++------
 drivers/net/ethernet/netronome/nfp/bpf/main.h     | 10 +++++-----
 drivers/net/ethernet/netronome/nfp/bpf/offload.c  |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 33111739b210..4a629e9b5c0f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1772,8 +1772,8 @@ static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __shl_imm64(nfp_prog, dst, umin);
 
@@ -1881,8 +1881,8 @@ static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __shr_imm64(nfp_prog, dst, umin);
 
@@ -1995,8 +1995,8 @@ static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	u8 dst, src;
 
 	dst = insn->dst_reg * 2;
-	umin = meta->umin;
-	umax = meta->umax;
+	umin = meta->umin_src;
+	umax = meta->umax_src;
 	if (umin == umax)
 		return __ashr_imm64(nfp_prog, dst, umin);
 
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h
index 654fe7823e5e..5975a19c28cb 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.h
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h
@@ -263,8 +263,8 @@ struct nfp_bpf_reg_state {
  * @func_id: function id for call instructions
  * @arg1: arg1 for call instructions
  * @arg2: arg2 for call instructions
- * @umin: copy of core verifier umin_value.
- * @umax: copy of core verifier umax_value.
+ * @umin_src: copy of core verifier umin_value for src opearnd.
+ * @umax_src: copy of core verifier umax_value for src operand.
  * @off: index of first generated machine instruction (in nfp_prog.prog)
  * @n: eBPF instruction number
  * @flags: eBPF instruction extra optimization flags
@@ -301,11 +301,11 @@ struct nfp_insn_meta {
 			struct nfp_bpf_reg_state arg2;
 		};
 		/* We are interested in range info for some operands,
-		 * for example, the shift amount.
+		 * for example, the shift amount which is kept in src operand.
 		 */
 		struct {
-			u64 umin;
-			u64 umax;
+			u64 umin_src;
+			u64 umax_src;
 		};
 	};
 	unsigned int off;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index 7eae4c0266f8..856a0003bb75 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -191,7 +191,7 @@ nfp_prog_prepare(struct nfp_prog *nfp_prog, const struct bpf_insn *prog,
 		meta->insn = prog[i];
 		meta->n = i;
 		if (is_mbpf_indir_shift(meta))
-			meta->umin = U64_MAX;
+			meta->umin_src = U64_MAX;
 
 		list_add_tail(&meta->l, &nfp_prog->insns);
 	}
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 4bfeba7b21b2..e862b739441f 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -555,8 +555,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx)
 		const struct bpf_reg_state *sreg =
 			cur_regs(env) + meta->insn.src_reg;
 
-		meta->umin = min(meta->umin, sreg->umin_value);
-		meta->umax = max(meta->umax, sreg->umax_value);
+		meta->umin_src = min(meta->umin_src, sreg->umin_value);
+		meta->umax_src = max(meta->umax_src, sreg->umax_value);
 	}
 
 	return 0;
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 2/7] lib: reciprocal_div: implement the improved algorithm on the paper mentioned
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

The new added "reciprocal_value_adv" implements the advanced version of the
algorithm described in Figure 4.2 of the paper except when dividend has MSB
set which would require u128 divide on host and actually could be easily
handled before calling the new "reciprocal_value_adv".

The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations.

It makes no sense to use this advanced version for host divide emulation,
those extra complexities for calculating multiplier etc could completely
waive our saving on emulation operations.

However, it makes sense to use it for JIT divide code generation (for
example eBPF JIT backends) for which we are willing to trade performance of
JITed code with that of host. As shown by the following pseudo code, the
required emulation operations could go down from 6 (the basic version) to 3
or 4.

To use the result of "reciprocal_value_adv", suppose we want to calculate
n/d, the C-style pseudo code will be the following, it could be easily
changed to real code generation for other JIT targets.

  struct reciprocal_value_adv rvalue;
  u8 pre_shift, exp;

  if (d >= (1u << 31)) {
    result = n >= d;
    return;
  }
  rvalue = reciprocal_value_adv(d, 32)
  exp = rvalue.exp;
  if (rvalue.is_wide_m && !(d & 1)) {
    pre_shift = fls(d & -d) - 1;
    rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  } else {
    pre_shift = 0;
  }

  // code generation starts.
  if (imm == 1 << exp) {
    result = n >> exp;
  } else if (rvalue.is_wide_m) {
    // pre_shift must be zero when reached here.
    t = (n * rvalue.m) >> 32;
    result = n - t;
    result >>= 1;
    result += t;
    result >>= rvalue.sh - 1;
  } else {
    if (pre_shift)
      result = n >> pre_shift;
    result = ((u64)result * rvalue.m) >> 32;
    result >>= rvalue.sh;
  }

Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 include/linux/reciprocal_div.h | 65 ++++++++++++++++++++++++++++++++++
 lib/reciprocal_div.c           | 37 +++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index e031e9f2f9d8..5a695e4697d3 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -25,6 +25,9 @@ struct reciprocal_value {
 	u8 sh1, sh2;
 };
 
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
+ * version of the algorithm described in Figure 4.1 of the paper.
+ */
 struct reciprocal_value reciprocal_value(u32 d);
 
 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
@@ -33,4 +36,66 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }
 
+struct reciprocal_value_adv {
+	u32 m;
+	u8 sh, exp;
+	bool is_wide_m;
+};
+
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
+ * described in Figure 4.2 of the paper except when dividend has MSB set which
+ * would require u128 divide on host and actually could be easily handled before
+ * calling "reciprocal_value_adv".
+ *
+ * The advanced version requires more complex calculation to get the reciprocal
+ * multiplier and other control variables, but then could reduce the required
+ * emulation operations.
+ *
+ * It makes no sense to use this advanced version for host divide emulation,
+ * those extra complexities for calculating multiplier etc could completely
+ * waive our saving on emulation operations.
+ *
+ * However, it makes sense to use it for JIT divide code generation for which
+ * we are willing to trade performance of JITed code with that of host. As shown
+ * by the following pseudo code, the required emulation operations could go down
+ * from 6 (the basic version) to 3 or 4.
+ *
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
+ * n/d:
+ *
+ *   struct reciprocal_value_adv rvalue;
+ *   u8 pre_shift, exp;
+ *
+ *   if (d >= (1u << 31)) {
+ *     result = n >= d;
+ *     return;
+ *   }
+ *   rvalue = reciprocal_value_adv(d, 32)
+ *   exp = rvalue.exp;
+ *   if (rvalue.is_wide_m && !(d & 1)) {
+ *     pre_shift = fls(d & -d) - 1;
+ *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
+ *   } else {
+ *     pre_shift = 0;
+ *   }
+ *
+ *   // code generation starts.
+ *   if (imm == 1 << exp) {
+ *     result = n >> exp;
+ *   } else if (rvalue.is_wide_m) {
+ *     // pre_shift must be zero when reached here.
+ *     t = (n * rvalue.m) >> 32;
+ *     result = n - t;
+ *     result >>= 1;
+ *     result += t;
+ *     result >>= rvalue.sh - 1;
+ *   } else {
+ *     if (pre_shift)
+ *       result = n >> pre_shift;
+ *     result = ((u64)result * rvalue.m) >> 32;
+ *     result >>= rvalue.sh;
+ *   }
+ */
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
+
 #endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index fcb4ce682c6f..a41501ebad7c 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -26,3 +26,40 @@ struct reciprocal_value reciprocal_value(u32 d)
 	return R;
 }
 EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+	struct reciprocal_value_adv R;
+	u32 l, post_shift;
+	u64 mhigh, mlow;
+
+	l = fls(d - 1);
+	post_shift = l;
+	/* NOTE: mlow/mhigh could overflow u64 when l == 32 which means d has
+	 * MSB set. This case needs to be handled before calling
+	 * "reciprocal_value_adv", please see the comment at
+	 * include/linux/reciprocal_div.h.
+	 */
+	mlow = 1ULL << (32 + l);
+	do_div(mlow, d);
+	mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+	do_div(mhigh, d);
+
+	for (; post_shift > 0; post_shift--) {
+		u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+		if (lo >= hi)
+			break;
+
+		mlow = lo;
+		mhigh = hi;
+	}
+
+	R.m = (u32)mhigh;
+	R.sh = post_shift;
+	R.exp = l;
+	R.is_wide_m = mhigh > U32_MAX;
+
+	return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 1/7] nfp: bpf: allow source ptr type be map ptr in memcpy optimization
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jiong Wang
In-Reply-To: <20180625035421.2991-1-jakub.kicinski@netronome.com>

From: Jiong Wang <jiong.wang@netronome.com>

Map read has been supported on NFP, this patch enables optimization for
memcpy from map to packet.

This patch also fixed one latent bug which will cause copying from
unexpected address once memcpy for map pointer enabled.

Reported-by: Mary Pham <mary.pham@netronome.com>
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/jit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 8a92088df0d7..33111739b210 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -670,7 +670,7 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 	xfer_num = round_up(len, 4) / 4;
 
 	if (src_40bit_addr)
-		addr40_offset(nfp_prog, meta->insn.src_reg, off, &src_base,
+		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
 			      &off);
 
 	/* Setup PREV_ALU fields to override memory read length. */
@@ -3299,7 +3299,8 @@ curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
 		return false;
 
-	if (ld_meta->ptr.type != PTR_TO_PACKET)
+	if (ld_meta->ptr.type != PTR_TO_PACKET &&
+	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
 		return false;
 
 	if (st_meta->ptr.type != PTR_TO_PACKET)
-- 
2.17.1

^ permalink raw reply related

* [PATCH bpf-next 0/7] nfp: bpf: add multiplication, divide and memcpy from maps
From: Jakub Kicinski @ 2018-06-25  3:54 UTC (permalink / raw)
  To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski

Hi!

This set enables memcpy optimization when the source is a map pointer.
The rest adds multiplication and devide support with Jiong describes
as follows:

NFP supports u16 and u32 multiplication. Multiplication is done 8-bits per
step, therefore we need 2 steps for u16 and 4 steps for u32.

We also need one start instruction to initialize the sequence and one or
two instructions to fetch the result depending on either you need the high
halve of u32 multiplication.

For ALU64, if either operand is beyond u32's value range, we reject it. One
thing to note, if the source operand is BPF_K, then we need to check "imm"
field directly, and we'd reject it if it is negative.  Because for ALU64,
"imm" (with s32 type) is expected to be sign extended to s64 which NFP mul
doesn't support. For ALU32, it is fine for "imm" be negative though,
because the result is 32-bits and here is no difference on the low halve
of result for signed/unsigned mul, so we will get correct result.

NFP doesn't have integer divide instruction, this patch set uses reciprocal
algorithm (the basic one, reciprocal_div) to emulate it.

For each u32 divide, we would need 11 instructions to finish the operation.

  7 (for multiplication) + 4 (various ALUs) = 11

Given NFP only supports multiplication no bigger than u32, we'd require
divisor and dividend no bigger than that as well.

Also eBPF doesn't support signed divide and has enforced this on C language
level by failing compilation. However LLVM assembler hasn't enforced this,
so it is possible for negative constant to leak in as a BPF_K operand
through assembly code, we reject such cases as well.

Meanwhile reciprocal_div.h only implemented the basic version of:

  "Division by Invariant Integers Using Multiplication"
                        - Torbjörn Granlund and Peter L. Montgomery

This patch set further implements the optimized version (Figure 4.2 in the
paper) inside existing reciprocal_div.h. When the divider is even and the
calculated reciprocal magic number doesn't fit u32, we could reduce the
required ALU instructions from 4 to 2 or 1 for some cases.

The advanced version requires more complex calculation to get the
reciprocal multiplier and other control variables, but then could reduce
the required emulation operations. It makes sense to use it for JIT divide
code generation (for example eBPF JIT backends) for which we are willing to
trade performance of JITed code with that of host.

Jiong Wang (7):
  nfp: bpf: allow source ptr type be map ptr in memcpy optimization
  lib: reciprocal_div: implement the improved algorithm on the paper
    mentioned
  nfp: bpf: rename umin/umax to umin_src/umax_src
  nfp: bpf: copy range info for all operands of all ALU operations
  nfp: bpf: support u16 and u32 multiplications
  nfp: bpf: support u32 divide using reciprocal_div.h
  nfp: bpf: migrate to advanced reciprocal divide in reciprocal_div.h

 drivers/net/ethernet/netronome/nfp/bpf/jit.c  | 232 +++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/bpf/main.h |  43 ++--
 .../net/ethernet/netronome/nfp/bpf/offload.c  |   6 +-
 .../net/ethernet/netronome/nfp/bpf/verifier.c |  95 ++++++-
 drivers/net/ethernet/netronome/nfp/nfp_asm.h  |  28 +++
 include/linux/reciprocal_div.h                |  65 +++++
 lib/reciprocal_div.c                          |  37 +++
 7 files changed, 467 insertions(+), 39 deletions(-)

-- 
2.17.1

^ permalink raw reply

* Re: [patch net-next 2/3] nfp: handle cls_flower command default case
From: Jakub Kicinski @ 2018-06-25  3:11 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, simon.horman, john.hurley, pieter.jansenvanvuuren,
	oss-drivers, michael.chan, intel-wired-lan, mlxsw
In-Reply-To: <20180624083839.1692-3-jiri@resnulli.us>

On Sun, 24 Jun 2018 10:38:38 +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> Currently the default case is not handled, which with future command
> introductions would introduce a warning. So handle it.
> 
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>

Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>

^ permalink raw reply

* Re: [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25  2:47 UTC (permalink / raw)
  To: network dev; +Cc: davem, David Ahern
In-Reply-To: <671e900d14a124f1de7785ee44c437d0826b9e2a.1529894708.git.lucien.xin@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 5443 bytes --]

On Mon, Jun 25, 2018 at 10:45 AM, Xin Long <lucien.xin@gmail.com> wrote:
> This patch implements the feature described in rfc1812#section-5.3.5.2
> and rfc2644. It allows the router to forward directed broadcast when
> sysctl mc_forwarding is enabled.
>
> Note that this feature could be done by iptables -j TEE, but it would
> cause some problems:
>   - target TEE's gateway param has to be set with a specific address,
>     and it's not flexible especially when the route wants forward all
>     directed broadcasts.
>   - this duplicates the directed broadcasts so this may cause side
>     effects to applications.
>
> Besides, to keep consistent with other os router like BSD, it's also
> necessary to implement it in the route rx path.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  include/linux/inetdevice.h   | 1 +
>  include/uapi/linux/ip.h      | 1 +
>  include/uapi/linux/netconf.h | 1 +
>  net/ipv4/devinet.c           | 7 +++++++
>  net/ipv4/route.c             | 6 +++++-
>  5 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> index 27650f1..c759d1c 100644
> --- a/include/linux/inetdevice.h
> +++ b/include/linux/inetdevice.h
> @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
>
>  #define IN_DEV_FORWARD(in_dev)         IN_DEV_CONF_GET((in_dev), FORWARDING)
>  #define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
> +#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
>  #define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
>  #define IN_DEV_SRC_VMARK(in_dev)       IN_DEV_ORCONF((in_dev), SRC_VMARK)
>  #define IN_DEV_SOURCE_ROUTE(in_dev)    IN_DEV_ANDCONF((in_dev), \
> diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
> index b24a742..2b756b5 100644
> --- a/include/uapi/linux/ip.h
> +++ b/include/uapi/linux/ip.h
> @@ -139,6 +139,7 @@ enum
>  {
>         IPV4_DEVCONF_FORWARDING=1,
>         IPV4_DEVCONF_MC_FORWARDING,
> +       IPV4_DEVCONF_BC_FORWARDING,
>         IPV4_DEVCONF_PROXY_ARP,
>         IPV4_DEVCONF_ACCEPT_REDIRECTS,
>         IPV4_DEVCONF_SECURE_REDIRECTS,
> diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
> index c84fcdf..a5cd70e 100644
> --- a/include/uapi/linux/netconf.h
> +++ b/include/uapi/linux/netconf.h
> @@ -15,6 +15,7 @@ enum {
>         NETCONFA_FORWARDING,
>         NETCONFA_RP_FILTER,
>         NETCONFA_MC_FORWARDING,
> +       NETCONFA_BC_FORWARDING,
>         NETCONFA_PROXY_NEIGH,
>         NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
>         NETCONFA_INPUT,
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index d7585ab..ea30ab6 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
>                 size += nla_total_size(4);
>         if (all || type == NETCONFA_MC_FORWARDING)
>                 size += nla_total_size(4);
> +       if (all || type == NETCONFA_BC_FORWARDING)
> +               size += nla_total_size(4);
>         if (all || type == NETCONFA_PROXY_NEIGH)
>                 size += nla_total_size(4);
>         if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
> @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
>             nla_put_s32(skb, NETCONFA_MC_FORWARDING,
>                         IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
>                 goto nla_put_failure;
> +       if ((all || type == NETCONFA_BC_FORWARDING) &&
> +           nla_put_s32(skb, NETCONFA_BC_FORWARDING,
> +                       IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
> +               goto nla_put_failure;
>         if ((all || type == NETCONFA_PROXY_NEIGH) &&
>             nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
>                         IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
> @@ -2259,6 +2265,7 @@ static struct devinet_sysctl_table {
>                 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
>                                              devinet_sysctl_forward),
>                 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
> +               DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
>
>                 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
>                 DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 1df6e97..b678466 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>                 goto no_route;
>         }
>
> -       if (res->type == RTN_BROADCAST)
> +       if (res->type == RTN_BROADCAST) {
> +               if (IN_DEV_BFORWARD(in_dev))
> +                       goto make_route;
>                 goto brd_input;
> +       }
>
>         if (res->type == RTN_LOCAL) {
>                 err = fib_validate_source(skb, saddr, daddr, tos,
> @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>         if (res->type != RTN_UNICAST)
>                 goto martian_destination;
>
> +make_route:
>         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
>  out:   return err;
>
> --
> 2.1.0
>
attachment is some testing scipts.

[-- Attachment #2: bc_fwd.sh --]
[-- Type: text/x-sh, Size: 2555 bytes --]

#!/bin/bash

# TOPO:
# host1 172.16.1.1/24 <-> .254/24 RTR  .254/24 <-> 192.168.1.1/24 host2
#                     <-> .253/24 RTR2 .254/24 <-> 192.168.2.1/24 host3

netns_list="host1 host2 host3 RTR RTR2"
for i in $netns_list; do
	ip netns del $i > /dev/null 2>&1
done
for i in $netns_list; do
	ip netns add $i
done
ip link add host1_eth1 type veth peer name RTR_eth1
ip link add host1_eth2 type veth peer name RTR2_eth1
ip link add host2_eth1 type veth peer name RTR_eth2
ip link add host3_eth1 type veth peer name RTR2_eth2
ip link set RTR_eth1 netns RTR
ip link set RTR2_eth1 netns RTR2
ip link set RTR_eth2 netns RTR
ip link set RTR2_eth2 netns RTR2
ip link set host1_eth1 netns host1
ip link set host1_eth2 netns host1
ip link set host2_eth1 netns host2
ip link set host3_eth1 netns host3

ip netns exec host1 brctl addbr host1_br0
ip netns exec host1 brctl addif host1_br0 host1_eth1
ip netns exec host1 brctl addif host1_br0 host1_eth2
ip netns exec host1 ifconfig host1_br0 172.16.1.1/24 up
ip netns exec host1 ip link set host1_eth1 up
ip netns exec host1 ip link set host1_eth2 up

ip netns exec RTR ifconfig RTR_eth1 172.16.1.254/24 up
ip netns exec RTR2 ifconfig RTR2_eth1 172.16.1.253/24 up
ip netns exec RTR ifconfig RTR_eth2 192.168.1.254/24 up
ip netns exec RTR2 ifconfig RTR2_eth2 192.168.2.254/24 up
ip netns exec RTR sysctl -w net.ipv4.conf.all.forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.all.forwarding=1

ip netns exec RTR sysctl -w net.ipv4.conf.RTR_eth1.bc_forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.RTR2_eth1.bc_forwarding=1
ip netns exec RTR sysctl -w net.ipv4.conf.all.bc_forwarding=1
ip netns exec RTR2 sysctl -w net.ipv4.conf.all.bc_forwarding=1

ip netns exec host2 ifconfig host2_eth1 192.168.1.1/24 up
ip netns exec host3 ifconfig host3_eth1 192.168.2.1/24 up

ip netns exec host1 ip route add default nexthop via 172.16.1.254 nexthop via 172.16.1.253
ip netns exec host2 ip route add default via 192.168.1.254
ip netns exec host3 ip route add default via 192.168.2.254

ip netns exec host2 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec host3 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec RTR sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0
ip netns exec RTR2 sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=0

ip netns exec host1 ping 192.168.2.255 -c 1
# ip netns exec host3 tcpdump -i host3_eth1 -p icmp -nn

# ip netns exec host1 ping 192.168.1.255 -c 1
# ip netns exec host2 tcpdump -i host2_eth1 -p icmp -nn

# ip netns exec host1 ping 255.255.255.255 -c 1


^ permalink raw reply

* [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25  2:45 UTC (permalink / raw)
  To: network dev; +Cc: davem, David Ahern

This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl mc_forwarding is enabled.

Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
  - target TEE's gateway param has to be set with a specific address,
    and it's not flexible especially when the route wants forward all
    directed broadcasts.
  - this duplicates the directed broadcasts so this may cause side
    effects to applications.

Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/linux/inetdevice.h   | 1 +
 include/uapi/linux/ip.h      | 1 +
 include/uapi/linux/netconf.h | 1 +
 net/ipv4/devinet.c           | 7 +++++++
 net/ipv4/route.c             | 6 +++++-
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1..c759d1c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 
 #define IN_DEV_FORWARD(in_dev)		IN_DEV_CONF_GET((in_dev), FORWARDING)
 #define IN_DEV_MFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev)		IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
 #define IN_DEV_RPFILTER(in_dev)		IN_DEV_MAXCONF((in_dev), RP_FILTER)
 #define IN_DEV_SRC_VMARK(in_dev)    	IN_DEV_ORCONF((in_dev), SRC_VMARK)
 #define IN_DEV_SOURCE_ROUTE(in_dev)	IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742..2b756b5 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -139,6 +139,7 @@ enum
 {
 	IPV4_DEVCONF_FORWARDING=1,
 	IPV4_DEVCONF_MC_FORWARDING,
+	IPV4_DEVCONF_BC_FORWARDING,
 	IPV4_DEVCONF_PROXY_ARP,
 	IPV4_DEVCONF_ACCEPT_REDIRECTS,
 	IPV4_DEVCONF_SECURE_REDIRECTS,
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdf..a5cd70e 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -15,6 +15,7 @@ enum {
 	NETCONFA_FORWARDING,
 	NETCONFA_RP_FILTER,
 	NETCONFA_MC_FORWARDING,
+	NETCONFA_BC_FORWARDING,
 	NETCONFA_PROXY_NEIGH,
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 	NETCONFA_INPUT,
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab..ea30ab6 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
+	if (all || type == NETCONFA_BC_FORWARDING)
+		size += nla_total_size(4);
 	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
+	if ((all || type == NETCONFA_BC_FORWARDING) &&
+	    nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+			IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+		goto nla_put_failure;
 	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2259,6 +2265,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
 					     devinet_sysctl_forward),
 		DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+		DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
 
 		DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
 		DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97..b678466 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto no_route;
 	}
 
-	if (res->type == RTN_BROADCAST)
+	if (res->type == RTN_BROADCAST) {
+		if (IN_DEV_BFORWARD(in_dev))
+			goto make_route;
 		goto brd_input;
+	}
 
 	if (res->type == RTN_LOCAL) {
 		err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res->type != RTN_UNICAST)
 		goto martian_destination;
 
+make_route:
 	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
 out:	return err;
 
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 5/5] sctp: check for ipv6_pinfo legal sndflow with flowlabel in sctp_v6_get_dst
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>

The transport with illegal flowlabel should not be allowed to send
packets. Other transport protocols already denies this.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/ipv6.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 772513d..d83ddc4 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -262,6 +262,15 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
 		fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
 
+	if (np->sndflow && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) {
+		struct ip6_flowlabel *flowlabel;
+
+		flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+		if (!flowlabel)
+			goto out;
+		fl6_sock_release(flowlabel);
+	}
+
 	pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
 
 	if (asoc)
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 4/5] sctp: add support for setting flowlabel when adding a transport
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>

Struct sockaddr_in6 has the member sin6_flowinfo that includes the
ipv6 flowlabel, it should also support for setting flowlabel when
adding a transport whose ipaddr is from userspace.

Note that addrinfo in sctp_sendmsg is using struct in6_addr for
the secondary addrs, which doesn't contain sin6_flowinfo, and
it needs to copy sin6_flowinfo from the primary addr.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/associola.c | 12 ++++++++++--
 net/sctp/socket.c    |  5 +++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 16ecfbc..297d9cf 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -650,8 +650,16 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	peer->sackdelay = asoc->sackdelay;
 	peer->sackfreq = asoc->sackfreq;
 
-	if (addr->sa.sa_family == AF_INET6)
-		peer->flowlabel = asoc->flowlabel;
+	if (addr->sa.sa_family == AF_INET6) {
+		__be32 info = addr->v6.sin6_flowinfo;
+
+		if (info) {
+			peer->flowlabel = ntohl(info & IPV6_FLOWLABEL_MASK);
+			peer->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		} else {
+			peer->flowlabel = asoc->flowlabel;
+		}
+	}
 	peer->dscp = asoc->dscp;
 
 	/* Enable/disable heartbeat, SACK delay, and path MTU discovery
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 857de62..1df5d07 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1697,6 +1697,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 	struct sctp_association *asoc;
 	enum sctp_scope scope;
 	struct cmsghdr *cmsg;
+	__be32 flowinfo = 0;
 	struct sctp_af *af;
 	int err;
 
@@ -1781,6 +1782,9 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 	if (!cmsgs->addrs_msg)
 		return 0;
 
+	if (daddr->sa.sa_family == AF_INET6)
+		flowinfo = daddr->v6.sin6_flowinfo;
+
 	/* sendv addr list parse */
 	for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
 		struct sctp_transport *transport;
@@ -1813,6 +1817,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 			}
 
 			dlen = sizeof(struct in6_addr);
+			daddr->v6.sin6_flowinfo = flowinfo;
 			daddr->v6.sin6_family = AF_INET6;
 			daddr->v6.sin6_port = htons(asoc->peer.port);
 			memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 3/5] sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>

spp_ipv6_flowlabel and spp_dscp are added in sctp_paddrparams in
this patch so that users could set sctp_sock/asoc/transport dscp
and flowlabel with spp_flags SPP_IPV6_FLOWLABEL or SPP_DSCP by
SCTP_PEER_ADDR_PARAMS , as described section 8.1.12 in RFC6458.

As said in last patch, it uses '| 0x100000' or '|0x1' to mark
flowlabel or dscp is set,  so that their values could be set
to 0.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/uapi/linux/sctp.h |   4 ++
 net/sctp/socket.c         | 152 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index c02986a..b479db5 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -763,6 +763,8 @@ enum  sctp_spp_flags {
 	SPP_SACKDELAY_DISABLE = 1<<6,	/*Disable SACK*/
 	SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
 	SPP_HB_TIME_IS_ZERO = 1<<7,	/* Set HB delay to 0 */
+	SPP_IPV6_FLOWLABEL = 1<<8,
+	SPP_DSCP = 1<<9,
 };
 
 struct sctp_paddrparams {
@@ -773,6 +775,8 @@ struct sctp_paddrparams {
 	__u32			spp_pathmtu;
 	__u32			spp_sackdelay;
 	__u32			spp_flags;
+	__u32			spp_ipv6_flowlabel;
+	__u8			spp_dscp;
 } __attribute__((packed, aligned(4)));
 
 /*
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf11f9c..857de62 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2393,6 +2393,8 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  *     uint32_t                spp_pathmtu;
  *     uint32_t                spp_sackdelay;
  *     uint32_t                spp_flags;
+ *     uint32_t                spp_ipv6_flowlabel;
+ *     uint8_t                 spp_dscp;
  * };
  *
  *   spp_assoc_id    - (one-to-many style socket) This is filled in the
@@ -2472,6 +2474,45 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval,
  *                     also that this field is mutually exclusive to
  *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
  *                     results.
+ *
+ *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
+ *                     setting of the IPV6 flow label value.  The value is
+ *                     contained in the spp_ipv6_flowlabel field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_ipv6_flowlabel field has a valid value returned.
+ *                     If a specific destination address is set (in the
+ *                     spp_address field), then the value returned is that of
+ *                     the address.  If just an association is specified (and
+ *                     no address), then the association's default flow label
+ *                     is returned.  If neither an association nor a destination
+ *                     is specified, then the socket's default flow label is
+ *                     returned.  For non-IPv6 sockets, this flag will be left
+ *                     cleared.
+ *
+ *                     SPP_DSCP:  Setting this flag enables the setting of the
+ *                     Differentiated Services Code Point (DSCP) value
+ *                     associated with either the association or a specific
+ *                     address.  The value is obtained in the spp_dscp field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_dscp field has a valid value returned.  If a
+ *                     specific destination address is set when called (in the
+ *                     spp_address field), then that specific destination
+ *                     address's DSCP value is returned.  If just an association
+ *                     is specified, then the association's default DSCP is
+ *                     returned.  If neither an association nor a destination is
+ *                     specified, then the socket's default DSCP is returned.
+ *
+ *   spp_ipv6_flowlabel
+ *                   - This field is used in conjunction with the
+ *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ *                     The 20 least significant bits are used for the flow
+ *                     label.  This setting has precedence over any IPv6-layer
+ *                     setting.
+ *
+ *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
+ *                     and contains the DSCP.  The 6 most significant bits are
+ *                     used for the DSCP.  This setting has precedence over any
+ *                     IPv4- or IPv6- layer setting.
  */
 static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 				       struct sctp_transport   *trans,
@@ -2611,6 +2652,51 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params,
 		}
 	}
 
+	if (params->spp_flags & SPP_IPV6_FLOWLABEL) {
+		if (trans && trans->ipaddr.sa.sa_family == AF_INET6) {
+			trans->flowlabel = params->spp_ipv6_flowlabel &
+					   SCTP_FLOWLABEL_VAL_MASK;
+			trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		} else if (asoc) {
+			list_for_each_entry(trans,
+					    &asoc->peer.transport_addr_list,
+					    transports) {
+				if (trans->ipaddr.sa.sa_family != AF_INET6)
+					continue;
+				trans->flowlabel = params->spp_ipv6_flowlabel &
+						   SCTP_FLOWLABEL_VAL_MASK;
+				trans->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+			}
+			asoc->flowlabel = params->spp_ipv6_flowlabel &
+					  SCTP_FLOWLABEL_VAL_MASK;
+			asoc->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		} else if (sctp_opt2sk(sp)->sk_family == AF_INET6) {
+			sp->flowlabel = params->spp_ipv6_flowlabel &
+					SCTP_FLOWLABEL_VAL_MASK;
+			sp->flowlabel |= SCTP_FLOWLABEL_SET_MASK;
+		}
+	}
+
+	if (params->spp_flags & SPP_DSCP) {
+		if (trans) {
+			trans->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			trans->dscp |= SCTP_DSCP_SET_MASK;
+		} else if (asoc) {
+			list_for_each_entry(trans,
+					    &asoc->peer.transport_addr_list,
+					    transports) {
+				trans->dscp = params->spp_dscp &
+					      SCTP_DSCP_VAL_MASK;
+				trans->dscp |= SCTP_DSCP_SET_MASK;
+			}
+			asoc->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			asoc->dscp |= SCTP_DSCP_SET_MASK;
+		} else {
+			sp->dscp = params->spp_dscp & SCTP_DSCP_VAL_MASK;
+			sp->dscp |= SCTP_DSCP_SET_MASK;
+		}
+	}
+
 	return 0;
 }
 
@@ -5453,6 +5539,45 @@ static int sctp_getsockopt_peeloff_flags(struct sock *sk, int len,
  *                     also that this field is mutually exclusive to
  *                     SPP_SACKDELAY_ENABLE, setting both will have undefined
  *                     results.
+ *
+ *                     SPP_IPV6_FLOWLABEL:  Setting this flag enables the
+ *                     setting of the IPV6 flow label value.  The value is
+ *                     contained in the spp_ipv6_flowlabel field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_ipv6_flowlabel field has a valid value returned.
+ *                     If a specific destination address is set (in the
+ *                     spp_address field), then the value returned is that of
+ *                     the address.  If just an association is specified (and
+ *                     no address), then the association's default flow label
+ *                     is returned.  If neither an association nor a destination
+ *                     is specified, then the socket's default flow label is
+ *                     returned.  For non-IPv6 sockets, this flag will be left
+ *                     cleared.
+ *
+ *                     SPP_DSCP:  Setting this flag enables the setting of the
+ *                     Differentiated Services Code Point (DSCP) value
+ *                     associated with either the association or a specific
+ *                     address.  The value is obtained in the spp_dscp field.
+ *                     Upon retrieval, this flag will be set to indicate that
+ *                     the spp_dscp field has a valid value returned.  If a
+ *                     specific destination address is set when called (in the
+ *                     spp_address field), then that specific destination
+ *                     address's DSCP value is returned.  If just an association
+ *                     is specified, then the association's default DSCP is
+ *                     returned.  If neither an association nor a destination is
+ *                     specified, then the socket's default DSCP is returned.
+ *
+ *   spp_ipv6_flowlabel
+ *                   - This field is used in conjunction with the
+ *                     SPP_IPV6_FLOWLABEL flag and contains the IPv6 flow label.
+ *                     The 20 least significant bits are used for the flow
+ *                     label.  This setting has precedence over any IPv6-layer
+ *                     setting.
+ *
+ *   spp_dscp        - This field is used in conjunction with the SPP_DSCP flag
+ *                     and contains the DSCP.  The 6 most significant bits are
+ *                     used for the DSCP.  This setting has precedence over any
+ *                     IPv4- or IPv6- layer setting.
  */
 static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 					    char __user *optval, int __user *optlen)
@@ -5499,6 +5624,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = trans->param_flags;
+		if (trans->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = trans->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (trans->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= trans->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	} else if (asoc) {
 		/* Fetch association values. */
 		params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval);
@@ -5508,6 +5642,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = asoc->param_flags;
+		if (asoc->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = asoc->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (asoc->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= asoc->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	} else {
 		/* Fetch socket values. */
 		params.spp_hbinterval = sp->hbinterval;
@@ -5517,6 +5660,15 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len,
 
 		/*draft-11 doesn't say what to return in spp_flags*/
 		params.spp_flags      = sp->param_flags;
+		if (sp->flowlabel & SCTP_FLOWLABEL_SET_MASK) {
+			params.spp_ipv6_flowlabel = sp->flowlabel &
+						    SCTP_FLOWLABEL_VAL_MASK;
+			params.spp_flags |= SPP_IPV6_FLOWLABEL;
+		}
+		if (sp->dscp & SCTP_DSCP_SET_MASK) {
+			params.spp_dscp	= sp->dscp & SCTP_DSCP_VAL_MASK;
+			params.spp_flags |= SPP_DSCP;
+		}
 	}
 
 	if (copy_to_user(optval, &params, len))
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 2/5] sctp: add support for dscp and flowlabel per transport
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>

Like some other per transport params, flowlabel and dscp are added
in transport, asoc and sctp_sock. By default, transport sets its
value from asoc's, and asoc does it from sctp_sock. flowlabel
only works for ipv6 transport.

Other than that they need to be passed down in sctp_xmit, flow4/6
also needs to set them before looking up route in get_dst.

Note that it uses '& 0x100000' to check if flowlabel is set and
'& 0x1' (tos 1st bit is unused) to check if dscp is set by users,
so that they could be set to 0 by sockopt in next patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/linux/sctp.h       |  7 +++++++
 include/net/sctp/structs.h |  9 +++++++++
 net/sctp/associola.c       |  7 +++++++
 net/sctp/ipv6.c            | 11 +++++++++--
 net/sctp/protocol.c        | 16 ++++++++++++----
 5 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index b36c766..83d9434 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -801,4 +801,11 @@ struct sctp_strreset_resptsn {
 	__be32 receivers_next_tsn;
 };
 
+enum {
+	SCTP_DSCP_SET_MASK = 0x1,
+	SCTP_DSCP_VAL_MASK = 0xfc,
+	SCTP_FLOWLABEL_SET_MASK = 0x100000,
+	SCTP_FLOWLABEL_VAL_MASK = 0xfffff
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 701a517..ab869e0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -193,6 +193,9 @@ struct sctp_sock {
 	/* This is the max_retrans value for new associations. */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* The initial Path MTU to use for new associations. */
 	__u32 pathmtu;
 
@@ -895,6 +898,9 @@ struct sctp_transport {
 	 */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* This is the partially failed retrans value for the transport
 	 * and will be initialized from the assocs value.  This can be changed
 	 * using the SCTP_PEER_ADDR_THLDS socket option
@@ -1772,6 +1778,9 @@ struct sctp_association {
 	 */
 	__u16 pathmaxrxt;
 
+	__u32 flowlabel;
+	__u8  dscp;
+
 	/* Flag that path mtu update is pending */
 	__u8   pmtu_pending;
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5d5a162..16ecfbc 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -115,6 +115,9 @@ static struct sctp_association *sctp_association_init(
 	/* Initialize path max retrans value. */
 	asoc->pathmaxrxt = sp->pathmaxrxt;
 
+	asoc->flowlabel = sp->flowlabel;
+	asoc->dscp = sp->dscp;
+
 	/* Initialize default path MTU. */
 	asoc->pathmtu = sp->pathmtu;
 
@@ -647,6 +650,10 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 	peer->sackdelay = asoc->sackdelay;
 	peer->sackfreq = asoc->sackfreq;
 
+	if (addr->sa.sa_family == AF_INET6)
+		peer->flowlabel = asoc->flowlabel;
+	peer->dscp = asoc->dscp;
+
 	/* Enable/disable heartbeat, SACK delay, and path MTU discovery
 	 * based on association setting.
 	 */
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 7339918..772513d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,12 +209,17 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 	struct sock *sk = skb->sk;
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flowi6 *fl6 = &transport->fl.u.ip6;
+	__u8 tclass = np->tclass;
 	int res;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
 		 skb->len, &fl6->saddr, &fl6->daddr);
 
-	IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+	if (transport->dscp & SCTP_DSCP_SET_MASK)
+		tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+
+	if (INET_ECN_is_capable(tclass))
+		IP6_ECN_flow_xmit(sk, fl6->flowlabel);
 
 	if (!(transport->param_flags & SPP_PMTUD_ENABLE))
 		skb->ignore_df = 1;
@@ -223,7 +228,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 
 	rcu_read_lock();
 	res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
-		       np->tclass);
+		       tclass);
 	rcu_read_unlock();
 	return res;
 }
@@ -254,6 +259,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		fl6->flowi6_oif = daddr->v6.sin6_scope_id;
 	else if (asoc)
 		fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if;
+	if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK)
+		fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK);
 
 	pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr);
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5dffbc4..d57fd30 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,13 +426,16 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 	struct dst_entry *dst = NULL;
 	union sctp_addr *daddr = &t->ipaddr;
 	union sctp_addr dst_saddr;
+	__u8 tos = inet_sk(sk)->tos;
 
+	if (t->dscp & SCTP_DSCP_SET_MASK)
+		tos = t->dscp & SCTP_DSCP_VAL_MASK;
 	memset(fl4, 0x0, sizeof(struct flowi4));
 	fl4->daddr  = daddr->v4.sin_addr.s_addr;
 	fl4->fl4_dport = daddr->v4.sin_port;
 	fl4->flowi4_proto = IPPROTO_SCTP;
 	if (asoc) {
-		fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk);
+		fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos);
 		fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
 		fl4->fl4_sport = htons(asoc->base.bind_addr.port);
 	}
@@ -495,7 +498,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
 		fl4->fl4_sport = laddr->a.v4.sin_port;
 		flowi4_update_output(fl4,
 				     asoc->base.sk->sk_bound_dev_if,
-				     RT_CONN_FLAGS(asoc->base.sk),
+				     RT_CONN_FLAGS_TOS(asoc->base.sk, tos),
 				     daddr->v4.sin_addr.s_addr,
 				     laddr->a.v4.sin_addr.s_addr);
 
@@ -971,16 +974,21 @@ static inline int sctp_v4_xmit(struct sk_buff *skb,
 			       struct sctp_transport *transport)
 {
 	struct inet_sock *inet = inet_sk(skb->sk);
+	__u8 dscp = inet->tos;
 
 	pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
-		 skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr);
+		 skb->len, &transport->fl.u.ip4.saddr,
+		 &transport->fl.u.ip4.daddr);
+
+	if (transport->dscp & SCTP_DSCP_SET_MASK)
+		dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
 
 	inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ?
 			 IP_PMTUDISC_DO : IP_PMTUDISC_DONT;
 
 	SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS);
 
-	return ip_queue_xmit(&inet->sk, skb, &transport->fl);
+	return __ip_queue_xmit(&inet->sk, skb, &transport->fl, dscp);
 }
 
 static struct sctp_af sctp_af_inet;
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 1/5] ipv4: add __ip_queue_xmit() that supports tos param
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem
In-Reply-To: <cover.1529892764.git.lucien.xin@gmail.com>

This patch introduces __ip_queue_xmit(), through which the callers
can pass tos param into it without having to set inet->tos. For
ipv6, ip6_xmit() already allows passing tclass parameter.

It's needed when some transport protocol doesn't use inet->tos,
like sctp's per transport dscp, which will be added in next patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 include/net/ip.h     |  2 ++
 net/ipv4/ip_output.c | 13 ++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 0d2281b..ca05b77 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -148,6 +148,8 @@ void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+		    __u8 tos);
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
 void ip_init(void);
 int ip_append_data(struct sock *sk, struct flowi4 *fl4,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b3308e9..107d37f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -423,7 +423,8 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 }
 
 /* Note: skb->sk can be different from sk, in case of tunnels */
-int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
+		    __u8 tos)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	struct net *net = sock_net(sk);
@@ -462,7 +463,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 					   inet->inet_dport,
 					   inet->inet_sport,
 					   sk->sk_protocol,
-					   RT_CONN_FLAGS(sk),
+					   RT_CONN_FLAGS_TOS(sk, tos),
 					   sk->sk_bound_dev_if);
 		if (IS_ERR(rt))
 			goto no_route;
@@ -478,7 +479,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
-	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
 	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
 		iph->frag_off = htons(IP_DF);
 	else
@@ -511,6 +512,12 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 	kfree_skb(skb);
 	return -EHOSTUNREACH;
 }
+EXPORT_SYMBOL(__ip_queue_xmit);
+
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+}
 EXPORT_SYMBOL(ip_queue_xmit);
 
 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
-- 
2.1.0

^ permalink raw reply related

* [PATCH net-next 0/5] sctp: fully support for dscp and flowlabel per transport
From: Xin Long @ 2018-06-25  2:14 UTC (permalink / raw)
  To: network dev, linux-sctp; +Cc: Marcelo Ricardo Leitner, Neil Horman, davem

Now dscp and flowlabel are set from sock when sending the packets,
but being multi-homing, sctp also supports for dscp and flowlabel
per transport, which is described in section 8.1.12 in RFC6458.

Xin Long (5):
  ipv4: add __ip_queue_xmit() that supports tos param
  sctp: add support for dscp and flowlabel per transport
  sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
  sctp: add support for setting flowlabel when adding a transport
  sctp: check for ipv6_pinfo legal sndflow with flowlabel in
    sctp_v6_get_dst

 include/linux/sctp.h       |   7 ++
 include/net/ip.h           |   2 +
 include/net/sctp/structs.h |   9 +++
 include/uapi/linux/sctp.h  |   4 ++
 net/ipv4/ip_output.c       |  13 +++-
 net/sctp/associola.c       |  15 +++++
 net/sctp/ipv6.c            |  20 +++++-
 net/sctp/protocol.c        |  16 +++--
 net/sctp/socket.c          | 157 +++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 234 insertions(+), 9 deletions(-)

-- 
2.1.0

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox