Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 1/7] net: sched: add an offload dump helper
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

Qdisc dump operation of offload-capable qdiscs performs a few
extra steps which are identical among all the qdiscs.  Add
a helper to share this code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 include/net/sch_generic.h | 12 ++++++++++++
 net/sched/sch_api.c       | 21 +++++++++++++++++++++
 net/sched/sch_prio.c      | 16 +---------------
 net/sched/sch_red.c       | 17 +----------------
 4 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 4d736427a4cb..af55c1c4edb1 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -579,6 +579,18 @@ void qdisc_put(struct Qdisc *qdisc);
 void qdisc_put_unlocked(struct Qdisc *qdisc);
 void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
 			       unsigned int len);
+#ifdef CONFIG_NET_SCHED
+int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
+			      void *type_data);
+#else
+static inline int
+qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
+			  void *type_data)
+{
+	q->flags &= ~TCQ_F_OFFLOADED;
+	return 0;
+}
+#endif
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops,
 			  struct netlink_ext_ack *extack);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ca3b0f46de53..e534825d3d3a 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -810,6 +810,27 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 }
 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
 
+int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
+			      void *type_data)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	int err;
+
+	sch->flags &= ~TCQ_F_OFFLOADED;
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return 0;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+	if (err == -EOPNOTSUPP)
+		return 0;
+
+	if (!err)
+		sch->flags |= TCQ_F_OFFLOADED;
+
+	return err;
+}
+EXPORT_SYMBOL(qdisc_offload_dump_helper);
+
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 			 u32 portid, u32 seq, u16 flags, int event)
 {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index f8af98621179..4bdd04c30ead 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -251,7 +251,6 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt,
 
 static int prio_dump_offload(struct Qdisc *sch)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_prio_qopt_offload hw_stats = {
 		.command = TC_PRIO_STATS,
 		.handle = sch->handle,
@@ -263,21 +262,8 @@ static int prio_dump_offload(struct Qdisc *sch)
 			},
 		},
 	};
-	int err;
-
-	sch->flags &= ~TCQ_F_OFFLOADED;
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-		return 0;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
-		return 0;
-
-	if (!err)
-		sch->flags |= TCQ_F_OFFLOADED;
 
-	return err;
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats);
 }
 
 static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3ce6c0a2c493..d5e441194397 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -281,7 +281,6 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 
 static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_red_qopt_offload hw_stats = {
 		.command = TC_RED_STATS,
 		.handle = sch->handle,
@@ -291,22 +290,8 @@ static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
 			.stats.qstats = &sch->qstats,
 		},
 	};
-	int err;
-
-	sch->flags &= ~TCQ_F_OFFLOADED;
-
-	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
-		return 0;
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_RED,
-					    &hw_stats);
-	if (err == -EOPNOTSUPP)
-		return 0;
-
-	if (!err)
-		sch->flags |= TCQ_F_OFFLOADED;
 
-	return err;
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_RED, &hw_stats);
 }
 
 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 2/7] net: sched: red: remove unnecessary red_dump_offload_stats parameter
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

Offload dump helper does not use opt parameter, remove it.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 net/sched/sch_red.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index d5e441194397..2bf1d2fabc48 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -279,7 +279,7 @@ static int red_init(struct Qdisc *sch, struct nlattr *opt,
 	return red_change(sch, opt, extack);
 }
 
-static int red_dump_offload_stats(struct Qdisc *sch, struct tc_red_qopt *opt)
+static int red_dump_offload_stats(struct Qdisc *sch)
 {
 	struct tc_red_qopt_offload hw_stats = {
 		.command = TC_RED_STATS,
@@ -309,7 +309,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
 	};
 	int err;
 
-	err = red_dump_offload_stats(sch, &opt);
+	err = red_dump_offload_stats(sch);
 	if (err)
 		goto nla_put_failure;
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 3/7] net: sched: set TCQ_F_OFFLOADED flag for MQ
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

PRIO and RED mark the qdisc with TCQ_F_OFFLOADED upon successful offload,
make MQ do the same.  The consistency will help with consistent
graft callback behaviour.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 net/sched/sch_mq.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f20f3a0f8424..1db5c1bf6ddd 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -38,9 +38,8 @@ static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
 }
 
-static void mq_offload_stats(struct Qdisc *sch)
+static int mq_offload_stats(struct Qdisc *sch)
 {
-	struct net_device *dev = qdisc_dev(sch);
 	struct tc_mq_qopt_offload opt = {
 		.command = TC_MQ_STATS,
 		.handle = sch->handle,
@@ -50,8 +49,7 @@ static void mq_offload_stats(struct Qdisc *sch)
 		},
 	};
 
-	if (tc_can_offload(dev) && dev->netdev_ops->ndo_setup_tc)
-		dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQ, &opt);
+	return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
 }
 
 static void mq_destroy(struct Qdisc *sch)
@@ -171,9 +169,8 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
 
 		spin_unlock_bh(qdisc_lock(qdisc));
 	}
-	mq_offload_stats(sch);
 
-	return 0;
+	return mq_offload_stats(sch);
 }
 
 static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 4/7] net: sched: add an offload graft helper
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

Qdisc graft operation of offload-capable qdiscs performs a few
extra steps which are identical among all the qdiscs.  Add
a helper to share this code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 include/net/sch_generic.h | 12 ++++++++++++
 net/sched/sch_api.c       | 29 +++++++++++++++++++++++++++++
 net/sched/sch_prio.c      | 27 +++------------------------
 3 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index af55c1c4edb1..a8dd1fc141b6 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -582,6 +582,10 @@ void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, unsigned int n,
 #ifdef CONFIG_NET_SCHED
 int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
 			      void *type_data);
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+				struct Qdisc *new, struct Qdisc *old,
+				enum tc_setup_type type, void *type_data,
+				struct netlink_ext_ack *extack);
 #else
 static inline int
 qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
@@ -590,6 +594,14 @@ qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
 	q->flags &= ~TCQ_F_OFFLOADED;
 	return 0;
 }
+
+static inline void
+qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+			   struct Qdisc *new, struct Qdisc *old,
+			   enum tc_setup_type type, void *type_data,
+			   struct netlink_ext_ack *extack)
+{
+}
 #endif
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  const struct Qdisc_ops *ops,
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index e534825d3d3a..4b3af41cc1d7 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -831,6 +831,35 @@ int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
 }
 EXPORT_SYMBOL(qdisc_offload_dump_helper);
 
+void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
+				struct Qdisc *new, struct Qdisc *old,
+				enum tc_setup_type type, void *type_data,
+				struct netlink_ext_ack *extack)
+{
+	bool any_qdisc_is_offloaded;
+	int err;
+
+	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
+		return;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
+
+	/* Don't report error if the graft is part of destroy operation. */
+	if (!err || !new || new == &noop_qdisc)
+		return;
+
+	/* Don't report error if the parent, the old child and the new
+	 * one are not offloaded.
+	 */
+	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
+	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
+	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
+
+	if (any_qdisc_is_offloaded)
+		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+}
+EXPORT_SYMBOL(qdisc_offload_graft_helper);
+
 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
 			 u32 portid, u32 seq, u16 flags, int event)
 {
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 4bdd04c30ead..63a90c5055ee 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -295,43 +295,22 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
 	struct tc_prio_qopt_offload graft_offload;
-	struct net_device *dev = qdisc_dev(sch);
 	unsigned long band = arg - 1;
-	bool any_qdisc_is_offloaded;
-	int err;
 
 	if (new == NULL)
 		new = &noop_qdisc;
 
 	*old = qdisc_replace(sch, new, &q->queues[band]);
 
-	if (!tc_can_offload(dev))
-		return 0;
-
 	graft_offload.handle = sch->handle;
 	graft_offload.parent = sch->parent;
 	graft_offload.graft_params.band = band;
 	graft_offload.graft_params.child_handle = new->handle;
 	graft_offload.command = TC_PRIO_GRAFT;
 
-	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
-					    &graft_offload);
-
-	/* Don't report error if the graft is part of destroy operation. */
-	if (err && new != &noop_qdisc) {
-		/* Don't report error if the parent, the old child and the new
-		 * one are not offloaded.
-		 */
-		any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
-		any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
-		if (*old)
-			any_qdisc_is_offloaded |= (*old)->flags &
-						   TCQ_F_OFFLOADED;
-
-		if (any_qdisc_is_offloaded)
-			NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
-	}
-
+	qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old,
+				   TC_SETUP_QDISC_PRIO, &graft_offload,
+				   extack);
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 5/7] net: sched: refactor grafting Qdiscs with a parent
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

The code for grafting Qdiscs when there is a parent has two needless
indentation levels, and breaks the "keep the success path unindented"
guideline.  Refactor.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 net/sched/sch_api.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 4b3af41cc1d7..f55bc50cd0a9 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1007,7 +1007,6 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 {
 	struct Qdisc *q = old;
 	struct net *net = dev_net(dev);
-	int err = 0;
 
 	if (parent == NULL) {
 		unsigned int i, num_q, ingress;
@@ -1062,28 +1061,29 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
 			dev_activate(dev);
 	} else {
 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
+		unsigned long cl;
+		int err;
 
 		/* Only support running class lockless if parent is lockless */
 		if (new && (new->flags & TCQ_F_NOLOCK) &&
 		    parent && !(parent->flags & TCQ_F_NOLOCK))
 			new->flags &= ~TCQ_F_NOLOCK;
 
-		err = -EOPNOTSUPP;
-		if (cops && cops->graft) {
-			unsigned long cl = cops->find(parent, classid);
+		if (!cops || !cops->graft)
+			return -EOPNOTSUPP;
 
-			if (cl) {
-				err = cops->graft(parent, cl, new, &old,
-						  extack);
-			} else {
-				NL_SET_ERR_MSG(extack, "Specified class not found");
-				err = -ENOENT;
-			}
+		cl = cops->find(parent, classid);
+		if (!cl) {
+			NL_SET_ERR_MSG(extack, "Specified class not found");
+			return -ENOENT;
 		}
-		if (!err)
-			notify_and_destroy(net, skb, n, classid, old, new);
+
+		err = cops->graft(parent, cl, new, &old, extack);
+		if (err)
+			return err;
+		notify_and_destroy(net, skb, n, classid, old, new);
 	}
-	return err;
+	return 0;
 }
 
 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 6/7] net: sched: red: delay destroying child qdisc on replace
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

Move destroying of the old child qdisc outside of the sch_tree_lock()
section.  This should improve the software qdisc replace but is even
more important for offloads.  Firstly calling offloads under a spin
lock is best avoided.  Secondly the destroy event of existing child
would have been sent to the offload device before the replace, causing
confusion.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 net/sched/sch_red.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 2bf1d2fabc48..7682f7a618a1 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -193,10 +193,10 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
 static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		      struct netlink_ext_ack *extack)
 {
+	struct Qdisc *old_child = NULL, *child = NULL;
 	struct red_sched_data *q = qdisc_priv(sch);
 	struct nlattr *tb[TCA_RED_MAX + 1];
 	struct tc_red_qopt *ctl;
-	struct Qdisc *child = NULL;
 	int err;
 	u32 max_P;
 
@@ -233,7 +233,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 	if (child) {
 		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
 					  q->qdisc->qstats.backlog);
-		qdisc_put(q->qdisc);
+		old_child = q->qdisc;
 		q->qdisc = child;
 	}
 
@@ -252,7 +252,11 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt,
 		red_start_of_idle_period(&q->vars);
 
 	sch_tree_unlock(sch);
+
 	red_offload(sch, true);
+
+	if (old_child)
+		qdisc_put(old_child);
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 7/7] net: sched: prio: delay destroying child qdiscs on change
From: Jakub Kicinski @ 2018-11-08  1:33 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, jiri, xiyou.wangcong, jhs, nogah.frankel,
	yuvalm, Jakub Kicinski
In-Reply-To: <20181108013340.20983-1-jakub.kicinski@netronome.com>

Move destroying of the old child qdiscs outside of the sch_tree_lock()
section.  This should improve the software qdisc replace but is even
more important for offloads.  Calling offloads under a spin lock is
best avoided, and child's destroy would be called under sch_tree_lock().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
---
 net/sched/sch_prio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 63a90c5055ee..cdf68706e40f 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -220,7 +220,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 
 		qdisc_tree_reduce_backlog(child, child->q.qlen,
 					  child->qstats.backlog);
-		qdisc_put(child);
 	}
 
 	for (i = oldbands; i < q->bands; i++) {
@@ -230,6 +229,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch_tree_unlock(sch);
+
+	for (i = q->bands; i < oldbands; i++)
+		qdisc_put(q->queues[i]);
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* Re: [Kernel][NET] Bug report on packet defragmenting
From: Eric Dumazet @ 2018-11-08  1:43 UTC (permalink / raw)
  To: soukjin.bae, netdev@vger.kernel.org
In-Reply-To: <20181108012927epcms1p47f719c1908da64a378690362901644ee@epcms1p4>

On 11/07/2018 05:29 PM, 배석진 wrote:

> If ipv6_defrag hook is not excuted simultaneously, then it's ok.
> ipv6_defrag hook can handle that. [exam 3]

This seems wrong.

This is the root cause, we should not try to work around it but fix it.

There is no guarantee that RSS/RPS/RFS can help here, packets can sit in per-cpu
backlogs long enough to reproduce the issue, if RX queues interrupts are spread
over many cpus.

^ permalink raw reply

* Re: [PATCH bpf] tools/bpftool: copy uapi/linux/tc_act/tc_bpf.h to tools directory
From: Yonghong Song @ 2018-11-08  1:47 UTC (permalink / raw)
  To: Li Zhijian, Alexei Starovoitov, Martin Lau, daniel@iogearbox.net,
	netdev@vger.kernel.org, rong.a.chen@intel.com
  Cc: Kernel Team
In-Reply-To: <149eb435-a9b5-da31-9f72-52168a9058f4@intel.com>



On 11/7/18 5:15 PM, Li Zhijian wrote:
> On 11/8/2018 9:00 AM, Yonghong Song wrote:
>> Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
>> added certain networking support to bpftool.
>> The implementation relies on a relatively recent uapi header file
>> linux/tc_act/tc_bpf.h on the host which contains the marco
>> definition of TCA_ACT_BPF_ID.
>>
>> Unfortunately, this is not the case for all distributions.
>> See the email message below where rhel-7.2 does not have
>> an up-to-date linux/tc_act/tc_bpf.h.
>>    
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.mail-2Darchive.com_linux-2Dkernel-40vger.kernel.org_msg1799211.html&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=DA8e1B5r073vIqRrFz7MRA&m=BQIJZUzQfMmUkyvlAHMs7zygIFBysR_MlkAyHN59-7E&s=pVed3MjGDG_PMeBrhrqb3m57NRinnlcdL1QjAOj4iLA&e= 
>>
> 
> i have not tested this patch, but basing on the early commit
> 6f3bac08ff9 ("tools/bpf: bpftool: add net support")
> i cooked up similar patch locally, but i noticed that it also requires an
> up-to-date linux/pkt_cls.h as well to avoid compiling errors:

Thanks for testing. I will add linux/pkt_cls.h as well for the
version 2.

> 
> root@lkp-bdw-ep3 ~/linux-f6f3bac08f/tools/bpf/bpftool# make V=1
> [...snip...]
> gcc -O2 -W -Wall -Wextra -Wno-unused-parameter -Wshadow 
> -Wno-missing-field-initializers -DPACKAGE='"bpftool"' 
> -D__EXPORTED_HEADERS__ -I/root/linux-f6f3bac08f/kernel/bpf/ 
> -I/root/linux-f6f3bac08f/tools/include 
> -I/root/linux-f6f3bac08f/tools/include/uapi 
> -I/root/linux-f6f3bac08f/tools/lib/bpf 
> -I/root/linux-f6f3bac08f/tools/perf -DBPFTOOL_VERSION='"4.19.0-rc2"' 
> -DCOMPAT_NEED_REALLOCARRAY   -c -MMD -o netlink_dumper.o netlink_dumper.c
> make -C /root/linux-f6f3bac08f/tools/lib/bpf/ OUTPUT= libbpf.a
> make[1]: Entering directory '/root/linux-f6f3bac08f/tools/lib/bpf'
> netlink_dumper.c: In function 'do_bpf_filter_dump':
> netlink_dumper.c:153:9: error: 'TCA_BPF_ID' undeclared (first use in 
> this function)
>    if (tb[TCA_BPF_ID])
>           ^~~~~~~~~~
> netlink_dumper.c:153:9: note: each undeclared identifier is reported 
> only once for each function it appears in
> netlink_dumper.c:155:9: error: 'TCA_BPF_TAG' undeclared (first use in 
> this function)
>    if (tb[TCA_BPF_TAG])
>           ^~~~~~~~~~~
> Makefile:96: recipe for target 'netlink_dumper.o' failed
> make: *** [netlink_dumper.o] Error 1
> make: *** Waiting for unfinished jobs....
> make -f /root/linux-f6f3bac08f/tools/build/Makefile.build dir=. obj=libbpf
> make[1]: Leaving directory '/root/linux-f6f3bac08f/tools/lib/bpf'
> 
> Thanks
> Zhijian
> 
>>
>> This patch fixed the issue by copying linux/tc_act/tc_bpf.h from
>> kernel include/uapi directory to tools/include/uapi directory so
>> building the bpftool does not depend on host system for this file.
>>
>> Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
>> Reported-by: kernel test robot <rong.a.chen@intel.com>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   tools/include/uapi/linux/tc_act/tc_bpf.h | 37 ++++++++++++++++++++++++
>>   1 file changed, 37 insertions(+)
>>   create mode 100644 tools/include/uapi/linux/tc_act/tc_bpf.h
>>
>> diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h 
>> b/tools/include/uapi/linux/tc_act/tc_bpf.h
>> new file mode 100644
>> index 000000000000..6e89a5df49a4
>> --- /dev/null
>> +++ b/tools/include/uapi/linux/tc_act/tc_bpf.h
>> @@ -0,0 +1,37 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
>> +/*
>> + * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +
>> +#ifndef __LINUX_TC_BPF_H
>> +#define __LINUX_TC_BPF_H
>> +
>> +#include <linux/pkt_cls.h>
>> +
>> +#define TCA_ACT_BPF 13
>> +
>> +struct tc_act_bpf {
>> +    tc_gen;
>> +};
>> +
>> +enum {
>> +    TCA_ACT_BPF_UNSPEC,
>> +    TCA_ACT_BPF_TM,
>> +    TCA_ACT_BPF_PARMS,
>> +    TCA_ACT_BPF_OPS_LEN,
>> +    TCA_ACT_BPF_OPS,
>> +    TCA_ACT_BPF_FD,
>> +    TCA_ACT_BPF_NAME,
>> +    TCA_ACT_BPF_PAD,
>> +    TCA_ACT_BPF_TAG,
>> +    TCA_ACT_BPF_ID,
>> +    __TCA_ACT_BPF_MAX,
>> +};
>> +#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
>> +
>> +#endif

^ permalink raw reply

* Re: [net-next PATCH] net: sched: cls_flower: Classify packets using port ranges
From: Nambiar, Amritha @ 2018-11-08  1:52 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, jakub.kicinski, sridhar.samudrala, jhs,
	xiyou.wangcong
In-Reply-To: <20181019085253.GF4558@nanopsycho.orion>

On 10/19/2018 1:52 AM, Jiri Pirko wrote:
> Thu, Oct 18, 2018 at 08:24:44PM CEST, amritha.nambiar@intel.com wrote:
>> On 10/18/2018 5:17 AM, Jiri Pirko wrote:
>>> Fri, Oct 12, 2018 at 03:53:30PM CEST, amritha.nambiar@intel.com wrote:
>>>> Added support in tc flower for filtering based on port ranges.
>>>> This is a rework of the RFC patch at:
>>>> https://patchwork.ozlabs.org/patch/969595/
>>>>
>>>> Example:
>>>> 1. Match on a port range:
>>>> -------------------------
>>>> $ tc filter add dev enp4s0 protocol ip parent ffff:\
>>>>  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
>>>>  action drop
>>>>
>>>> $ tc -s filter show dev enp4s0 parent ffff:
>>>> filter protocol ip pref 1 flower chain 0
>>>> filter protocol ip pref 1 flower chain 0 handle 0x1
>>>>  eth_type ipv4
>>>>  ip_proto tcp
>>>>  dst_port_min 20
>>>>  dst_port_max 30
>>>>  skip_hw
>>>>  not_in_hw
>>>>        action order 1: gact action drop
>>>>         random type none pass val 0
>>>>         index 1 ref 1 bind 1 installed 181 sec used 5 sec
>>>>        Action statistics:
>>>>        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
>>>>        backlog 0b 0p requeues 0
>>>>
>>>> 2. Match on IP address and port range:
>>>> --------------------------------------
>>>> $ tc filter add dev enp4s0 protocol ip parent ffff:\
>>>>  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
>>>>  skip_hw action drop
>>>>
>>>> $ tc -s filter show dev enp4s0 parent ffff:
>>>> filter protocol ip pref 1 flower chain 0 handle 0x2
>>>>  eth_type ipv4
>>>>  ip_proto tcp
>>>>  dst_ip 192.168.1.1
>>>>  dst_port_min 100
>>>>  dst_port_max 200
>>>>  skip_hw
>>>>  not_in_hw
>>>>        action order 1: gact action drop
>>>>         random type none pass val 0
>>>>         index 2 ref 1 bind 1 installed 28 sec used 6 sec
>>>>        Action statistics:
>>>>        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
>>>>        backlog 0b 0p requeues 0
>>>>
>>>> Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
>>>> ---
>>>> include/uapi/linux/pkt_cls.h |    5 ++
>>>> net/sched/cls_flower.c       |  134 ++++++++++++++++++++++++++++++++++++++++--
>>>> 2 files changed, 132 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>>>> index 401d0c1..b569308 100644
>>>> --- a/include/uapi/linux/pkt_cls.h
>>>> +++ b/include/uapi/linux/pkt_cls.h
>>>> @@ -405,6 +405,11 @@ enum {
>>>> 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
>>>> 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
>>>>
>>>> +	TCA_FLOWER_KEY_PORT_SRC_MIN,	/* be16 */
>>>> +	TCA_FLOWER_KEY_PORT_SRC_MAX,	/* be16 */
>>>> +	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
>>>> +	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
>>>> +
>>>> 	TCA_FLOWER_FLAGS,
>>>> 	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
>>>> 	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
>>>> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>>>> index 9aada2d..5f135f0 100644
>>>> --- a/net/sched/cls_flower.c
>>>> +++ b/net/sched/cls_flower.c
>>>> @@ -55,6 +55,9 @@ struct fl_flow_key {
>>>> 	struct flow_dissector_key_ip ip;
>>>> 	struct flow_dissector_key_ip enc_ip;
>>>> 	struct flow_dissector_key_enc_opts enc_opts;
>>>> +
>>>> +	struct flow_dissector_key_ports tp_min;
>>>> +	struct flow_dissector_key_ports tp_max;
>>>> } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
>>>>
>>>> struct fl_flow_mask_range {
>>>> @@ -103,6 +106,11 @@ struct cls_fl_filter {
>>>> 	struct net_device *hw_dev;
>>>> };
>>>>
>>>> +enum fl_endpoint {
>>>> +	FLOWER_ENDPOINT_DST,
>>>> +	FLOWER_ENDPOINT_SRC
>>>> +};
>>>> +
>>>> static const struct rhashtable_params mask_ht_params = {
>>>> 	.key_offset = offsetof(struct fl_flow_mask, key),
>>>> 	.key_len = sizeof(struct fl_flow_key),
>>>> @@ -179,11 +187,86 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
>>>> 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
>>>> }
>>>>
>>>> +static int fl_range_compare_params(struct cls_fl_filter *filter,
>>>> +				   struct fl_flow_key *key,
>>>> +				   struct fl_flow_key *mkey,
>>>> +				   enum fl_endpoint endpoint)
>>>> +{
>>>> +	__be16 min_mask, max_mask, min_val, max_val;
>>>> +
>>>> +	if (endpoint == FLOWER_ENDPOINT_DST) {
>>>> +		min_mask = htons(filter->mask->key.tp_min.dst);
>>>> +		max_mask = htons(filter->mask->key.tp_max.dst);
>>>> +		min_val = htons(filter->key.tp_min.dst);
>>>> +		max_val = htons(filter->key.tp_max.dst);
>>>> +
>>>> +		if (min_mask && max_mask) {
>>>> +			if (htons(key->tp.dst) < min_val ||
>>>> +			    htons(key->tp.dst) > max_val)
>>>> +				return -1;
>>>> +
>>>> +			/* skb does not have min and max values */
>>>> +			mkey->tp_min.dst = filter->mkey.tp_min.dst;
>>>> +			mkey->tp_max.dst = filter->mkey.tp_max.dst;
>>>> +		}
>>>> +	} else {
>>>> +		min_mask = htons(filter->mask->key.tp_min.src);
>>>> +		max_mask = htons(filter->mask->key.tp_max.src);
>>>> +		min_val = htons(filter->key.tp_min.src);
>>>> +		max_val = htons(filter->key.tp_max.src);
>>>> +
>>>> +		if (min_mask && max_mask) {
>>>> +			if (htons(key->tp.src) < min_val ||
>>>> +			    htons(key->tp.src) > max_val)
>>>> +				return -1;
>>>> +
>>>> +			/* skb does not have min and max values */
>>>> +			mkey->tp_min.src = filter->mkey.tp_min.src;
>>>> +			mkey->tp_max.src = filter->mkey.tp_max.src;
>>>> +		}
>>>
>>> You basically have 2 functions in 1 here. Just have 2 functions:
>>> fl_port_range_dst_cmp()
>>> and
>>> fl_port_range_src_cmp()
>>>
>>> And avoid the "endpoint enum.
>>> Also, as you return -1 or 0, just make it bool.
>>>
>>
>> Makes sense. Will do.
>>
>>>
>>>> +	}
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
>>>> +					     struct fl_flow_key *mkey,
>>>> +					     struct fl_flow_key *key)
>>>> +{
>>>> +	struct cls_fl_filter *filter, *f;
>>>> +	int ret;
>>>> +
>>>> +	list_for_each_entry_rcu(filter, &mask->filters, list) {
>>>> +		ret = fl_range_compare_params(filter, key, mkey,
>>>> +					      FLOWER_ENDPOINT_DST);
>>>> +		if (ret < 0)
>>>> +			continue;
>>>> +
>>>> +		ret = fl_range_compare_params(filter, key, mkey,
>>>> +					      FLOWER_ENDPOINT_SRC);
>>>> +		if (ret < 0)
>>>> +			continue;
>>>> +
>>>> +		f = rhashtable_lookup_fast(&mask->ht,
>>>> +					   fl_key_get_start(mkey, mask),
>>>> +					   mask->filter_ht_params);
>>>> +		if (f)
>>>> +			return f;
>>>> +	}
>>>> +	return NULL;
>>>> +}
>>>> +
>>>> static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
>>>> -				       struct fl_flow_key *mkey)
>>>> +				       struct fl_flow_key *mkey,
>>>> +				       struct fl_flow_key *key, bool is_skb)
>>>> {
>>>> -	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
>>>> -				      mask->filter_ht_params);
>>>> +	if ((!(mask->key.tp_min.dst && mask->key.tp_max.dst) &&
>>>> +	     !(mask->key.tp_min.src && mask->key.tp_max.src)) || !is_skb) {
>>>
>>> Would be probably good to have a dedicated bit to check for and decide
>>> if you do normal/range lookup. This is fast path. 
>>>
>>
>> Will fix in v2.
>>
>>>
>>>> +		return  rhashtable_lookup_fast(&mask->ht,
>>>
>>> Remove double space   ^^
>>>
>>
>> Will fix in v2.
>>
>>>
>>>> +					       fl_key_get_start(mkey, mask),
>>>> +					       mask->filter_ht_params);
>>>> +	}
>>>> +	/* Classify based on range */
>>>> +	return fl_lookup_range(mask, mkey, key);
>>>> }
>>>>
>>>> static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
>>>> @@ -207,8 +290,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
>>>> 		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
>>>>
>>>> 		fl_set_masked_key(&skb_mkey, &skb_key, mask);
>>>> +		f = fl_lookup(mask, &skb_mkey, &skb_key, true);
>>>>
>>>> -		f = fl_lookup(mask, &skb_mkey);
>>>> 		if (f && !tc_skip_sw(f->flags)) {
>>>> 			*res = f->res;
>>>> 			return tcf_exts_exec(skb, &f->exts, res);
>>>> @@ -909,6 +992,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
>>>> 			       sizeof(key->arp.tha));
>>>> 	}
>>>>
>>>> +	if (key->basic.ip_proto == IPPROTO_TCP ||
>>>> +	    key->basic.ip_proto == IPPROTO_UDP ||
>>>> +	    key->basic.ip_proto == IPPROTO_SCTP) {
>>>> +		fl_set_key_val(tb, &key->tp_min.dst,
>>>> +			       TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
>>>> +			       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
>>>> +		fl_set_key_val(tb, &key->tp_max.dst,
>>>> +			       TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
>>>> +			       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
>>>> +		fl_set_key_val(tb, &key->tp_min.src,
>>>> +			       TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
>>>> +			       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
>>>> +		fl_set_key_val(tb, &key->tp_max.src,
>>>> +			       TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
>>>> +			       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
>>>> +	}
>>>> +
>>>> 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
>>>> 	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
>>>> 		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
>>>> @@ -1026,8 +1126,7 @@ static void fl_init_dissector(struct flow_dissector *dissector,
>>>> 			     FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
>>>> 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
>>>> 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
>>>> -	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
>>>> -			     FLOW_DISSECTOR_KEY_PORTS, tp);
>>>> +	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
>>>> 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
>>>> 			     FLOW_DISSECTOR_KEY_IP, ip);
>>>> 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
>>>> @@ -1227,7 +1326,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
>>>> 		goto errout_idr;
>>>>
>>>> 	if (!tc_skip_sw(fnew->flags)) {
>>>> -		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
>>>> +		if (!fold && fl_lookup(fnew->mask, &fnew->mkey, NULL, false)) {
>>>
>>>
>>> I don't undestand why do you need the "is_skb" arg here. Could you
>>> please explain?
>>>
>>> Thanks!
>>>
>>
>> The reason to keep the 'is_skb' arg is because, fl_lookup is called in
>> two cases, one for skb classification and another for checking if a
>> filter exists every-time a new filter is added. In case of skb
>> classification, we need to go through the range-comparator to decide if
>> the skb's port-value falls within the range-filter's min and max limits.
>> In case of filter validation, the range-filter that we are trying to add
>> will have min and max values, and we are validating it against other
>> range-filters with min and max values. So, rhashtable lookup will
>> suffice here and there is no need to go through the range-comparator in
>> this case. In the above code, we are validating if a range-filter
>> exists, so 'is_skb' is false.
> 
> Got it. In that case, please just have a helper:
> static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
> 					 struct fl_flow_key *mkey)
> {
> 	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
> 				      mask->filter_ht_params);
> }
> 
> Call this helper from both fl_lookup() and fl_change()
> 

Alright. Will fix.

> 
>>
>>>
>>>> 			err = -EEXIST;
>>>> 			goto errout_mask;
>>>> 		}
>>>> @@ -1800,6 +1899,27 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
>>>> 				  sizeof(key->arp.tha))))
>>>> 		goto nla_put_failure;
>>>>
>>>> +	if ((key->basic.ip_proto == IPPROTO_TCP ||
>>>> +	     key->basic.ip_proto == IPPROTO_UDP ||
> 
>>>> +	     key->basic.ip_proto == IPPROTO_SCTP) &&
>>>> +	     (fl_dump_key_val(skb, &key->tp_min.dst,
>>>> +			      TCA_FLOWER_KEY_PORT_DST_MIN,
>>>> +			      &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
>>>> +			      sizeof(key->tp_min.dst)) ||
>>>> +	      fl_dump_key_val(skb, &key->tp_max.dst,
>>>> +			      TCA_FLOWER_KEY_PORT_DST_MAX,
>>>> +			      &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
>>>> +			      sizeof(key->tp_max.dst)) ||
>>>> +	      fl_dump_key_val(skb, &key->tp_min.src,
>>>> +			      TCA_FLOWER_KEY_PORT_SRC_MIN,
>>>> +			      &mask->tp_min.src, TCA_FLOWER_UNSPEC,
>>>> +			      sizeof(key->tp_min.src)) ||
>>>> +	      fl_dump_key_val(skb, &key->tp_max.src,
>>>> +			      TCA_FLOWER_KEY_PORT_SRC_MAX,
>>>> +			      &mask->tp_max.src, TCA_FLOWER_UNSPEC,
>>>> +			      sizeof(key->tp_max.src))))
>>>> +		goto nla_put_failure;
>>>> +
>>>> 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
>>>> 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
>>>> 			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,
>>>>

^ permalink raw reply

* [PATCH net-next] SUNRPC: drop pointless static qualifier in xdr_get_next_encode_buffer()
From: YueHaibing @ 2018-11-08  2:04 UTC (permalink / raw)
  To: Trond Myklebust, Anna Schumaker, J. Bruce Fields, David Miller,
	Jeff Layton
  Cc: YueHaibing, linux-nfs, netdev, kernel-janitors

There is no need to have the '__be32 *p' variable static since new value
always be assigned before use it.

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 net/sunrpc/xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 2bbb8d3..d80b156 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -546,7 +546,7 @@ void xdr_commit_encode(struct xdr_stream *xdr)
 static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
 		size_t nbytes)
 {
-	static __be32 *p;
+	__be32 *p;
 	int space_left;
 	int frag1bytes, frag2bytes;

^ permalink raw reply related

* Re: [PATCH 2/5] VSOCK: support fill data to mergeable rx buffer in host
From: jiangyiwen @ 2018-11-08  1:56 UTC (permalink / raw)
  To: Jason Wang, stefanha, stefanha; +Cc: netdev, kvm, virtualization
In-Reply-To: <76732898-8e4f-910a-aac5-ea4b635a1c15@redhat.com>

On 2018/11/7 21:32, Jason Wang wrote:
> 
> On 2018/11/7 下午3:11, jiangyiwen wrote:
>> On 2018/11/7 14:18, Jason Wang wrote:
>>> On 2018/11/6 下午2:30, jiangyiwen wrote:
>>>>> Seems duplicated with the one used by vhost-net.
>>>>>
>>>>> In packed virtqueue implementation, I plan to move this to vhost.c.
>>>>>
>>>> Yes, this code is full copied from vhost-net, if it can be packed into
>>>> vhost.c, it would be great.
>>>>
>>> If you try to reuse vhost-net, you don't even need to care about this:)
>>>
>>> Thanks
>>>
>>>
>>> .
>>>
>> Hi Jason,
>>
>> Thank your advice, I will consider your idea. But I don't know
>> what's stefan's suggestion? It seems that he doesn't care much
>> about this community.:(
> 
> 
> I think not. He is probably busy these days.
> 
> 
>>
>> I still hope this community can have some vitality.
>>
> 
> Let's wait for few more days for the possible comments from Stefan or Michael. But I do prefer to unify the virtio networking datapath which will be easier to be extended and maintained.
> 
> Thanks
> 
> 
> .
> 

Hi Jason,

Actually vsock use virtio-net as transport path should be a better idea,
I will try to consider the new idea.

Thanks,
Yiwen.

^ permalink raw reply

* RE:(2) [Kernel][NET] Bug report on packet defragmenting
From: 배석진 @ 2018-11-08  2:05 UTC (permalink / raw)
  To: Eric Dumazet, netdev@vger.kernel.org
In-Reply-To: <91b43bec-cb19-b94b-8ee3-26979e3a19d1@gmail.com>

 --------- Original Message ---------
Sender : Eric Dumazet <eric.dumazet@gmail.com>
Date   : 2018-11-08 10:44 (GMT+9)
Title  : Re: [Kernel][NET] Bug report on packet defragmenting
 
> On 11/07/2018 05:29 PM, 배석진 wrote:
>  
> > If ipv6_defrag hook is not excuted simultaneously, then it's ok.
> > ipv6_defrag hook can handle that. [exam 3]
>  
> This seems wrong.
>  
> This is the root cause, we should not try to work around it but fix it.
>  
> There is no guarantee that RSS/RPS/RFS can help here, packets can sit in per-cpu
> backlogs long enough to reproduce the issue, if RX queues interrupts are spread
> over many cpus.
 

Dear Dumazet,

Even if rx irq be spread to overal cpu, hash will be made by src/des address.
then they'll have a same hash and cpu. is it not enough?
Did you mean that we need total solution for all steering method? not just only RPS?

Best regards.



^ permalink raw reply

* Re: [PATCH net-next v2 3/5] virtio_ring: add packed ring support
From: Tiwei Bie @ 2018-11-08 11:51 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, linux-kernel, netdev,
	virtio-dev, wexu, jfreimann
In-Reply-To: <2d46a41e-bc00-276a-e19a-105c9dffc75a@redhat.com>

On Thu, Nov 08, 2018 at 04:18:25PM +0800, Jason Wang wrote:
> 
> On 2018/11/8 上午9:38, Tiwei Bie wrote:
> > > > +
> > > > +	if (vq->vq.num_free < descs_used) {
> > > > +		pr_debug("Can't add buf len %i - avail = %i\n",
> > > > +			 descs_used, vq->vq.num_free);
> > > > +		/* FIXME: for historical reasons, we force a notify here if
> > > > +		 * there are outgoing parts to the buffer.  Presumably the
> > > > +		 * host should service the ring ASAP. */
> > > I don't think we have a reason to do this for packed ring.
> > > No historical baggage there, right?
> > Based on the original commit log, it seems that the notify here
> > is just an "optimization". But I don't quite understand what does
> > the "the heuristics which KVM uses" refer to. If it's safe to drop
> > this in packed ring, I'd like to do it.
> 
> 
> According to the commit log, it seems like a workaround of lguest networking
> backend.

Do you know why removing this notify in Tx will break "the
heuristics which KVM uses"? Or what does "the heuristics
which KVM uses" refer to?


> I agree to drop it, we should not have such burden.
> 
> But we should notice that, with this removed, the compare between packed vs
> split is kind of unfair. Consider the removal of lguest support recently,
> maybe we can drop this for split ring as well?
> 
> Thanks
> 
> 
> > 
> > commit 44653eae1407f79dff6f52fcf594ae84cb165ec4
> > Author: Rusty Russell<rusty@rustcorp.com.au>
> > Date:   Fri Jul 25 12:06:04 2008 -0500
> > 
> >      virtio: don't always force a notification when ring is full
> >      We force notification when the ring is full, even if the host has
> >      indicated it doesn't want to know.  This seemed like a good idea at
> >      the time: if we fill the transmit ring, we should tell the host
> >      immediately.
> >      Unfortunately this logic also applies to the receiving ring, which is
> >      refilled constantly.  We should introduce real notification thesholds
> >      to replace this logic.  Meanwhile, removing the logic altogether breaks
> >      the heuristics which KVM uses, so we use a hack: only notify if there are
> >      outgoing parts of the new buffer.
> >      Here are the number of exits with lguest's crappy network implementation:
> >      Before:
> >              network xmit 7859051 recv 236420
> >      After:
> >              network xmit 7858610 recv 118136
> >      Signed-off-by: Rusty Russell<rusty@rustcorp.com.au>
> > 
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index 72bf8bc09014..21d9a62767af 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -87,8 +87,11 @@ static int vring_add_buf(struct virtqueue *_vq,
> >   	if (vq->num_free < out + in) {
> >   		pr_debug("Can't add buf len %i - avail = %i\n",
> >   			 out + in, vq->num_free);
> > -		/* We notify*even if*  VRING_USED_F_NO_NOTIFY is set here. */
> > -		vq->notify(&vq->vq);
> > +		/* FIXME: for historical reasons, we force a notify here if
> > +		 * there are outgoing parts to the buffer.  Presumably the
> > +		 * host should service the ring ASAP. */
> > +		if (out)
> > +			vq->notify(&vq->vq);
> >   		END_USE(vq);
> >   		return -ENOSPC;
> >   	}
> > 
> > 

^ permalink raw reply

* [net-next PATCH v2] net: sched: cls_flower: Classify packets using port ranges
From: Amritha Nambiar @ 2018-11-07 21:22 UTC (permalink / raw)
  To: netdev, davem
  Cc: jakub.kicinski, amritha.nambiar, sridhar.samudrala, jhs,
	xiyou.wangcong, jiri

Added support in tc flower for filtering based on port ranges.

Example:
1. Match on a port range:
-------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 85 sec used 3 sec
        Action statistics:
        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

2. Match on IP address and port range:
--------------------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 2 ref 1 bind 1 installed 58 sec used 2 sec
        Action statistics:
        Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

v2:
Addressed Jiri's comments:
1. Added separate functions for dst and src comparisons.
2. Removed endpoint enum.
3. Added new bit TCA_FLOWER_FLAGS_RANGE to decide normal/range
  lookup.
4. Cleaned up fl_lookup function.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
---
 include/uapi/linux/pkt_cls.h |    7 ++
 net/sched/cls_flower.c       |  133 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b63c3cf 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
 
+	TCA_FLOWER_KEY_PORT_SRC_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_SRC_MAX,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
+
 	TCA_FLOWER_FLAGS,
 	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
 	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
@@ -518,6 +523,8 @@ enum {
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE	(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 9aada2d..9d2582d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -55,6 +55,9 @@ struct fl_flow_key {
 	struct flow_dissector_key_ip ip;
 	struct flow_dissector_key_ip enc_ip;
 	struct flow_dissector_key_enc_opts enc_opts;
+
+	struct flow_dissector_key_ports tp_min;
+	struct flow_dissector_key_ports tp_max;
 } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
 
 struct fl_flow_mask_range {
@@ -65,6 +68,7 @@ struct fl_flow_mask_range {
 struct fl_flow_mask {
 	struct fl_flow_key key;
 	struct fl_flow_mask_range range;
+	u32 flags;
 	struct rhash_head ht_node;
 	struct rhashtable ht;
 	struct rhashtable_params filter_ht_params;
@@ -179,13 +183,89 @@ static void fl_clear_masked_range(struct fl_flow_key *key,
 	memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask));
 }
 
-static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
-				       struct fl_flow_key *mkey)
+static bool fl_range_port_dst_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.dst);
+	max_mask = htons(filter->mask->key.tp_max.dst);
+	min_val = htons(filter->key.tp_min.dst);
+	max_val = htons(filter->key.tp_max.dst);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.dst) < min_val ||
+		    htons(key->tp.dst) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.dst = filter->mkey.tp_min.dst;
+		mkey->tp_max.dst = filter->mkey.tp_max.dst;
+	}
+	return true;
+}
+
+static bool fl_range_port_src_cmp(struct cls_fl_filter *filter,
+				  struct fl_flow_key *key,
+				  struct fl_flow_key *mkey)
+{
+	__be16 min_mask, max_mask, min_val, max_val;
+
+	min_mask = htons(filter->mask->key.tp_min.src);
+	max_mask = htons(filter->mask->key.tp_max.src);
+	min_val = htons(filter->key.tp_min.src);
+	max_val = htons(filter->key.tp_max.src);
+
+	if (min_mask && max_mask) {
+		if (htons(key->tp.src) < min_val ||
+		    htons(key->tp.src) > max_val)
+			return false;
+
+		/* skb does not have min and max values */
+		mkey->tp_min.src = filter->mkey.tp_min.src;
+		mkey->tp_max.src = filter->mkey.tp_max.src;
+	}
+	return true;
+}
+
+static struct cls_fl_filter *__fl_lookup(struct fl_flow_mask *mask,
+					 struct fl_flow_key *mkey)
 {
 	return rhashtable_lookup_fast(&mask->ht, fl_key_get_start(mkey, mask),
 				      mask->filter_ht_params);
 }
 
+static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask,
+					     struct fl_flow_key *mkey,
+					     struct fl_flow_key *key)
+{
+	struct cls_fl_filter *filter, *f;
+
+	list_for_each_entry_rcu(filter, &mask->filters, list) {
+		if (!fl_range_port_dst_cmp(filter, key, mkey))
+			continue;
+
+		if (!fl_range_port_src_cmp(filter, key, mkey))
+			continue;
+
+		f = __fl_lookup(mask, mkey);
+		if (f)
+			return f;
+	}
+	return NULL;
+}
+
+static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask,
+				       struct fl_flow_key *mkey,
+				       struct fl_flow_key *key)
+{
+	if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE))
+		return fl_lookup_range(mask, mkey, key);
+
+	return __fl_lookup(mask, mkey);
+}
+
 static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		       struct tcf_result *res)
 {
@@ -207,8 +287,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		skb_flow_dissect(skb, &mask->dissector, &skb_key, 0);
 
 		fl_set_masked_key(&skb_mkey, &skb_key, mask);
+		f = fl_lookup(mask, &skb_mkey, &skb_key);
 
-		f = fl_lookup(mask, &skb_mkey);
 		if (f && !tc_skip_sw(f->flags)) {
 			*res = f->res;
 			return tcf_exts_exec(skb, &f->exts, res);
@@ -909,6 +989,23 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       sizeof(key->arp.tha));
 	}
 
+	if (key->basic.ip_proto == IPPROTO_TCP ||
+	    key->basic.ip_proto == IPPROTO_UDP ||
+	    key->basic.ip_proto == IPPROTO_SCTP) {
+		fl_set_key_val(tb, &key->tp_min.dst,
+			       TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_min.dst,
+			       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.dst));
+		fl_set_key_val(tb, &key->tp_max.dst,
+			       TCA_FLOWER_KEY_PORT_DST_MAX, &mask->tp_max.dst,
+			       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.dst));
+		fl_set_key_val(tb, &key->tp_min.src,
+			       TCA_FLOWER_KEY_PORT_SRC_MIN, &mask->tp_min.src,
+			       TCA_FLOWER_UNSPEC, sizeof(key->tp_min.src));
+		fl_set_key_val(tb, &key->tp_max.src,
+			       TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_max.src,
+			       TCA_FLOWER_UNSPEC, sizeof(key->tp_max.src));
+	}
+
 	if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] ||
 	    tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) {
 		key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1026,8 +1123,7 @@ static void fl_init_dissector(struct flow_dissector *dissector,
 			     FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
-	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
-			     FLOW_DISSECTOR_KEY_PORTS, tp);
+	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
 			     FLOW_DISSECTOR_KEY_IP, ip);
 	FL_KEY_SET_IF_MASKED(mask, keys, cnt,
@@ -1074,6 +1170,10 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
 
 	fl_mask_copy(newmask, mask);
 
+	if ((newmask->key.tp_min.dst && newmask->key.tp_max.dst) ||
+	    (newmask->key.tp_min.src && newmask->key.tp_max.src))
+		newmask->flags |= TCA_FLOWER_MASK_FLAGS_RANGE;
+
 	err = fl_init_mask_hashtable(newmask);
 	if (err)
 		goto errout_free;
@@ -1227,7 +1327,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout_idr;
 
 	if (!tc_skip_sw(fnew->flags)) {
-		if (!fold && fl_lookup(fnew->mask, &fnew->mkey)) {
+		if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
 			err = -EEXIST;
 			goto errout_mask;
 		}
@@ -1800,6 +1900,27 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net,
 				  sizeof(key->arp.tha))))
 		goto nla_put_failure;
 
+	if ((key->basic.ip_proto == IPPROTO_TCP ||
+	     key->basic.ip_proto == IPPROTO_UDP ||
+	     key->basic.ip_proto == IPPROTO_SCTP) &&
+	     (fl_dump_key_val(skb, &key->tp_min.dst,
+			      TCA_FLOWER_KEY_PORT_DST_MIN,
+			      &mask->tp_min.dst, TCA_FLOWER_UNSPEC,
+			      sizeof(key->tp_min.dst)) ||
+	      fl_dump_key_val(skb, &key->tp_max.dst,
+			      TCA_FLOWER_KEY_PORT_DST_MAX,
+			      &mask->tp_max.dst, TCA_FLOWER_UNSPEC,
+			      sizeof(key->tp_max.dst)) ||
+	      fl_dump_key_val(skb, &key->tp_min.src,
+			      TCA_FLOWER_KEY_PORT_SRC_MIN,
+			      &mask->tp_min.src, TCA_FLOWER_UNSPEC,
+			      sizeof(key->tp_min.src)) ||
+	      fl_dump_key_val(skb, &key->tp_max.src,
+			      TCA_FLOWER_KEY_PORT_SRC_MAX,
+			      &mask->tp_max.src, TCA_FLOWER_UNSPEC,
+			      sizeof(key->tp_max.src))))
+		goto nla_put_failure;
+
 	if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->enc_ipv4.src,
 			    TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src,

^ permalink raw reply related

* [iproute2 PATCH v2] tc: flower: Classify packets based port ranges
From: Amritha Nambiar @ 2018-11-07 21:22 UTC (permalink / raw)
  To: stephen, netdev
  Cc: jakub.kicinski, amritha.nambiar, sridhar.samudrala, jhs,
	xiyou.wangcong, jiri

Added support for filtering based on port ranges.

Example:
1. Match on a port range:
-------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower ip_proto tcp dst_port range 20-30 skip_hw\
  action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0
filter protocol ip pref 1 flower chain 0 handle 0x1
  eth_type ipv4
  ip_proto tcp
  dst_port range 20-30
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 1 ref 1 bind 1 installed 85 sec used 3 sec
        Action statistics:
        Sent 460 bytes 10 pkt (dropped 10, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

2. Match on IP address and port range:
--------------------------------------
$ tc filter add dev enp4s0 protocol ip parent ffff:\
  prio 1 flower dst_ip 192.168.1.1 ip_proto tcp dst_port range 100-200\
  skip_hw action drop

$ tc -s filter show dev enp4s0 parent ffff:
filter protocol ip pref 1 flower chain 0 handle 0x2
  eth_type ipv4
  ip_proto tcp
  dst_ip 192.168.1.1
  dst_port range 100-200
  skip_hw
  not_in_hw
        action order 1: gact action drop
         random type none pass val 0
         index 2 ref 1 bind 1 installed 58 sec used 2 sec
        Action statistics:
        Sent 920 bytes 20 pkt (dropped 20, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0

v2:
Addressed Jiri's comment to sync output format with input

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
---
 include/uapi/linux/pkt_cls.h |    7 ++
 tc/f_flower.c                |  145 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 142 insertions(+), 10 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 401d0c1..b63c3cf 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -405,6 +405,11 @@ enum {
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
 
+	TCA_FLOWER_KEY_PORT_SRC_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_SRC_MAX,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MIN,	/* be16 */
+	TCA_FLOWER_KEY_PORT_DST_MAX,	/* be16 */
+
 	TCA_FLOWER_FLAGS,
 	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
 	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
@@ -518,6 +523,8 @@ enum {
 	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
 };
 
+#define TCA_FLOWER_MASK_FLAGS_RANGE	(1 << 0) /* Range-based match */
+
 /* Match-all classifier */
 
 enum {
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 65fca04..7724a1d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -494,6 +494,66 @@ static int flower_parse_port(char *str, __u8 ip_proto,
 	return 0;
 }
 
+static int flower_port_range_attr_type(__u8 ip_proto, enum flower_endpoint type,
+				       __be16 *min_port_type,
+				       __be16 *max_port_type)
+{
+	if (ip_proto == IPPROTO_TCP || ip_proto == IPPROTO_UDP ||
+	    ip_proto == IPPROTO_SCTP) {
+		if (type == FLOWER_ENDPOINT_SRC) {
+			*min_port_type = TCA_FLOWER_KEY_PORT_SRC_MIN;
+			*max_port_type = TCA_FLOWER_KEY_PORT_SRC_MAX;
+		} else {
+			*min_port_type = TCA_FLOWER_KEY_PORT_DST_MIN;
+			*max_port_type = TCA_FLOWER_KEY_PORT_DST_MAX;
+		}
+	} else {
+		return -1;
+	}
+
+	return 0;
+}
+
+static int flower_parse_port_range(__be16 *min, __be16 *max, __u8 ip_proto,
+				   enum flower_endpoint endpoint,
+				   struct nlmsghdr *n)
+{
+	__be16 min_port_type, max_port_type;
+
+	flower_port_range_attr_type(ip_proto, endpoint, &min_port_type,
+				    &max_port_type);
+	addattr16(n, MAX_MSG, min_port_type, *min);
+	addattr16(n, MAX_MSG, max_port_type, *max);
+
+	return 0;
+}
+
+static int get_range(__be16 *min, __be16 *max, char *argv)
+{
+	char *r;
+
+	r = strchr(argv, '-');
+	if (r) {
+		*r = '\0';
+		if (get_be16(min, argv, 10)) {
+			fprintf(stderr, "invalid min range\n");
+			return -1;
+		}
+		if (get_be16(max, r + 1, 10)) {
+			fprintf(stderr, "invalid max range\n");
+			return -1;
+		}
+		if (htons(*max) <= htons(*min)) {
+			fprintf(stderr, "max value should be greater than min value\n");
+			return -1;
+		}
+	} else {
+		fprintf(stderr, "Illegal range format\n");
+		return -1;
+	}
+	return 0;
+}
+
 #define TCP_FLAGS_MAX_MASK 0xfff
 
 static int flower_parse_tcp_flags(char *str, int flags_type, int mask_type,
@@ -1061,20 +1121,54 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
 				return -1;
 			}
 		} else if (matches(*argv, "dst_port") == 0) {
+			__be16 min, max;
+
 			NEXT_ARG();
-			ret = flower_parse_port(*argv, ip_proto,
-						FLOWER_ENDPOINT_DST, n);
-			if (ret < 0) {
-				fprintf(stderr, "Illegal \"dst_port\"\n");
-				return -1;
+			if (matches(*argv, "range") == 0) {
+				NEXT_ARG();
+				ret = get_range(&min, &max, *argv);
+				if (ret < 0)
+					return -1;
+				ret = flower_parse_port_range(&min, &max,
+							      ip_proto,
+							      FLOWER_ENDPOINT_DST,
+							      n);
+				if (ret < 0) {
+					fprintf(stderr, "Illegal \"dst_port range\"\n");
+					return -1;
+				}
+			} else {
+				ret = flower_parse_port(*argv, ip_proto,
+							FLOWER_ENDPOINT_DST, n);
+				if (ret < 0) {
+					fprintf(stderr, "Illegal \"dst_port\"\n");
+					return -1;
+				}
 			}
 		} else if (matches(*argv, "src_port") == 0) {
+			__be16 min, max;
+
 			NEXT_ARG();
-			ret = flower_parse_port(*argv, ip_proto,
-						FLOWER_ENDPOINT_SRC, n);
-			if (ret < 0) {
-				fprintf(stderr, "Illegal \"src_port\"\n");
-				return -1;
+			if (matches(*argv, "range") == 0) {
+				NEXT_ARG();
+				ret = get_range(&min, &max, *argv);
+				if (ret < 0)
+					return -1;
+				ret = flower_parse_port_range(&min, &max,
+							      ip_proto,
+							      FLOWER_ENDPOINT_SRC,
+							      n);
+				if (ret < 0) {
+					fprintf(stderr, "Illegal \"src_port range\"\n");
+					return -1;
+				}
+			} else {
+				ret = flower_parse_port(*argv, ip_proto,
+							FLOWER_ENDPOINT_SRC, n);
+				if (ret < 0) {
+					fprintf(stderr, "Illegal \"src_port\"\n");
+					return -1;
+				}
 			}
 		} else if (matches(*argv, "tcp_flags") == 0) {
 			NEXT_ARG();
@@ -1490,6 +1584,22 @@ static void flower_print_port(char *name, struct rtattr *attr)
 	print_hu(PRINT_ANY, name, namefrm, rta_getattr_be16(attr));
 }
 
+static void flower_print_port_range(char *name, struct rtattr *min_attr,
+				    struct rtattr *max_attr)
+{
+	SPRINT_BUF(namefrm);
+	SPRINT_BUF(out);
+	size_t done;
+
+	if (!min_attr || !max_attr)
+		return;
+
+	done = sprintf(out, "%u", rta_getattr_be16(min_attr));
+	sprintf(out + done, "-%u", rta_getattr_be16(max_attr));
+	sprintf(namefrm, "\n  %s %%s", name);
+	print_string(PRINT_ANY, name, namefrm, out);
+}
+
 static void flower_print_tcp_flags(const char *name, struct rtattr *flags_attr,
 				   struct rtattr *mask_attr)
 {
@@ -1678,6 +1788,7 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 			    struct rtattr *opt, __u32 handle)
 {
 	struct rtattr *tb[TCA_FLOWER_MAX + 1];
+	__be16 min_port_type, max_port_type;
 	int nl_type, nl_mask_type;
 	__be16 eth_type = 0;
 	__u8 ip_proto = 0xff;
@@ -1796,6 +1907,20 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 	if (nl_type >= 0)
 		flower_print_port("src_port", tb[nl_type]);
 
+	if (flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_DST,
+					&min_port_type, &max_port_type)
+	    == 0) {
+		flower_print_port_range("dst_port range",
+					tb[min_port_type], tb[max_port_type]);
+	}
+
+	if (flower_port_range_attr_type(ip_proto, FLOWER_ENDPOINT_SRC,
+					&min_port_type, &max_port_type)
+	    == 0) {
+		flower_print_port_range("src_port range",
+					tb[min_port_type], tb[max_port_type]);
+	}
+
 	flower_print_tcp_flags("tcp_flags", tb[TCA_FLOWER_KEY_TCP_FLAGS],
 			       tb[TCA_FLOWER_KEY_TCP_FLAGS_MASK]);
 

^ permalink raw reply related

* [PATCH net-next v3 1/6] net/ncsi: Don't enable all channels when HWA available
From: Samuel Mendoza-Jonas @ 2018-11-08  2:49 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181108024909.9897-1-sam@mendozajonas.com>

NCSI hardware arbitration allows multiple packages to be enabled at once
and share the same wiring. If the NCSI driver recognises that HWA is
available it unconditionally enables all packages and channels; but that
is a configuration decision rather than something required by HWA.
Additionally the current implementation will not failover on link events
which can cause connectivity to be lost unless the interface is manually
bounced.

Retain basic HWA support but remove the separate configuration path to
enable all channels, leaving this to be handled by a later
implementation.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/ncsi-aen.c    |  3 +--
 net/ncsi/ncsi-manage.c | 50 ++++--------------------------------------
 2 files changed, 5 insertions(+), 48 deletions(-)

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 25e483e8278b..65f47a648be3 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -86,8 +86,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
 		return 0;
 
-	if (!(ndp->flags & NCSI_DEV_HWA) &&
-	    state == NCSI_CHANNEL_ACTIVE)
+	if (state == NCSI_CHANNEL_ACTIVE)
 		ndp->flags |= NCSI_DEV_RESHUFFLE;
 
 	ncsi_stop_channel_monitor(nc);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index bfc43b28c7a6..d4e6e0f99097 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -113,10 +113,8 @@ static void ncsi_channel_monitor(struct timer_list *t)
 	default:
 		netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n",
 			   nc->id);
-		if (!(ndp->flags & NCSI_DEV_HWA)) {
-			ncsi_report_link(ndp, true);
-			ndp->flags |= NCSI_DEV_RESHUFFLE;
-		}
+		ncsi_report_link(ndp, true);
+		ndp->flags |= NCSI_DEV_RESHUFFLE;
 
 		ncsi_stop_channel_monitor(nc);
 
@@ -1050,35 +1048,6 @@ static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp)
 	return false;
 }
 
-static int ncsi_enable_hwa(struct ncsi_dev_priv *ndp)
-{
-	struct ncsi_package *np;
-	struct ncsi_channel *nc;
-	unsigned long flags;
-
-	/* Move all available channels to processing queue */
-	spin_lock_irqsave(&ndp->lock, flags);
-	NCSI_FOR_EACH_PACKAGE(ndp, np) {
-		NCSI_FOR_EACH_CHANNEL(np, nc) {
-			WARN_ON_ONCE(nc->state != NCSI_CHANNEL_INACTIVE ||
-				     !list_empty(&nc->link));
-			ncsi_stop_channel_monitor(nc);
-			list_add_tail_rcu(&nc->link, &ndp->channel_queue);
-		}
-	}
-	spin_unlock_irqrestore(&ndp->lock, flags);
-
-	/* We can have no channels in extremely case */
-	if (list_empty(&ndp->channel_queue)) {
-		netdev_err(ndp->ndev.dev,
-			   "NCSI: No available channels for HWA\n");
-		ncsi_report_link(ndp, false);
-		return -ENOENT;
-	}
-
-	return ncsi_process_next_channel(ndp);
-}
-
 static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
 {
 	struct ncsi_dev *nd = &ndp->ndev;
@@ -1156,10 +1125,7 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp)
 		 */
 		if (!ndp->active_package) {
 			ndp->flags |= NCSI_DEV_PROBED;
-			if (ncsi_check_hwa(ndp))
-				ncsi_enable_hwa(ndp);
-			else
-				ncsi_choose_active_channel(ndp);
+			ncsi_choose_active_channel(ndp);
 			return;
 		}
 
@@ -1592,7 +1558,6 @@ EXPORT_SYMBOL_GPL(ncsi_register_dev);
 int ncsi_start_dev(struct ncsi_dev *nd)
 {
 	struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd);
-	int ret;
 
 	if (nd->state != ncsi_dev_state_registered &&
 	    nd->state != ncsi_dev_state_functional)
@@ -1604,14 +1569,7 @@ int ncsi_start_dev(struct ncsi_dev *nd)
 		return 0;
 	}
 
-	if (ndp->flags & NCSI_DEV_HWA) {
-		netdev_info(ndp->ndev.dev, "NCSI: Enabling HWA mode\n");
-		ret = ncsi_enable_hwa(ndp);
-	} else {
-		ret = ncsi_choose_active_channel(ndp);
-	}
-
-	return ret;
+	return ncsi_choose_active_channel(ndp);
 }
 EXPORT_SYMBOL_GPL(ncsi_start_dev);
 
-- 
2.19.1

^ permalink raw reply related

* [PATCH net-next v3 4/6] net/ncsi: Don't mark configured channels inactive
From: Samuel Mendoza-Jonas @ 2018-11-08  2:49 UTC (permalink / raw)
  To: netdev
  Cc: Samuel Mendoza-Jonas, David S . Miller, Justin.Lee1, linux-kernel,
	openbmc
In-Reply-To: <20181108024909.9897-1-sam@mendozajonas.com>

The concepts of a channel being 'active' and it having link are slightly
muddled in the NCSI driver. Tweak this slightly so that
NCSI_CHANNEL_ACTIVE represents a channel that has been configured and
enabled, and NCSI_CHANNEL_INACTIVE represents a de-configured channel.
This distinction is important because a channel can be 'active' but have
its link down; in this case the channel may still need to be configured
so that it may receive AEN link-state-change packets.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
---
 net/ncsi/ncsi-aen.c    | 17 +++++++++++------
 net/ncsi/ncsi-manage.c |  3 +--
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c
index 65f47a648be3..57f77e5d381a 100644
--- a/net/ncsi/ncsi-aen.c
+++ b/net/ncsi/ncsi-aen.c
@@ -57,6 +57,7 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	int state;
 	unsigned long old_data, data;
 	unsigned long flags;
+	bool had_link, has_link;
 
 	/* Find the NCSI channel */
 	ncsi_find_package_and_channel(ndp, h->common.channel, NULL, &nc);
@@ -73,6 +74,9 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	ncm->data[2] = data;
 	ncm->data[4] = ntohl(lsc->oem_status);
 
+	had_link = !!(old_data & 0x1);
+	has_link = !!(data & 0x1);
+
 	netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n",
 		   nc->id, data & 0x1 ? "up" : "down");
 
@@ -80,15 +84,16 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp,
 	state = nc->state;
 	spin_unlock_irqrestore(&nc->lock, flags);
 
-	if (!((old_data ^ data) & 0x1) || chained)
-		return 0;
-	if (!(state == NCSI_CHANNEL_INACTIVE && (data & 0x1)) &&
-	    !(state == NCSI_CHANNEL_ACTIVE && !(data & 0x1)))
+	if (state == NCSI_CHANNEL_INACTIVE)
+		netdev_warn(ndp->ndev.dev,
+			    "NCSI: Inactive channel %u received AEN!\n",
+			    nc->id);
+
+	if ((had_link == has_link) || chained)
 		return 0;
 
-	if (state == NCSI_CHANNEL_ACTIVE)
+	if (had_link)
 		ndp->flags |= NCSI_DEV_RESHUFFLE;
-
 	ncsi_stop_channel_monitor(nc);
 	spin_lock_irqsave(&ndp->lock, flags);
 	list_add_tail_rcu(&nc->link, &ndp->channel_queue);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index b8b4e765a04c..b9de5b78c4e9 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -916,12 +916,11 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp)
 			break;
 		}
 
+		nc->state = NCSI_CHANNEL_ACTIVE;
 		if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) {
 			hot_nc = nc;
-			nc->state = NCSI_CHANNEL_ACTIVE;
 		} else {
 			hot_nc = NULL;
-			nc->state = NCSI_CHANNEL_INACTIVE;
 			netdev_dbg(ndp->ndev.dev,
 				   "NCSI: channel %u link down after config\n",
 				   nc->id);
-- 
2.19.1

^ permalink raw reply related

* [PATCH][net-next] openvswitch: remove BUG_ON from get_dpdev
From: Li RongQing @ 2018-11-08 12:40 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA, pshelar-LZ6Gd1LRuIk,
	dev-yBygre7rU0TnMu66kgdUjQ

if local is NULL pointer, and the following access of local's
dev will trigger panic, which is same as BUG_ON

Signed-off-by: Li RongQing <lirongqing-h1bp6feCCZcAvxtiuMwx3w@public.gmane.org>
---
 net/openvswitch/vport-netdev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2e5e7a41d8ef..9bec22e3e9e8 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -84,7 +84,6 @@ static struct net_device *get_dpdev(const struct datapath *dp)
 	struct vport *local;
 
 	local = ovs_vport_ovsl(dp, OVSP_LOCAL);
-	BUG_ON(!local);
 	return local->dev;
 }
 
-- 
2.16.2

^ permalink raw reply related

* Re: [PATCH v2 2/3] dt-bindings: can: rcar_can: Add r8a774a1 support
From: Simon Horman @ 2018-11-08 12:46 UTC (permalink / raw)
  To: Fabrizio Castro
  Cc: Wolfgang Grandegger, Marc Kleine-Budde, Rob Herring, Mark Rutland,
	David S. Miller, Sergei Shtylyov, linux-can@vger.kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	devicetree@vger.kernel.org, Geert Uytterhoeven, Chris Paterson,
	Biju Das, linux-renesas-soc@vger.kernel.org
In-Reply-To: <TY1PR01MB17709C158FEAC53383231CB7C0C50@TY1PR01MB1770.jpnprd01.prod.outlook.com>

On Thu, Nov 08, 2018 at 11:25:23AM +0000, Fabrizio Castro wrote:
> Dear All,
> 
> Who is the best person to take this patch?

I believe this one is for Marc.

> Thanks,
> Fab
> 
> > From: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
> > Sent: 10 September 2018 11:43
> > Subject: [PATCH v2 2/3] dt-bindings: can: rcar_can: Add r8a774a1 support
> >
> > Document RZ/G2M (r8a774a1) SoC specific bindings.
> >
> > Signed-off-by: Fabrizio Castro <fabrizio.castro@bp.renesas.com>
> > Signed-off-by: Chris Paterson <Chris.Paterson2@renesas.com>
> > Reviewed-by: Biju Das <biju.das@bp.renesas.com>
> > ---
> > v1->v2:
> > * dropped "renesas,rzg-gen2-can" and fixed "clocks" property description
> >   as per Geert's comments.
> >
> > This patch applies on top of next-20180910.
> >
> >  Documentation/devicetree/bindings/net/can/rcar_can.txt | 18 +++++++++++++-----
> >  1 file changed, 13 insertions(+), 5 deletions(-)
> >
> > diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt
> > index 94a7f33..f3b160c 100644
> > --- a/Documentation/devicetree/bindings/net/can/rcar_can.txt
> > +++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt
> > @@ -4,6 +4,7 @@ Renesas R-Car CAN controller Device Tree Bindings
> >  Required properties:
> >  - compatible: "renesas,can-r8a7743" if CAN controller is a part of R8A7743 SoC.
> >        "renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC.
> > +      "renesas,can-r8a774a1" if CAN controller is a part of R8A774A1 SoC.
> >        "renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
> >        "renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC.
> >        "renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC.
> > @@ -16,15 +17,21 @@ Required properties:
> >        "renesas,rcar-gen1-can" for a generic R-Car Gen1 compatible device.
> >        "renesas,rcar-gen2-can" for a generic R-Car Gen2 or RZ/G1
> >        compatible device.
> > -      "renesas,rcar-gen3-can" for a generic R-Car Gen3 compatible device.
> > +      "renesas,rcar-gen3-can" for a generic R-Car Gen3 or RZ/G2
> > +      compatible device.
> >        When compatible with the generic version, nodes must list the
> >        SoC-specific version corresponding to the platform first
> >        followed by the generic version.
> >
> >  - reg: physical base address and size of the R-Car CAN register map.
> >  - interrupts: interrupt specifier for the sole interrupt.
> > -- clocks: phandles and clock specifiers for 3 CAN clock inputs.
> > -- clock-names: 3 clock input name strings: "clkp1", "clkp2", "can_clk".
> > +- clocks: phandles and clock specifiers for 2 CAN clock inputs for RZ/G2
> > +  devices.
> > +  phandles and clock specifiers for 3 CAN clock inputs for every other
> > +  SoC.
> > +- clock-names: 2 clock input name strings for RZ/G2: "clkp1", "can_clk".
> > +       3 clock input name strings for every other SoC: "clkp1", "clkp2",
> > +       "can_clk".
> >  - pinctrl-0: pin control group to be used for this controller.
> >  - pinctrl-names: must be "default".
> >
> > @@ -41,8 +48,9 @@ using the below properties:
> >  Optional properties:
> >  - renesas,can-clock-select: R-Car CAN Clock Source Select. Valid values are:
> >      <0x0> (default) : Peripheral clock (clkp1)
> > -    <0x1> : Peripheral clock (clkp2)
> > -    <0x3> : Externally input clock
> > +    <0x1> : Peripheral clock (clkp2) (not supported by
> > +    RZ/G2 devices)
> > +    <0x3> : External input clock
> >
> >  Example
> >  -------
> > --
> > 2.7.4
> 
> 
> 
> 
> Renesas Electronics Europe Ltd, Dukes Meadow, Millboard Road, Bourne End, Buckinghamshire, SL8 5FH, UK. Registered in England & Wales under Registered No. 04586709.
> 

^ permalink raw reply

* Re: [PATCH net-next] SUNRPC: drop pointless static qualifier in xdr_get_next_encode_buffer()
From: Trond Myklebust @ 2018-11-08  3:13 UTC (permalink / raw)
  To: yuehaibing@huawei.com, anna.schumaker@netapp.com,
	davem@davemloft.net, bfields@fieldses.org, jlayton@kernel.org
  Cc: netdev@vger.kernel.org, linux-nfs@vger.kernel.org,
	kernel-janitors@vger.kernel.org
In-Reply-To: <1541642697-33860-1-git-send-email-yuehaibing@huawei.com>

On Thu, 2018-11-08 at 02:04 +0000, YueHaibing wrote:
> There is no need to have the '__be32 *p' variable static since new
> value
> always be assigned before use it.
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
> ---
>  net/sunrpc/xdr.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
> index 2bbb8d3..d80b156 100644
> --- a/net/sunrpc/xdr.c
> +++ b/net/sunrpc/xdr.c
> @@ -546,7 +546,7 @@ void xdr_commit_encode(struct xdr_stream *xdr)
>  static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
>  		size_t nbytes)
>  {
> -	static __be32 *p;
> +	__be32 *p;
>  	int space_left;
>  	int frag1bytes, frag2bytes;
> 

Ouch, that's a really nasty bug that could definitely cause corruption
if you have 2 threads simultaneously calling this function! This really
deserves to be a stable patch.

Thank you, YueHaibing!

Bruce, do you want to shepherd this one in?

-- 
Trond Myklebust
Linux NFS client maintainer, Hammerspace
trond.myklebust@hammerspace.com



^ permalink raw reply

* Re: (2) [Kernel][NET] Bug report on packet defragmenting
From: Eric Dumazet @ 2018-11-08  3:24 UTC (permalink / raw)
  To: soukjin.bae, Eric Dumazet, netdev@vger.kernel.org
In-Reply-To: <20181108020523epcms1p55a0c28d3e881a079231fe813258602f6@epcms1p5>



On 11/07/2018 06:05 PM, 배석진 wrote:
>  --------- Original Message ---------
> Sender : Eric Dumazet <eric.dumazet@gmail.com>
> Date   : 2018-11-08 10:44 (GMT+9)
> Title  : Re: [Kernel][NET] Bug report on packet defragmenting
>  
>> On 11/07/2018 05:29 PM, 배석진 wrote:
>>  
>>> If ipv6_defrag hook is not excuted simultaneously, then it's ok.
>>> ipv6_defrag hook can handle that. [exam 3]
>>  
>> This seems wrong.
>>  
>> This is the root cause, we should not try to work around it but fix it.
>>  
>> There is no guarantee that RSS/RPS/RFS can help here, packets can sit in per-cpu
>> backlogs long enough to reproduce the issue, if RX queues interrupts are spread
>> over many cpus.
>  
> 
> Dear Dumazet,
> 
> Even if rx irq be spread to overal cpu, hash will be made by src/des address.
> then they'll have a same hash and cpu. is it not enough?
> Did you mean that we need total solution for all steering method? not just only RPS?
> 

IPv6 defrag unit should work all the times, even if 10 cpus have to feed fragments for the same
datagram at the same time.

RPS is just a hint to spread packets on different cpus.

Basically we could have the following patch and everything must still work properly
(presumably at lower performance, if RPS/RFS is any good, of course)

diff --git a/net/core/dev.c b/net/core/dev.c
index 0ffcbdd55fa9ee545c807f2ed3fc178830e3075a..c1269bb0d6c86b097cfff2d8395d8cbf2d596537 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4036,7 +4036,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                goto done;
 
        skb_reset_network_header(skb);
-       hash = skb_get_hash(skb);
+       hash = prandom_u32();
        if (!hash)
                goto done;
 


Sure, it is better if RPS is smarter, but if there is a bug in IPv6 defrag unit
we must investigate and root-cause it.

^ permalink raw reply related

* [PATCH bpf v2] tools/bpftool: copy a few net uapi headers to tools directory
From: Yonghong Song @ 2018-11-08  3:55 UTC (permalink / raw)
  To: ast, daniel, netdev, zhijianx.li; +Cc: kernel-team

Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
added certain networking support to bpftool.
The implementation relies on a relatively recent uapi header file
linux/tc_act/tc_bpf.h on the host which contains the marco
definition of TCA_ACT_BPF_ID.

Unfortunately, this is not the case for all distributions.
See the email message below where rhel-7.2 does not have
an up-to-date linux/tc_act/tc_bpf.h.
  https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1799211.html
Further investigation found that linux/pkt_cls.h is also needed for macro
TCA_BPF_TAG.

This patch fixed the issue by copying linux/tc_act/tc_bpf.h
and linux/pkt_cls.h from kernel include/uapi directory to
tools/include/uapi directory so building the bpftool does not depend
on host system for these files.

Fixes: f6f3bac08ff9 ("tools/bpf: bpftool: add net support")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Cc: Li Zhijian <zhijianx.li@intel.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/pkt_cls.h       | 612 +++++++++++++++++++++++
 tools/include/uapi/linux/tc_act/tc_bpf.h |  37 ++
 2 files changed, 649 insertions(+)
 create mode 100644 tools/include/uapi/linux/pkt_cls.h
 create mode 100644 tools/include/uapi/linux/tc_act/tc_bpf.h

Changelogs:
  v1 -> v2:
    . copy linux/pkt_cls.h as well.

diff --git a/tools/include/uapi/linux/pkt_cls.h b/tools/include/uapi/linux/pkt_cls.h
new file mode 100644
index 000000000000..401d0c1e612d
--- /dev/null
+++ b/tools/include/uapi/linux/pkt_cls.h
@@ -0,0 +1,612 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_PKT_CLS_H
+#define __LINUX_PKT_CLS_H
+
+#include <linux/types.h>
+#include <linux/pkt_sched.h>
+
+#define TC_COOKIE_MAX_SIZE 16
+
+/* Action attributes */
+enum {
+	TCA_ACT_UNSPEC,
+	TCA_ACT_KIND,
+	TCA_ACT_OPTIONS,
+	TCA_ACT_INDEX,
+	TCA_ACT_STATS,
+	TCA_ACT_PAD,
+	TCA_ACT_COOKIE,
+	__TCA_ACT_MAX
+};
+
+#define TCA_ACT_MAX __TCA_ACT_MAX
+#define TCA_OLD_COMPAT (TCA_ACT_MAX+1)
+#define TCA_ACT_MAX_PRIO 32
+#define TCA_ACT_BIND	1
+#define TCA_ACT_NOBIND	0
+#define TCA_ACT_UNBIND	1
+#define TCA_ACT_NOUNBIND	0
+#define TCA_ACT_REPLACE		1
+#define TCA_ACT_NOREPLACE	0
+
+#define TC_ACT_UNSPEC	(-1)
+#define TC_ACT_OK		0
+#define TC_ACT_RECLASSIFY	1
+#define TC_ACT_SHOT		2
+#define TC_ACT_PIPE		3
+#define TC_ACT_STOLEN		4
+#define TC_ACT_QUEUED		5
+#define TC_ACT_REPEAT		6
+#define TC_ACT_REDIRECT		7
+#define TC_ACT_TRAP		8 /* For hw path, this means "trap to cpu"
+				   * and don't further process the frame
+				   * in hardware. For sw path, this is
+				   * equivalent of TC_ACT_STOLEN - drop
+				   * the skb and act like everything
+				   * is alright.
+				   */
+#define TC_ACT_VALUE_MAX	TC_ACT_TRAP
+
+/* There is a special kind of actions called "extended actions",
+ * which need a value parameter. These have a local opcode located in
+ * the highest nibble, starting from 1. The rest of the bits
+ * are used to carry the value. These two parts together make
+ * a combined opcode.
+ */
+#define __TC_ACT_EXT_SHIFT 28
+#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT)
+#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1)
+#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK))
+#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode)
+
+#define TC_ACT_JUMP __TC_ACT_EXT(1)
+#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2)
+#define TC_ACT_EXT_OPCODE_MAX	TC_ACT_GOTO_CHAIN
+
+/* Action type identifiers*/
+enum {
+	TCA_ID_UNSPEC=0,
+	TCA_ID_POLICE=1,
+	/* other actions go here */
+	__TCA_ID_MAX=255
+};
+
+#define TCA_ID_MAX __TCA_ID_MAX
+
+struct tc_police {
+	__u32			index;
+	int			action;
+#define TC_POLICE_UNSPEC	TC_ACT_UNSPEC
+#define TC_POLICE_OK		TC_ACT_OK
+#define TC_POLICE_RECLASSIFY	TC_ACT_RECLASSIFY
+#define TC_POLICE_SHOT		TC_ACT_SHOT
+#define TC_POLICE_PIPE		TC_ACT_PIPE
+
+	__u32			limit;
+	__u32			burst;
+	__u32			mtu;
+	struct tc_ratespec	rate;
+	struct tc_ratespec	peakrate;
+	int			refcnt;
+	int			bindcnt;
+	__u32			capab;
+};
+
+struct tcf_t {
+	__u64   install;
+	__u64   lastuse;
+	__u64   expires;
+	__u64   firstuse;
+};
+
+struct tc_cnt {
+	int                   refcnt;
+	int                   bindcnt;
+};
+
+#define tc_gen \
+	__u32                 index; \
+	__u32                 capab; \
+	int                   action; \
+	int                   refcnt; \
+	int                   bindcnt
+
+enum {
+	TCA_POLICE_UNSPEC,
+	TCA_POLICE_TBF,
+	TCA_POLICE_RATE,
+	TCA_POLICE_PEAKRATE,
+	TCA_POLICE_AVRATE,
+	TCA_POLICE_RESULT,
+	TCA_POLICE_TM,
+	TCA_POLICE_PAD,
+	__TCA_POLICE_MAX
+#define TCA_POLICE_RESULT TCA_POLICE_RESULT
+};
+
+#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1)
+
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW	(1 << 0) /* don't offload filter to HW */
+#define TCA_CLS_FLAGS_SKIP_SW	(1 << 1) /* don't use filter in SW */
+#define TCA_CLS_FLAGS_IN_HW	(1 << 2) /* filter is offloaded to HW */
+#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */
+#define TCA_CLS_FLAGS_VERBOSE	(1 << 4) /* verbose logging */
+
+/* U32 filters */
+
+#define TC_U32_HTID(h) ((h)&0xFFF00000)
+#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20)
+#define TC_U32_HASH(h) (((h)>>12)&0xFF)
+#define TC_U32_NODE(h) ((h)&0xFFF)
+#define TC_U32_KEY(h) ((h)&0xFFFFF)
+#define TC_U32_UNSPEC	0
+#define TC_U32_ROOT	(0xFFF00000)
+
+enum {
+	TCA_U32_UNSPEC,
+	TCA_U32_CLASSID,
+	TCA_U32_HASH,
+	TCA_U32_LINK,
+	TCA_U32_DIVISOR,
+	TCA_U32_SEL,
+	TCA_U32_POLICE,
+	TCA_U32_ACT,
+	TCA_U32_INDEV,
+	TCA_U32_PCNT,
+	TCA_U32_MARK,
+	TCA_U32_FLAGS,
+	TCA_U32_PAD,
+	__TCA_U32_MAX
+};
+
+#define TCA_U32_MAX (__TCA_U32_MAX - 1)
+
+struct tc_u32_key {
+	__be32		mask;
+	__be32		val;
+	int		off;
+	int		offmask;
+};
+
+struct tc_u32_sel {
+	unsigned char		flags;
+	unsigned char		offshift;
+	unsigned char		nkeys;
+
+	__be16			offmask;
+	__u16			off;
+	short			offoff;
+
+	short			hoff;
+	__be32			hmask;
+	struct tc_u32_key	keys[0];
+};
+
+struct tc_u32_mark {
+	__u32		val;
+	__u32		mask;
+	__u32		success;
+};
+
+struct tc_u32_pcnt {
+	__u64 rcnt;
+	__u64 rhit;
+	__u64 kcnts[0];
+};
+
+/* Flags */
+
+#define TC_U32_TERMINAL		1
+#define TC_U32_OFFSET		2
+#define TC_U32_VAROFFSET	4
+#define TC_U32_EAT		8
+
+#define TC_U32_MAXDEPTH 8
+
+
+/* RSVP filter */
+
+enum {
+	TCA_RSVP_UNSPEC,
+	TCA_RSVP_CLASSID,
+	TCA_RSVP_DST,
+	TCA_RSVP_SRC,
+	TCA_RSVP_PINFO,
+	TCA_RSVP_POLICE,
+	TCA_RSVP_ACT,
+	__TCA_RSVP_MAX
+};
+
+#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 )
+
+struct tc_rsvp_gpi {
+	__u32	key;
+	__u32	mask;
+	int	offset;
+};
+
+struct tc_rsvp_pinfo {
+	struct tc_rsvp_gpi dpi;
+	struct tc_rsvp_gpi spi;
+	__u8	protocol;
+	__u8	tunnelid;
+	__u8	tunnelhdr;
+	__u8	pad;
+};
+
+/* ROUTE filter */
+
+enum {
+	TCA_ROUTE4_UNSPEC,
+	TCA_ROUTE4_CLASSID,
+	TCA_ROUTE4_TO,
+	TCA_ROUTE4_FROM,
+	TCA_ROUTE4_IIF,
+	TCA_ROUTE4_POLICE,
+	TCA_ROUTE4_ACT,
+	__TCA_ROUTE4_MAX
+};
+
+#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1)
+
+
+/* FW filter */
+
+enum {
+	TCA_FW_UNSPEC,
+	TCA_FW_CLASSID,
+	TCA_FW_POLICE,
+	TCA_FW_INDEV, /*  used by CONFIG_NET_CLS_IND */
+	TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */
+	TCA_FW_MASK,
+	__TCA_FW_MAX
+};
+
+#define TCA_FW_MAX (__TCA_FW_MAX - 1)
+
+/* TC index filter */
+
+enum {
+	TCA_TCINDEX_UNSPEC,
+	TCA_TCINDEX_HASH,
+	TCA_TCINDEX_MASK,
+	TCA_TCINDEX_SHIFT,
+	TCA_TCINDEX_FALL_THROUGH,
+	TCA_TCINDEX_CLASSID,
+	TCA_TCINDEX_POLICE,
+	TCA_TCINDEX_ACT,
+	__TCA_TCINDEX_MAX
+};
+
+#define TCA_TCINDEX_MAX     (__TCA_TCINDEX_MAX - 1)
+
+/* Flow filter */
+
+enum {
+	FLOW_KEY_SRC,
+	FLOW_KEY_DST,
+	FLOW_KEY_PROTO,
+	FLOW_KEY_PROTO_SRC,
+	FLOW_KEY_PROTO_DST,
+	FLOW_KEY_IIF,
+	FLOW_KEY_PRIORITY,
+	FLOW_KEY_MARK,
+	FLOW_KEY_NFCT,
+	FLOW_KEY_NFCT_SRC,
+	FLOW_KEY_NFCT_DST,
+	FLOW_KEY_NFCT_PROTO_SRC,
+	FLOW_KEY_NFCT_PROTO_DST,
+	FLOW_KEY_RTCLASSID,
+	FLOW_KEY_SKUID,
+	FLOW_KEY_SKGID,
+	FLOW_KEY_VLAN_TAG,
+	FLOW_KEY_RXHASH,
+	__FLOW_KEY_MAX,
+};
+
+#define FLOW_KEY_MAX	(__FLOW_KEY_MAX - 1)
+
+enum {
+	FLOW_MODE_MAP,
+	FLOW_MODE_HASH,
+};
+
+enum {
+	TCA_FLOW_UNSPEC,
+	TCA_FLOW_KEYS,
+	TCA_FLOW_MODE,
+	TCA_FLOW_BASECLASS,
+	TCA_FLOW_RSHIFT,
+	TCA_FLOW_ADDEND,
+	TCA_FLOW_MASK,
+	TCA_FLOW_XOR,
+	TCA_FLOW_DIVISOR,
+	TCA_FLOW_ACT,
+	TCA_FLOW_POLICE,
+	TCA_FLOW_EMATCHES,
+	TCA_FLOW_PERTURB,
+	__TCA_FLOW_MAX
+};
+
+#define TCA_FLOW_MAX	(__TCA_FLOW_MAX - 1)
+
+/* Basic filter */
+
+enum {
+	TCA_BASIC_UNSPEC,
+	TCA_BASIC_CLASSID,
+	TCA_BASIC_EMATCHES,
+	TCA_BASIC_ACT,
+	TCA_BASIC_POLICE,
+	__TCA_BASIC_MAX
+};
+
+#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1)
+
+
+/* Cgroup classifier */
+
+enum {
+	TCA_CGROUP_UNSPEC,
+	TCA_CGROUP_ACT,
+	TCA_CGROUP_POLICE,
+	TCA_CGROUP_EMATCHES,
+	__TCA_CGROUP_MAX,
+};
+
+#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1)
+
+/* BPF classifier */
+
+#define TCA_BPF_FLAG_ACT_DIRECT		(1 << 0)
+
+enum {
+	TCA_BPF_UNSPEC,
+	TCA_BPF_ACT,
+	TCA_BPF_POLICE,
+	TCA_BPF_CLASSID,
+	TCA_BPF_OPS_LEN,
+	TCA_BPF_OPS,
+	TCA_BPF_FD,
+	TCA_BPF_NAME,
+	TCA_BPF_FLAGS,
+	TCA_BPF_FLAGS_GEN,
+	TCA_BPF_TAG,
+	TCA_BPF_ID,
+	__TCA_BPF_MAX,
+};
+
+#define TCA_BPF_MAX (__TCA_BPF_MAX - 1)
+
+/* Flower classifier */
+
+enum {
+	TCA_FLOWER_UNSPEC,
+	TCA_FLOWER_CLASSID,
+	TCA_FLOWER_INDEV,
+	TCA_FLOWER_ACT,
+	TCA_FLOWER_KEY_ETH_DST,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_DST_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_SRC_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ETH_TYPE,	/* be16 */
+	TCA_FLOWER_KEY_IP_PROTO,	/* u8 */
+	TCA_FLOWER_KEY_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_SRC_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_IPV4_DST_MASK,	/* be32 */
+	TCA_FLOWER_KEY_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_SRC_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_IPV6_DST_MASK,	/* struct in6_addr */
+	TCA_FLOWER_KEY_TCP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
+	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
+	TCA_FLOWER_KEY_VLAN_ID,		/* be16 */
+	TCA_FLOWER_KEY_VLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_VLAN_ETH_TYPE,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_KEY_ID,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
+
+	TCA_FLOWER_KEY_TCP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_TCP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_UDP_DST_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_SRC_MASK,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_SCTP_SRC,	/* be16 */
+	TCA_FLOWER_KEY_SCTP_DST,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT,	/* be16 */
+	TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_FLAGS,		/* be32 */
+	TCA_FLOWER_KEY_FLAGS_MASK,	/* be32 */
+
+	TCA_FLOWER_KEY_ICMPV4_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE,	/* u8 */
+	TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */
+
+	TCA_FLOWER_KEY_ARP_SIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_SIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP,		/* be32 */
+	TCA_FLOWER_KEY_ARP_TIP_MASK,	/* be32 */
+	TCA_FLOWER_KEY_ARP_OP,		/* u8 */
+	TCA_FLOWER_KEY_ARP_OP_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ARP_SHA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_SHA_MASK,	/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA,		/* ETH_ALEN */
+	TCA_FLOWER_KEY_ARP_THA_MASK,	/* ETH_ALEN */
+
+	TCA_FLOWER_KEY_MPLS_TTL,	/* u8 - 8 bits */
+	TCA_FLOWER_KEY_MPLS_BOS,	/* u8 - 1 bit */
+	TCA_FLOWER_KEY_MPLS_TC,		/* u8 - 3 bits */
+	TCA_FLOWER_KEY_MPLS_LABEL,	/* be32 - 20 bits */
+
+	TCA_FLOWER_KEY_TCP_FLAGS,	/* be16 */
+	TCA_FLOWER_KEY_TCP_FLAGS_MASK,	/* be16 */
+
+	TCA_FLOWER_KEY_IP_TOS,		/* u8 */
+	TCA_FLOWER_KEY_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_IP_TTL,		/* u8 */
+	TCA_FLOWER_KEY_IP_TTL_MASK,	/* u8 */
+
+	TCA_FLOWER_KEY_CVLAN_ID,	/* be16 */
+	TCA_FLOWER_KEY_CVLAN_PRIO,	/* u8   */
+	TCA_FLOWER_KEY_CVLAN_ETH_TYPE,	/* be16 */
+
+	TCA_FLOWER_KEY_ENC_IP_TOS,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TOS_MASK,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL,	/* u8 */
+	TCA_FLOWER_KEY_ENC_IP_TTL_MASK,	/* u8 */
+
+	TCA_FLOWER_KEY_ENC_OPTS,
+	TCA_FLOWER_KEY_ENC_OPTS_MASK,
+
+	TCA_FLOWER_IN_HW_COUNT,
+
+	__TCA_FLOWER_MAX,
+};
+
+#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_ENC_OPTS_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested
+					 * TCA_FLOWER_KEY_ENC_OPT_GENEVE_
+					 * attributes
+					 */
+	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS,            /* u16 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE,             /* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA,             /* 4 to 128 bytes */
+
+	__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1)
+
+enum {
+	TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0),
+	TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1),
+};
+
+/* Match-all classifier */
+
+enum {
+	TCA_MATCHALL_UNSPEC,
+	TCA_MATCHALL_CLASSID,
+	TCA_MATCHALL_ACT,
+	TCA_MATCHALL_FLAGS,
+	__TCA_MATCHALL_MAX,
+};
+
+#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1)
+
+/* Extended Matches */
+
+struct tcf_ematch_tree_hdr {
+	__u16		nmatches;
+	__u16		progid;
+};
+
+enum {
+	TCA_EMATCH_TREE_UNSPEC,
+	TCA_EMATCH_TREE_HDR,
+	TCA_EMATCH_TREE_LIST,
+	__TCA_EMATCH_TREE_MAX
+};
+#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1)
+
+struct tcf_ematch_hdr {
+	__u16		matchid;
+	__u16		kind;
+	__u16		flags;
+	__u16		pad; /* currently unused */
+};
+
+/*  0                   1
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 
+ * +-----------------------+-+-+---+
+ * |         Unused        |S|I| R |
+ * +-----------------------+-+-+---+
+ *
+ * R(2) ::= relation to next ematch
+ *          where: 0 0 END (last ematch)
+ *                 0 1 AND
+ *                 1 0 OR
+ *                 1 1 Unused (invalid)
+ * I(1) ::= invert result
+ * S(1) ::= simple payload
+ */
+#define TCF_EM_REL_END	0
+#define TCF_EM_REL_AND	(1<<0)
+#define TCF_EM_REL_OR	(1<<1)
+#define TCF_EM_INVERT	(1<<2)
+#define TCF_EM_SIMPLE	(1<<3)
+
+#define TCF_EM_REL_MASK	3
+#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK)
+
+enum {
+	TCF_LAYER_LINK,
+	TCF_LAYER_NETWORK,
+	TCF_LAYER_TRANSPORT,
+	__TCF_LAYER_MAX
+};
+#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1)
+
+/* Ematch type assignments
+ *   1..32767		Reserved for ematches inside kernel tree
+ *   32768..65535	Free to use, not reliable
+ */
+#define	TCF_EM_CONTAINER	0
+#define	TCF_EM_CMP		1
+#define	TCF_EM_NBYTE		2
+#define	TCF_EM_U32		3
+#define	TCF_EM_META		4
+#define	TCF_EM_TEXT		5
+#define	TCF_EM_VLAN		6
+#define	TCF_EM_CANID		7
+#define	TCF_EM_IPSET		8
+#define	TCF_EM_IPT		9
+#define	TCF_EM_MAX		9
+
+enum {
+	TCF_EM_PROG_TC
+};
+
+enum {
+	TCF_EM_OPND_EQ,
+	TCF_EM_OPND_GT,
+	TCF_EM_OPND_LT
+};
+
+#endif
diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h
new file mode 100644
index 000000000000..6e89a5df49a4
--- /dev/null
+++ b/tools/include/uapi/linux/tc_act/tc_bpf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_BPF_H
+#define __LINUX_TC_BPF_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_BPF 13
+
+struct tc_act_bpf {
+	tc_gen;
+};
+
+enum {
+	TCA_ACT_BPF_UNSPEC,
+	TCA_ACT_BPF_TM,
+	TCA_ACT_BPF_PARMS,
+	TCA_ACT_BPF_OPS_LEN,
+	TCA_ACT_BPF_OPS,
+	TCA_ACT_BPF_FD,
+	TCA_ACT_BPF_NAME,
+	TCA_ACT_BPF_PAD,
+	TCA_ACT_BPF_TAG,
+	TCA_ACT_BPF_ID,
+	__TCA_ACT_BPF_MAX,
+};
+#define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
+
+#endif
-- 
2.17.1

^ permalink raw reply related

* Re: (2) [Kernel][NET] Bug report on packet defragmenting
From: Eric Dumazet @ 2018-11-08  3:56 UTC (permalink / raw)
  To: soukjin.bae, netdev@vger.kernel.org
In-Reply-To: <eca6b22e-1c8d-d62a-6632-c7e7ffb180c2@gmail.com>



On 11/07/2018 07:24 PM, Eric Dumazet wrote:

> Sure, it is better if RPS is smarter, but if there is a bug in IPv6 defrag unit
> we must investigate and root-cause it.

BTW, IPv4 defrag seems to have the same issue.

^ permalink raw reply

* [PATCH net-next] net: qca_spi: Add available buffer space verification
From: Stefan Wahren @ 2018-11-08 13:38 UTC (permalink / raw)
  To: David S. Miller; +Cc: Michael Heimpold, netdev, linux-kernel, Stefan Wahren

Interferences on the SPI line could distort the response of
available buffer space. So at least we should check that the
response doesn't exceed the maximum available buffer space.
In error case increase a new error counter and retry it later.
This behavior avoids buffer errors in the QCA7000, which
results in an unnecessary chip reset including packet loss.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
---
 drivers/net/ethernet/qualcomm/qca_debug.c |  1 +
 drivers/net/ethernet/qualcomm/qca_spi.c   | 16 +++++++++++++++-
 drivers/net/ethernet/qualcomm/qca_spi.h   |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qualcomm/qca_debug.c b/drivers/net/ethernet/qualcomm/qca_debug.c
index a9f1bc0..1450f38 100644
--- a/drivers/net/ethernet/qualcomm/qca_debug.c
+++ b/drivers/net/ethernet/qualcomm/qca_debug.c
@@ -61,6 +61,7 @@ static const char qcaspi_gstrings_stats[][ETH_GSTRING_LEN] = {
 	"Transmit ring full",
 	"SPI errors",
 	"Write verify errors",
+	"Buffer available errors",
 };
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c
index d531050..97f9295 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -289,6 +289,14 @@ qcaspi_transmit(struct qcaspi *qca)
 
 	qcaspi_read_register(qca, SPI_REG_WRBUF_SPC_AVA, &available);
 
+	if (available > QCASPI_HW_BUF_LEN) {
+		/* This could only happen by interferences on the SPI line.
+		 * So retry later ...
+		 */
+		qca->stats.buf_avail_err++;
+		return -1;
+	}
+
 	while (qca->txr.skb[qca->txr.head]) {
 		pkt_len = qca->txr.skb[qca->txr.head]->len + QCASPI_HW_PKT_LEN;
 
@@ -355,7 +363,13 @@ qcaspi_receive(struct qcaspi *qca)
 	netdev_dbg(net_dev, "qcaspi_receive: SPI_REG_RDBUF_BYTE_AVA: Value: %08x\n",
 		   available);
 
-	if (available == 0) {
+	if (available > QCASPI_HW_BUF_LEN) {
+		/* This could only happen by interferences on the SPI line.
+		 * So retry later ...
+		 */
+		qca->stats.buf_avail_err++;
+		return -1;
+	} else if (available == 0) {
 		netdev_dbg(net_dev, "qcaspi_receive called without any data being available!\n");
 		return -1;
 	}
diff --git a/drivers/net/ethernet/qualcomm/qca_spi.h b/drivers/net/ethernet/qualcomm/qca_spi.h
index 2d2c497..eb9af45 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.h
+++ b/drivers/net/ethernet/qualcomm/qca_spi.h
@@ -74,6 +74,7 @@ struct qcaspi_stats {
 	u64 ring_full;
 	u64 spi_err;
 	u64 write_verify_failed;
+	u64 buf_avail_err;
 };
 
 struct qcaspi {
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox