Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 1/3] flow_offload: move tc indirect block to flow offload
From: wenxu @ 2019-07-28  6:52 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski; +Cc: netfilter-devel, netdev
In-Reply-To: <1564296769-32294-1-git-send-email-wenxu@ucloud.cn>

From: wenxu <wenxu@ucloud.cn>

move tc indirect block to flow_offload and rename
it to flow indirect block.The nf_tables can use the
indr block architecture.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
v3: subsys_initcall for init_flow_indr_rhashtable
v4: no change

 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  10 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    |  10 +-
 include/net/flow_offload.h                         |  39 ++++
 include/net/pkt_cls.h                              |  35 ---
 include/net/sch_generic.h                          |   3 -
 net/core/flow_offload.c                            | 179 ++++++++++++++++
 net/sched/cls_api.c                                | 235 ++-------------------
 7 files changed, 247 insertions(+), 264 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 7f747cb..074573b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -785,9 +785,9 @@ static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
 {
 	int err;
 
-	err = __tc_indr_block_cb_register(netdev, rpriv,
-					  mlx5e_rep_indr_setup_tc_cb,
-					  rpriv);
+	err = __flow_indr_block_cb_register(netdev, rpriv,
+					    mlx5e_rep_indr_setup_tc_cb,
+					    rpriv);
 	if (err) {
 		struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
 
@@ -800,8 +800,8 @@ static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
 static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
 					    struct net_device *netdev)
 {
-	__tc_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_tc_cb,
-				      rpriv);
+	__flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_tc_cb,
+					rpriv);
 }
 
 static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index e209f15..6a0f034 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1479,16 +1479,16 @@ int nfp_flower_reg_indir_block_handler(struct nfp_app *app,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
-		err = __tc_indr_block_cb_register(netdev, app,
-						  nfp_flower_indr_setup_tc_cb,
-						  app);
+		err = __flow_indr_block_cb_register(netdev, app,
+						    nfp_flower_indr_setup_tc_cb,
+						    app);
 		if (err)
 			nfp_flower_cmsg_warn(app,
 					     "Indirect block reg failed - %s\n",
 					     netdev->name);
 	} else if (event == NETDEV_UNREGISTER) {
-		__tc_indr_block_cb_unregister(netdev,
-					      nfp_flower_indr_setup_tc_cb, app);
+		__flow_indr_block_cb_unregister(netdev,
+						nfp_flower_indr_setup_tc_cb, app);
 	}
 
 	return NOTIFY_OK;
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 00b9aab..66f89bc 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <net/flow_dissector.h>
+#include <linux/rhashtable.h>
 
 struct flow_match {
 	struct flow_dissector	*dissector;
@@ -366,4 +367,42 @@ static inline void flow_block_init(struct flow_block *flow_block)
 	INIT_LIST_HEAD(&flow_block->cb_list);
 }
 
+typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
+				      enum tc_setup_type type, void *type_data);
+
+struct flow_indr_block_cb {
+	struct list_head list;
+	void *cb_priv;
+	flow_indr_block_bind_cb_t *cb;
+	void *cb_ident;
+};
+
+typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
+				       struct flow_block *flow_block,
+				       struct flow_indr_block_cb *indr_block_cb,
+				       enum flow_block_command command);
+
+struct flow_indr_block_dev {
+	struct rhash_head ht_node;
+	struct net_device *dev;
+	unsigned int refcnt;
+	struct list_head cb_list;
+	flow_indr_block_ing_cmd_t *ing_cmd_cb;
+	struct flow_block *flow_block;
+};
+
+struct flow_indr_block_dev *flow_indr_block_dev_lookup(struct net_device *dev);
+
+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				  flow_indr_block_bind_cb_t *cb, void *cb_ident);
+
+void __flow_indr_block_cb_unregister(struct net_device *dev,
+				     flow_indr_block_bind_cb_t *cb, void *cb_ident);
+
+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				flow_indr_block_bind_cb_t *cb, void *cb_ident);
+
+void flow_indr_block_cb_unregister(struct net_device *dev,
+				   flow_indr_block_bind_cb_t *cb, void *cb_ident);
+
 #endif /* _NET_FLOW_OFFLOAD_H */
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e429809..0790a4e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -70,15 +70,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return block->q;
 }
 
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident);
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident);
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident);
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident);
-
 int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
@@ -137,32 +128,6 @@ void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb,
 {
 }
 
-static inline
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	return 0;
-}
-
-static inline
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	return 0;
-}
-
-static inline
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-}
-
-static inline
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-}
-
 static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res, bool compat_mode)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 6b6b012..d9f359a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -23,9 +23,6 @@
 struct module;
 struct bpf_flow_keys;
 
-typedef int tc_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
-				    enum tc_setup_type type, void *type_data);
-
 struct qdisc_rate_table {
 	struct tc_ratespec rate;
 	u32		data[256];
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index d63b970..9f1ae67 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <net/flow_offload.h>
+#include <linux/rtnetlink.h>
 
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
@@ -280,3 +281,181 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 	}
 }
 EXPORT_SYMBOL(flow_block_cb_setup_simple);
+
+static struct rhashtable indr_setup_block_ht;
+
+static const struct rhashtable_params flow_indr_setup_block_ht_params = {
+	.key_offset	= offsetof(struct flow_indr_block_dev, dev),
+	.head_offset	= offsetof(struct flow_indr_block_dev, ht_node),
+	.key_len	= sizeof(struct net_device *),
+};
+
+struct flow_indr_block_dev *
+flow_indr_block_dev_lookup(struct net_device *dev)
+{
+	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+				      flow_indr_setup_block_ht_params);
+}
+EXPORT_SYMBOL(flow_indr_block_dev_lookup);
+
+static struct flow_indr_block_dev *flow_indr_block_dev_get(struct net_device *dev)
+{
+	struct flow_indr_block_dev *indr_dev;
+
+	indr_dev = flow_indr_block_dev_lookup(dev);
+	if (indr_dev)
+		goto inc_ref;
+
+	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+	if (!indr_dev)
+		return NULL;
+
+	INIT_LIST_HEAD(&indr_dev->cb_list);
+	indr_dev->dev = dev;
+	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+				   flow_indr_setup_block_ht_params)) {
+		kfree(indr_dev);
+		return NULL;
+	}
+
+inc_ref:
+	indr_dev->refcnt++;
+	return indr_dev;
+}
+
+static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
+{
+	if (--indr_dev->refcnt)
+		return;
+
+	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+			       flow_indr_setup_block_ht_params);
+	kfree(indr_dev);
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
+			  flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+
+	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+		if (indr_block_cb->cb == cb &&
+		    indr_block_cb->cb_ident == cb_ident)
+			return indr_block_cb;
+	return NULL;
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
+		       flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+
+	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (indr_block_cb)
+		return ERR_PTR(-EEXIST);
+
+	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+	if (!indr_block_cb)
+		return ERR_PTR(-ENOMEM);
+
+	indr_block_cb->cb_priv = cb_priv;
+	indr_block_cb->cb = cb;
+	indr_block_cb->cb_ident = cb_ident;
+	list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+	return indr_block_cb;
+}
+
+static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
+{
+	list_del(&indr_block_cb->list);
+	kfree(indr_block_cb);
+}
+
+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				  flow_indr_block_bind_cb_t *cb,
+				  void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
+	int err;
+
+	indr_dev = flow_indr_block_dev_get(dev);
+	if (!indr_dev)
+		return -ENOMEM;
+
+	indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+	err = PTR_ERR_OR_ZERO(indr_block_cb);
+	if (err)
+		goto err_dev_put;
+
+	if (indr_dev->ing_cmd_cb)
+		indr_dev->ing_cmd_cb(indr_dev->dev, indr_dev->flow_block, indr_block_cb,
+				     FLOW_BLOCK_BIND);
+
+	return 0;
+
+err_dev_put:
+	flow_indr_block_dev_put(indr_dev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
+
+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				flow_indr_block_bind_cb_t *cb,
+				void *cb_ident)
+{
+	int err;
+
+	rtnl_lock();
+	err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
+
+void __flow_indr_block_cb_unregister(struct net_device *dev,
+				     flow_indr_block_bind_cb_t *cb,
+				     void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
+
+	indr_dev = flow_indr_block_dev_lookup(dev);
+	if (!indr_dev)
+		return;
+
+	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (!indr_block_cb)
+		return;
+
+	/* Send unbind message if required to free any block cbs. */
+	if (indr_dev->ing_cmd_cb)
+		indr_dev->ing_cmd_cb(indr_dev->dev, indr_dev->flow_block,
+				     indr_block_cb,
+				     FLOW_BLOCK_UNBIND);
+
+	flow_indr_block_cb_del(indr_block_cb);
+	flow_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
+
+void flow_indr_block_cb_unregister(struct net_device *dev,
+				   flow_indr_block_bind_cb_t *cb,
+				   void *cb_ident)
+{
+	rtnl_lock();
+	__flow_indr_block_cb_unregister(dev, cb, cb_ident);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
+
+static int __init init_flow_indr_rhashtable(void)
+{
+	return rhashtable_init(&indr_setup_block_ht,
+			       &flow_indr_setup_block_ht_params);
+}
+subsys_initcall(init_flow_indr_rhashtable);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3565d9a..d551c56 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -37,6 +37,7 @@
 #include <net/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_ct.h>
 #include <net/tc_act/tc_mpls.h>
+#include <net/flow_offload.h>
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
 
@@ -545,235 +546,43 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
 	}
 }
 
-static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
-{
-	const struct Qdisc_class_ops *cops;
-	struct Qdisc *qdisc;
-
-	if (!dev_ingress_queue(dev))
-		return NULL;
-
-	qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
-	if (!qdisc)
-		return NULL;
-
-	cops = qdisc->ops->cl_ops;
-	if (!cops)
-		return NULL;
-
-	if (!cops->tcf_block)
-		return NULL;
-
-	return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
-}
-
-static struct rhashtable indr_setup_block_ht;
-
-struct tc_indr_block_dev {
-	struct rhash_head ht_node;
-	struct net_device *dev;
-	unsigned int refcnt;
-	struct list_head cb_list;
-	struct tcf_block *block;
-};
-
-struct tc_indr_block_cb {
-	struct list_head list;
-	void *cb_priv;
-	tc_indr_block_bind_cb_t *cb;
-	void *cb_ident;
-};
-
-static const struct rhashtable_params tc_indr_setup_block_ht_params = {
-	.key_offset	= offsetof(struct tc_indr_block_dev, dev),
-	.head_offset	= offsetof(struct tc_indr_block_dev, ht_node),
-	.key_len	= sizeof(struct net_device *),
-};
-
-static struct tc_indr_block_dev *
-tc_indr_block_dev_lookup(struct net_device *dev)
-{
-	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
-				      tc_indr_setup_block_ht_params);
-}
-
-static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
-{
-	struct tc_indr_block_dev *indr_dev;
-
-	indr_dev = tc_indr_block_dev_lookup(dev);
-	if (indr_dev)
-		goto inc_ref;
-
-	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
-	if (!indr_dev)
-		return NULL;
-
-	INIT_LIST_HEAD(&indr_dev->cb_list);
-	indr_dev->dev = dev;
-	indr_dev->block = tc_dev_ingress_block(dev);
-	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-				   tc_indr_setup_block_ht_params)) {
-		kfree(indr_dev);
-		return NULL;
-	}
-
-inc_ref:
-	indr_dev->refcnt++;
-	return indr_dev;
-}
-
-static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
-{
-	if (--indr_dev->refcnt)
-		return;
-
-	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-			       tc_indr_setup_block_ht_params);
-	kfree(indr_dev);
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
-			tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-
-	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
-		if (indr_block_cb->cb == cb &&
-		    indr_block_cb->cb_ident == cb_ident)
-			return indr_block_cb;
-	return NULL;
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
-		     tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-
-	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
-	if (indr_block_cb)
-		return ERR_PTR(-EEXIST);
-
-	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
-	if (!indr_block_cb)
-		return ERR_PTR(-ENOMEM);
-
-	indr_block_cb->cb_priv = cb_priv;
-	indr_block_cb->cb = cb;
-	indr_block_cb->cb_ident = cb_ident;
-	list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
-	return indr_block_cb;
-}
-
-static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
-{
-	list_del(&indr_block_cb->list);
-	kfree(indr_block_cb);
-}
-
 static int tcf_block_setup(struct tcf_block *block,
 			   struct flow_block_offload *bo);
 
-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
-				  struct tc_indr_block_cb *indr_block_cb,
+static void tc_indr_block_ing_cmd(struct net_device *dev,
+				  struct flow_block *flow_block,
+				  struct flow_indr_block_cb *indr_block_cb,
 				  enum flow_block_command command)
 {
+	struct tcf_block *block = flow_block ?
+				  container_of(flow_block,
+					       struct tcf_block,
+					       flow_block) : NULL;
 	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
-		.net		= dev_net(indr_dev->dev),
-		.block_shared	= tcf_block_non_null_shared(indr_dev->block),
+		.net		= dev_net(dev),
+		.block_shared	= tcf_block_non_null_shared(block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-	if (!indr_dev->block)
-		return;
-
-	bo.block = &indr_dev->block->flow_block;
-
-	indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
-			  &bo);
-	tcf_block_setup(indr_dev->block, &bo);
-}
-
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
-	int err;
-
-	indr_dev = tc_indr_block_dev_get(dev);
-	if (!indr_dev)
-		return -ENOMEM;
-
-	indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
-	err = PTR_ERR_OR_ZERO(indr_block_cb);
-	if (err)
-		goto err_dev_put;
-
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
-	return 0;
-
-err_dev_put:
-	tc_indr_block_dev_put(indr_dev);
-	return err;
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
-
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	int err;
-
-	rtnl_lock();
-	err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
-	rtnl_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
-
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
-
-	indr_dev = tc_indr_block_dev_lookup(dev);
-	if (!indr_dev)
+	if (!block)
 		return;
 
-	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
-	if (!indr_block_cb)
-		return;
+	bo.block = flow_block;
 
-	/* Send unbind message if required to free any block cbs. */
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
-	tc_indr_block_cb_del(indr_block_cb);
-	tc_indr_block_dev_put(indr_dev);
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
+	indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK, &bo);
 
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	rtnl_lock();
-	__tc_indr_block_cb_unregister(dev, cb, cb_ident);
-	rtnl_unlock();
+	tcf_block_setup(block, &bo);
 }
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
 
 static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 			       struct tcf_block_ext_info *ei,
 			       enum flow_block_command command,
 			       struct netlink_ext_ack *extack)
 {
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
 	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= ei->binder_type,
@@ -784,11 +593,12 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-	indr_dev = tc_indr_block_dev_lookup(dev);
+	indr_dev = flow_indr_block_dev_lookup(dev);
 	if (!indr_dev)
 		return;
 
-	indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
+	indr_dev->flow_block = command == FLOW_BLOCK_BIND ? &block->flow_block : NULL;
+	indr_dev->ing_cmd_cb = command == FLOW_BLOCK_BIND ? tc_indr_block_ing_cmd : NULL;
 
 	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
 		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
@@ -3358,11 +3168,6 @@ static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
-	err = rhashtable_init(&indr_setup_block_ht,
-			      &tc_indr_setup_block_ht_params);
-	if (err)
-		goto err_rhash_setup_block_ht;
-
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
 		      RTNL_FLAG_DOIT_UNLOCKED);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
@@ -3376,8 +3181,6 @@ static int __init tc_filter_init(void)
 
 	return 0;
 
-err_rhash_setup_block_ht:
-	unregister_pernet_subsys(&tcf_net_ops);
 err_register_pernet_subsys:
 	destroy_workqueue(tc_filter_wq);
 	return err;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH] rocker: fix memory leaks of fib_work on two error return paths
From: Jiri Pirko @ 2019-07-28  7:46 UTC (permalink / raw)
  To: Colin King
  Cc: David Ahern, David S . Miller, netdev, kernel-janitors,
	linux-kernel
In-Reply-To: <20190727233726.3121-1-colin.king@canonical.com>

Sun, Jul 28, 2019 at 01:37:26AM CEST, colin.king@canonical.com wrote:
>From: Colin Ian King <colin.king@canonical.com>
>
>Currently there are two error return paths that leak memory allocated
>to fib_work. Fix this by kfree'ing fib_work before returning.
>
>Addresses-Coverity: ("Resource leak")
>Fixes: 19a9d136f198 ("ipv4: Flag fib_info with a fib_nh using IPv6 gateway")
>Fixes: dbcc4fa718ee ("rocker: Fail attempts to use routes with nexthop objects")
>Signed-off-by: Colin Ian King <colin.king@canonical.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

^ permalink raw reply

* Re: INFO: rcu detected stall in vhost_worker
From: Michael S. Tsirkin @ 2019-07-28  8:36 UTC (permalink / raw)
  To: Hillf Danton
  Cc: syzbot, jasowang, kvm, linux-kbuild, linux-kernel, michal.lkml,
	netdev, syzkaller-bugs, torvalds, virtualization, yamada.masahiro
In-Reply-To: <000000000000e87d14058e9728d7@google.com>

On Sat, Jul 27, 2019 at 04:23:23PM +0800, Hillf Danton wrote:
> 
> Fri, 26 Jul 2019 08:26:01 -0700 (PDT)
> > syzbot has bisected this bug to:
> > 
> > commit 0ecfebd2b52404ae0c54a878c872bb93363ada36
> > Author: Linus Torvalds <torvalds@linux-foundation.org>
> > Date:   Sun Jul 7 22:41:56 2019 +0000
> > 
> >      Linux 5.2
> > 
> > bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=118810bfa00000
> > start commit:   13bf6d6a Add linux-next specific files for 20190725
> > git tree:       linux-next
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=8ae987d803395886
> > dashboard link: https://syzkaller.appspot.com/bug?extid=36e93b425cd6eb54fcc1
> > syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=15112f3fa00000
> > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=131ab578600000
> > 
> > Reported-by: syzbot+36e93b425cd6eb54fcc1@syzkaller.appspotmail.com
> > Fixes: 0ecfebd2b524 ("Linux 5.2")
> > 
> > For information about bisection process see: https://goo.gl/tpsmEJ#bisection
> 
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -787,7 +787,6 @@ static void vhost_setup_uaddr(struct vho
> 			      size_t size, bool write)
> {
> 	struct vhost_uaddr *addr = &vq->uaddrs[index];
> -	spin_lock(&vq->mmu_lock);
> 
> 	addr->uaddr = uaddr;
> 	addr->size = size;
> @@ -797,7 +796,10 @@ static void vhost_setup_uaddr(struct vho
> static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> {
> 	spin_lock(&vq->mmu_lock);
> -
> +	/*
> +	 * deadlock if managing to take mmu_lock again while
> +	 * setting up uaddr
> +	 */
> 	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> 			  (unsigned long)vq->desc,
> 			  vhost_get_desc_size(vq, vq->num),
> --

Thanks!
I reverted this whole commit.

-- 
MST

^ permalink raw reply

* Re: [PATCH v6 rdma-next 1/6] RDMA/core: Create mmap database and cookie helper functions
From: Gal Pressman @ 2019-07-28  8:45 UTC (permalink / raw)
  To: Jason Gunthorpe, Michal Kalderon
  Cc: Kamal Heib, Ariel Elior, dledford@redhat.com,
	linux-rdma@vger.kernel.org, davem@davemloft.net,
	netdev@vger.kernel.org
In-Reply-To: <20190726132316.GA8695@ziepe.ca>

On 26/07/2019 16:23, Jason Gunthorpe wrote:
> On Fri, Jul 26, 2019 at 08:42:07AM +0000, Michal Kalderon wrote:
> 
>>>> But we don't free entires from the xa_array ( only when ucontext is
>>>> destroyed) so how will There be an empty element after we wrap ?
>>>
>>> Oh!
>>>
>>> That should be fixed up too, in the general case if a user is
>>> creating/destroying driver objects in loop we don't want memory usage to
>>> be unbounded.
>>>
>>> The rdma_user_mmap stuff has VMA ops that can refcount the xa entry and
>>> now that this is core code it is easy enough to harmonize the two things and
>>> track the xa side from the struct rdma_umap_priv
>>>
>>> The question is, does EFA or qedr have a use model for this that allows a
>>> userspace verb to create/destroy in a loop? ie do we need to fix this right
>>> now?
> 
>> The mapping occurs for every qp and cq creation. So yes.
>>
>> So do you mean add a ref-cnt to the xarray entry and from umap
>> decrease the refcnt and free?
> 
> Yes, free the entry (release the HW resource) and release the xa_array
> ID.

This is a bit tricky for EFA.
The UAR BAR resources (LLQ for example) aren't cleaned up until the UAR is
deallocated, so many of the entries won't really be freed when the refcount
reaches zero (i.e the HW considers these entries as refcounted as long as the
UAR exists). The best we can do is free the DMA buffers for appropriate entries.

^ permalink raw reply

* [PATCH net-next] r8169: make use of xmit_more
From: Heiner Kallweit @ 2019-07-28  9:25 UTC (permalink / raw)
  To: Realtek linux nic maintainers, David Miller
  Cc: netdev@vger.kernel.org, Sander Eikelenboom, Eric Dumazet

There was a previous attempt to use xmit_more, but the change had to be
reverted because under load sometimes a transmit timeout occurred [0].
Maybe this was caused by a missing memory barrier, the new attempt
keeps the memory barrier before the call to netif_stop_queue like it
is used by the driver as of today. The new attempt also changes the
order of some calls as suggested by Eric.

[0] https://lkml.org/lkml/2019/2/10/39

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 864ca529d..d9261e68f 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5637,6 +5637,8 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 	struct device *d = tp_to_dev(tp);
 	dma_addr_t mapping;
 	u32 opts[2], len;
+	bool stop_queue;
+	bool door_bell;
 	int frags;
 
 	if (unlikely(!rtl_tx_slots_avail(tp, skb_shinfo(skb)->nr_frags))) {
@@ -5680,13 +5682,13 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	txd->opts2 = cpu_to_le32(opts[1]);
 
-	netdev_sent_queue(dev, skb->len);
-
 	skb_tx_timestamp(skb);
 
 	/* Force memory writes to complete before releasing descriptor */
 	dma_wmb();
 
+	door_bell = __netdev_sent_queue(dev, skb->len, netdev_xmit_more());
+
 	txd->opts1 = rtl8169_get_txd_opts1(opts[0], len, entry);
 
 	/* Force all memory writes to complete before notifying device */
@@ -5694,14 +5696,19 @@ static netdev_tx_t rtl8169_start_xmit(struct sk_buff *skb,
 
 	tp->cur_tx += frags + 1;
 
-	RTL_W8(tp, TxPoll, NPQ);
-
-	if (!rtl_tx_slots_avail(tp, MAX_SKB_FRAGS)) {
+	stop_queue = !rtl_tx_slots_avail(tp, MAX_SKB_FRAGS);
+	if (unlikely(stop_queue)) {
 		/* Avoid wrongly optimistic queue wake-up: rtl_tx thread must
 		 * not miss a ring update when it notices a stopped queue.
 		 */
 		smp_wmb();
 		netif_stop_queue(dev);
+	}
+
+	if (door_bell)
+		RTL_W8(tp, TxPoll, NPQ);
+
+	if (unlikely(stop_queue)) {
 		/* Sync with rtl_tx:
 		 * - publish queue status and cur_tx ring index (write barrier)
 		 * - refresh dirty_tx ring index (read barrier).
-- 
2.22.0


^ permalink raw reply related

* Re: [PATCH v6 rdma-next 1/6] RDMA/core: Create mmap database and cookie helper functions
From: Kamal Heib @ 2019-07-28  9:30 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Michal Kalderon, ariel.elior, dledford, galpress, linux-rdma,
	davem, netdev
In-Reply-To: <20190725175540.GA18757@ziepe.ca>

On Thu, Jul 25, 2019 at 02:55:40PM -0300, Jason Gunthorpe wrote:
> On Tue, Jul 09, 2019 at 05:17:30PM +0300, Michal Kalderon wrote:
> > Create some common API's for adding entries to a xa_mmap.
> > Searching for an entry and freeing one.
> > 
> > The code was copied from the efa driver almost as is, just renamed
> > function to be generic and not efa specific.
> > 
> > Signed-off-by: Ariel Elior <ariel.elior@marvell.com>
> > Signed-off-by: Michal Kalderon <michal.kalderon@marvell.com>
> >  drivers/infiniband/core/device.c      |   1 +
> >  drivers/infiniband/core/rdma_core.c   |   1 +
> >  drivers/infiniband/core/uverbs_cmd.c  |   1 +
> >  drivers/infiniband/core/uverbs_main.c | 135 ++++++++++++++++++++++++++++++++++
> >  include/rdma/ib_verbs.h               |  46 ++++++++++++
> >  5 files changed, 184 insertions(+)
> > 
> > diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
> > index 8a6ccb936dfe..a830c2c5d691 100644
> > +++ b/drivers/infiniband/core/device.c
> > @@ -2521,6 +2521,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
> >  	SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
> >  	SET_DEVICE_OP(dev_ops, map_phys_fmr);
> >  	SET_DEVICE_OP(dev_ops, mmap);
> > +	SET_DEVICE_OP(dev_ops, mmap_free);
> >  	SET_DEVICE_OP(dev_ops, modify_ah);
> >  	SET_DEVICE_OP(dev_ops, modify_cq);
> >  	SET_DEVICE_OP(dev_ops, modify_device);
> > diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
> > index ccf4d069c25c..1ed01b02401f 100644
> > +++ b/drivers/infiniband/core/rdma_core.c
> > @@ -816,6 +816,7 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
> >  
> >  	rdma_restrack_del(&ucontext->res);
> >  
> > +	rdma_user_mmap_entries_remove_free(ucontext);
> >  	ib_dev->ops.dealloc_ucontext(ucontext);
> >  	kfree(ucontext);
> >  
> > diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
> > index 7ddd0e5bc6b3..44c0600245e4 100644
> > +++ b/drivers/infiniband/core/uverbs_cmd.c
> > @@ -254,6 +254,7 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
> >  
> >  	mutex_init(&ucontext->per_mm_list_lock);
> >  	INIT_LIST_HEAD(&ucontext->per_mm_list);
> > +	xa_init(&ucontext->mmap_xa);
> >  
> >  	ret = get_unused_fd_flags(O_CLOEXEC);
> >  	if (ret < 0)
> > diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
> > index 11c13c1381cf..4b909d7b97de 100644
> > +++ b/drivers/infiniband/core/uverbs_main.c
> > @@ -965,6 +965,141 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
> >  }
> >  EXPORT_SYMBOL(rdma_user_mmap_io);
> >  
> > +static inline u64
> > +rdma_user_mmap_get_key(const struct rdma_user_mmap_entry *entry)
> > +{
> > +	return (u64)entry->mmap_page << PAGE_SHIFT;
> > +}
> > +
> > +/**
> > + * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa.
> > + *
> > + * @ucontext: associated user context.
> > + * @key: The key received from rdma_user_mmap_entry_insert which
> > + *     is provided by user as the address to map.
> > + * @len: The length the user wants to map
> > + *
> > + * This function is called when a user tries to mmap a key it
> > + * initially received from the driver. They key was created by
> > + * the function rdma_user_mmap_entry_insert.
> > + *
> > + * Return an entry if exists or NULL if there is no match.
> > + */
> > +struct rdma_user_mmap_entry *
> > +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, u64 len)
> > +{
> > +	struct rdma_user_mmap_entry *entry;
> > +	u64 mmap_page;
> > +
> > +	mmap_page = key >> PAGE_SHIFT;
> > +	if (mmap_page > U32_MAX)
> > +		return NULL;
> > +
> > +	entry = xa_load(&ucontext->mmap_xa, mmap_page);
> > +	if (!entry || entry->length != len)
> > +		return NULL;
> > +
> > +	ibdev_dbg(ucontext->device,
> > +		  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
> > +		  entry->obj, key, entry->address, entry->length);
> > +
> > +	return entry;
> > +}
> > +EXPORT_SYMBOL(rdma_user_mmap_entry_get);
> 
> It is a mistake we keep making, and maybe the war is hopelessly lost
> now, but functions called from a driver should not be part of the
> ib_uverbs module - ideally uverbs is an optional module. They should
> be in ib_core.
> 
> Maybe put this in ib_core_uverbs.c ?
> 
> Kamal, you've been tackling various cleanups, maybe making ib_uverbs
> unloadable again is something you'd be keen on?
>

Yes, Could you please give some background on that?


> > +/**
> > + * rdma_user_mmap_entry_insert() - Allocate and insert an entry to the mmap_xa.
> > + *
> > + * @ucontext: associated user context.
> > + * @obj: opaque driver object that will be stored in the entry.
> > + * @address: The address that will be mmapped to the user
> > + * @length: Length of the address that will be mmapped
> > + * @mmap_flag: opaque driver flags related to the address (For
> > + *           example could be used for cachability)
> > + *
> > + * This function should be called by drivers that use the rdma_user_mmap
> > + * interface for handling user mmapped addresses. The database is handled in
> > + * the core and helper functions are provided to insert entries into the
> > + * database and extract entries when the user call mmap with the given key.
> > + * The function returns a unique key that should be provided to user, the user
> > + * will use the key to map the given address.
> > + *
> > + * Note this locking scheme cannot support removal of entries,
> > + * except during ucontext destruction when the core code
> > + * guarentees no concurrency.
> > + *
> > + * Return: unique key or RDMA_USER_MMAP_INVALID if entry was not added.
> > + */
> > +u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void *obj,
> > +				u64 address, u64 length, u8 mmap_flag)
> > +{
> > +	struct rdma_user_mmap_entry *entry;
> > +	u32 next_mmap_page;
> > +	int err;
> > +
> > +	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> > +	if (!entry)
> > +		return RDMA_USER_MMAP_INVALID;
> > +
> > +	entry->obj = obj;
> > +	entry->address = address;
> > +	entry->length = length;
> > +	entry->mmap_flag = mmap_flag;
> > +
> > +	xa_lock(&ucontext->mmap_xa);
> > +	if (check_add_overflow(ucontext->mmap_xa_page,
> > +			       (u32)(length >> PAGE_SHIFT),
> 
> Should this be divide round up ?
> 
> > +			       &next_mmap_page))
> > +		goto err_unlock;
> 
> I still don't like that this algorithm latches into a permanent
> failure when the xa_page wraps.
> 
> It seems worth spending a bit more time here to tidy this.. Keep using
> the mmap_xa_page scheme, but instead do something like
> 
> alloc_cyclic_range():
> 
> while () {
>    // Find first empty element in a cyclic way
>    xa_page_first = mmap_xa_page;
>    xa_find(xa, &xa_page_first, U32_MAX, XA_FREE_MARK)
> 
>    // Is there a enough room to have the range?
>    if (check_add_overflow(xa_page_first, npages, &xa_page_end)) {
>       mmap_xa_page = 0;
>       continue;
>    }
> 
>    // See if the element before intersects 
>    elm = xa_find(xa, &zero, xa_page_end, 0);
>    if (elm && intersects(xa_page_first, xa_page_last, elm->first, elm->last)) {
>       mmap_xa_page = elm->last + 1;
>       continue
>    }
>   
>    // xa_page_first -> xa_page_end should now be free
>    xa_insert(xa, xa_page_start, entry);
>    mmap_xa_page = xa_page_end + 1;
>    return xa_page_start;
> }
> 
> Approximately, please check it.
> 
> > @@ -2199,6 +2201,17 @@ struct iw_cm_conn_param;
> >  
> >  #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
> >  
> > +#define RDMA_USER_MMAP_FLAG_SHIFT 56
> > +#define RDMA_USER_MMAP_PAGE_MASK GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
> > +#define RDMA_USER_MMAP_INVALID U64_MAX
> > +struct rdma_user_mmap_entry {
> > +	void *obj;
> > +	u64 address;
> > +	u64 length;
> > +	u32 mmap_page;
> > +	u8 mmap_flag;
> > +};
> > +
> >  /**
> >   * struct ib_device_ops - InfiniBand device operations
> >   * This structure defines all the InfiniBand device operations, providers will
> > @@ -2311,6 +2324,19 @@ struct ib_device_ops {
> >  			      struct ib_udata *udata);
> >  	void (*dealloc_ucontext)(struct ib_ucontext *context);
> >  	int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
> > +	/**
> > +	 * Memory that is mapped to the user can only be freed once the
> > +	 * ucontext of the application is destroyed. This is for
> > +	 * security reasons where we don't want an application to have a
> > +	 * mapping to phyiscal memory that is freed and allocated to
> > +	 * another application. For this reason, all the entries are
> > +	 * stored in ucontext and once ucontext is freed mmap_free is
> > +	 * called on each of the entries. They type of the memory that
> 
> They -> the
> 
> > +	 * was mapped may differ between entries and is opaque to the
> > +	 * rdma_user_mmap interface. Therefore needs to be implemented
> > +	 * by the driver in mmap_free.
> > +	 */
> > +	void (*mmap_free)(struct rdma_user_mmap_entry *entry);
> >  	void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
> >  	int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
> >  	void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
> > @@ -2709,6 +2735,11 @@ void ib_set_device_ops(struct ib_device *device,
> >  #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
> >  int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
> >  		      unsigned long pfn, unsigned long size, pgprot_t prot);
> > +u64 rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, void *obj,
> > +				u64 address, u64 length, u8 mmap_flag);
> > +struct rdma_user_mmap_entry *
> > +rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, u64 key, u64 len);
> > +void rdma_user_mmap_entries_remove_free(struct ib_ucontext
> > *ucontext);
> 
> Should remove_free should be in the core-priv header?
> 
> Jason

^ permalink raw reply

* RE: [PATCH] net/mlx5e: Fix zero table prio set by user.
From: Paul Blakey @ 2019-07-28 10:04 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner, wenxu
  Cc: Or Gerlitz, Saeed Mahameed, Roi Dayan, Mark Bloch,
	pablo@netfilter.org, netdev@vger.kernel.org
In-Reply-To: <20190726140142.GC4063@localhost.localdomain>


On 7/26/2019 5:01 PM, Marcelo Ricardo Leitner wrote:
> On Fri, Jul 26, 2019 at 08:39:43PM +0800, wenxu wrote:
>>
>> 在 2019/7/26 20:19, Or Gerlitz 写道:
>>> On Fri, Jul 26, 2019 at 12:24 AM Saeed Mahameed <saeedm@mellanox.com> wrote:
>>>> On Thu, 2019-07-25 at 19:24 +0800, wenxu@ucloud.cn wrote:
>>>>> From: wenxu <wenxu@ucloud.cn>
>>>>>
>>>>> The flow_cls_common_offload prio is zero
>>>>>
>>>>> It leads the invalid table prio in hw.
>>>>>
>>>>> Error: Could not process rule: Invalid argument
>>>>>
>>>>> kernel log:
>>>>> mlx5_core 0000:81:00.0: E-Switch: Failed to create FDB Table err -22
>>>>> (table prio: 65535, level: 0, size: 4194304)
>>>>>
>>>>> table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
>>>>> should check (chain * FDB_MAX_PRIO) + prio is not 0
>>>>>
>>>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>>>>> ---
>>>>>  drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 +++-
>>>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git
>>>>> a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>>>>> b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>>>>> index 089ae4d..64ca90f 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>>>>> @@ -970,7 +970,9 @@ static int esw_add_fdb_miss_rule(struct
>>>> this piece of code isn't in this function, weird how it got to the
>>>> diff, patch applies correctly though !
>>>>
>>>>> mlx5_eswitch *esw)
>>>>>               flags |= (MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT |
>>>>>                         MLX5_FLOW_TABLE_TUNNEL_EN_DECAP);
>>>>>
>>>>> -     table_prio = (chain * FDB_MAX_PRIO) + prio - 1;
>>>>> +     table_prio = (chain * FDB_MAX_PRIO) + prio;
>>>>> +     if (table_prio)
>>>>> +             table_prio = table_prio - 1;
>>>>>
>>>> This is black magic, even before this fix.
>>>> this -1 seems to be needed in order to call
>>>> create_next_size_table(table_prio) with the previous "table prio" ?
>>>> (table_prio - 1)  ?
>>>>
>>>> The whole thing looks wrong to me since when prio is 0 and chain is 0,
>>>> there is not such thing table_prio - 1.
>>>>
>>>> mlnx eswitch guys in the cc, please advise.
>>> basically, prio 0 is not something we ever get in the driver, since if
>>> user space
>>> specifies 0, the kernel generates some random non-zero prio, and we support
>>> only prios 1-16 -- Wenxu -- what do you run to get this error?
>>>
>>>
>> I run offload with nfatbles(but not tc), there is no prio for each rule.
>>
>> prio of flow_cls_common_offload init as 0.
>>
>> static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
>>
>>                      __be16 proto,
>>                     struct netlink_ext_ack *extack)
>> {
>>     common->protocol = proto;
>>     common->extack = extack;
>> }
>>
>>
>> flow_cls_common_offload
>
> Note that on
> [PATCH net-next] netfilter: nf_table_offload: Fix zero prio of flow_cls_common_offload
> I asked Pablo on how nftables should behave on this situation.
>
> It's the same issue as in the patch above but being fixed at a
> different level.

That's better, since the original code relied on not having prio 0 as valid, the suggested fix (net/mlx5e: Fix zero table prio set by user) maps NFT offload prio 0 and tc prio 1 to the same

hardware table. This is wrong and can cause issues.

^ permalink raw reply

* Re: next-20190723: bpf/seccomp - systemd/journald issue?
From: Sedat Dilek @ 2019-07-28 11:09 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Alexei Starovoitov, Alexei Starovoitov, Daniel Borkmann,
	Martin Lau, Song Liu, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Clang-Built-Linux ML, Kees Cook, Nick Desaulniers,
	Nathan Chancellor
In-Reply-To: <934a2a0a-c3fb-fd75-b8a3-c1042d73ca0c@fb.com>

On Sat, Jul 27, 2019 at 7:08 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 7/27/19 12:36 AM, Sedat Dilek wrote:
> > On Sat, Jul 27, 2019 at 4:24 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> >>
> >> On Fri, Jul 26, 2019 at 2:19 PM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> >>>
> >>> On Fri, Jul 26, 2019 at 11:10 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 7/26/19 2:02 PM, Sedat Dilek wrote:
> >>>>> On Fri, Jul 26, 2019 at 10:38 PM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> >>>>>>
> >>>>>> Hi Yonghong Song,
> >>>>>>
> >>>>>> On Fri, Jul 26, 2019 at 5:45 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 7/26/19 1:26 AM, Sedat Dilek wrote:
> >>>>>>>> Hi,
> >>>>>>>>
> >>>>>>>> I have opened a new issue in the ClangBuiltLinux issue tracker.
> >>>>>>>
> >>>>>>> Glad to know clang 9 has asm goto support and now It can compile
> >>>>>>> kernel again.
> >>>>>>>
> >>>>>>
> >>>>>> Yupp.
> >>>>>>
> >>>>>>>>
> >>>>>>>> I am seeing a problem in the area bpf/seccomp causing
> >>>>>>>> systemd/journald/udevd services to fail.
> >>>>>>>>
> >>>>>>>> [Fri Jul 26 08:08:43 2019] systemd[453]: systemd-udevd.service: Failed
> >>>>>>>> to connect stdout to the journal socket, ignoring: Connection refused
> >>>>>>>>
> >>>>>>>> This happens when I use the (LLVM) LLD ld.lld-9 linker but not with
> >>>>>>>> BFD linker ld.bfd on Debian/buster AMD64.
> >>>>>>>> In both cases I use clang-9 (prerelease).
> >>>>>>>
> >>>>>>> Looks like it is a lld bug.
> >>>>>>>
> >>>>>>> I see the stack trace has __bpf_prog_run32() which is used by
> >>>>>>> kernel bpf interpreter. Could you try to enable bpf jit
> >>>>>>>      sysctl net.core.bpf_jit_enable = 1
> >>>>>>> If this passed, it will prove it is interpreter related.
> >>>>>>>
> >>>>>>
> >>>>>> After...
> >>>>>>
> >>>>>> sysctl -w net.core.bpf_jit_enable=1
> >>>>>>
> >>>>>> I can start all failed systemd services.
> >>>>>>
> >>>>>> systemd-journald.service
> >>>>>> systemd-udevd.service
> >>>>>> haveged.service
> >>>>>>
> >>>>>> This is in maintenance mode.
> >>>>>>
> >>>>>> What is next: Do set a permanent sysctl setting for net.core.bpf_jit_enable?
> >>>>>>
> >>>>>
> >>>>> This is what I did:
> >>>>
> >>>> I probably won't have cycles to debug this potential lld issue.
> >>>> Maybe you already did, I suggest you put enough reproducible
> >>>> details in the bug you filed against lld so they can take a look.
> >>>>
> >>>
> >>> I understand and will put the journalctl-log into the CBL issue
> >>> tracker and update informations.
> >>>
> >>> Thanks for your help understanding the BPF correlations.
> >>>
> >>> Is setting 'net.core.bpf_jit_enable = 2' helpful here?
> >>
> >> jit_enable=1 is enough.
> >> Or use CONFIG_BPF_JIT_ALWAYS_ON to workaround.
> >>
> >> It sounds like clang miscompiles interpreter.
> >> modprobe test_bpf
> >> should be able to point out which part of interpreter is broken.
> >
> > Maybe we need something like...
> >
> > "bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()"
> >
> > ...for clang?
>
> Not sure how do you get conclusion it is gcse causing the problem.
> But anyway, adding such flag in the kernel is not a good idea.
> clang/llvm should be fixed instead. Esp. there is still time
> for 9.0.0 release to fix bugs.
>

To clarify: This is a snapshot release of clang-9 built with tc-build.

Building with -O0 is not possible as I see asm-goto failing.

- Sedat -

[1] https://github.com/ClangBuiltLinux/tc-build

> >
> > - Sedat -
> >
> > [1] https://git.kernel.org/linus/3193c0836f203a91bef96d88c64cccf0be090d9c
> >

^ permalink raw reply

* ip route JSON format is unparseable for "unreachable" routes
From: Michael Ziegler @ 2019-07-28 11:09 UTC (permalink / raw)
  To: netdev

Hi,

I created a couple "unreachable" routes on one of my systems, like such:

> ip route add unreachable 10.0.0.0/8     metric 255
> ip route add unreachable 192.168.0.0/16 metric 255

Unfortunately this results in unparseable JSON output from "ip":

> # ip -j route show  | jq .
> parse error: Objects must consist of key:value pairs at line 1, column 84

The offending JSON objects are these:

> {"unreachable","dst":"10.0.0.0/8","metric":255,"flags":[]}
> {"unreachable","dst":"192.168.0.0/16","metric":255,"flags":[]}
"unreachable" cannot appear on its own here, it needs to be some kind of
field.

The manpage says to report here, thus I do :) I've searched the
archives, but I wasn't able to find any existing bug reports about this.
I'm running version

> ip utility, iproute2-ss190107

on Debian Buster.

Regards,
Michael.

^ permalink raw reply

* Re: next-20190723: bpf/seccomp - systemd/journald issue?
From: Sedat Dilek @ 2019-07-28 11:16 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Alexei Starovoitov, Alexei Starovoitov, Daniel Borkmann,
	Martin Lau, Song Liu, netdev@vger.kernel.org, bpf@vger.kernel.org,
	Clang-Built-Linux ML, Kees Cook, Nick Desaulniers,
	Nathan Chancellor
In-Reply-To: <57169960-35c2-d9d3-94e4-3b5a43d5aca7@fb.com>

On Sat, Jul 27, 2019 at 7:11 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 7/27/19 1:16 AM, Sedat Dilek wrote:
> > On Sat, Jul 27, 2019 at 9:36 AM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> >>
> >> On Sat, Jul 27, 2019 at 4:24 AM Alexei Starovoitov
> >> <alexei.starovoitov@gmail.com> wrote:
> >>>
> >>> On Fri, Jul 26, 2019 at 2:19 PM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> >>>>
> >>>> On Fri, Jul 26, 2019 at 11:10 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 7/26/19 2:02 PM, Sedat Dilek wrote:
> >>>>>> On Fri, Jul 26, 2019 at 10:38 PM Sedat Dilek <sedat.dilek@gmail.com> wrote:
> >>>>>>>
> >>>>>>> Hi Yonghong Song,
> >>>>>>>
> >>>>>>> On Fri, Jul 26, 2019 at 5:45 PM Yonghong Song <yhs@fb.com> wrote:
> >>>>>>>>
> >>>>>>>>
> >>>>>>>>
> >>>>>>>> On 7/26/19 1:26 AM, Sedat Dilek wrote:
> >>>>>>>>> Hi,
> >>>>>>>>>
> >>>>>>>>> I have opened a new issue in the ClangBuiltLinux issue tracker.
> >>>>>>>>
> >>>>>>>> Glad to know clang 9 has asm goto support and now It can compile
> >>>>>>>> kernel again.
> >>>>>>>>
> >>>>>>>
> >>>>>>> Yupp.
> >>>>>>>
> >>>>>>>>>
> >>>>>>>>> I am seeing a problem in the area bpf/seccomp causing
> >>>>>>>>> systemd/journald/udevd services to fail.
> >>>>>>>>>
> >>>>>>>>> [Fri Jul 26 08:08:43 2019] systemd[453]: systemd-udevd.service: Failed
> >>>>>>>>> to connect stdout to the journal socket, ignoring: Connection refused
> >>>>>>>>>
> >>>>>>>>> This happens when I use the (LLVM) LLD ld.lld-9 linker but not with
> >>>>>>>>> BFD linker ld.bfd on Debian/buster AMD64.
> >>>>>>>>> In both cases I use clang-9 (prerelease).
> >>>>>>>>
> >>>>>>>> Looks like it is a lld bug.
> >>>>>>>>
> >>>>>>>> I see the stack trace has __bpf_prog_run32() which is used by
> >>>>>>>> kernel bpf interpreter. Could you try to enable bpf jit
> >>>>>>>>      sysctl net.core.bpf_jit_enable = 1
> >>>>>>>> If this passed, it will prove it is interpreter related.
> >>>>>>>>
> >>>>>>>
> >>>>>>> After...
> >>>>>>>
> >>>>>>> sysctl -w net.core.bpf_jit_enable=1
> >>>>>>>
> >>>>>>> I can start all failed systemd services.
> >>>>>>>
> >>>>>>> systemd-journald.service
> >>>>>>> systemd-udevd.service
> >>>>>>> haveged.service
> >>>>>>>
> >>>>>>> This is in maintenance mode.
> >>>>>>>
> >>>>>>> What is next: Do set a permanent sysctl setting for net.core.bpf_jit_enable?
> >>>>>>>
> >>>>>>
> >>>>>> This is what I did:
> >>>>>
> >>>>> I probably won't have cycles to debug this potential lld issue.
> >>>>> Maybe you already did, I suggest you put enough reproducible
> >>>>> details in the bug you filed against lld so they can take a look.
> >>>>>
> >>>>
> >>>> I understand and will put the journalctl-log into the CBL issue
> >>>> tracker and update informations.
> >>>>
> >>>> Thanks for your help understanding the BPF correlations.
> >>>>
> >>>> Is setting 'net.core.bpf_jit_enable = 2' helpful here?
> >>>
> >>> jit_enable=1 is enough.
> >>> Or use CONFIG_BPF_JIT_ALWAYS_ON to workaround.
> >>>
> >>> It sounds like clang miscompiles interpreter.
> >
> > Just to clarify:
> > This does not happen with clang-9 + ld.bfd (GNU/ld linker).
> >
> >>> modprobe test_bpf
> >>> should be able to point out which part of interpreter is broken.
> >>
> >> Maybe we need something like...
> >>
> >> "bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()"
> >>
> >> ...for clang?
> >>
> >
> > Not sure if something like GCC's...
> >
> > -fgcse
> >
> > Perform a global common subexpression elimination pass. This pass also
> > performs global constant and copy propagation.
> >
> > Note: When compiling a program using computed gotos, a GCC extension,
> > you may get better run-time performance if you disable the global
> > common subexpression elimination pass by adding -fno-gcse to the
> > command line.
> >
> > Enabled at levels -O2, -O3, -Os.
> >
> > ...is available for clang.
> >
> > I tried with hopping to turn off "global common subexpression elimination":
> >
> > diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
> > index 383c87300b0d..92f934a1e9ff 100644
> > --- a/arch/x86/net/Makefile
> > +++ b/arch/x86/net/Makefile
> > @@ -3,6 +3,8 @@
> >   # Arch-specific network modules
> >   #
> >
> > +KBUILD_CFLAGS += -O0
>
> This won't work. First, you added to the wrong file. The interpreter
> is at kernel/bpf/core.c.
>

Thanks for the clarification.
I mixed up the x86 BPF JIT compiler with the BPF interpreter.

I see no diff in the disassembled kernel/bpf/core.o in my clang9-bfd
and clang9-lld build-dirs.

l$ objdump -M intel -d linux.clang9-bfd/kernel/bpf/core.o >
bpf_core_o_clang9-bfd.txt
$ objdump -M intel -d linux.clang9-lld/kernel/bpf/core.o >
bpf_core_o_clang9-lld.txt

--- bpf_core_o_clang9-bfd.txt   2019-07-28 13:11:59.363552042 +0200
+++ bpf_core_o_clang9-lld.txt   2019-07-28 13:12:09.975535278 +0200
@@ -1,5 +1,5 @@

-linux.clang9-bfd/kernel/bpf/core.o:     file format elf64-x86-64
+linux.clang9-lld/kernel/bpf/core.o:     file format elf64-x86-64


 Disassembly of section .text:

> Second, kernel may have compilation issues with -O0.
>

Confirmed.

- Sedat -

> > +
> >   ifeq ($(CONFIG_X86_32),y)
> >           obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
> >   else
> >
> > Still see...
> > BROKEN: test_bpf: #294 BPF_MAXINSNS: Jump, gap, jump, ... jited:0
> >
> > - Sedat -
> >

^ permalink raw reply

* [patch net] net: fix ifindex collision during namespace removal
From: Jiri Pirko @ 2019-07-28 12:56 UTC (permalink / raw)
  To: netdev
  Cc: davem, xemul, edumazet, pabeni, idosch, petrm, sd, f.fainelli,
	stephen, mlxsw, Jiri Pirko

From: Jiri Pirko <jiri@mellanox.com>

Commit aca51397d014 ("netns: Fix arbitrary net_device-s corruptions
on net_ns stop.") introduced a possibility to hit a BUG in case device
is returning back to init_net and two following conditions are met:
1) dev->ifindex value is used in a name of another "dev%d"
   device in init_net.
2) dev->name is used by another device in init_net.

Under real life circumstances this is hard to get. Therefore this has
been present happily for over 10 years. To reproduce:

$ ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
3: enp0s2: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip netns add ns1
$ ip -n ns1 link add dummy1ns1 type dummy
$ ip -n ns1 link add dummy2ns1 type dummy
$ ip link set enp0s2 netns ns1
$ ip -n ns1 link set enp0s2 name dummy0
[  100.858894] virtio_net virtio0 dummy0: renamed from enp0s2
$ ip link add dev4 type dummy
$ ip -n ns1 a
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: dummy1ns1: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 16:63:4c:38:3e:ff brd ff:ff:ff:ff:ff:ff
3: dummy2ns1: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether aa:9e:86:dd:6b:5d brd ff:ff:ff:ff:ff:ff
4: dummy0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: dummy0: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
4: dev4: <BROADCAST,NOARP> mtu 1500 qdisc noop state DOWN group default qlen 1000
    link/ether 5a:e1:4a:b6:ec:f8 brd ff:ff:ff:ff:ff:ff
$ ip netns del ns1
[  158.717795] default_device_exit: failed to move dummy0 to init_net: -17
[  158.719316] ------------[ cut here ]------------
[  158.720591] kernel BUG at net/core/dev.c:9824!
[  158.722260] invalid opcode: 0000 [#1] SMP KASAN PTI
[  158.723728] CPU: 0 PID: 56 Comm: kworker/u2:1 Not tainted 5.3.0-rc1+ #18
[  158.725422] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
[  158.727508] Workqueue: netns cleanup_net
[  158.728915] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.730683] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.736854] RSP: 0018:ffff8880347e7b90 EFLAGS: 00010282
[  158.738752] RAX: 000000000000003b RBX: 00000000ffffffef RCX: 0000000000000000
[  158.741369] RDX: 0000000000000000 RSI: ffffffff8128013d RDI: ffffed10068fcf64
[  158.743418] RBP: ffff888033550170 R08: 000000000000003b R09: fffffbfff0b94b9c
[  158.745626] R10: fffffbfff0b94b9b R11: ffffffff85ca5cdf R12: ffff888032f28000
[  158.748405] R13: dffffc0000000000 R14: ffff8880335501b8 R15: 1ffff110068fcf72
[  158.750638] FS:  0000000000000000(0000) GS:ffff888036000000(0000) knlGS:0000000000000000
[  158.752944] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  158.755245] CR2: 00007fe8b45d21d0 CR3: 00000000340b4005 CR4: 0000000000360ef0
[  158.757654] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  158.760012] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  158.762758] Call Trace:
[  158.763882]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.766148]  ? devlink_nl_cmd_set_doit+0x520/0x520
[  158.768034]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.769870]  ops_exit_list.isra.0+0xa8/0x150
[  158.771544]  cleanup_net+0x446/0x8f0
[  158.772945]  ? unregister_pernet_operations+0x4a0/0x4a0
[  158.775294]  process_one_work+0xa1a/0x1740
[  158.776896]  ? pwq_dec_nr_in_flight+0x310/0x310
[  158.779143]  ? do_raw_spin_lock+0x11b/0x280
[  158.780848]  worker_thread+0x9e/0x1060
[  158.782500]  ? process_one_work+0x1740/0x1740
[  158.784454]  kthread+0x31b/0x420
[  158.786082]  ? __kthread_create_on_node+0x3f0/0x3f0
[  158.788286]  ret_from_fork+0x3a/0x50
[  158.789871] ---[ end trace defd6c657c71f936 ]---
[  158.792273] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.795478] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.804854] RSP: 0018:ffff8880347e7b90 EFLAGS: 00010282
[  158.807865] RAX: 000000000000003b RBX: 00000000ffffffef RCX: 0000000000000000
[  158.811794] RDX: 0000000000000000 RSI: ffffffff8128013d RDI: ffffed10068fcf64
[  158.816652] RBP: ffff888033550170 R08: 000000000000003b R09: fffffbfff0b94b9c
[  158.820930] R10: fffffbfff0b94b9b R11: ffffffff85ca5cdf R12: ffff888032f28000
[  158.825113] R13: dffffc0000000000 R14: ffff8880335501b8 R15: 1ffff110068fcf72
[  158.829899] FS:  0000000000000000(0000) GS:ffff888036000000(0000) knlGS:0000000000000000
[  158.834923] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  158.838164] CR2: 00007fe8b45d21d0 CR3: 00000000340b4005 CR4: 0000000000360ef0
[  158.841917] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  158.845149] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Fix this by checking if a device with the same name exists in init_net
and fallback to original code - dev%d to allocate name - in case it does.

This was found using syzkaller.

Fixes: aca51397d014 ("netns: Fix arbitrary net_device-s corruptions on net_ns stop.")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 net/core/dev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index 2a3be2b279d3..1a24ba26b098 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -9817,6 +9817,8 @@ static void __net_exit default_device_exit(struct net *net)
 
 		/* Push remaining network devices to init_net */
 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+		if (__dev_get_by_name(&init_net, fb_name))
+			snprintf(fb_name, IFNAMSIZ, "dev%%d");
 		err = dev_change_net_namespace(dev, &init_net, fb_name);
 		if (err) {
 			pr_emerg("%s: failed to move %s to init_net: %d\n",
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH net] net: hns: fix LED configuration for marvell phy
From: Pavel Machek @ 2019-07-28 13:24 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: liuyonglong, David Miller, netdev, linux-kernel, linuxarm,
	salil.mehta, yisen.zhuang, shiju.jose
In-Reply-To: <20190725042829.GB14276@lunn.ch>

On Thu 2019-07-25 06:28:29, Andrew Lunn wrote:
> On Thu, Jul 25, 2019 at 11:00:08AM +0800, liuyonglong wrote:
> > > Revert "net: hns: fix LED configuration for marvell phy"
> > > This reverts commit f4e5f775db5a4631300dccd0de5eafb50a77c131.
> > >
> > > Andrew Lunn says this should be handled another way.
> > >
> > > Signed-off-by: David S. Miller <davem@davemloft.net>
> > 
> > 
> > Hi Andrew:
> > 
> > I see this patch have been reverted, can you tell me the better way to do this?
> > Thanks very much!
> 
> Please take a look at the work Matthias Kaehlcke is doing. It has not
> got too far yet, but when it is complete, it should define a generic
> way to configure PHY LEDs.

I don't remember PHY LED discussion from LED mailing list. Would you have a pointer?
Would it make sense to coordinate with LED subsystem?

									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply

* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Eric Dumazet @ 2019-07-28 13:54 UTC (permalink / raw)
  To: Josh Hunt; +Cc: netdev, David Miller
In-Reply-To: <a9ec9cfd-c381-c02e-7d67-e24373c693d6@akamai.com>

On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
>
> On 7/27/19 12:05 AM, Eric Dumazet wrote:
> > On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
> >>
> >> The current implementation of TCP MTU probing can considerably
> >> underestimate the MTU on lossy connections allowing the MSS to get down to
> >> 48. We have found that in almost all of these cases on our networks these
> >> paths can handle much larger MTUs meaning the connections are being
> >> artificially limited. Even though TCP MTU probing can raise the MSS back up
> >> we have seen this not to be the case causing connections to be "stuck" with
> >> an MSS of 48 when heavy loss is present.
> >>
> >> Prior to pushing out this change we could not keep TCP MTU probing enabled
> >> b/c of the above reasons. Now with a reasonble floor set we've had it
> >> enabled for the past 6 months.
> >
> > And what reasonable value have you used ???
>
> Reasonable for some may not be reasonable for others hence the new
> sysctl :) We're currently running with a fairly high value based off of
> the v6 min MTU minus headers and options, etc. We went conservative with
> our setting initially as it seemed a reasonable first step when
> re-enabling TCP MTU probing since with no configurable floor we saw a #
> of cases where connections were using severely reduced mss b/c of loss
> and not b/c of actual path restriction. I plan to reevaluate the setting
> at some point, but since the probing method is still the same it means
> the same clients who got stuck with mss of 48 before will land at
> whatever floor we set. Looking forward we are interested in trying to
> improve TCP MTU probing so it does not penalize clients like this.
>
> A suggestion for a more reasonable floor default would be 512, which is
> the same as the min_pmtu. Given both mechanisms are trying to achieve
> the same goal it seems like they should have a similar min/floor.
>
> >
> >>
> >> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
> >> administrators the ability to control the floor of MSS probing.
> >>
> >> Signed-off-by: Josh Hunt <johunt@akamai.com>
> >> ---
> >>   Documentation/networking/ip-sysctl.txt | 6 ++++++
> >>   include/net/netns/ipv4.h               | 1 +
> >>   net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
> >>   net/ipv4/tcp_ipv4.c                    | 1 +
> >>   net/ipv4/tcp_timer.c                   | 2 +-
> >>   5 files changed, 18 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> >> index df33674799b5..49e95f438ed7 100644
> >> --- a/Documentation/networking/ip-sysctl.txt
> >> +++ b/Documentation/networking/ip-sysctl.txt
> >> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
> >>          Path MTU discovery (MTU probing).  If MTU probing is enabled,
> >>          this is the initial MSS used by the connection.
> >>
> >> +tcp_mtu_probe_floor - INTEGER
> >> +       If MTU probing is enabled this caps the minimum MSS used for search_low
> >> +       for the connection.
> >> +
> >> +       Default : 48
> >> +
> >>   tcp_min_snd_mss - INTEGER
> >>          TCP SYN and SYNACK messages usually advertise an ADVMSS option,
> >>          as described in RFC 1122 and RFC 6691.
> >> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> >> index bc24a8ec1ce5..c0c0791b1912 100644
> >> --- a/include/net/netns/ipv4.h
> >> +++ b/include/net/netns/ipv4.h
> >> @@ -116,6 +116,7 @@ struct netns_ipv4 {
> >>          int sysctl_tcp_l3mdev_accept;
> >>   #endif
> >>          int sysctl_tcp_mtu_probing;
> >> +       int sysctl_tcp_mtu_probe_floor;
> >>          int sysctl_tcp_base_mss;
> >>          int sysctl_tcp_min_snd_mss;
> >>          int sysctl_tcp_probe_threshold;
> >> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> >> index 0b980e841927..59ded25acd04 100644
> >> --- a/net/ipv4/sysctl_net_ipv4.c
> >> +++ b/net/ipv4/sysctl_net_ipv4.c
> >> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
> >>                  .extra2         = &tcp_min_snd_mss_max,
> >>          },
> >>          {
> >> +               .procname       = "tcp_mtu_probe_floor",
> >> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
> >> +               .maxlen         = sizeof(int),
> >> +               .mode           = 0644,
> >> +               .proc_handler   = proc_dointvec_minmax,
> >> +               .extra1         = &tcp_min_snd_mss_min,
> >> +               .extra2         = &tcp_min_snd_mss_max,
> >> +       },
> >> +       {
> >>                  .procname       = "tcp_probe_threshold",
> >>                  .data           = &init_net.ipv4.sysctl_tcp_probe_threshold,
> >>                  .maxlen         = sizeof(int),
> >> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> >> index d57641cb3477..e0a372676329 100644
> >> --- a/net/ipv4/tcp_ipv4.c
> >> +++ b/net/ipv4/tcp_ipv4.c
> >> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net *net)
> >>          net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
> >>          net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
> >>          net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
> >> +       net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> >>
> >>          net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
> >>          net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
> >> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> >> index c801cd37cc2a..dbd9d2d0ee63 100644
> >> --- a/net/ipv4/tcp_timer.c
> >> +++ b/net/ipv4/tcp_timer.c
> >> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
> >>          } else {
> >>                  mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
> >>                  mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
> >> -               mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
> >> +               mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
> >>                  mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
> >>                  icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
> >>          }
> >
> >
> > Existing sysctl should be enough ?
>
> I don't think so. Changing tcp_min_snd_mss could impact clients that
> really want/need a small mss. When you added the new sysctl I tried to
> analyze the mss values we're seeing to understand what we could possibly
> raise it to. While not a huge amount, we see more clients than I
> expected announcing mss values in the 180-512 range. Given that I would
> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
> above.

If these clients need mss values in 180-512 ranges, how MTU probing
would work for them,
if you set a floor to 512 ?

Are we sure the intent of tcp_base_mss was not to act as a floor ?

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c801cd37cc2a9c11f2dd4b9681137755e501a538..6d15895e9dcfb2eff51bbcf3608c7e68c1970a9e
100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -153,7 +153,7 @@ static void tcp_mtu_probing(struct
inet_connection_sock *icsk, struct sock *sk)
                icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
        } else {
                mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
-               mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
+               mss = max(net->ipv4.sysctl_tcp_base_mss, mss);
                mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
                mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
                icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);



>
> >
> > tcp_min_snd_mss  documentation could be slightly updated.
> >
> > And maybe its default value could be raised a bit.
> >
>
> Thanks
> Josh

^ permalink raw reply

* [PATCH net-next] rt2800usb: Add new rt2800usb device PLANEX GW-USMicroN
From: Masanari Iida @ 2019-07-28 14:07 UTC (permalink / raw)
  To: sgruszka, helmut.schaa, kvalo, davem, linux-wireless, netdev,
	linux-kernel
  Cc: Masanari Iida

This patch add a device ID for PLANEX GW-USMicroN.
Without this patch, I had to echo the device IDs in order to
recognize the device.

# lsusb |grep PLANEX
Bus 002 Device 005: ID 2019:ed14 PLANEX GW-USMicroN

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
---
 drivers/net/wireless/ralink/rt2x00/rt2800usb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
index fdf0504b5f1d..0dfb55c69b73 100644
--- a/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
+++ b/drivers/net/wireless/ralink/rt2x00/rt2800usb.c
@@ -1086,6 +1086,7 @@ static const struct usb_device_id rt2800usb_device_table[] = {
 	{ USB_DEVICE(0x0846, 0x9013) },
 	{ USB_DEVICE(0x0846, 0x9019) },
 	/* Planex */
+	{ USB_DEVICE(0x2019, 0xed14) },
 	{ USB_DEVICE(0x2019, 0xed19) },
 	/* Ralink */
 	{ USB_DEVICE(0x148f, 0x3573) },
-- 
2.22.0.545.g9c9b961d7eb1


^ permalink raw reply related

* Re: [PATCH] gigaset: stop maintaining seperately
From: Tilman Schmidt @ 2019-07-28 14:17 UTC (permalink / raw)
  To: Paul Bolle
  Cc: David Miller, Hansjoerg Lipp, Arnd Bergmann, Karsten Keil, netdev,
	linux-kernel
In-Reply-To: <20190726220541.28783-1-pebolle@tiscali.nl>

Thanks to you, Paul, for all your contributions, and specifically for
keeping the driver maintained for four more years after I had to abandon
it for the same reason.

I had a lot of fun working on that driver and I learned a lot in the
course. Now it's time to move on without regrets.

All the best,
Tilman

Am 27.07.2019 um 00:05 schrieb Paul Bolle:
> The Dutch consumer grade ISDN network will be shut down on September 1,
> 2019. This means I'll be converted to some sort of VOIP shortly. At that
> point it would be unwise to try to maintain the gigaset driver, even for
> odd fixes as I do. So I'll stop maintaining it as a seperate driver and
> bump support to CAPI in staging. De facto this means the driver will be
> unmaintained, since no-one seems to be working on CAPI.
> 
> I've lighty tested the hardware specific modules of this driver (bas-gigaset,
> ser-gigaset, and usb-gigaset) for v5.3-rc1. The basic functionality appears to
> be working. It's unclear whether anyone still cares. I'm aware of only one
> person sort of using the driver a few years ago.
> 
> Thanks to Karsten Keil for the ISDN subsystems gigaset was using (I4L and
> CAPI). And many thanks to Hansjoerg Lipp and Tilman Schmidt for writing and
> upstreaming this driver.
> 
> Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
> ---
>  MAINTAINERS | 7 -------
>  1 file changed, 7 deletions(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 783569e3c4b4..e99afbd13355 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6822,13 +6822,6 @@ F:	Documentation/filesystems/gfs2*.txt
>  F:	fs/gfs2/
>  F:	include/uapi/linux/gfs2_ondisk.h
>  
> -GIGASET ISDN DRIVERS
> -M:	Paul Bolle <pebolle@tiscali.nl>
> -L:	gigaset307x-common@lists.sourceforge.net
> -W:	http://gigaset307x.sourceforge.net/
> -S:	Odd Fixes
> -F:	drivers/staging/isdn/gigaset/
> -
>  GNSS SUBSYSTEM
>  M:	Johan Hovold <johan@kernel.org>
>  T:	git git://git.kernel.org/pub/scm/linux/kernel/git/johan/gnss.git
> 

^ permalink raw reply

* Re: memory leak in fdb_create
From: syzbot @ 2019-07-28 14:20 UTC (permalink / raw)
  To: bridge, bsingharora, coreteam, davem, duwe, kaber, kadlec,
	linux-kernel, mingo, mpe, netdev, netfilter-devel, nikolay, pablo,
	roopa, rostedt, syzkaller-bugs
In-Reply-To: <0000000000005e6124058c0cbdbe@google.com>

syzbot has bisected this bug to:

commit 04cf31a759ef575f750a63777cee95500e410994
Author: Michael Ellerman <mpe@ellerman.id.au>
Date:   Thu Mar 24 11:04:01 2016 +0000

     ftrace: Make ftrace_location_range() global

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=1538c778600000
start commit:   abf02e29 Merge tag 'pm-5.2-rc6' of git://git.kernel.org/pu..
git tree:       upstream
final crash:    https://syzkaller.appspot.com/x/report.txt?x=1738c778600000
console output: https://syzkaller.appspot.com/x/log.txt?x=1338c778600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=56f1da14935c3cce
dashboard link: https://syzkaller.appspot.com/bug?extid=88533dc8b582309bf3ee
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16de5c06a00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10546026a00000

Reported-by: syzbot+88533dc8b582309bf3ee@syzkaller.appspotmail.com
Fixes: 04cf31a759ef ("ftrace: Make ftrace_location_range() global")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection

^ permalink raw reply

* Re: [PATCH net-next] mvpp2: document HW checksum behaviour
From: Matteo Croce @ 2019-07-28 14:30 UTC (permalink / raw)
  To: Antoine Tenart, Marcin Wojtas, Stefan Chulski, Maxime Chevallier
  Cc: netdev, LKML, David S . Miller
In-Reply-To: <CAGnkfhycOc8mvqeQDBcnXueUjrFQMC7hdfAOkxr5k0+xc_tnDw@mail.gmail.com>

On Sun, Jul 28, 2019 at 3:36 AM Matteo Croce <mcroce@redhat.com> wrote:
>
> On Fri, Jul 26, 2019 at 2:57 PM Antoine Tenart
> <antoine.tenart@bootlin.com> wrote:
> >
> > Hi Matteo,
> >
> > On Fri, Jul 26, 2019 at 01:15:46AM +0200, Matteo Croce wrote:
> > > The hardware can only offload checksum calculation on first port
> > > due to the Tx FIFO size limitation. Document this in a comment.
> > >
> > > Fixes: 576193f2d579 ("net: mvpp2: jumbo frames support")
> > > Signed-off-by: Matteo Croce <mcroce@redhat.com>
> >
> > Looks good. Please note there's a similar code path in the probe.
> > You could also add a comment there (or move this check/comment in a
> > common place).
> >
> > Thanks!
> > Antoine
> >
>
> Hi Antoine,
>
> I was making a v2, when I looked at the mvpp2_port_probe() which does:
>
> --------------------------------%<------------------------------
> features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
> NETIF_F_TSO;
>
> if (port->pool_long->id == MVPP2_BM_JUMBO && port->id != 0) {
>     dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
>     dev->hw_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
> }
>
> dev->vlan_features |= features;
> -------------------------------->%------------------------------
>
> Is it ok to remove NETIF_F_IP*_CSUM from dev->features and
> dev->hw_features but keep it in dev->vlan_features?
>
> Regards,
> --
> Matteo Croce
> per aspera ad upstream

Hi all,

probably dev->vlan_features is safe to keep the CSUM features to avoid
unnecessary calculation in some cases, but I have another question.
Does the PP2 hardware support checksumming within any offset? I
replaced 'NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM' with NETIF_F_HW_CSUM and
then stacked 5 VxLANS on top of a mvpp2 device, to have the last IP
header at offset 264:

ip link set $dev up
ip addr add 192.168.0.$last/24 dev $dev

for i in {1..5}; do
	ip link add vx$i type vxlan id $i dstport 4789 remote 192.168.$((i-1)).$other
	ip link set vx$i up
	ip addr add 192.168.$i.$last/24 dev vx$i
done

00:51:82:11:22:00 > 3c:fd:fe:9c:60:6c, ethertype IPv4 (0x0800), length 348: 192.168.0.1.33625 > 192.168.0.2.4789: VXLAN, flags [I] (0x08), vni 1
02:25:60:da:87:03 > 92:20:05:45:3d:d3, ethertype IPv4 (0x0800), length 298: 192.168.1.1.33625 > 192.168.1.2.4789: VXLAN, flags [I] (0x08), vni 2
12:20:97:15:8f:aa > 66:08:23:c7:72:ea, ethertype IPv4 (0x0800), length 248: 192.168.2.1.33625 > 192.168.2.2.4789: VXLAN, flags [I] (0x08), vni 3
c6:1c:b9:fd:9d:28 > 22:ca:cb:6a:ea:68, ethertype IPv4 (0x0800), length 198: 192.168.3.1.33625 > 192.168.3.2.4789: VXLAN, flags [I] (0x08), vni 4
02:34:5f:45:a5:9d > d2:4e:d4:d7:42:31, ethertype IPv4 (0x0800), length 148: 192.168.4.1.34504 > 192.168.4.2.4789: VXLAN, flags [I] (0x08), vni 5
a2:99:fd:9c:1b:05 > 5a:81:3b:fc:6a:07, ethertype IPv4 (0x0800), length 98: 192.168.5.1 > 192.168.5.2: ICMP echo request, id 1654, seq 156, length 64

It seems that the HW is capable of doing it, can someone with a
datasheet confirm this?

Regards,
-- 
Matteo Croce
per aspera ad upstream

^ permalink raw reply

* RE: [EXT] Re: [PATCH net-next] mvpp2: document HW checksum behaviour
From: Stefan Chulski @ 2019-07-28 15:22 UTC (permalink / raw)
  To: Matteo Croce, Antoine Tenart, Marcin Wojtas, Maxime Chevallier
  Cc: netdev, LKML, David S . Miller
In-Reply-To: <CAGnkfhz+PezeLT+gyXdsnyJz2dnKpYkcb2HbqvXJoLdzNxuC6g@mail.gmail.com>

> Hi all,
> 
> probably dev->vlan_features is safe to keep the CSUM features to avoid
> unnecessary calculation in some cases, but I have another question.
> Does the PP2 hardware support checksumming within any offset? I replaced
> 'NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM' with NETIF_F_HW_CSUM and
> then stacked 5 VxLANS on top of a mvpp2 device, to have the last IP header
> at offset 264:
> 
> ip link set $dev up
> ip addr add 192.168.0.$last/24 dev $dev
> 
> for i in {1..5}; do
> 	ip link add vx$i type vxlan id $i dstport 4789 remote 192.168.$((i-
> 1)).$other
> 	ip link set vx$i up
> 	ip addr add 192.168.$i.$last/24 dev vx$i done
> 
> 00:51:82:11:22:00 > 3c:fd:fe:9c:60:6c, ethertype IPv4 (0x0800), length 348:
> 192.168.0.1.33625 > 192.168.0.2.4789: VXLAN, flags [I] (0x08), vni 1
> 02:25:60:da:87:03 > 92:20:05:45:3d:d3, ethertype IPv4 (0x0800), length 298:
> 192.168.1.1.33625 > 192.168.1.2.4789: VXLAN, flags [I] (0x08), vni 2
> 12:20:97:15:8f:aa > 66:08:23:c7:72:ea, ethertype IPv4 (0x0800), length 248:
> 192.168.2.1.33625 > 192.168.2.2.4789: VXLAN, flags [I] (0x08), vni 3
> c6:1c:b9:fd:9d:28 > 22:ca:cb:6a:ea:68, ethertype IPv4 (0x0800), length 198:
> 192.168.3.1.33625 > 192.168.3.2.4789: VXLAN, flags [I] (0x08), vni 4
> 02:34:5f:45:a5:9d > d2:4e:d4:d7:42:31, ethertype IPv4 (0x0800), length 148:
> 192.168.4.1.34504 > 192.168.4.2.4789: VXLAN, flags [I] (0x08), vni 5
> a2:99:fd:9c:1b:05 > 5a:81:3b:fc:6a:07, ethertype IPv4 (0x0800), length 98:
> 192.168.5.1 > 192.168.5.2: ICMP echo request, id 1654, seq 156, length 64
> 
> It seems that the HW is capable of doing it, can someone with a datasheet
> confirm this?

L3_offset in TX descriptor has 7 bits, so beginning of Layer3 should be less than 128 Bytes.

Stefan,
Regards.

^ permalink raw reply

* Re: ip route JSON format is unparseable for "unreachable" routes
From: Stephen Hemminger @ 2019-07-28 16:15 UTC (permalink / raw)
  To: Michael Ziegler; +Cc: netdev
In-Reply-To: <6e88311b-5edc-4c62-1581-0f5b160a5f4e@michaelziegler.name>

On Sun, 28 Jul 2019 13:09:55 +0200
Michael Ziegler <ich@michaelziegler.name> wrote:

> Hi,
> 
> I created a couple "unreachable" routes on one of my systems, like such:
> 
> > ip route add unreachable 10.0.0.0/8     metric 255
> > ip route add unreachable 192.168.0.0/16 metric 255  
> 
> Unfortunately this results in unparseable JSON output from "ip":
> 
> > # ip -j route show  | jq .
> > parse error: Objects must consist of key:value pairs at line 1, column 84  
> 
> The offending JSON objects are these:
> 
> > {"unreachable","dst":"10.0.0.0/8","metric":255,"flags":[]}
> > {"unreachable","dst":"192.168.0.0/16","metric":255,"flags":[]}  
> "unreachable" cannot appear on its own here, it needs to be some kind of
> field.
> 
> The manpage says to report here, thus I do :) I've searched the
> archives, but I wasn't able to find any existing bug reports about this.
> I'm running version
> 
> > ip utility, iproute2-ss190107  
> 
> on Debian Buster.
> 
> Regards,
> Michael.

Already fixed upstream by:

commit 073661773872709518d35d4d093f3a715281f21d
Author: Matteo Croce <mcroce@redhat.com>
Date:   Mon Mar 18 18:19:29 2019 +0100

    ip route: print route type in JSON output
    
    ip route generates an invalid JSON if the route type has to be printed,
    eg. when detailed mode is active, or the type is different that unicast:
    
        $ ip -d -j -p route show
        [ {"unicast",
                "dst": "192.168.122.0/24",
                "dev": "virbr0",
                "protocol": "kernel",
                "scope": "link",
                "prefsrc": "192.168.122.1",
                "flags": [ "linkdown" ]
            } ]
    
        $ ip -j -p route show
        [ {"unreachable",
                "dst": "192.168.23.0/24",
                "flags": [ ]
            },{"prohibit",
                "dst": "192.168.24.0/24",
                "flags": [ ]
            },{"blackhole",
                "dst": "192.168.25.0/24",
                "flags": [ ]
            } ]
    
    Fix it by printing the route type as the "type" attribute:
    
        $ ip -d -j -p route show
        [ {
                "type": "unicast",
                "dst": "default",
                "gateway": "192.168.85.1",
                "dev": "wlp3s0",
                "protocol": "dhcp",
                "scope": "global",
                "metric": 600,
                "flags": [ ]
            },{
                "type": "unreachable",
                "dst": "192.168.23.0/24",
                "protocol": "boot",
                "scope": "global",
                "flags": [ ]
            },{
                "type": "prohibit",
                "dst": "192.168.24.0/24",
                "protocol": "boot",
                "scope": "global",
                "flags": [ ]
            },{
                "type": "blackhole",
                "dst": "192.168.25.0/24",
                "protocol": "boot",
                "scope": "global",
                "flags": [ ]
            } ]
    
    Fixes: 663c3cb23103 ("iproute: implement JSON and color output")
    Acked-by: Phil Sutter <phil@nwl.cc>
    Reviewed-and-tested-by: Andrea Claudi <aclaudi@redhat.com>
    Signed-off-by: Matteo Croce <mcroce@redhat.com>
    Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

^ permalink raw reply

* Re: memory leak in fdb_create
From: Nikolay Aleksandrov @ 2019-07-28 16:51 UTC (permalink / raw)
  To: syzbot, bridge, bsingharora, coreteam, davem, duwe, kaber, kadlec,
	linux-kernel, mingo, mpe, netdev, netfilter-devel, pablo, roopa,
	rostedt, syzkaller-bugs
In-Reply-To: <0000000000008be1b2058ebe7805@google.com>

On 28/07/2019 17:20, syzbot wrote:
> syzbot has bisected this bug to:
> 
> commit 04cf31a759ef575f750a63777cee95500e410994
> Author: Michael Ellerman <mpe@ellerman.id.au>
> Date:   Thu Mar 24 11:04:01 2016 +0000
> 
>     ftrace: Make ftrace_location_range() global
> 
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=1538c778600000
> start commit:   abf02e29 Merge tag 'pm-5.2-rc6' of git://git.kernel.org/pu..
> git tree:       upstream
> final crash:    https://syzkaller.appspot.com/x/report.txt?x=1738c778600000
> console output: https://syzkaller.appspot.com/x/log.txt?x=1338c778600000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=56f1da14935c3cce
> dashboard link: https://syzkaller.appspot.com/bug?extid=88533dc8b582309bf3ee
> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16de5c06a00000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10546026a00000
> 
> Reported-by: syzbot+88533dc8b582309bf3ee@syzkaller.appspotmail.com
> Fixes: 04cf31a759ef ("ftrace: Make ftrace_location_range() global")
> 
> For information about bisection process see: https://goo.gl/tpsmEJ#bisection

I see the problem, it'd happen if the multicast stats memory allocation fails on bridge
init then the fdb added due to the default vlan would remain and the bridge kmem cache
would be destroyed while not empty (you can even trigger a BUG because of that).
I'll post a patch shortly after running a few tests.

Thanks,
 Nik


^ permalink raw reply

* [PATCH net v2] mvpp2: refactor the HW checksum setup
From: Matteo Croce @ 2019-07-28 17:35 UTC (permalink / raw)
  To: netdev
  Cc: Antoine Tenart, Maxime Chevallier, Marcin Wojtas, Stefan Chulski,
	LKML, David Miller

The hardware can only offload checksum calculation on first port due to
the Tx FIFO size limitation, and has a maximum L3 offset of 128 bytes.
Document this in a comment and move duplicated code in a function.

Fixes: 576193f2d579 ("net: mvpp2: jumbo frames support")
Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 937e4b928b94..a99405135046 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -811,6 +811,26 @@ static int mvpp2_swf_bm_pool_init(struct mvpp2_port *port)
 	return 0;
 }
 
+static void mvpp2_set_hw_csum(struct mvpp2_port *port,
+			      enum mvpp2_bm_pool_log_num new_long_pool)
+{
+	const netdev_features_t csums = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+
+	/* Update L4 checksum when jumbo enable/disable on port.
+	 * Only port 0 supports hardware checksum offload due to
+	 * the Tx FIFO size limitation.
+	 * Also, don't set NETIF_F_HW_CSUM because L3_offset in TX descriptor
+	 * has 7 bits, so the maximum L3 offset is 128.
+	 */
+	if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
+		port->dev->features &= ~csums;
+		port->dev->hw_features &= ~csums;
+	} else {
+		port->dev->features |= csums;
+		port->dev->hw_features |= csums;
+	}
+}
+
 static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
 {
 	struct mvpp2_port *port = netdev_priv(dev);
@@ -843,15 +863,7 @@ static int mvpp2_bm_update_mtu(struct net_device *dev, int mtu)
 		/* Add port to new short & long pool */
 		mvpp2_swf_bm_pool_init(port);
 
-		/* Update L4 checksum when jumbo enable/disable on port */
-		if (new_long_pool == MVPP2_BM_JUMBO && port->id != 0) {
-			dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-			dev->hw_features &= ~(NETIF_F_IP_CSUM |
-					      NETIF_F_IPV6_CSUM);
-		} else {
-			dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-			dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-		}
+		mvpp2_set_hw_csum(port, new_long_pool);
 	}
 
 	dev->mtu = mtu;
@@ -5209,10 +5221,7 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 		dev->features |= NETIF_F_NTUPLE;
 	}
 
-	if (port->pool_long->id == MVPP2_BM_JUMBO && port->id != 0) {
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-		dev->hw_features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-	}
+	mvpp2_set_hw_csum(port, port->pool_long->id);
 
 	dev->vlan_features |= features;
 	dev->gso_max_segs = MVPP2_MAX_TSO_SEGS;
-- 
2.21.0


^ permalink raw reply related

* [PATCH net] net: bridge: delete local fdbs on device init failure
From: Nikolay Aleksandrov @ 2019-07-28 18:22 UTC (permalink / raw)
  To: netdev
  Cc: davem, roopa, bridge, Nikolay Aleksandrov,
	syzbot+88533dc8b582309bf3ee

On initialization failure we have to delete all local fdbs which were
inserted due to the default pvid. This problem has been present since the
inception of default_pvid. Note that currently there are 2 cases:
1) in br_dev_init() when br_multicast_init() fails
2) if register_netdevice() fails after calling ndo_init()

This patch takes care of both since br_vlan_flush() is called on both
occasions. Also the new fdb delete would be a no-op on normal bridge device
destruction since the local fdbs would've been already flushed by
br_dev_delete(). This is not an issue for ports since nbp_vlan_init() is
called last when adding a port thus nothing can fail after it.

Reported-by: syzbot+88533dc8b582309bf3ee@syzkaller.appspotmail.com
Fixes: 5be5a2df40f0 ("bridge: Add filtering support for default_pvid")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
---
Tested with the provided reproducer and can no longer trigger the leak.
Also tested the br_multicast_init() failure manually by making it always
return an error.

 net/bridge/br_vlan.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 021cc9f66804..3e6a702e4c21 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -715,6 +715,11 @@ void br_vlan_flush(struct net_bridge *br)
 
 	ASSERT_RTNL();
 
+	/* delete auto-added default pvid local fdbs before flushing vlans
+	 * otherwise these will be leaked on bridge device init failure
+	 */
+	br_fdb_delete_by_port(br, NULL, 0, 1);
+
 	vg = br_vlan_group(br);
 	__vlan_flush(vg);
 	RCU_INIT_POINTER(br->vlgrp, NULL);
-- 
2.21.0


^ permalink raw reply related

* [PATCH net] hv_sock: Fix hang when a connection is closed
From: Dexuan Cui @ 2019-07-28 18:32 UTC (permalink / raw)
  To: Sunil Muthuswamy, David Miller, netdev@vger.kernel.org
  Cc: KY Srinivasan, Haiyang Zhang, Stephen Hemminger,
	sashal@kernel.org, Michael Kelley, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, olaf@aepfle.de, apw@canonical.com,
	jasowang@redhat.com, vkuznets, marcelo.cerri@canonical.com


hvs_do_close_lock_held() may decrease the reference count to 0 and free the
sk struct completely, and then the following release_sock(sk) may hang.

Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Cc: stable@vger.kernel.org

---
With the proper kernel debugging options enabled, first a warning can
appear:

kworker/1:0/4467 is freeing memory ..., with a lock still held there!
stack backtrace:
Workqueue: events vmbus_onmessage_work [hv_vmbus]
Call Trace:
 dump_stack+0x67/0x90
 debug_check_no_locks_freed.cold.52+0x78/0x7d
 slab_free_freelist_hook+0x85/0x140
 kmem_cache_free+0xa5/0x380
 __sk_destruct+0x150/0x260
 hvs_close_connection+0x24/0x30 [hv_sock]
 vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
 process_one_work+0x241/0x600
 worker_thread+0x3c/0x390
 kthread+0x11b/0x140
 ret_from_fork+0x24/0x30

and then the following release_sock(sk) can hang:

watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [kworker/1:0:4467]
...
irq event stamp: 62890
CPU: 1 PID: 4467 Comm: kworker/1:0 Tainted: G        W         5.2.0+ #39
Workqueue: events vmbus_onmessage_work [hv_vmbus]
RIP: 0010:queued_spin_lock_slowpath+0x2b/0x1e0
...
Call Trace:
 do_raw_spin_lock+0xab/0xb0
 release_sock+0x19/0xb0
 vmbus_onmessage_work+0x1d/0x30 [hv_vmbus]
 process_one_work+0x241/0x600
 worker_thread+0x3c/0x390
 kthread+0x11b/0x140
 ret_from_fork+0x24/0x30

 net/vmw_vsock/hyperv_transport.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index f2084e3f7aa4..efbda8ef1eff 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -309,9 +309,16 @@ static void hvs_close_connection(struct vmbus_channel *chan)
 {
 	struct sock *sk = get_per_channel_state(chan);
 
+	/* Grab an extra reference since hvs_do_close_lock_held() may decrease
+	 * the reference count to 0 by calling sock_put(sk).
+	 */
+	sock_hold(sk);
+
 	lock_sock(sk);
 	hvs_do_close_lock_held(vsock_sk(sk), true);
 	release_sock(sk);
+
+	sock_put(sk);
 }
 
 static void hvs_open_connection(struct vmbus_channel *chan)
-- 
2.19.1


^ permalink raw reply related

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-28 19:15 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Horatiu Vultur, Nikolay Aleksandrov, roopa, davem, bridge, netdev,
	linux-kernel
In-Reply-To: <20190727030223.GA29731@lunn.ch>

The 07/27/2019 05:02, Andrew Lunn wrote:
> > As you properly guessed, this model is quite different from what we are used to.
> 
> Yes, it takes a while to get the idea that the hardware is just an
> accelerator for what the Linux stack can already do. And if the switch
> cannot do some feature, pass the frame to Linux so it can handle it.
This is understood, and not that different from what we are used to.

The surprise was to make all multicast traffic to go to the CPU.

> You need to keep in mind that there could be other ports in the bridge
> than switch ports, and those ports might be interested in the
> multicast traffic. Hence the CPU needs to see the traffic.
This is a good argument, but I was under the impression that not all HW/drivers
supports foreign interfaces (see ocelot_netdevice_dev_check and
mlxsw_sp_port_dev_check).

> But IGMP snooping can be used to optimise this.
Yes, IGMP snooping can limit the multicast storm of multicast IP traffic, but
not for L2 non-IP multicast traffic.

We could really use something similar for non-IP multicast MAC addresses.

Trying to get back to the original problem:

We have a network which implements the ODVA/DLR ring protocol. This protocol
sends out a beacon frame as often as every 3 us (as far as I recall, default I
believe is 400 us) to this MAC address: 01:21:6C:00:00:01.

Try take a quick look at slide 10 in [1].

If we assume that the SwitchDev driver implemented such that all multicast
traffic goes to the CPU, then we should really have a way to install a HW
offload path in the silicon, such that these packets does not go to the CPU (as
they are known not to be use full, and a frame every 3 us is a significant load
on small DMA connections and CPU resources).

If we assume that the SwitchDev driver implemented such that only "needed"
multicast packets goes to the CPU, then we need a way to get these packets in
case we want to implement the DLR protocol.

I'm sure that both models can work, and I do not think that this is the main
issue here.

Our initial attempt was to allow install static L2-MAC entries and append
multiple ports to such an entry in the MAC table. This was rejected, for several
good reasons it seems. But I'm not sure it was clear what we wanted to achieve,
and why we find it to be important. Hopefully this is clear with a real world
use-case.

Any hints or ideas on what would be a better way to solve this problems will be
much appreciated.

/Allan

[1] https://www.odva.org/Portals/0/Library/Conference/2017-ODVA-Conference_Woods_High%20Availability_Guidelines%20for%20Use%20of%20DLR%20in%20EtherNetIP%20Networks_FINAL%20PPT.pdf

^ permalink raw reply

* Linux Plumbers BPF micro-conference CFP (reminder)
From: Alexei Starovoitov @ 2019-07-28 19:24 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: bpf, Network Development
In-Reply-To: <CAADnVQJ0ATngyqo8xjXdDsyFuuov3KRtbHMR1LcV8VnEDUK8Fg@mail.gmail.com>

Hey Folks,

August 2nd deadline to submit a proposal for BPF uconf
is quickly approaching.
If you're attending LPC in Lisbon and interested
in awesome BPF uconf you need to submit a proposal.

Some of you already submitted them to lpc-bpf@vger
per instructions that were sent back on July 12.
Some proposals were sent via website.
We'd like all proposals to be seen in the website.
Could you please re-enter your proposal there?
Please go to:
https://www.linuxplumbersconf.org/event/4/abstracts/
click on 'submit new proposal'
and copy-paste what you've already sent to lpc-bpf@vger.
Much appreciate it and sorry for confusion.

There is still room for few new proposals,
but space is getting very limited.
Please don't delay.

Thanks!

> ---------- Forwarded message ---------
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Fri, Jul 12, 2019 at 7:26 AM
> Subject: Linux Plumbers BPF micro-conference CFP (reminder)
> To: <bpf@vger.kernel.org>
> Cc: <netdev@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
> <xdp-newbies@vger.kernel.org>, <iovisor-dev@lists.iovisor.org>,
> <lpc-bpf@vger.kernel.org>, <alexei.starovoitov@gmail.com>
>
>
> This is a call for proposals for the BPF micro-conference at this
> years' Linux Plumbers Conference (LPC) 2019 which will be held in
> Lisbon, Portugal for September 9-11.
>
> The goal of the BPF micro-conference is to bring BPF developers
> together to discuss topics around Linux kernel work related to
> the BPF core infrastructure as well as its many subsystems under
> tracing, networking, security, and BPF user space tooling (LLVM,
> libbpf, bpftool and many others).
>
> The format of the micro-conference has a main focus on discussion,
> therefore each accepted topic will provide a short 1-2 slide
> introduction with subsequent discussion for the rest of the given
> time slot.
>
> The BPF micro-conference is a community-driven event and open to
> all LPC attendees, there is no additional registration required.
>
> Please submit your discussion proposals to the LPC BPF micro-conference
> organizers at:
>
>         lpc-bpf@vger.kernel.org
>
> Proposals must be submitted until August 2nd, and submitters will
> be notified of acceptance at latest by August 9. (Please note that
> proposals must not be sent as html mail as they are otherwise dropped
> by vger.)
>
> The format of the submission and many other details can be found at:
>
>         http://vger.kernel.org/lpc-bpf.html
>
> Looking forward to seeing you all in Lisbon in September!

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox