Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 1/6] cls_api: modify the tc_indr_block_ing_cmd parameters.
From: wenxu @ 2019-07-31  8:12 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski, jiri; +Cc: netfilter-devel, netdev
In-Reply-To: <1564560753-32603-1-git-send-email-wenxu@ucloud.cn>

From: wenxu <wenxu@ucloud.cn>

This patch make tc_indr_block_ing_cmd can't access struct
tc_indr_block_dev and tc_indr_block_cb.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/sched/cls_api.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3565d9a..2e3b58d 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -677,26 +677,28 @@ static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
 static int tcf_block_setup(struct tcf_block *block,
 			   struct flow_block_offload *bo);
 
-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
-				  struct tc_indr_block_cb *indr_block_cb,
+static void tc_indr_block_ing_cmd(struct net_device *dev,
+				  struct tcf_block *block,
+				  tc_indr_block_bind_cb_t *cb,
+				  void *cb_priv,
 				  enum flow_block_command command)
 {
 	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
-		.net		= dev_net(indr_dev->dev),
-		.block_shared	= tcf_block_non_null_shared(indr_dev->block),
+		.net		= dev_net(dev),
+		.block_shared	= tcf_block_non_null_shared(block),
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-	if (!indr_dev->block)
+	if (!block)
 		return;
 
-	bo.block = &indr_dev->block->flow_block;
+	bo.block = &block->flow_block;
 
-	indr_block_cb->cb(indr_dev->dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
-			  &bo);
-	tcf_block_setup(indr_dev->block, &bo);
+	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
+
+	tcf_block_setup(block, &bo);
 }
 
 int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
@@ -715,7 +717,8 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 	if (err)
 		goto err_dev_put;
 
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_BIND);
+	tc_indr_block_ing_cmd(dev, indr_dev->block, cb, cb_priv,
+			      FLOW_BLOCK_BIND);
 	return 0;
 
 err_dev_put:
@@ -752,7 +755,8 @@ void __tc_indr_block_cb_unregister(struct net_device *dev,
 		return;
 
 	/* Send unbind message if required to free any block cbs. */
-	tc_indr_block_ing_cmd(indr_dev, indr_block_cb, FLOW_BLOCK_UNBIND);
+	tc_indr_block_ing_cmd(dev, indr_dev->block, cb, indr_block_cb->cb_priv,
+			      FLOW_BLOCK_UNBIND);
 	tc_indr_block_cb_del(indr_block_cb);
 	tc_indr_block_dev_put(indr_dev);
 }
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 2/6] cls_api: replace block with flow_block in tc_indr_block_dev
From: wenxu @ 2019-07-31  8:12 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski, jiri; +Cc: netfilter-devel, netdev
In-Reply-To: <1564560753-32603-1-git-send-email-wenxu@ucloud.cn>

From: wenxu <wenxu@ucloud.cn>

This patch make tc_indr_block_dev can separate from tc subsystem

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/sched/cls_api.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2e3b58d..f9643fa 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -574,7 +574,7 @@ struct tc_indr_block_dev {
 	struct net_device *dev;
 	unsigned int refcnt;
 	struct list_head cb_list;
-	struct tcf_block *block;
+	struct flow_block *flow_block;
 };
 
 struct tc_indr_block_cb {
@@ -597,6 +597,14 @@ struct tc_indr_block_cb {
 				      tc_indr_setup_block_ht_params);
 }
 
+static void tc_indr_get_default_block(struct tc_indr_block_dev *indr_dev)
+{
+	struct tcf_block *block = tc_dev_ingress_block(indr_dev->dev);
+
+	if (block)
+		indr_dev->flow_block = &block->flow_block;
+}
+
 static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
 {
 	struct tc_indr_block_dev *indr_dev;
@@ -611,7 +619,7 @@ static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
 
 	INIT_LIST_HEAD(&indr_dev->cb_list);
 	indr_dev->dev = dev;
-	indr_dev->block = tc_dev_ingress_block(dev);
+	tc_indr_get_default_block(indr_dev);
 	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
 				   tc_indr_setup_block_ht_params)) {
 		kfree(indr_dev);
@@ -678,11 +686,14 @@ static int tcf_block_setup(struct tcf_block *block,
 			   struct flow_block_offload *bo);
 
 static void tc_indr_block_ing_cmd(struct net_device *dev,
-				  struct tcf_block *block,
+				  struct flow_block *flow_block,
 				  tc_indr_block_bind_cb_t *cb,
 				  void *cb_priv,
 				  enum flow_block_command command)
 {
+	struct tcf_block *block = flow_block ? container_of(flow_block,
+							    struct tcf_block,
+							    flow_block) : NULL;
 	struct flow_block_offload bo = {
 		.command	= command,
 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
@@ -694,7 +705,7 @@ static void tc_indr_block_ing_cmd(struct net_device *dev,
 	if (!block)
 		return;
 
-	bo.block = &block->flow_block;
+	bo.block = flow_block;
 
 	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
 
@@ -717,7 +728,7 @@ int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
 	if (err)
 		goto err_dev_put;
 
-	tc_indr_block_ing_cmd(dev, indr_dev->block, cb, cb_priv,
+	tc_indr_block_ing_cmd(dev, indr_dev->flow_block, cb, cb_priv,
 			      FLOW_BLOCK_BIND);
 	return 0;
 
@@ -750,13 +761,14 @@ void __tc_indr_block_cb_unregister(struct net_device *dev,
 	if (!indr_dev)
 		return;
 
-	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, indr_block_cb->cb,
+						indr_block_cb->cb_ident);
 	if (!indr_block_cb)
 		return;
 
 	/* Send unbind message if required to free any block cbs. */
-	tc_indr_block_ing_cmd(dev, indr_dev->block, cb, indr_block_cb->cb_priv,
-			      FLOW_BLOCK_UNBIND);
+	tc_indr_block_ing_cmd(dev, indr_dev->flow_block, indr_block_cb->cb,
+			      indr_block_cb->cb_priv, FLOW_BLOCK_UNBIND);
 	tc_indr_block_cb_del(indr_block_cb);
 	tc_indr_block_dev_put(indr_dev);
 }
@@ -792,7 +804,8 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 	if (!indr_dev)
 		return;
 
-	indr_dev->block = command == FLOW_BLOCK_BIND ? block : NULL;
+	indr_dev->flow_block = command == FLOW_BLOCK_BIND ?
+					  &block->flow_block : NULL;
 
 	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
 		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 6/6] netfilter: nf_tables_offload: support indr block call
From: wenxu @ 2019-07-31  8:12 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski, jiri; +Cc: netfilter-devel, netdev
In-Reply-To: <1564560753-32603-1-git-send-email-wenxu@ucloud.cn>

From: wenxu <wenxu@ucloud.cn>

nftable support indr-block call. It makes nftable an offload vlan
and tunnel device.

nft add table netdev firewall
nft add chain netdev firewall aclout { type filter hook ingress offload device mlx_pf0vf0 priority - 300 \; }
nft add rule netdev firewall aclout ip daddr 10.0.0.1 fwd to vlan0
nft add chain netdev firewall aclin { type filter hook ingress device vlan0 priority - 300 \; }
nft add rule netdev firewall aclin ip daddr 10.0.0.7 fwd to mlx_pf0vf0

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/net/netfilter/nf_tables_offload.h |   2 +
 net/netfilter/nf_tables_api.c             |   7 ++
 net/netfilter/nf_tables_offload.c         | 156 +++++++++++++++++++++++++-----
 3 files changed, 141 insertions(+), 24 deletions(-)

diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h
index 3196663..ac69087 100644
--- a/include/net/netfilter/nf_tables_offload.h
+++ b/include/net/netfilter/nf_tables_offload.h
@@ -63,6 +63,8 @@ struct nft_flow_rule {
 struct nft_flow_rule *nft_flow_rule_create(const struct nft_rule *rule);
 void nft_flow_rule_destroy(struct nft_flow_rule *flow);
 int nft_flow_rule_offload_commit(struct net *net);
+bool nft_indr_get_default_block(struct net_device *dev,
+				struct flow_indr_block_info *info);
 
 #define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg)		\
 	(__reg)->base_offset	=					\
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 605a7cf..6a1d0b2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7593,6 +7593,11 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 	.exit	= nf_tables_exit_net,
 };
 
+static struct flow_indr_get_block_entry get_block_entry = {
+	.get_block_cb = nft_indr_get_default_block,
+	.list = LIST_HEAD_INIT(get_block_entry.list),
+};
+
 static int __init nf_tables_module_init(void)
 {
 	int err;
@@ -7624,6 +7629,7 @@ static int __init nf_tables_module_init(void)
 		goto err5;
 
 	nft_chain_route_init();
+	flow_indr_add_default_block_cb(&get_block_entry);
 	return err;
 err5:
 	rhltable_destroy(&nft_objname_ht);
@@ -7640,6 +7646,7 @@ static int __init nf_tables_module_init(void)
 
 static void __exit nf_tables_module_exit(void)
 {
+	flow_indr_del_default_block_cb(&get_block_entry);
 	nfnetlink_subsys_unregister(&nf_tables_subsys);
 	unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
 	nft_chain_filter_fini();
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 64f5fd5..59c9629 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -171,24 +171,114 @@ static int nft_flow_offload_unbind(struct flow_block_offload *bo,
 	return 0;
 }
 
+static int nft_block_setup(struct nft_base_chain *basechain,
+			   struct flow_block_offload *bo,
+			   enum flow_block_command cmd)
+{
+	int err;
+
+	switch (cmd) {
+	case FLOW_BLOCK_BIND:
+		err = nft_flow_offload_bind(bo, basechain);
+		break;
+	case FLOW_BLOCK_UNBIND:
+		err = nft_flow_offload_unbind(bo, basechain);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		err = -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static int nft_block_offload_cmd(struct nft_base_chain *chain,
+				 struct net_device *dev,
+				 enum flow_block_command cmd)
+{
+	struct netlink_ext_ack extack = {};
+	struct flow_block_offload bo = {};
+	int err;
+
+	bo.net = dev_net(dev);
+	bo.block = &chain->flow_block;
+	bo.command = cmd;
+	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	bo.extack = &extack;
+	INIT_LIST_HEAD(&bo.cb_list);
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
+	if (err < 0)
+		return err;
+
+	return nft_block_setup(chain, &bo, cmd);
+}
+
+static void nft_indr_block_ing_cmd(struct net_device *dev,
+				   struct flow_block *flow_block,
+				   flow_indr_block_bind_cb_t *cb,
+				   void *cb_priv,
+				   enum flow_block_command cmd)
+{
+	struct netlink_ext_ack extack = {};
+	struct flow_block_offload bo = {};
+	struct nft_base_chain *chain;
+
+	if (flow_block)
+		return;
+
+	chain = container_of(flow_block, struct nft_base_chain, flow_block);
+
+	bo.net = dev_net(dev);
+	bo.block = flow_block;
+	bo.command = cmd;
+	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	bo.extack = &extack;
+	INIT_LIST_HEAD(&bo.cb_list);
+
+	cb(dev, cb_priv, TC_SETUP_BLOCK, &bo);
+
+	nft_block_setup(chain, &bo, cmd);
+}
+
+static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
+				      struct net_device *dev,
+				      enum flow_block_command cmd)
+{
+	struct flow_block_offload bo = {};
+	struct netlink_ext_ack extack = {};
+
+	bo.net = dev_net(dev);
+	bo.block = &chain->flow_block;
+	bo.command = cmd;
+	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	bo.extack = &extack;
+	INIT_LIST_HEAD(&bo.cb_list);
+
+	flow_indr_block_call(&chain->flow_block, dev, nft_indr_block_ing_cmd,
+			     &bo, cmd);
+
+	if (list_empty(&bo.cb_list))
+		return -EOPNOTSUPP;
+
+	return nft_block_setup(chain, &bo, cmd);
+}
+
 #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK
 
 static int nft_flow_offload_chain(struct nft_trans *trans,
 				  enum flow_block_command cmd)
 {
 	struct nft_chain *chain = trans->ctx.chain;
-	struct netlink_ext_ack extack = {};
-	struct flow_block_offload bo = {};
 	struct nft_base_chain *basechain;
 	struct net_device *dev;
-	int err;
 
 	if (!nft_is_base_chain(chain))
 		return -EOPNOTSUPP;
 
 	basechain = nft_base_chain(chain);
 	dev = basechain->ops.dev;
-	if (!dev || !dev->netdev_ops->ndo_setup_tc)
+	if (!dev)
 		return -EOPNOTSUPP;
 
 	/* Only default policy to accept is supported for now. */
@@ -197,26 +287,10 @@ static int nft_flow_offload_chain(struct nft_trans *trans,
 	    nft_trans_chain_policy(trans) != NF_ACCEPT)
 		return -EOPNOTSUPP;
 
-	bo.command = cmd;
-	bo.block = &basechain->flow_block;
-	bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
-	bo.extack = &extack;
-	INIT_LIST_HEAD(&bo.cb_list);
-
-	err = dev->netdev_ops->ndo_setup_tc(dev, FLOW_SETUP_BLOCK, &bo);
-	if (err < 0)
-		return err;
-
-	switch (cmd) {
-	case FLOW_BLOCK_BIND:
-		err = nft_flow_offload_bind(&bo, basechain);
-		break;
-	case FLOW_BLOCK_UNBIND:
-		err = nft_flow_offload_unbind(&bo, basechain);
-		break;
-	}
-
-	return err;
+	if (dev->netdev_ops->ndo_setup_tc)
+		return nft_block_offload_cmd(basechain, dev, cmd);
+	else
+		return nft_indr_block_offload_cmd(basechain, dev, cmd);
 }
 
 int nft_flow_rule_offload_commit(struct net *net)
@@ -266,3 +340,37 @@ int nft_flow_rule_offload_commit(struct net *net)
 
 	return err;
 }
+
+bool nft_indr_get_default_block(struct net_device *dev,
+				struct flow_indr_block_info *info)
+{
+	struct net *net = dev_net(dev);
+	const struct nft_table *table;
+	const struct nft_chain *chain;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
+		if (table->family != NFPROTO_NETDEV)
+			continue;
+
+		list_for_each_entry_rcu(chain, &table->chains, list) {
+			if (nft_is_base_chain(chain)) {
+				struct nft_base_chain *basechain;
+
+				basechain = nft_base_chain(chain);
+				if (!strncmp(basechain->dev_name, dev->name,
+					     IFNAMSIZ)) {
+					info->flow_block = &basechain->flow_block;
+					info->ing_cmd_cb = nft_indr_block_ing_cmd;
+					rcu_read_unlock();
+					return true;
+				}
+			}
+		}
+	}
+
+	rcu_read_unlock();
+
+	return false;
+}
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 4/6] flow_offload: move tc indirect block to flow offload
From: wenxu @ 2019-07-31  8:12 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski, jiri; +Cc: netfilter-devel, netdev
In-Reply-To: <1564560753-32603-1-git-send-email-wenxu@ucloud.cn>

From: wenxu <wenxu@ucloud.cn>

move tc indirect block to flow_offload and rename
it to flow indirect block.The nf_tables can use the
indr block architecture.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  10 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    |  11 +-
 include/net/flow_offload.h                         |  31 +++
 include/net/pkt_cls.h                              |  35 ---
 include/net/sch_generic.h                          |   3 -
 net/core/flow_offload.c                            | 218 +++++++++++++++++++
 net/sched/cls_api.c                                | 241 +--------------------
 7 files changed, 265 insertions(+), 284 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 7f747cb..074573b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -785,9 +785,9 @@ static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
 {
 	int err;
 
-	err = __tc_indr_block_cb_register(netdev, rpriv,
-					  mlx5e_rep_indr_setup_tc_cb,
-					  rpriv);
+	err = __flow_indr_block_cb_register(netdev, rpriv,
+					    mlx5e_rep_indr_setup_tc_cb,
+					    rpriv);
 	if (err) {
 		struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
 
@@ -800,8 +800,8 @@ static int mlx5e_rep_indr_register_block(struct mlx5e_rep_priv *rpriv,
 static void mlx5e_rep_indr_unregister_block(struct mlx5e_rep_priv *rpriv,
 					    struct net_device *netdev)
 {
-	__tc_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_tc_cb,
-				      rpriv);
+	__flow_indr_block_cb_unregister(netdev, mlx5e_rep_indr_setup_tc_cb,
+					rpriv);
 }
 
 static int mlx5e_nic_rep_netdevice_event(struct notifier_block *nb,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c
index e209f15..7b490db 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
@@ -1479,16 +1479,17 @@ int nfp_flower_reg_indir_block_handler(struct nfp_app *app,
 		return NOTIFY_OK;
 
 	if (event == NETDEV_REGISTER) {
-		err = __tc_indr_block_cb_register(netdev, app,
-						  nfp_flower_indr_setup_tc_cb,
-						  app);
+		err = __flow_indr_block_cb_register(netdev, app,
+						    nfp_flower_indr_setup_tc_cb,
+						    app);
 		if (err)
 			nfp_flower_cmsg_warn(app,
 					     "Indirect block reg failed - %s\n",
 					     netdev->name);
 	} else if (event == NETDEV_UNREGISTER) {
-		__tc_indr_block_cb_unregister(netdev,
-					      nfp_flower_indr_setup_tc_cb, app);
+		__flow_indr_block_cb_unregister(netdev,
+						nfp_flower_indr_setup_tc_cb,
+						app);
 	}
 
 	return NOTIFY_OK;
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 00b9aab..c8d60a6 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <net/flow_dissector.h>
+#include <linux/rhashtable.h>
 
 struct flow_match {
 	struct flow_dissector	*dissector;
@@ -366,4 +367,34 @@ static inline void flow_block_init(struct flow_block *flow_block)
 	INIT_LIST_HEAD(&flow_block->cb_list);
 }
 
+typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
+				      enum tc_setup_type type, void *type_data);
+
+typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
+				       struct flow_block *flow_block,
+				       flow_indr_block_bind_cb_t *cb,
+				       void *cb_priv,
+				       enum flow_block_command command);
+
+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				  flow_indr_block_bind_cb_t *cb,
+				  void *cb_ident);
+
+void __flow_indr_block_cb_unregister(struct net_device *dev,
+				     flow_indr_block_bind_cb_t *cb,
+				     void *cb_ident);
+
+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				flow_indr_block_bind_cb_t *cb, void *cb_ident);
+
+void flow_indr_block_cb_unregister(struct net_device *dev,
+				   flow_indr_block_bind_cb_t *cb,
+				   void *cb_ident);
+
+void flow_indr_block_call(struct flow_block *flow_block,
+			  struct net_device *dev,
+			  flow_indr_block_ing_cmd_t *ing_cmd_cb,
+			  struct flow_block_offload *bo,
+			  enum flow_block_command command);
+
 #endif /* _NET_FLOW_OFFLOAD_H */
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e429809..0790a4e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -70,15 +70,6 @@ static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 	return block->q;
 }
 
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident);
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident);
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident);
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident);
-
 int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		 struct tcf_result *res, bool compat_mode);
 
@@ -137,32 +128,6 @@ void tc_setup_cb_block_unregister(struct tcf_block *block, flow_setup_cb_t *cb,
 {
 }
 
-static inline
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	return 0;
-}
-
-static inline
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	return 0;
-}
-
-static inline
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-}
-
-static inline
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-}
-
 static inline int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 			       struct tcf_result *res, bool compat_mode)
 {
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 6b6b012..d9f359a 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -23,9 +23,6 @@
 struct module;
 struct bpf_flow_keys;
 
-typedef int tc_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
-				    enum tc_setup_type type, void *type_data);
-
 struct qdisc_rate_table {
 	struct tc_ratespec rate;
 	u32		data[256];
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index d63b970..a1fdfa4 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -2,6 +2,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <net/flow_offload.h>
+#include <linux/rtnetlink.h>
 
 struct flow_rule *flow_rule_alloc(unsigned int num_actions)
 {
@@ -280,3 +281,220 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 	}
 }
 EXPORT_SYMBOL(flow_block_cb_setup_simple);
+
+static struct rhashtable indr_setup_block_ht;
+
+struct flow_indr_block_cb {
+	struct list_head list;
+	void *cb_priv;
+	flow_indr_block_bind_cb_t *cb;
+	void *cb_ident;
+};
+
+struct flow_indr_block_dev {
+	struct rhash_head ht_node;
+	struct net_device *dev;
+	unsigned int refcnt;
+	struct list_head cb_list;
+	flow_indr_block_ing_cmd_t *ing_cmd_cb;
+	struct flow_block *flow_block;
+};
+
+static const struct rhashtable_params flow_indr_setup_block_ht_params = {
+	.key_offset	= offsetof(struct flow_indr_block_dev, dev),
+	.head_offset	= offsetof(struct flow_indr_block_dev, ht_node),
+	.key_len	= sizeof(struct net_device *),
+};
+
+static struct flow_indr_block_dev *
+flow_indr_block_dev_lookup(struct net_device *dev)
+{
+	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
+				      flow_indr_setup_block_ht_params);
+}
+
+static struct flow_indr_block_dev *
+flow_indr_block_dev_get(struct net_device *dev)
+{
+	struct flow_indr_block_dev *indr_dev;
+
+	indr_dev = flow_indr_block_dev_lookup(dev);
+	if (indr_dev)
+		goto inc_ref;
+
+	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
+	if (!indr_dev)
+		return NULL;
+
+	INIT_LIST_HEAD(&indr_dev->cb_list);
+	indr_dev->dev = dev;
+	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+				   flow_indr_setup_block_ht_params)) {
+		kfree(indr_dev);
+		return NULL;
+	}
+
+inc_ref:
+	indr_dev->refcnt++;
+	return indr_dev;
+}
+
+static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev)
+{
+	if (--indr_dev->refcnt)
+		return;
+
+	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
+			       flow_indr_setup_block_ht_params);
+	kfree(indr_dev);
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev,
+			  flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+
+	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+		if (indr_block_cb->cb == cb &&
+		    indr_block_cb->cb_ident == cb_ident)
+			return indr_block_cb;
+	return NULL;
+}
+
+static struct flow_indr_block_cb *
+flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv,
+		       flow_indr_block_bind_cb_t *cb, void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+
+	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (indr_block_cb)
+		return ERR_PTR(-EEXIST);
+
+	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
+	if (!indr_block_cb)
+		return ERR_PTR(-ENOMEM);
+
+	indr_block_cb->cb_priv = cb_priv;
+	indr_block_cb->cb = cb;
+	indr_block_cb->cb_ident = cb_ident;
+	list_add(&indr_block_cb->list, &indr_dev->cb_list);
+
+	return indr_block_cb;
+}
+
+static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb)
+{
+	list_del(&indr_block_cb->list);
+	kfree(indr_block_cb);
+}
+
+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				  flow_indr_block_bind_cb_t *cb,
+				  void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
+	int err;
+
+	indr_dev = flow_indr_block_dev_get(dev);
+	if (!indr_dev)
+		return -ENOMEM;
+
+	indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
+	err = PTR_ERR_OR_ZERO(indr_block_cb);
+	if (err)
+		goto err_dev_put;
+
+	if (indr_dev->ing_cmd_cb)
+		indr_dev->ing_cmd_cb(indr_dev->dev, indr_dev->flow_block,
+				     indr_block_cb->cb, indr_block_cb->cb_priv,
+				     FLOW_BLOCK_BIND);
+
+	return 0;
+
+err_dev_put:
+	flow_indr_block_dev_put(indr_dev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);
+
+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
+				flow_indr_block_bind_cb_t *cb,
+				void *cb_ident)
+{
+	int err;
+
+	rtnl_lock();
+	err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
+	rtnl_unlock();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_register);
+
+void __flow_indr_block_cb_unregister(struct net_device *dev,
+				     flow_indr_block_bind_cb_t *cb,
+				     void *cb_ident)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
+
+	indr_dev = flow_indr_block_dev_lookup(dev);
+	if (!indr_dev)
+		return;
+
+	indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident);
+	if (!indr_block_cb)
+		return;
+
+	/* Send unbind message if required to free any block cbs. */
+	if (indr_dev->ing_cmd_cb)
+		indr_dev->ing_cmd_cb(indr_dev->dev, indr_dev->flow_block,
+				     indr_block_cb->cb, indr_block_cb->cb_priv,
+				     FLOW_BLOCK_UNBIND);
+
+	flow_indr_block_cb_del(indr_block_cb);
+	flow_indr_block_dev_put(indr_dev);
+}
+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister);
+
+void flow_indr_block_cb_unregister(struct net_device *dev,
+				   flow_indr_block_bind_cb_t *cb,
+				   void *cb_ident)
+{
+	rtnl_lock();
+	__flow_indr_block_cb_unregister(dev, cb, cb_ident);
+	rtnl_unlock();
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
+
+void flow_indr_block_call(struct flow_block *flow_block,
+			  struct net_device *dev,
+			  flow_indr_block_ing_cmd_t *ing_cmd_cb,
+			  struct flow_block_offload *bo,
+			  enum flow_block_command command)
+{
+	struct flow_indr_block_cb *indr_block_cb;
+	struct flow_indr_block_dev *indr_dev;
+
+	indr_dev = flow_indr_block_dev_lookup(dev);
+	if (!indr_dev)
+		return;
+
+	indr_dev->flow_block = command == FLOW_BLOCK_BIND ? flow_block : NULL;
+	indr_dev->ing_cmd_cb = command == FLOW_BLOCK_BIND ? ing_cmd_cb : NULL;
+
+	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
+		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
+				  bo);
+}
+EXPORT_SYMBOL_GPL(flow_indr_block_call);
+
+static int __init init_flow_indr_rhashtable(void)
+{
+	return rhashtable_init(&indr_setup_block_ht,
+			       &flow_indr_setup_block_ht_params);
+}
+subsys_initcall(init_flow_indr_rhashtable);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 617b098..bd5e591 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -37,6 +37,7 @@
 #include <net/tc_act/tc_skbedit.h>
 #include <net/tc_act/tc_ct.h>
 #include <net/tc_act/tc_mpls.h>
+#include <net/flow_offload.h>
 
 extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
 
@@ -545,149 +546,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held)
 	}
 }
 
-static struct tcf_block *tc_dev_ingress_block(struct net_device *dev)
-{
-	const struct Qdisc_class_ops *cops;
-	struct Qdisc *qdisc;
-
-	if (!dev_ingress_queue(dev))
-		return NULL;
-
-	qdisc = dev_ingress_queue(dev)->qdisc_sleeping;
-	if (!qdisc)
-		return NULL;
-
-	cops = qdisc->ops->cl_ops;
-	if (!cops)
-		return NULL;
-
-	if (!cops->tcf_block)
-		return NULL;
-
-	return cops->tcf_block(qdisc, TC_H_MIN_INGRESS, NULL);
-}
-
-static struct rhashtable indr_setup_block_ht;
-
-struct tc_indr_block_dev {
-	struct rhash_head ht_node;
-	struct net_device *dev;
-	unsigned int refcnt;
-	struct list_head cb_list;
-	struct flow_block *flow_block;
-};
-
-struct tc_indr_block_cb {
-	struct list_head list;
-	void *cb_priv;
-	tc_indr_block_bind_cb_t *cb;
-	void *cb_ident;
-};
-
-static const struct rhashtable_params tc_indr_setup_block_ht_params = {
-	.key_offset	= offsetof(struct tc_indr_block_dev, dev),
-	.head_offset	= offsetof(struct tc_indr_block_dev, ht_node),
-	.key_len	= sizeof(struct net_device *),
-};
-
-static struct tc_indr_block_dev *
-tc_indr_block_dev_lookup(struct net_device *dev)
-{
-	return rhashtable_lookup_fast(&indr_setup_block_ht, &dev,
-				      tc_indr_setup_block_ht_params);
-}
-
-static void tc_indr_get_default_block(struct tc_indr_block_dev *indr_dev)
-{
-	struct tcf_block *block = tc_dev_ingress_block(indr_dev->dev);
-
-	if (block)
-		indr_dev->flow_block = &block->flow_block;
-}
-
-static struct tc_indr_block_dev *tc_indr_block_dev_get(struct net_device *dev)
-{
-	struct tc_indr_block_dev *indr_dev;
-
-	indr_dev = tc_indr_block_dev_lookup(dev);
-	if (indr_dev)
-		goto inc_ref;
-
-	indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL);
-	if (!indr_dev)
-		return NULL;
-
-	INIT_LIST_HEAD(&indr_dev->cb_list);
-	indr_dev->dev = dev;
-	tc_indr_get_default_block(indr_dev);
-	if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-				   tc_indr_setup_block_ht_params)) {
-		kfree(indr_dev);
-		return NULL;
-	}
-
-inc_ref:
-	indr_dev->refcnt++;
-	return indr_dev;
-}
-
-static void tc_indr_block_dev_put(struct tc_indr_block_dev *indr_dev)
-{
-	if (--indr_dev->refcnt)
-		return;
-
-	rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node,
-			       tc_indr_setup_block_ht_params);
-	kfree(indr_dev);
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_lookup(struct tc_indr_block_dev *indr_dev,
-			tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-
-	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
-		if (indr_block_cb->cb == cb &&
-		    indr_block_cb->cb_ident == cb_ident)
-			return indr_block_cb;
-	return NULL;
-}
-
-static struct tc_indr_block_cb *
-tc_indr_block_cb_add(struct tc_indr_block_dev *indr_dev, void *cb_priv,
-		     tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-
-	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, cb, cb_ident);
-	if (indr_block_cb)
-		return ERR_PTR(-EEXIST);
-
-	indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL);
-	if (!indr_block_cb)
-		return ERR_PTR(-ENOMEM);
-
-	indr_block_cb->cb_priv = cb_priv;
-	indr_block_cb->cb = cb;
-	indr_block_cb->cb_ident = cb_ident;
-	list_add(&indr_block_cb->list, &indr_dev->cb_list);
-
-	return indr_block_cb;
-}
-
-static void tc_indr_block_cb_del(struct tc_indr_block_cb *indr_block_cb)
-{
-	list_del(&indr_block_cb->list);
-	kfree(indr_block_cb);
-}
-
 static int tcf_block_setup(struct tcf_block *block,
 			   struct flow_block_offload *bo);
 
 static void tc_indr_block_ing_cmd(struct net_device *dev,
 				  struct flow_block *flow_block,
-				  tc_indr_block_bind_cb_t *cb,
+				  flow_indr_block_bind_cb_t *cb,
 				  void *cb_priv,
 				  enum flow_block_command command)
 {
@@ -712,96 +576,6 @@ static void tc_indr_block_ing_cmd(struct net_device *dev,
 	tcf_block_setup(block, &bo);
 }
 
-int __tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-				tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
-	int err;
-
-	indr_dev = tc_indr_block_dev_get(dev);
-	if (!indr_dev)
-		return -ENOMEM;
-
-	indr_block_cb = tc_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
-	err = PTR_ERR_OR_ZERO(indr_block_cb);
-	if (err)
-		goto err_dev_put;
-
-	tc_indr_block_ing_cmd(dev, indr_dev->flow_block, cb, cb_priv,
-			      FLOW_BLOCK_BIND);
-	return 0;
-
-err_dev_put:
-	tc_indr_block_dev_put(indr_dev);
-	return err;
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_register);
-
-int tc_indr_block_cb_register(struct net_device *dev, void *cb_priv,
-			      tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	int err;
-
-	rtnl_lock();
-	err = __tc_indr_block_cb_register(dev, cb_priv, cb, cb_ident);
-	rtnl_unlock();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_register);
-
-void __tc_indr_block_cb_unregister(struct net_device *dev,
-				   tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
-
-	indr_dev = tc_indr_block_dev_lookup(dev);
-	if (!indr_dev)
-		return;
-
-	indr_block_cb = tc_indr_block_cb_lookup(indr_dev, indr_block_cb->cb,
-						indr_block_cb->cb_ident);
-	if (!indr_block_cb)
-		return;
-
-	/* Send unbind message if required to free any block cbs. */
-	tc_indr_block_ing_cmd(dev, indr_dev->flow_block, indr_block_cb->cb,
-			      indr_block_cb->cb_priv, FLOW_BLOCK_UNBIND);
-	tc_indr_block_cb_del(indr_block_cb);
-	tc_indr_block_dev_put(indr_dev);
-}
-EXPORT_SYMBOL_GPL(__tc_indr_block_cb_unregister);
-
-void tc_indr_block_cb_unregister(struct net_device *dev,
-				 tc_indr_block_bind_cb_t *cb, void *cb_ident)
-{
-	rtnl_lock();
-	__tc_indr_block_cb_unregister(dev, cb, cb_ident);
-	rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(tc_indr_block_cb_unregister);
-
-static void flow_indr_block_call(struct flow_block *flow_block,
-				 struct net_device *dev,
-				 struct flow_block_offload *bo,
-				 enum flow_block_command command)
-{
-	struct tc_indr_block_cb *indr_block_cb;
-	struct tc_indr_block_dev *indr_dev;
-
-	indr_dev = tc_indr_block_dev_lookup(dev);
-	if (!indr_dev)
-		return;
-
-	indr_dev->flow_block = command == FLOW_BLOCK_BIND ? flow_block : NULL;
-
-	list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
-		indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
-				  bo);
-}
-
 static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 			       struct tcf_block_ext_info *ei,
 			       enum flow_block_command command,
@@ -817,7 +591,9 @@ static void tc_indr_block_call(struct tcf_block *block, struct net_device *dev,
 	};
 	INIT_LIST_HEAD(&bo.cb_list);
 
-	flow_indr_block_call(&block->flow_block, dev, &bo, command);
+	flow_indr_block_call(&block->flow_block, dev, tc_indr_block_ing_cmd,
+			     &bo, command);
+
 	tcf_block_setup(block, &bo);
 }
 
@@ -3382,11 +3158,6 @@ static int __init tc_filter_init(void)
 	if (err)
 		goto err_register_pernet_subsys;
 
-	err = rhashtable_init(&indr_setup_block_ht,
-			      &tc_indr_setup_block_ht_params);
-	if (err)
-		goto err_rhash_setup_block_ht;
-
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL,
 		      RTNL_FLAG_DOIT_UNLOCKED);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL,
@@ -3400,8 +3171,6 @@ static int __init tc_filter_init(void)
 
 	return 0;
 
-err_rhash_setup_block_ht:
-	unregister_pernet_subsys(&tcf_net_ops);
 err_register_pernet_subsys:
 	destroy_workqueue(tc_filter_wq);
 	return err;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 0/6] flow_offload: add indr-block in nf_table_offload
From: wenxu @ 2019-07-31  8:12 UTC (permalink / raw)
  To: pablo, fw, jakub.kicinski, jiri; +Cc: netfilter-devel, netdev

From: wenxu <wenxu@ucloud.cn>

This series patch make nftables offload support the vlan and
tunnel device offload through indr-block architecture.

The first four patches mv tc indr block to flow offload and
rename to flow-indr-block.
Because the new flow-indr-block can't get the tcf_block
directly. The fifthe patch provide a callback list to get 
flow_block of each subsystem immediately when the device
register and contain a block.
The last patch make nf_tables_offload support flow-indr-block.

wenxu (6):
  cls_api: modify the tc_indr_block_ing_cmd parameters.
  cls_api: replace block with flow_block in tc_indr_block_dev
  cls_api: add flow_indr_block_call function
  flow_offload: move tc indirect block to flow offload
  flow_offload: support get flow_block immediately
  netfilter: nf_tables_offload: support indr block call

 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  10 +-
 .../net/ethernet/netronome/nfp/flower/offload.c    |  11 +-
 include/net/flow_offload.h                         |  48 ++++
 include/net/netfilter/nf_tables_offload.h          |   2 +
 include/net/pkt_cls.h                              |  35 ---
 include/net/sch_generic.h                          |   3 -
 net/core/flow_offload.c                            | 251 ++++++++++++++++++++
 net/netfilter/nf_tables_api.c                      |   7 +
 net/netfilter/nf_tables_offload.c                  | 156 +++++++++++--
 net/sched/cls_api.c                                | 255 ++++-----------------
 10 files changed, 497 insertions(+), 281 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Song Liu @ 2019-07-31  8:10 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kees Cook, Networking, bpf, Alexei Starovoitov, Daniel Borkmann,
	Kernel Team, Lorenz Bauer, Jann Horn, Greg KH, Linux API,
	LSM List
In-Reply-To: <CALCETrUpVMrk7aaf0trfg9AfZ4fy279uJgZH7V+gZzjFw=hUxA@mail.gmail.com>



> On Jul 30, 2019, at 1:24 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> On Mon, Jul 29, 2019 at 10:07 PM Song Liu <songliubraving@fb.com> wrote:
>> 
>> Hi Andy,
>> 
>>> On Jul 27, 2019, at 11:20 AM, Song Liu <songliubraving@fb.com> wrote:
>>> 
>>> Hi Andy,
>>> 
>>> 

[...]

>>> 
>> 
>> I would like more comments on this.
>> 
>> Currently, bpf permission is more or less "root or nothing", which we
>> would like to change.
>> 
>> The short term goal is to separate bpf from root, in other words, it is
>> "all or nothing". Special user space utilities, such as systemd, would
>> benefit from this. Once this is implemented, systemd can call sys_bpf()
>> when it is not running as root.
> 
> As generally nasty as Linux capabilities are, this sounds like a good
> use for CAP_BPF_ADMIN.

I actually agree CAP_BPF_ADMIN makes sense. The hard part is to make 
existing tools (setcap, getcap, etc.) and libraries aware of the new CAP.

> 
> But what do you have in mind?  Isn't non-root systemd mostly just the
> user systemd session?  That should *not* have bpf() privileges until
> bpf() is improved such that you can't use it to compromise the system.

cgroup bpf is the major use case here. A less important use case is to 
run bpf selftests without being root. 

> 
>> 
>> In longer term, it may be useful to provide finer grain permission of
>> sys_bpf(). For example, sys_bpf() should be aware of containers; and
>> user may only have access to certain bpf maps. Let's call this
>> "fine grain" capability.
>> 
>> 
>> Since we are seeing new use cases every year, we will need many
>> iterations to implement the fine grain permission. I think we need an
>> API that is flexible enough to cover different types of permission
>> control.
>> 
>> For example, bpf_with_cap() can be flexible:
>> 
>>        bpf_with_cap(cmd, attr, size, perm_fd);
>> 
>> We can get different types of permission via different combinations of
>> arguments:
>> 
>>    A perm_fd to /dev/bpf gives access to all sys_bpf() commands, so
>>    this is "all or nothing" permission.
>> 
>>    A perm_fd to /sys/fs/cgroup/.../bpf.xxx would only allow some
>>    commands to this specific cgroup.
>> 
> 
> I don't see why you need to invent a whole new mechanism for this.
> The entire cgroup ecosystem outside bpf() does just fine using the
> write permission on files in cgroupfs to control access.  Why can't
> bpf() do the same thing?

It is easier to use write permission for BPF_PROG_ATTACH. But it is 
not easy to do the same for other bpf commands: BPF_PROG_LOAD and 
BPF_MAP_*. A lot of these commands don't have target concept. Maybe 
we should have target concept for all these commands. But that is a 
much bigger project. OTOH, "all or nothing" model allows all these 
commands at once.

Well, that being said, I will look more into using write permission 
in cgroupfs. 

Thanks again for all these comments and suggestions. Please let us 
know your future thoughts and insights. 

Song

^ permalink raw reply

* [PATCH 2/2] net: ag71xx: Use GFP_KERNEL instead of GFP_ATOMIC in 'ag71xx_rings_init()'
From: Christophe JAILLET @ 2019-07-31  8:06 UTC (permalink / raw)
  To: jcliburn, davem, chris.snook
  Cc: netdev, linux-kernel, kernel-janitors, Christophe JAILLET
In-Reply-To: <cover.1564560130.git.christophe.jaillet@wanadoo.fr>

There is no need to use GFP_ATOMIC here, GFP_KERNEL should be enough.
The 'kcalloc()' just a few lines above, already uses GFP_KERNEL.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
I've done my best to check if a spinlock can be hold when reaching this
code. Apparently it is never the case.
But double check to be sure that it is not the kcalloc that should use
GFP_ATOMIC.
---
 drivers/net/ethernet/atheros/ag71xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 40a8717f51b1..7548247455d7 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1141,7 +1141,7 @@ static int ag71xx_rings_init(struct ag71xx *ag)
 
 	tx->descs_cpu = dma_alloc_coherent(&ag->pdev->dev,
 					   ring_size * AG71XX_DESC_SIZE,
-					   &tx->descs_dma, GFP_ATOMIC);
+					   &tx->descs_dma, GFP_KERNEL);
 	if (!tx->descs_cpu) {
 		kfree(tx->buf);
 		tx->buf = NULL;
-- 
2.20.1


^ permalink raw reply related

* [PATCH 1/2] net: ag71xx: Slighly simplify code in 'ag71xx_rings_init()'
From: Christophe JAILLET @ 2019-07-31  8:06 UTC (permalink / raw)
  To: jcliburn, davem, chris.snook
  Cc: netdev, linux-kernel, kernel-janitors, Christophe JAILLET
In-Reply-To: <cover.1564560130.git.christophe.jaillet@wanadoo.fr>

A few lines above, we have:
   tx_size = BIT(tx->order);

So use 'tx_size' directly to be consistent with the way 'rx->descs_cpu' and
'rx->descs_dma' are computed below.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
 drivers/net/ethernet/atheros/ag71xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c
index 77542bd1e5bd..40a8717f51b1 100644
--- a/drivers/net/ethernet/atheros/ag71xx.c
+++ b/drivers/net/ethernet/atheros/ag71xx.c
@@ -1148,7 +1148,7 @@ static int ag71xx_rings_init(struct ag71xx *ag)
 		return -ENOMEM;
 	}
 
-	rx->buf = &tx->buf[BIT(tx->order)];
+	rx->buf = &tx->buf[tx_size];
 	rx->descs_cpu = ((void *)tx->descs_cpu) + tx_size * AG71XX_DESC_SIZE;
 	rx->descs_dma = tx->descs_dma + tx_size * AG71XX_DESC_SIZE;
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH 0/2] net: ag71xx: 2 small patches for 'ag71xx_rings_init()'
From: Christophe JAILLET @ 2019-07-31  8:06 UTC (permalink / raw)
  To: jcliburn, davem, chris.snook
  Cc: netdev, linux-kernel, kernel-janitors, Christophe JAILLET

Christophe JAILLET (2):
  net: ag71xx: Slighly simplify code in 'ag71xx_rings_init()'
  net: ag71xx: Use GFP_KERNEL instead of GFP_ATOMIC in
    'ag71xx_rings_init()'

 drivers/net/ethernet/atheros/ag71xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
2.20.1


^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-31  8:01 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Ido Schimmel, Nikolay Aleksandrov, Horatiu Vultur, roopa, davem,
	bridge, netdev, linux-kernel
In-Reply-To: <20190731033156.GE9523@lunn.ch>

The 07/31/2019 05:31, Andrew Lunn wrote:
> The PDF you linked to suggests this as well. But i think you will need
> to make some core changes to the bridge. At the moment, STP is a
> bridge level property. But you are going to need it to be a per-port
> option. You can then use DLR on the ring ports, and optionally STP on
> the other ports.
I think you are right. I have not looked into the details of this. Our plan was
to not implement this to begin with. This will mean that it cannot act as a
gateway until this is done. But it still has good value to be able to implement
a DLR ring node.

> > But what we are looking at here, is to offload a
> > non-aware-(DLR|MRP)-switch which happens to be placed in a network
> > with these protocols running.
> 
> So we need to think about why we are passing traffic to the CPU port,
> and under what conditions can it be blocked.
> 
> 1) The interface is not part of a bridge. In this case, we only need
> the switch to pass to the CPU port MC addresses which have been set
> via set_rx_mode().
Yes. In our HW we are using the MAC table to do this, but this is an
implementation detail.

> I think this case does not apply for what you want. You have two ports
> bridges together as part of the ring.
Correct.

> 2) The interface is part of a bridge. There are a few sub-cases
> 
> a) IGMP snooping is being performed. We can block multicast where
> there is no interest in the group. But this is limited to IP
> multicast.
Agree. And this is done today by installing an explicit offload rule to limit
the flooding of a specific group.

> b) IGMP snooping is not being used and all interfaces in the bridge
> are ports of the switch. IP Multicast can be blocked to the CPU.
Does it matter if IGMP snooping is used or not? A more general statement could
be:

- "All interfaces in the bridge are ports of the switch. IP Multicast can be
  blocked to the CPU."

This assumes that if br0 joins a IP multicast group, then the needed MAC
addresses is added via the set_rx_mode (which I'm pretty sure is the case.

Or much more aggressive (which is where we started this entire discussion):

- "All interfaces in the bridge are ports of the switch. All Multicast (except
  ff:ff:ff:ff:ff:ff, 33:33:xx:xx:xx:xx, 01:80:C2:xx:xx:xx) can be blocked to the
  CPU."

But then we are back to having the need of requesting additional L2-multicast
mac addresses to the CPU. This has been discussed, and I do not want to re-start
the discussion, just wanted to mention it to have the complete picture.

> c) IGMP snooping is not being used and there is a non-switch interface
> in the bridge. Multicast needed is needed, so it can be flooded out
> this port.
Yes. And L2-multicast also always needs to go to the CPU regardless of IGMP.

> d) set_rx_mode() has been called on the br0 interface, indicating
> there is interest in the packets on the host. They must be sent to the
> CPU so they can be delivered locally.
Yes - but today set_rx_mode is not doing anything on br0. This was one problem
we had a few days ago, when we did not forward all traffic to the CPU by
default.

When looking at these cases, I'm not sure it matters if IGMP snooping is running
or not.

Here is how I understand what you say:

              || Foreign interfaces   |  No Foreign interfaces
==============||======================|=========================
IGMP Snoop on || IP-Multicast must    |  IP-Multicast is not needed
              || go to CPU by default |  in the CPU (by default*)
--------------||----------------------|-------------------------
IGMP Snoop off|| IP-Multicast must    |  IP-Multicast is not needed 
              || go to CPU by default |  in the CPU (by default*)

* This assumes that set_rx_mode starts to do something for the br0 interface
  (which does not seem to be the case right now - see br_dev_set_multicast_list).


> e) ????
e) Another case to consider is similar to d), but with VLANs. Assume that
br0.100 joins a IP-Multicast group, which will cause set_rx_mode() to be called.
As I read the code (I have not tried it, so I might have gotten this wrong),
then we loose the VLAN information, because the mc list is not VLAN aware.

This means that if br0.100 joins a group, then we will get that group for all
VLANs.

Maybe it is better (for now) to hold on to the exiting behavioral and let all
multicast go to the CPU by default, leave br_dev_set_multicast_list empty, use
IGMP snooping to optimize the flooding, and hopefully we will have a way to
limit the L2-multicast flooding as an optimization which can be applied to
"noisy" l2 multicast streams.

> Does the Multicast MAC address being used by DLR also map to an IP
> mmulticast address?
No.

> 01:21:6C:00:00:0[123] appear to be the MAC addresses used by DLR.
Yes.

> IPv4 multicast MAC addresses are 01:00:5E:XX:XX:XX. IPv6 multicast MAC
> addresses are 33:33:XX:XX:XX:XX.
Yes, and there are no overlap.

> So one possibility here is to teach the SW bridge about non-IP
> multicast addresses. Initially the switch should forward all MAC
> multicast frames to the CPU. If the frame is not an IPv4 or IPv6
> frame, and there has not been a call to set_rx_mode() for the MAC
> address on the br0 interface, and the bridge only contains switch
> ports, switchdev could be used to block the multicast to the CPU
> frame, but forward it out all other ports of the bridge.
Close, but not exactly (due to the arguments above).

Here is how I see it:

Teach the SW bridge about non-IP multicast addresses. Initially the switch
should forward all MAC multicast frames to the CPU. Today MDB rules can be
installed (either static or dynamic by IGMP), which limit the flooding of IPv4/6
multicast streams. In the same way, we should have a way to install a rule
(FDM/ or MDB) to limit the flooding of L2 multicast frames.

If foreign interfaces (or br0 it self) is part of the destination list, then
traffic also needs to go to the CPU.

By doing this, we can for explicitly configured dst mac address:
- limit the flooding on the on the SW bridge interfaces
- limit the flooding on the on the HW bridge interfaces
- prevent them to go to the CPU if they are not needed


-- 
/Allan

^ permalink raw reply

* Re: [PATCH net-next v4 6/6] net: mscc: PTP Hardware Clock (PHC) support
From: antoine.tenart @ 2019-07-31  7:46 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: antoine.tenart@bootlin.com, richardcochran@gmail.com,
	davem@davemloft.net, UNGLinuxDriver@microchip.com,
	alexandre.belloni@bootlin.com, netdev@vger.kernel.org,
	thomas.petazzoni@bootlin.com, allan.nielsen@microchip.com
In-Reply-To: <2b70e0fbebf1875c51272f3005271a9aec68f00f.camel@mellanox.com>

Hello Saeed,

On Fri, Jul 26, 2019 at 08:52:10PM +0000, Saeed Mahameed wrote:
> On Thu, 2019-07-25 at 16:27 +0200, Antoine Tenart wrote:
> >  
> >  	dev->stats.tx_packets++;
> >  	dev->stats.tx_bytes += skb->len;
> > -	dev_kfree_skb_any(skb);
> > +
> > +	if (ocelot->ptp && shinfo->tx_flags & SKBTX_HW_TSTAMP &&
> > +	    port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) {
> > +		struct ocelot_skb *oskb =
> > +			kzalloc(sizeof(struct ocelot_skb), GFP_ATOMIC);
> > +
> 
> Device drivers normally pre allocate descriptor info ring array to
> avoid dynamic atomic allocations of private data on data path.

This depends on drivers. It's a good thing to do but I don't think it's
required here for a first version, we can improve this later.

> > +		oskb->skb = skb;
> > +		oskb->id = port->ts_id % 4;
> > +		port->ts_id++;
> 
> missing skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; ?
> see 3.1 Hardware Timestamping Implementation: Device Drivers
> https://www.kernel.org/doc/Documentation/networking/timestamping.txt

Right, I'll add this.

> > +void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64
> > *ts)
> > +{
> > +	unsigned long flags;
> > +	u32 val;
> > +
> > +	spin_lock_irqsave(&ocelot->ptp_clock_lock, flags);
> > +
> > +	/* Read current PTP time to get seconds */
> > +	val = ocelot_read_rix(ocelot, PTP_PIN_CFG, TOD_ACC_PIN);
> > +
> > +	val &= ~(PTP_PIN_CFG_SYNC | PTP_PIN_CFG_ACTION_MASK |
> > PTP_PIN_CFG_DOM);
> > +	val |= PTP_PIN_CFG_ACTION(PTP_PIN_ACTION_SAVE);
> > +	ocelot_write_rix(ocelot, val, PTP_PIN_CFG, TOD_ACC_PIN);
> > +	ts->tv_sec = ocelot_read_rix(ocelot, PTP_PIN_TOD_SEC_LSB,
> > TOD_ACC_PIN);
> > +
> > +	/* Read packet HW timestamp from FIFO */
> > +	val = ocelot_read(ocelot, SYS_PTP_TXSTAMP);
> > +	ts->tv_nsec = SYS_PTP_TXSTAMP_PTP_TXSTAMP(val);
> > +
> > +	/* Sec has incremented since the ts was registered */
> > +	if ((ts->tv_sec & 0x1) != !!(val &
> > SYS_PTP_TXSTAMP_PTP_TXSTAMP_SEC))
> > +		ts->tv_sec--;
> > +
> > +	spin_unlock_irqrestore(&ocelot->ptp_clock_lock, flags);
> > +}
> > +EXPORT_SYMBOL(ocelot_get_hwtimestamp);
> > +
> 
> Why EXPORT_SYMBOL? this is the last patch and it is touching one
> driver.

Because the function is used in ocelot_board.c, which can be part of
another module (the mscc/ driver provides 2 modules). You can find
other examples of this in this driver.

> > @@ -98,7 +100,11 @@ static irqreturn_t ocelot_xtr_irq_handler(int
> > irq, void *arg)
> >  		int sz, len, buf_len;
> >  		u32 ifh[4];
> >  		u32 val;
> > -		struct frame_info info;
> > +		struct frame_info info = {};
> > +		struct timespec64 ts;
> > +		struct skb_shared_hwtstamps *shhwtstamps;
> > +		u64 tod_in_ns;
> > +		u64 full_ts_in_ns;
> 
> reverse xmas tree.

OK.

> > @@ -145,6 +151,22 @@ static irqreturn_t ocelot_xtr_irq_handler(int
> > irq, void *arg)
> >  			break;
> >  		}
> >  
> > +		if (ocelot->ptp) {
> > +			ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
> > +
> > +			tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
> > +			if ((tod_in_ns & 0xffffffff) < info.timestamp)
> > +				full_ts_in_ns = (((tod_in_ns >> 32) -
> > 1) << 32) |
> > +						info.timestamp;
> > +			else
> > +				full_ts_in_ns = (tod_in_ns &
> > GENMASK_ULL(63, 32)) |
> > +						info.timestamp;
> > +
> > +			shhwtstamps = skb_hwtstamps(skb);
> > +			memset(shhwtstamps, 0, sizeof(struct
> > skb_shared_hwtstamps));
> > +			shhwtstamps->hwtstamp = full_ts_in_ns;
> 
> the right way to set the timestamp is by calling: 
> skb_tstamp_tx(skb, &tstamp);

I'll fix this.

> > +static irqreturn_t ocelot_ptp_rdy_irq_handler(int irq, void *arg)
> > +{
> > +	int budget = OCELOT_PTP_QUEUE_SZ;
> > +	struct ocelot *ocelot = arg;
> > +
> > +	do {
> > +		struct skb_shared_hwtstamps shhwtstamps;
> > +		struct list_head *pos, *tmp;
> > +		struct sk_buff *skb = NULL;
> > +		struct ocelot_skb *entry;
> > +		struct ocelot_port *port;
> > +		struct timespec64 ts;
> > +		u32 val, id, txport;
> > +
> > +		/* Prevent from infinite loop */
> > +		if (unlikely(!--budget))
> > +			break;
> 
> when budget gets to 1 you break, while you still have 1 to go :)
> 
> I assume OCELOT_PTP_QUEUE_SZ > 0, just make this the loop condition and
> avoid infinite loops by design.

That's right, I'll fix it :)

> >  static int mscc_ocelot_probe(struct platform_device *pdev)
> >  {
> > -	int err, irq;
> >  	unsigned int i;
> > +	int err, irq_xtr, irq_ptp_rdy;
> >  	struct device_node *np = pdev->dev.of_node;
> >  	struct device_node *ports, *portnp;
> >  	struct ocelot *ocelot;
> 
> reverse xmas tree

Well, the xmas tree isn't there before my addition. Do you want me to
rework the whole variable definitions in this function (which is
completely unrelated to this patch)?

Thanks,
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/4] bpf: unprivileged BPF access via /dev/bpf
From: Song Liu @ 2019-07-31  7:44 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Kees Cook, linux-security@vger.kernel.org, Networking, bpf,
	Alexei Starovoitov, Daniel Borkmann, Kernel Team, Lorenz Bauer,
	Jann Horn, Greg KH, Linux API
In-Reply-To: <CALCETrVS5FwtmTyspyg-UNoZTHes9wUNbbsvNYwQwXUUfrtaiQ@mail.gmail.com>

Hi Andy,

> On Jul 30, 2019, at 1:20 PM, Andy Lutomirski <luto@kernel.org> wrote:
> 
> On Sat, Jul 27, 2019 at 11:20 AM Song Liu <songliubraving@fb.com> wrote:
>> 
>> Hi Andy,
>> 
>>>>>> 
>>>>> 
>>>>> Well, yes. sys_bpf() is pretty powerful.
>>>>> 
>>>>> The goal of /dev/bpf is to enable special users to call sys_bpf(). In
>>>>> the meanwhile, such users should not take down the whole system easily
>>>>> by accident, e.g., with rm -rf /.
>>>> 
>>>> That’s easy, though — bpftool could learn to read /etc/bpfusers before allowing ruid != 0.
>>> 
>>> This is a great idea! fscaps + /etc/bpfusers should do the trick.
>> 
>> After some discussions and more thinking on this, I have some concerns
>> with the user space only approach.
>> 
>> IIUC, your proposal for user space only approach is like:
>> 
>> 1. bpftool (and other tools) check /etc/bpfusers and only do
>>   setuid for allowed users:
>> 
>>        int main()
>>        {
>>                if (/* uid in /etc/bpfusers */)
>>                        setuid(0);
>>                sys_bpf(...);
>>        }
>> 
>> 2. bpftool (and other tools) is installed with CAP_SETUID:
>> 
>>        setcap cap_setuid=e+p /bin/bpftool
>> 
> 
> You have this a bit backwards.  You wouldn't use CAP_SETUID.  You
> would use the setuid *mode* bit, i.e. chmod 4111 (or 4100 and use ACLs
> to further lock it down).  Or you could use setcap cap_sys_admin=p,
> although the details vary.  It woks a bit like this:
> 
> First, if you are running with elevated privilege due to SUID or
> fscaps, the kernel and glibc offer you a degree of protection: you are
> protected from ptrace(), LD_PRELOAD, etc.  You are *not* protected
> from yourself.  For example, you may be running in a context in which
> an attacker has malicious values in your environment variables, CWD,
> etc.  Do you need to carefully decide whether you are willing to run
> with elevated privilege on behalf of the user, which you learn like
> this:
> 
> uid_t real_uid = getuid();
> 
> Your decision may may depend on command-line arguments as well (i.e.
> you might want to allow tracing but not filtering, say).  Once you've
> made this decision, the details vary:
> 
> For SUID, you either continue to run with euid == 0, or you drop
> privilege using something like:
> 
> if (setresuid(real_uid, real_uid, real_uid) != 0) {
> /* optionally print an error to stderr */
> exit(1);
> }
> 
> For fscaps, if you want to be privileged, you use something like
> capng_update(); capng_apply() to set CAP_SYS_ADMIN to be effective
> when you want privilege.  If you want to be unprivileged (because
> bpfusers says so, for example), you could use capng_update() to drop
> CAP_SYS_ADMIN entirely and see if the calls still work without
> privilege.  But this is a little bit awkward, since you don't directly
> know whether the user that invoked you in the first place had
> CAP_SYS_ADMIN to begin with.
> 
> In general, SUID is a bit easier to work with.

Thanks a lot for these explanations. I learned a lot today via reading
this email and Googling. 

> 
>> This approach is not ideal, because we need to trust the tool to give
>> it CAP_SETUID. A hacked tool could easily bypass /etc/bpfusers check
>> or use other root only sys calls after setuid(0).
> 
> How?  The whole SUID mechanism is designed fairly carefully to prevent
> this.  /bin/sudo is likely to be SUID on your system, but you can't
> just "hack" it to become root.

I guess "hacked" was not the right description. I should have used 
"buggy" instead. 

SUID mechanism is great for small and solid tools, like sudo, passwd,
mount, etc. The user or sys admin could trust the tool to always do the 
right thing. On the other hand, I think SUID is not ideal for complex
tools that are under heavy development. As you mentioned, SUID doesn't 
protect against oneself. Therefore, it won't protect the system from 
buggy tool that uses SUID on something it should not. 

I think SUID is good for bpftool, because it is easy to use SUID 
correctly in bpftool. But, it is not easy to use SUID correctly in 
systemd. 

systemd and similar user space daemons use sys_bpf() to manage cgroup 
bpf programs and maps (BPF_MAP_TYPE_*CGROUP*, BPF_PROG_TYPE_CGROUP_*, 
BPF_CGROUP_*). The daemon runs in the background, and handles user 
requests to attach/detach BPF programs. If the daemon uses SUID or 
similar mechanism, it has to use euid == 0 for sys_bpf(). However, 
such daemons are usually pretty complicated. It is not easy to prove 
the daemon won't misuse euid == 0. 

Does this make sense so far? I will discuss more about cgroup in the 
next email. 

Thanks,
Song

^ permalink raw reply

* [PATCH] net: ethernet: et131x: Use GFP_KERNEL instead of GFP_ATOMIC when allocating tx_ring->tcb_ring
From: Christophe JAILLET @ 2019-07-31  7:38 UTC (permalink / raw)
  To: mark.einon, davem, willy, f.fainelli, andrew
  Cc: netdev, linux-kernel, kernel-janitors, Christophe JAILLET

There is no good reason to use GFP_ATOMIC here. Other memory allocations
are performed with GFP_KERNEL (see other 'dma_alloc_coherent()' below and
'kzalloc()' in 'et131x_rx_dma_memory_alloc()')

Use GFP_KERNEL which should be enough.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
 drivers/net/ethernet/agere/et131x.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c
index e43d922f043e..174344c450af 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -2362,7 +2362,7 @@ static int et131x_tx_dma_memory_alloc(struct et131x_adapter *adapter)
 
 	/* Allocate memory for the TCB's (Transmit Control Block) */
 	tx_ring->tcb_ring = kcalloc(NUM_TCB, sizeof(struct tcb),
-				    GFP_ATOMIC | GFP_DMA);
+				    GFP_KERNEL | GFP_DMA);
 	if (!tx_ring->tcb_ring)
 		return -ENOMEM;
 
-- 
2.20.1


^ permalink raw reply related

* Re: KASAN: use-after-free Read in nr_rx_frame (2)
From: syzbot @ 2019-07-31  7:30 UTC (permalink / raw)
  To: davem, dvyukov, linux-hams, linux-kernel, netdev, ralf,
	syzkaller-bugs, xiyou.wangcong
In-Reply-To: <000000000000e42667058e554371@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    629f8205 Merge tag 'for-linus-20190730' of git://git.kerne..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=15856062600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=e397351d2615e10
dashboard link: https://syzkaller.appspot.com/bug?extid=701728447042217b67c1
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=14a6e008600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11937d92600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+701728447042217b67c1@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: use-after-free in atomic_read  
include/asm-generic/atomic-instrumented.h:26 [inline]
BUG: KASAN: use-after-free in refcount_inc_not_zero_checked+0x7c/0x280  
lib/refcount.c:123
Read of size 4 at addr ffff8880893ccec0 by task swapper/0/0

CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc2+ #56
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1d8/0x2f8 lib/dump_stack.c:113
  print_address_description+0x75/0x5b0 mm/kasan/report.c:351
  __kasan_report+0x14b/0x1c0 mm/kasan/report.c:482
  kasan_report+0x26/0x50 mm/kasan/common.c:612
  check_memory_region_inline mm/kasan/generic.c:182 [inline]
  check_memory_region+0x2cf/0x2e0 mm/kasan/generic.c:192
  __kasan_check_read+0x11/0x20 mm/kasan/common.c:92
  atomic_read include/asm-generic/atomic-instrumented.h:26 [inline]
  refcount_inc_not_zero_checked+0x7c/0x280 lib/refcount.c:123
  refcount_inc_checked+0x15/0x50 lib/refcount.c:156
  sock_hold include/net/sock.h:649 [inline]
  sk_add_node include/net/sock.h:701 [inline]
  nr_insert_socket net/netrom/af_netrom.c:137 [inline]
  nr_rx_frame+0x17bc/0x1e40 net/netrom/af_netrom.c:1023
  nr_loopback_timer+0x6a/0x140 net/netrom/nr_loopback.c:59
  call_timer_fn+0xec/0x200 kernel/time/timer.c:1322
  expire_timers kernel/time/timer.c:1366 [inline]
  __run_timers+0x7cd/0x9c0 kernel/time/timer.c:1685
  run_timer_softirq+0x4a/0x90 kernel/time/timer.c:1698
  __do_softirq+0x333/0x7c4 arch/x86/include/asm/paravirt.h:778
  invoke_softirq kernel/softirq.c:373 [inline]
  irq_exit+0x227/0x230 kernel/softirq.c:413
  exiting_irq arch/x86/include/asm/apic.h:537 [inline]
  smp_apic_timer_interrupt+0x113/0x280 arch/x86/kernel/apic/apic.c:1095
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:828
  </IRQ>
RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
Code: 01 fa eb ae 89 d9 80 e1 07 80 c1 03 38 c1 7c ba 48 89 df e8 74 3b 01  
fa eb b0 90 90 e9 07 00 00 00 0f 00 2d d6 36 51 00 fb f4 <c3> 90 e9 07 00  
00 00 0f 00 2d c6 36 51 00 f4 c3 90 90 55 48 89 e5
RSP: 0018:ffffffff88c07cd8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: 1ffffffff11950f3 RBX: ffffffff88c75a00 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: ffffffff812d2b3a RDI: ffffffff87b14d9a
RBP: ffffffff88c07ce0 R08: ffffffff817d8974 R09: fffffbfff118eb41
R10: fffffbfff118eb41 R11: 0000000000000000 R12: 0000000000000000
R13: 1ffffffff118eb40 R14: dffffc0000000000 R15: dffffc0000000000
  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
  default_idle_call+0x59/0xa0 kernel/sched/idle.c:94
  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
  do_idle+0x180/0x780 kernel/sched/idle.c:263
  cpu_startup_entry+0x25/0x30 kernel/sched/idle.c:354
  rest_init+0x29d/0x2b0 init/main.c:451
  arch_call_rest_init+0xe/0x10
  start_kernel+0x751/0x871 init/main.c:785
  x86_64_start_reservations+0x18/0x2e arch/x86/kernel/head64.c:472
  x86_64_start_kernel+0x7a/0x7d arch/x86/kernel/head64.c:453
  secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241

Allocated by task 0:
  save_stack mm/kasan/common.c:69 [inline]
  set_track mm/kasan/common.c:77 [inline]
  __kasan_kmalloc+0x11c/0x1b0 mm/kasan/common.c:487
  kasan_kmalloc+0x9/0x10 mm/kasan/common.c:501
  __do_kmalloc mm/slab.c:3655 [inline]
  __kmalloc+0x254/0x340 mm/slab.c:3664
  kmalloc include/linux/slab.h:557 [inline]
  sk_prot_alloc+0xb0/0x290 net/core/sock.c:1603
  sk_alloc+0x38/0x950 net/core/sock.c:1657
  nr_make_new net/netrom/af_netrom.c:476 [inline]
  nr_rx_frame+0xabc/0x1e40 net/netrom/af_netrom.c:959
  nr_loopback_timer+0x6a/0x140 net/netrom/nr_loopback.c:59
  call_timer_fn+0xec/0x200 kernel/time/timer.c:1322
  expire_timers kernel/time/timer.c:1366 [inline]
  __run_timers+0x7cd/0x9c0 kernel/time/timer.c:1685
  run_timer_softirq+0x4a/0x90 kernel/time/timer.c:1698
  __do_softirq+0x333/0x7c4 arch/x86/include/asm/paravirt.h:778

Freed by task 23150:
  save_stack mm/kasan/common.c:69 [inline]
  set_track mm/kasan/common.c:77 [inline]
  __kasan_slab_free+0x12a/0x1e0 mm/kasan/common.c:449
  kasan_slab_free+0xe/0x10 mm/kasan/common.c:457
  __cache_free mm/slab.c:3425 [inline]
  kfree+0x115/0x200 mm/slab.c:3756
  sk_prot_free net/core/sock.c:1640 [inline]
  __sk_destruct+0x567/0x660 net/core/sock.c:1726
  sk_destruct net/core/sock.c:1734 [inline]
  __sk_free+0x317/0x3e0 net/core/sock.c:1745
  sk_free net/core/sock.c:1756 [inline]
  sock_put include/net/sock.h:1725 [inline]
  sock_efree+0x60/0x80 net/core/sock.c:2042
  skb_release_head_state+0x100/0x220 net/core/skbuff.c:652
  skb_release_all net/core/skbuff.c:663 [inline]
  __kfree_skb+0x25/0x170 net/core/skbuff.c:679
  kfree_skb+0x6f/0xb0 net/core/skbuff.c:697
  nr_accept+0x4ef/0x650 net/netrom/af_netrom.c:819
  __sys_accept4+0x5bc/0x9a0 net/socket.c:1754
  __do_sys_accept net/socket.c:1795 [inline]
  __se_sys_accept net/socket.c:1792 [inline]
  __x64_sys_accept+0x7d/0x90 net/socket.c:1792
  do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8880893cce40
  which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 128 bytes inside of
  2048-byte region [ffff8880893cce40, ffff8880893cd640)
The buggy address belongs to the page:
page:ffffea000224f300 refcount:1 mapcount:0 mapping:ffff8880aa400e00  
index:0x0 compound_mapcount: 0
flags: 0x1fffc0000010200(slab|head)
raw: 01fffc0000010200 ffffea0002924d08 ffffea00027f7288 ffff8880aa400e00
raw: 0000000000000000 ffff8880893cc5c0 0000000100000003 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8880893ccd80: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
  ffff8880893cce00: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
> ffff8880893cce80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                            ^
  ffff8880893ccf00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff8880893ccf80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
------------[ cut here ]------------
ODEBUG: activate not available (active state 0) object type: timer_list  
hint: nr_heartbeat_expiry+0x0/0x420 net/netrom/nr_timer.c:193
WARNING: CPU: 0 PID: 0 at lib/debugobjects.c:484 debug_print_object  
lib/debugobjects.c:481 [inline]
WARNING: CPU: 0 PID: 0 at lib/debugobjects.c:484  
debug_object_activate+0x33d/0x6f0 lib/debugobjects.c:680
Modules linked in:
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G    B             5.3.0-rc2+ #56
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:debug_print_object lib/debugobjects.c:481 [inline]
RIP: 0010:debug_object_activate+0x33d/0x6f0 lib/debugobjects.c:680
Code: f7 e8 47 04 4a fe 4d 8b 06 48 c7 c7 ba 55 88 88 48 c7 c6 00 2d a1 88  
48 c7 c2 c3 68 81 88 31 c9 49 89 d9 31 c0 e8 93 6a e0 fd <0f> 0b 48 ba 00  
00 00 00 00 fc ff df ff 05 55 99 95 05 49 83 c6 20
RSP: 0018:ffff8880aea09928 EFLAGS: 00010046
RAX: ec91b22b9b388e00 RBX: ffffffff86dc4cc0 RCX: ffffffff88c75a00
RDX: 0000000000000102 RSI: 0000000000000102 RDI: 0000000000000000
RBP: ffff8880aea09970 R08: ffffffff816068e4 R09: fffffbfff119a975
R10: fffffbfff119a975 R11: 0000000000000000 R12: ffff888218471280
R13: 1ffff1104308e250 R14: ffffffff88cda040 R15: ffff8880893cd108
FS:  0000000000000000(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5db7f62db8 CR3: 00000000a0178000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  <IRQ>
  debug_timer_activate kernel/time/timer.c:710 [inline]
  __mod_timer+0x960/0x16e0 kernel/time/timer.c:1035
  mod_timer+0x1f/0x30 kernel/time/timer.c:1096
  sk_reset_timer+0x22/0x50 net/core/sock.c:2821
  nr_start_heartbeat+0x54/0x60 net/netrom/nr_timer.c:79
  nr_rx_frame+0x184c/0x1e40 net/netrom/af_netrom.c:1025
  nr_loopback_timer+0x6a/0x140 net/netrom/nr_loopback.c:59
  call_timer_fn+0xec/0x200 kernel/time/timer.c:1322
  expire_timers kernel/time/timer.c:1366 [inline]
  __run_timers+0x7cd/0x9c0 kernel/time/timer.c:1685
  run_timer_softirq+0x4a/0x90 kernel/time/timer.c:1698
  __do_softirq+0x333/0x7c4 arch/x86/include/asm/paravirt.h:778
  invoke_softirq kernel/softirq.c:373 [inline]
  irq_exit+0x227/0x230 kernel/softirq.c:413
  exiting_irq arch/x86/include/asm/apic.h:537 [inline]
  smp_apic_timer_interrupt+0x113/0x280 arch/x86/kernel/apic/apic.c:1095
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:828
  </IRQ>
RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
Code: 01 fa eb ae 89 d9 80 e1 07 80 c1 03 38 c1 7c ba 48 89 df e8 74 3b 01  
fa eb b0 90 90 e9 07 00 00 00 0f 00 2d d6 36 51 00 fb f4 <c3> 90 e9 07 00  
00 00 0f 00 2d c6 36 51 00 f4 c3 90 90 55 48 89 e5
RSP: 0018:ffffffff88c07cd8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: 1ffffffff11950f3 RBX: ffffffff88c75a00 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: ffffffff812d2b3a RDI: ffffffff87b14d9a
RBP: ffffffff88c07ce0 R08: ffffffff817d8974 R09: fffffbfff118eb41
R10: fffffbfff118eb41 R11: 0000000000000000 R12: 0000000000000000
R13: 1ffffffff118eb40 R14: dffffc0000000000 R15: dffffc0000000000
  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
  default_idle_call+0x59/0xa0 kernel/sched/idle.c:94
  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
  do_idle+0x180/0x780 kernel/sched/idle.c:263
  cpu_startup_entry+0x25/0x30 kernel/sched/idle.c:354
  rest_init+0x29d/0x2b0 init/main.c:451
  arch_call_rest_init+0xe/0x10
  start_kernel+0x751/0x871 init/main.c:785
  x86_64_start_reservations+0x18/0x2e arch/x86/kernel/head64.c:472
  x86_64_start_kernel+0x7a/0x7d arch/x86/kernel/head64.c:453
  secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241
irq event stamp: 101048
hardirqs last  enabled at (101047): [<ffffffff816b39fd>]  
tick_nohz_idle_exit+0x2bd/0x3a0 kernel/time/tick-sched.c:1180
hardirqs last disabled at (101048): [<ffffffff87b07f01>]  
__schedule+0x121/0xcd0 kernel/sched/core.c:3821
softirqs last  enabled at (100320): [<ffffffff814a5627>] invoke_softirq  
kernel/softirq.c:373 [inline]
softirqs last  enabled at (100320): [<ffffffff814a5627>]  
irq_exit+0x227/0x230 kernel/softirq.c:413
softirqs last disabled at (100307): [<ffffffff814a5627>] invoke_softirq  
kernel/softirq.c:373 [inline]
softirqs last disabled at (100307): [<ffffffff814a5627>]  
irq_exit+0x227/0x230 kernel/softirq.c:413
---[ end trace c1ceb269b6c91467 ]---


^ permalink raw reply

* Re: [PATCH v2 bpf-next 02/12] libbpf: implement BPF CO-RE offset relocation algorithm
From: Andrii Nakryiko @ 2019-07-31  6:52 UTC (permalink / raw)
  To: Song Liu
  Cc: Andrii Nakryiko, bpf, netdev@vger.kernel.org, Alexei Starovoitov,
	daniel@iogearbox.net, Yonghong Song, Kernel Team
In-Reply-To: <AA9B5489-425E-4FAE-BE01-F0F65679DF00@fb.com>

On Tue, Jul 30, 2019 at 10:19 PM Song Liu <songliubraving@fb.com> wrote:
>
>
>
> > On Jul 30, 2019, at 6:00 PM, Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Jul 30, 2019 at 5:39 PM Song Liu <songliubraving@fb.com> wrote:
> >>
> >>
> >>
> >>> On Jul 30, 2019, at 12:53 PM, Andrii Nakryiko <andriin@fb.com> wrote:
> >>>
> >>> This patch implements the core logic for BPF CO-RE offsets relocations.
> >>> Every instruction that needs to be relocated has corresponding
> >>> bpf_offset_reloc as part of BTF.ext. Relocations are performed by trying
> >>> to match recorded "local" relocation spec against potentially many
> >>> compatible "target" types, creating corresponding spec. Details of the
> >>> algorithm are noted in corresponding comments in the code.
> >>>
> >>> Signed-off-by: Andrii Nakryiko <andriin@fb.com>
> >>> ---
> >>> tools/lib/bpf/libbpf.c | 915 ++++++++++++++++++++++++++++++++++++++++-
> >>> tools/lib/bpf/libbpf.h |   1 +
> >>> 2 files changed, 909 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> >
> > [...]
> >
> > Please trim irrelevant parts. It doesn't matter with desktop Gmail,
> > but pretty much everywhere else is very hard to work with.
>
> This won't be a problem if the patch is shorter. ;)
>
> >
> >>> +
> >>> +     for (i = 1; i < spec->raw_len; i++) {
> >>> +             t = skip_mods_and_typedefs(btf, id, &id);
> >>> +             if (!t)
> >>> +                     return -EINVAL;
> >>> +
> >>> +             access_idx = spec->raw_spec[i];
> >>> +
> >>> +             if (btf_is_composite(t)) {
> >>> +                     const struct btf_member *m = (void *)(t + 1);
> >>
> >> Why (void *) instead of (const struct btf_member *)? There are a few more
> >> in the rest of the patch.
> >>
> >
> > I just picked the most succinct and non-repetitive form. It's
> > immediately apparent which type it's implicitly converted to, so I
> > felt there is no need to repeat it. Also, just (void *) is much
> > shorter. :)
>
> _All_ other code in btf.c converts the pointer to the target type.

Most in libbpf.c doesn't, though. Also, I try to preserve pointer
constness for uses that don't modify BTF types (pretty much all of
them in libbpf), so it becomes really verbose, despite extremely short
variable names:

const struct btf_member *m = (const struct btf_member *)(t + 1);

Add one or two levels of nestedness and you are wrapping this line.

> In some cases, it is not apparent which type it is converted to,
> for example:
>
> +       m = (void *)(targ_type + 1);
>
> I would suggest we do implicit conversion whenever possible.

Implicit conversion (`m = targ_type + 1;`) is a compilation error,
that won't work.

>
> Thanks,
> Song

^ permalink raw reply

* [PATCH net] drop_monitor: Add missing uAPI file to MAINTAINERS file
From: Ido Schimmel @ 2019-07-31  6:38 UTC (permalink / raw)
  To: netdev; +Cc: davem, nhorman, Ido Schimmel

From: Ido Schimmel <idosch@mellanox.com>

Fixes: 6e43650cee64 ("add maintainer for network drop monitor kernel service")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9f5b8bd4faf9..b540794cbd91 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11137,6 +11137,7 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 W:	https://fedorahosted.org/dropwatch/
 F:	net/core/drop_monitor.c
+F:	include/uapi/linux/net_dropmon.h
 
 NETWORKING DRIVERS
 M:	"David S. Miller" <davem@davemloft.net>
-- 
2.21.0


^ permalink raw reply related

* Re: KASAN: use-after-free Read in release_sock
From: syzbot @ 2019-07-31  6:38 UTC (permalink / raw)
  To: davem, linux-hams, linux-kernel, netdev, ralf, syzkaller-bugs
In-Reply-To: <000000000000de000a058e9025db@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    629f8205 Merge tag 'for-linus-20190730' of git://git.kerne..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=11a27744600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=4c7b914a2680c9c6
dashboard link: https://syzkaller.appspot.com/bug?extid=107429d62fb1d293602f
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16a7c8dc600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1498cfa2600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+107429d62fb1d293602f@syzkaller.appspotmail.com

==================================================================
BUG: KASAN: use-after-free in debug_spin_lock_before  
kernel/locking/spinlock_debug.c:83 [inline]
BUG: KASAN: use-after-free in do_raw_spin_lock+0x28a/0x2e0  
kernel/locking/spinlock_debug.c:112
Read of size 4 at addr ffff8880a81b540c by task syz-executor143/10205

CPU: 1 PID: 10205 Comm: syz-executor143 Not tainted 5.3.0-rc2+ #90
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  print_address_description.cold+0xd4/0x306 mm/kasan/report.c:351
  __kasan_report.cold+0x1b/0x36 mm/kasan/report.c:482
  kasan_report+0x12/0x17 mm/kasan/common.c:612
  __asan_report_load4_noabort+0x14/0x20 mm/kasan/generic_report.c:131
  debug_spin_lock_before kernel/locking/spinlock_debug.c:83 [inline]
  do_raw_spin_lock+0x28a/0x2e0 kernel/locking/spinlock_debug.c:112
  __raw_spin_lock_bh include/linux/spinlock_api_smp.h:136 [inline]
  _raw_spin_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:175
  spin_lock_bh include/linux/spinlock.h:343 [inline]
  release_sock+0x20/0x1c0 net/core/sock.c:2932
  nr_release+0x303/0x3e0 net/netrom/af_netrom.c:553
  __sock_release+0xce/0x280 net/socket.c:590
  sock_close+0x1e/0x30 net/socket.c:1268
  __fput+0x2ff/0x890 fs/file_table.c:280
  ____fput+0x16/0x20 fs/file_table.c:313
  task_work_run+0x145/0x1c0 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x92f/0x2e50 kernel/exit.c:879
  do_group_exit+0x135/0x360 kernel/exit.c:983
  get_signal+0x47c/0x2500 kernel/signal.c:2729
  do_signal+0x87/0x1700 arch/x86/kernel/signal.c:815
  exit_to_usermode_loop+0x286/0x380 arch/x86/entry/common.c:159
  prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
  syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
  do_syscall_64+0x5a9/0x6a0 arch/x86/entry/common.c:299
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x447d09
Code: Bad RIP value.
RSP: 002b:00007fca48b63db8 EFLAGS: 00000246 ORIG_RAX: 000000000000002b
RAX: fffffffffffffe00 RBX: 00000000006ddc58 RCX: 0000000000447d09
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000007
RBP: 00000000006ddc50 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006ddc5c
R13: 00007ffe913f3ebf R14: 00007fca48b649c0 R15: 0000000000000001

Allocated by task 0:
  save_stack+0x23/0x90 mm/kasan/common.c:69
  set_track mm/kasan/common.c:77 [inline]
  __kasan_kmalloc mm/kasan/common.c:487 [inline]
  __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:460
  kasan_kmalloc+0x9/0x10 mm/kasan/common.c:501
  __do_kmalloc mm/slab.c:3655 [inline]
  __kmalloc+0x163/0x770 mm/slab.c:3664
  kmalloc include/linux/slab.h:557 [inline]
  sk_prot_alloc+0x23a/0x310 net/core/sock.c:1603
  sk_alloc+0x39/0xf70 net/core/sock.c:1657
  nr_make_new net/netrom/af_netrom.c:476 [inline]
  nr_rx_frame+0x733/0x1e73 net/netrom/af_netrom.c:959
  nr_loopback_timer+0x7b/0x170 net/netrom/nr_loopback.c:59
  call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1322
  expire_timers kernel/time/timer.c:1366 [inline]
  __run_timers kernel/time/timer.c:1685 [inline]
  __run_timers kernel/time/timer.c:1653 [inline]
  run_timer_softirq+0x697/0x17a0 kernel/time/timer.c:1698
  __do_softirq+0x262/0x98c kernel/softirq.c:292

Freed by task 10205:
  save_stack+0x23/0x90 mm/kasan/common.c:69
  set_track mm/kasan/common.c:77 [inline]
  __kasan_slab_free+0x102/0x150 mm/kasan/common.c:449
  kasan_slab_free+0xe/0x10 mm/kasan/common.c:457
  __cache_free mm/slab.c:3425 [inline]
  kfree+0x10a/0x2c0 mm/slab.c:3756
  sk_prot_free net/core/sock.c:1640 [inline]
  __sk_destruct+0x4f7/0x6e0 net/core/sock.c:1726
  sk_destruct+0x86/0xa0 net/core/sock.c:1734
  __sk_free+0xfb/0x360 net/core/sock.c:1745
  sk_free+0x42/0x50 net/core/sock.c:1756
  sock_put include/net/sock.h:1725 [inline]
  nr_destroy_socket+0x3ea/0x4a0 net/netrom/af_netrom.c:288
  nr_release+0x347/0x3e0 net/netrom/af_netrom.c:530
  __sock_release+0xce/0x280 net/socket.c:590
  sock_close+0x1e/0x30 net/socket.c:1268
  __fput+0x2ff/0x890 fs/file_table.c:280
  ____fput+0x16/0x20 fs/file_table.c:313
  task_work_run+0x145/0x1c0 kernel/task_work.c:113
  exit_task_work include/linux/task_work.h:22 [inline]
  do_exit+0x92f/0x2e50 kernel/exit.c:879
  do_group_exit+0x135/0x360 kernel/exit.c:983
  get_signal+0x47c/0x2500 kernel/signal.c:2729
  do_signal+0x87/0x1700 arch/x86/kernel/signal.c:815
  exit_to_usermode_loop+0x286/0x380 arch/x86/entry/common.c:159
  prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline]
  syscall_return_slowpath arch/x86/entry/common.c:274 [inline]
  do_syscall_64+0x5a9/0x6a0 arch/x86/entry/common.c:299
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8880a81b5380
  which belongs to the cache kmalloc-2k of size 2048
The buggy address is located 140 bytes inside of
  2048-byte region [ffff8880a81b5380, ffff8880a81b5b80)
The buggy address belongs to the page:
page:ffffea0002a06d00 refcount:1 mapcount:0 mapping:ffff8880aa400e00  
index:0x0 compound_mapcount: 0
flags: 0x1fffc0000010200(slab|head)
raw: 01fffc0000010200 ffffea0002a0ec88 ffffea0002a07708 ffff8880aa400e00
raw: 0000000000000000 ffff8880a81b4280 0000000100000003 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff8880a81b5300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff8880a81b5380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> ffff8880a81b5400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                       ^
  ffff8880a81b5480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff8880a81b5500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

^ permalink raw reply

* [PATCH net 2/2] mlxsw: spectrum_buffers: Further reduce pool size on Spectrum-2
From: Ido Schimmel @ 2019-07-31  6:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20190731063315.9381-1-idosch@idosch.org>

From: Petr Machata <petrm@mellanox.com>

In commit e891ce1dd2a5 ("mlxsw: spectrum_buffers: Reduce pool size on
Spectrum-2"), pool size was reduced to mitigate a problem in port buffer
usage of ports split four ways. It turns out that this work around does not
solve the issue, and a further reduction is required.

Thus reduce the size of pool 0 by another 2.7 MiB, and round down to the
whole number of cells.

Fixes: e891ce1dd2a5 ("mlxsw: spectrum_buffers: Reduce pool size on Spectrum-2")
Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
index 1537f70bc26d..888ba4300bcc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c
@@ -437,8 +437,8 @@ static const struct mlxsw_sp_sb_pr mlxsw_sp1_sb_prs[] = {
 			   MLXSW_SP1_SB_PR_CPU_SIZE, true, false),
 };
 
-#define MLXSW_SP2_SB_PR_INGRESS_SIZE	38128752
-#define MLXSW_SP2_SB_PR_EGRESS_SIZE	38128752
+#define MLXSW_SP2_SB_PR_INGRESS_SIZE	35297568
+#define MLXSW_SP2_SB_PR_EGRESS_SIZE	35297568
 #define MLXSW_SP2_SB_PR_CPU_SIZE	(256 * 1000)
 
 /* Order according to mlxsw_sp2_sb_pool_dess */
-- 
2.21.0


^ permalink raw reply related

* [PATCH net 1/2] mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()
From: Ido Schimmel @ 2019-07-31  6:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20190731063315.9381-1-idosch@idosch.org>

From: Jiri Pirko <jiri@mellanox.com>

In case of sp2 pci driver registration fail, fix the error path to
start with sp1 pci driver unregister.

Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 650638152bbc..eda9c23e87b2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -6330,7 +6330,7 @@ static int __init mlxsw_sp_module_init(void)
 	return 0;
 
 err_sp2_pci_driver_register:
-	mlxsw_pci_driver_unregister(&mlxsw_sp2_pci_driver);
+	mlxsw_pci_driver_unregister(&mlxsw_sp1_pci_driver);
 err_sp1_pci_driver_register:
 	mlxsw_core_driver_unregister(&mlxsw_sp2_driver);
 err_sp2_core_driver_register:
-- 
2.21.0


^ permalink raw reply related

* [PATCH net 0/2] mlxsw: Two small fixes
From: Ido Schimmel @ 2019-07-31  6:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel

From: Ido Schimmel <idosch@mellanox.com>

Patch #1 from Jiri fixes the error path of the module initialization
function. Found during manual code inspection.

Patch #2 from Petr further reduces the default shared buffer pool sizes
in order to work around a problem that was originally described in
commit e891ce1dd2a5 ("mlxsw: spectrum_buffers: Reduce pool size on
Spectrum-2").

Jiri Pirko (1):
  mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()

Petr Machata (1):
  mlxsw: spectrum_buffers: Further reduce pool size on Spectrum-2

 drivers/net/ethernet/mellanox/mlxsw/spectrum.c         | 2 +-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

-- 
2.21.0

^ permalink raw reply

* Re: [PATCH v4 3/3] net/xdp: convert put_page() to put_user_page*()
From: Björn Töpel @ 2019-07-31  6:08 UTC (permalink / raw)
  To: john.hubbard, Andrew Morton
  Cc: Al Viro, Christian Benvenuti, Christoph Hellwig, Dan Williams,
	Darrick J . Wong, Dave Chinner, Ira Weiny, Jan Kara,
	Jason Gunthorpe, Jens Axboe, Jerome Glisse, Kirill A . Shutemov,
	Matthew Wilcox, Michal Hocko, Mike Marciniszyn, Mike Rapoport,
	linux-block, linux-fsdevel, linux-mm, linux-xfs, LKML,
	John Hubbard, Magnus Karlsson, David S . Miller, netdev
In-Reply-To: <20190730205705.9018-4-jhubbard@nvidia.com>

On 2019-07-30 22:57, john.hubbard@gmail.com wrote:
> From: John Hubbard <jhubbard@nvidia.com>
> 
> For pages that were retained via get_user_pages*(), release those pages
> via the new put_user_page*() routines, instead of via put_page() or
> release_pages().
> 
> This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
> ("mm: introduce put_user_page*(), placeholder versions").
> 
> Cc: Björn Töpel <bjorn.topel@intel.com>
> Cc: Magnus Karlsson <magnus.karlsson@intel.com>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: netdev@vger.kernel.org
> Signed-off-by: John Hubbard <jhubbard@nvidia.com>

Acked-by: Björn Töpel <bjorn.topel@intel.com>

> ---
>   net/xdp/xdp_umem.c | 9 +--------
>   1 file changed, 1 insertion(+), 8 deletions(-)
> 
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index 83de74ca729a..17c4b3d3dc34 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -166,14 +166,7 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
>   
>   static void xdp_umem_unpin_pages(struct xdp_umem *umem)
>   {
> -	unsigned int i;
> -
> -	for (i = 0; i < umem->npgs; i++) {
> -		struct page *page = umem->pgs[i];
> -
> -		set_page_dirty_lock(page);
> -		put_page(page);
> -	}
> +	put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
>   
>   	kfree(umem->pgs);
>   	umem->pgs = NULL;
> 

^ permalink raw reply

* Re: [PATCH net-next 1/2] net: phy: broadcom: set features explicitly for BCM54616S
From: Tao Ren @ 2019-07-31  6:00 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn
  Cc: Florian Fainelli, David S . Miller, Arun Parameswaran,
	Justin Chen, Vladimir Oltean, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, Andrew Jeffery,
	openbmc@lists.ozlabs.org
In-Reply-To: <41c1f898-aee8-d73a-386d-c3ce280c5a1b@gmail.com>

On 7/30/19 10:53 PM, Heiner Kallweit wrote:
> On 31.07.2019 02:12, Tao Ren wrote:
>> On 7/29/19 11:00 PM, Heiner Kallweit wrote:
>>> On 30.07.2019 07:05, Tao Ren wrote:
>>>> On 7/29/19 8:35 PM, Andrew Lunn wrote:
>>>>> On Mon, Jul 29, 2019 at 05:25:32PM -0700, Tao Ren wrote:
>>>>>> BCM54616S feature "PHY_GBIT_FEATURES" was removed by commit dcdecdcfe1fc
>>>>>> ("net: phy: switch drivers to use dynamic feature detection"). As dynamic
>>>>>> feature detection doesn't work when BCM54616S is working in RGMII-Fiber
>>>>>> mode (different sets of MII Control/Status registers being used), let's
>>>>>> set "PHY_GBIT_FEATURES" for BCM54616S explicitly.
>>>>>
>>>>> Hi Tao
>>>>>
>>>>> What exactly does it get wrong?
>>>>>
>>>>>      Thanks
>>>>> 	Andrew
>>>>
>>>> Hi Andrew,
>>>>
>>>> BCM54616S is set to RGMII-Fiber (1000Base-X) mode on my platform, and none of the features (1000BaseT/100BaseT/10BaseT) can be detected by genphy_read_abilities(), because the PHY only reports 1000BaseX_Full|Half ability in this mode.
>>>>
>>> Are you going to use the PHY in copper or fibre mode?
>>> In case you use fibre mode, why do you need the copper modes set as supported?
>>> Or does the PHY just start in fibre mode and you want to switch it to copper mode?
>>
>> Hi Heiner,
>>
>> The phy starts in fiber mode and that's the mode I want.
>> My observation is: phydev->link is always 0 (Link status bit is never set in MII_BMSR) by using dynamic ability detection on my machine. I checked phydev->supported and it's set to "AutoNeg | TP | MII | Pause | Asym_Pause" by dynamic ability detection. Is it normal/expected? Or maybe the fix should go to different places? Thank you for your help.
>>
> 
> Not sure whether you stated already which kernel version you're using.
> There's a brand-new extension to auto-detect 1000BaseX:
> f30e33bcdab9 ("net: phy: Add more 1000BaseX support detection")
> It's included in the 5.3-rc series.

I'm running kernel 5.2.0. Thank you for the sharing and I didn't know the patch. Let me check it out.

> If a feature can be read from a vendor-specific register only,
> then the preferred way is: Implement callback get_features in
> the PHY driver, call genphy_read_abilities for the basic features
> and complement it with reading the vendor-specific register(s).

Got it. Let me update my code and will come back soon.


Thanks,

Tao

^ permalink raw reply

* Re: [RFC] net: phy: read link status twice when phy_check_link_status()
From: liuyonglong @ 2019-07-31  5:58 UTC (permalink / raw)
  To: Heiner Kallweit, andrew, davem
  Cc: netdev, linux-kernel, linuxarm, salil.mehta, yisen.zhuang,
	shiju.jose
In-Reply-To: <5634113b-f5b5-6fa8-851d-1402e046c3df@gmail.com>



On 2019/7/31 13:44, Heiner Kallweit wrote:
> On 31.07.2019 05:33, liuyonglong wrote:
>>
>>
>> On 2019/7/31 3:04, Heiner Kallweit wrote:
>>> On 30.07.2019 08:35, liuyonglong wrote:
>>>> :/sys/kernel/debug/tracing$ cat trace
>>>> # tracer: nop
>>>> #
>>>> # entries-in-buffer/entries-written: 45/45   #P:128
>>>> #
>>>> #                              _-----=> irqs-off
>>>> #                             / _----=> need-resched
>>>> #                            | / _---=> hardirq/softirq
>>>> #                            || / _--=> preempt-depth
>>>> #                            ||| /     delay
>>>> #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
>>>> #              | |       |   ||||       |         |
>>>>     kworker/64:2-1028  [064] ....   172.295687: mdio_access: mii-0000:bd:00.0 read  phy:0x01 reg:0x02 val:0x001c
>>>>     kworker/64:2-1028  [064] ....   172.295726: mdio_access: mii-0000:bd:00.0 read  phy:0x01 reg:0x03 val:0xc916
>>>>     kworker/64:2-1028  [064] ....   172.296902: mdio_access: mii-0000:bd:00.0 read  phy:0x01 reg:0x01 val:0x79ad
>>>>     kworker/64:2-1028  [064] ....   172.296938: mdio_access: mii-0000:bd:00.0 read  phy:0x01 reg:0x0f val:0x2000
>>>>     kworker/64:2-1028  [064] ....   172.321213: mdio_access: mii-0000:bd:00.0 read  phy:0x01 reg:0x00 val:0x1040
>>>>     kworker/64:2-1028  [064] ....   172.343209: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x02 val:0x001c
>>>>     kworker/64:2-1028  [064] ....   172.343245: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x03 val:0xc916
>>>>     kworker/64:2-1028  [064] ....   172.343882: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x01 val:0x79ad
>>>>     kworker/64:2-1028  [064] ....   172.343918: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x0f val:0x2000
>>>>     kworker/64:2-1028  [064] ....   172.362658: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x00 val:0x1040
>>>>     kworker/64:2-1028  [064] ....   172.385961: mdio_access: mii-0000:bd:00.2 read  phy:0x05 reg:0x02 val:0x001c
>>>>     kworker/64:2-1028  [064] ....   172.385996: mdio_access: mii-0000:bd:00.2 read  phy:0x05 reg:0x03 val:0xc916
>>>>     kworker/64:2-1028  [064] ....   172.386646: mdio_access: mii-0000:bd:00.2 read  phy:0x05 reg:0x01 val:0x79ad
>>>>     kworker/64:2-1028  [064] ....   172.386681: mdio_access: mii-0000:bd:00.2 read  phy:0x05 reg:0x0f val:0x2000
>>>>     kworker/64:2-1028  [064] ....   172.411286: mdio_access: mii-0000:bd:00.2 read  phy:0x05 reg:0x00 val:0x1040
>>>>     kworker/64:2-1028  [064] ....   172.433225: mdio_access: mii-0000:bd:00.3 read  phy:0x07 reg:0x02 val:0x001c
>>>>     kworker/64:2-1028  [064] ....   172.433260: mdio_access: mii-0000:bd:00.3 read  phy:0x07 reg:0x03 val:0xc916
>>>>     kworker/64:2-1028  [064] ....   172.433887: mdio_access: mii-0000:bd:00.3 read  phy:0x07 reg:0x01 val:0x79ad
>>>>     kworker/64:2-1028  [064] ....   172.433922: mdio_access: mii-0000:bd:00.3 read  phy:0x07 reg:0x0f val:0x2000
>>>>     kworker/64:2-1028  [064] ....   172.452862: mdio_access: mii-0000:bd:00.3 read  phy:0x07 reg:0x00 val:0x1040
>>>>         ifconfig-1324  [011] ....   177.325585: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x00 val:0x1040
>>>>   kworker/u257:0-8     [012] ....   177.325642: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x04 val:0x01e1
>>>>   kworker/u257:0-8     [012] ....   177.325654: mdio_access: mii-0000:bd:00.1 write phy:0x03 reg:0x04 val:0x05e1
>>>>   kworker/u257:0-8     [012] ....   177.325708: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x01 val:0x79ad
>>>>   kworker/u257:0-8     [012] ....   177.325744: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x09 val:0x0200
>>>>   kworker/u257:0-8     [012] ....   177.325779: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x00 val:0x1040
>>>>   kworker/u257:0-8     [012] ....   177.325788: mdio_access: mii-0000:bd:00.1 write phy:0x03 reg:0x00 val:0x1240
>>>>   kworker/u257:0-8     [012] ....   177.325843: mdio_access: mii-0000:bd:00.1 read  phy:0x03 reg:0x01 val:0x798d
>>>
>>> What I think that happens here:
>>> Writing 0x1240 to BMCR starts aneg. When reading BMSR immediately after that then the PHY seems to have cleared
>>> the "aneg complete" bit already, but not yet the "link up" bit. This results in the false "link up" notification.
>>> The following patch is based on the fact that in case of enabled aneg we can't have a valid link if aneg isn't
>>> finished. Could you please test whether this works for you?
>>>
>>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>>> index 6b5cb87f3..7ddd91df9 100644
>>> --- a/drivers/net/phy/phy_device.c
>>> +++ b/drivers/net/phy/phy_device.c
>>> @@ -1774,6 +1774,12 @@ int genphy_update_link(struct phy_device *phydev)
>>>  	phydev->link = status & BMSR_LSTATUS ? 1 : 0;
>>>  	phydev->autoneg_complete = status & BMSR_ANEGCOMPLETE ? 1 : 0;
>>>  
>>> +	/* Consider the case that autoneg was started and "aneg complete"
>>> +	 * bit has been reset, but "link up" bit not yet.
>>> +	 */
>>> +	if (phydev->autoneg == AUTONEG_ENABLE && !phydev->autoneg_complete)
>>> +		phydev->link = 0;
>>> +
>>>  	return 0;
>>>  }
>>>  EXPORT_SYMBOL(genphy_update_link);
>>>
>>
>> This patch can solve the issue! Will it be upstream?
>>
> I'll check for side effects, but in general: yes.
> 
>> So it's nothing to do with the bios, and just the PHY's own behavior,
>> the "link up" bit can not reset immediately，right?
>>
> Yes, it's the PHY's own behavior, and to a certain extent it may depend on speed
> of the MDIO bus. At least few network chips require a delay of several microseconds
> after each MDIO bus access. This may be sufficient for the PHY to reset the
> link-up bit in time.
> 
>> ps: It will take 1 second more to make the link up for RTL8211F when 0x798d happend.
>>
> In polling mode link-up is detected up to 1s after it happened.
> You could switch to interrupt mode to reduce the aneg time.
> 
>>
>>
> Heiner
> 
> .
> 

Thanks very much!


^ permalink raw reply

* [PATCH] can: flexcan: add LPSR mode support for i.MX7D
From: Joakim Zhang @ 2019-07-31  5:56 UTC (permalink / raw)
  To: mkl@pengutronix.de, linux-can@vger.kernel.org
  Cc: wg@grandegger.com, netdev@vger.kernel.org, dl-linux-imx,
	Joakim Zhang

For i.MX7D LPSR mode, the controller will lost power and got the
configuration state lost after system resume back. (coming i.MX8QM/QXP
will also completely power off the domain, the controller state will be
lost and needs restore).
So we need to set pinctrl state again and re-start chip to do
re-configuration after resume.

For wakeup case, it should not set pinctrl to sleep state by
pinctrl_pm_select_sleep_state.
For interface is not up before suspend case, we don't need
re-configure as it will be configured by user later by interface up.

Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
---
 drivers/net/can/flexcan.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
index c21b3507123e..228d07e84ddc 100644
--- a/drivers/net/can/flexcan.c
+++ b/drivers/net/can/flexcan.c
@@ -26,6 +26,7 @@
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/regulator/consumer.h>
+#include <linux/pinctrl/consumer.h>
 #include <linux/regmap.h>
 
 #define DRV_NAME			"flexcan"
@@ -1889,7 +1890,7 @@ static int __maybe_unused flexcan_suspend(struct device *device)
 {
 	struct net_device *dev = dev_get_drvdata(device);
 	struct flexcan_priv *priv = netdev_priv(dev);
-	int err = 0;
+	int err;
 
 	if (netif_running(dev)) {
 		/* if wakeup is enabled, enter stop mode
@@ -1899,25 +1900,27 @@ static int __maybe_unused flexcan_suspend(struct device *device)
 			enable_irq_wake(dev->irq);
 			flexcan_enter_stop_mode(priv);
 		} else {
-			err = flexcan_chip_disable(priv);
+			flexcan_chip_stop(dev);
+
+			err = pm_runtime_force_suspend(device);
 			if (err)
 				return err;
 
-			err = pm_runtime_force_suspend(device);
+			pinctrl_pm_select_sleep_state(device);
 		}
 		netif_stop_queue(dev);
 		netif_device_detach(dev);
 	}
 	priv->can.state = CAN_STATE_SLEEPING;
 
-	return err;
+	return 0;
 }
 
 static int __maybe_unused flexcan_resume(struct device *device)
 {
 	struct net_device *dev = dev_get_drvdata(device);
 	struct flexcan_priv *priv = netdev_priv(dev);
-	int err = 0;
+	int err;
 
 	priv->can.state = CAN_STATE_ERROR_ACTIVE;
 	if (netif_running(dev)) {
@@ -1926,15 +1929,19 @@ static int __maybe_unused flexcan_resume(struct device *device)
 		if (device_may_wakeup(device)) {
 			disable_irq_wake(dev->irq);
 		} else {
+			pinctrl_pm_select_default_state(device);
+
 			err = pm_runtime_force_resume(device);
 			if (err)
 				return err;
 
-			err = flexcan_chip_enable(priv);
+			err = flexcan_chip_start(dev);
+			if (err)
+				return err;
 		}
 	}
 
-	return err;
+	return 0;
 }
 
 static int __maybe_unused flexcan_runtime_suspend(struct device *device)
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH net-next 2/2] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Tao Ren @ 2019-07-31  5:55 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Vladimir Oltean, Florian Fainelli, Heiner Kallweit,
	David S . Miller, Arun Parameswaran, Justin Chen, netdev, lkml,
	Andrew Jeffery, openbmc@lists.ozlabs.org
In-Reply-To: <20190731023417.GD9523@lunn.ch>

On 7/30/19 7:34 PM, Andrew Lunn wrote:
>> Hi Andrew,
>>
>> The BCM54616S PHY on my machine is connected to a BCM5396 switch chip over backplane (1000Base-KX).
> 
> Ah, that is different. So the board is using it for RGMII to 1000Base-KX?
> 
> phy-mode is about the MAC-PHY link. So in this case RGMII.

Yes. It's RGMII to 1000Base-KX.

> There is no DT way to configure the PHY-Switch link. However, it
> sounds like you have the PHY strapped so it is doing 1000BaseX on the
> PHY-Switch link. So do you actually need to configure this?

The PHY is strapped in RGMII-Fiber Mode (the term used in datasheet), but besides 1000BaseX, 100Base-FX is also supported in this mode.
The datasheet doesn't say which link type (1000BaseX or 100Base-FX) is active after reset and I cannot find a way to auto-detect the link type, either.

Below are a few more details about 1000Base-X and 100Base-FX in BCM54616S datasheet.

- 1000BaseX: 
  The 1000BaseX register set (MII registers 00-0F) needs to be enabled by setting bit 0 of Mode Control Register.
  Although the register address is the same between 1000BaseX and 1000BaseT/100BaseT/10BaseT registers, some bit fields in 1000BaseX registers are different: for example, speed field is not available in 1000BaseX status register.

- 100Base-FX:
  100Base-FX registers need to be enabled by writing 1 to SerDes 100FX Control Register.
  100Base-FX Control and Status registers are in shadow register 1Ch, instead of MII register 00h and 01h.

> You report you never see link up? So maybe the problem is actually in
> read_status? When in 1000BaseX mode, do you need to read the link
> status from some other register? The Marvell PHYs use a second page
> for 1000BaseX.

read_status callback needs to be updated to report correct link speed in my case. But as I cannot tell which link type (1000BaseX or 100Base-FX) is active, it becomes hard to access registers in read_status method. Any suggestions?

BTW, link-never-up issue seems to be caused by static/dynamic feature detection. I'm still tracing down the issue and it's being tracked in patch #1 of the series.

Thanks,

Tao

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox