Netdev List

Netdev List
 help / color / mirror / Atom feed

* [for-next V3 08/11] net/mlx5e: E-Switch, Move send-to-vport rule struct to en_rep
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>

From: Mark Bloch <markb@mellanox.com>

Move struct mlx5_esw_sq which keeps send-to-vport rule to from the eswitch
code to mlx5e and rename it to better reflect where it belongs

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  | 22 +++++++++++-----------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h  |  5 +++++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |  5 -----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 5b2b673c0b13..c6a77f8e99a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -193,17 +193,17 @@ int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
 static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
 				 struct mlx5_eswitch_rep *rep)
 {
-	struct mlx5_esw_sq *esw_sq, *tmp;
+	struct mlx5e_rep_sq *rep_sq, *tmp;
 	struct mlx5e_rep_priv *rpriv;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return;
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
-	list_for_each_entry_safe(esw_sq, tmp, &rpriv->vport_sqs_list, list) {
-		mlx5_eswitch_del_send_to_vport_rule(esw_sq->send_to_vport_rule);
-		list_del(&esw_sq->list);
-		kfree(esw_sq);
+	list_for_each_entry_safe(rep_sq, tmp, &rpriv->vport_sqs_list, list) {
+		mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule);
+		list_del(&rep_sq->list);
+		kfree(rep_sq);
 	}
 }
 
@@ -213,7 +213,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 {
 	struct mlx5_flow_handle *flow_rule;
 	struct mlx5e_rep_priv *rpriv;
-	struct mlx5_esw_sq *esw_sq;
+	struct mlx5e_rep_sq *rep_sq;
 	int err;
 	int i;
 
@@ -222,8 +222,8 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 
 	rpriv = mlx5e_rep_to_rep_priv(rep);
 	for (i = 0; i < sqns_num; i++) {
-		esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
-		if (!esw_sq) {
+		rep_sq = kzalloc(sizeof(*rep_sq), GFP_KERNEL);
+		if (!rep_sq) {
 			err = -ENOMEM;
 			goto out_err;
 		}
@@ -234,11 +234,11 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
 								sqns_array[i]);
 		if (IS_ERR(flow_rule)) {
 			err = PTR_ERR(flow_rule);
-			kfree(esw_sq);
+			kfree(rep_sq);
 			goto out_err;
 		}
-		esw_sq->send_to_vport_rule = flow_rule;
-		list_add(&esw_sq->list, &rpriv->vport_sqs_list);
+		rep_sq->send_to_vport_rule = flow_rule;
+		list_add(&rep_sq->list, &rpriv->vport_sqs_list);
 	}
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index e4473a9ebd50..b9b481f2833a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -133,6 +133,11 @@ struct mlx5e_encap_entry {
 	int encap_size;
 };
 
+struct mlx5e_rep_sq {
+	struct mlx5_flow_handle	*send_to_vport_rule;
+	struct list_head	 list;
+};
+
 void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev);
 void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
 void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 91175965df7f..3b481182f13a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -138,11 +138,6 @@ struct mlx5_eswitch_fdb {
 	};
 };
 
-struct mlx5_esw_sq {
-	struct mlx5_flow_handle	*send_to_vport_rule;
-	struct list_head	 list;
-};
-
 struct mlx5_eswitch_rep;
 struct mlx5_eswitch_rep_if {
 	int		       (*load)(struct mlx5_core_dev *dev,
-- 
2.13.0

^ permalink raw reply related

* [for-next V3 11/11] net/mlx5: Separate ingress/egress namespaces for each vport
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

Each vport has its own root flow table for the ACL flow tables and root
flow table is per namespace, therefore we should create a namespace for
each vport.

Fixes: efdc810ba39d ("net/mlx5: Flow steering, Add vport ACL support")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c |  10 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 145 ++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   4 +-
 include/linux/mlx5/fs.h                           |   4 +
 4 files changed, 133 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index cdf65ed8714c..7649e36653d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -867,9 +867,10 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
 	esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n",
 		  vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size));
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS);
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+						    vport->vport);
 	if (!root_ns) {
-		esw_warn(dev, "Failed to get E-Switch egress flow namespace\n");
+		esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport);
 		return -EOPNOTSUPP;
 	}
 
@@ -984,9 +985,10 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
 	esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n",
 		  vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size));
 
-	root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS);
+	root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+						    vport->vport);
 	if (!root_ns) {
-		esw_warn(dev, "Failed to get E-Switch ingress flow namespace\n");
+		esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", vport->vport);
 		return -EOPNOTSUPP;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5e786e29f93a..45e75b1010f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2014,16 +2014,6 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			return &steering->fdb_root_ns->ns;
 		else
 			return NULL;
-	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
-		if (steering->esw_egress_root_ns)
-			return &steering->esw_egress_root_ns->ns;
-		else
-			return NULL;
-	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
-		if (steering->esw_ingress_root_ns)
-			return &steering->esw_ingress_root_ns->ns;
-		else
-			return NULL;
 	case MLX5_FLOW_NAMESPACE_SNIFFER_RX:
 		if (steering->sniffer_rx_root_ns)
 			return &steering->sniffer_rx_root_ns->ns;
@@ -2054,6 +2044,33 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 }
 EXPORT_SYMBOL(mlx5_get_flow_namespace);
 
+struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+							      enum mlx5_flow_namespace_type type,
+							      int vport)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+
+	if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+		return NULL;
+
+	switch (type) {
+	case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+		if (steering->esw_egress_root_ns &&
+		    steering->esw_egress_root_ns[vport])
+			return &steering->esw_egress_root_ns[vport]->ns;
+		else
+			return NULL;
+	case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+		if (steering->esw_ingress_root_ns &&
+		    steering->esw_ingress_root_ns[vport])
+			return &steering->esw_ingress_root_ns[vport]->ns;
+		else
+			return NULL;
+	default:
+		return NULL;
+	}
+}
+
 static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 				      unsigned int prio, int num_levels)
 {
@@ -2331,13 +2348,41 @@ static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns)
 	clean_tree(&root_ns->ns.node);
 }
 
+static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_egress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+
+	kfree(steering->esw_egress_root_ns);
+}
+
+static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int i;
+
+	if (!steering->esw_ingress_root_ns)
+		return;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+
+	kfree(steering->esw_ingress_root_ns);
+}
+
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering = dev->priv.steering;
 
 	cleanup_root_ns(steering->root_ns);
-	cleanup_root_ns(steering->esw_egress_root_ns);
-	cleanup_root_ns(steering->esw_ingress_root_ns);
+	cleanup_egress_acls_root_ns(dev);
+	cleanup_ingress_acls_root_ns(dev);
 	cleanup_root_ns(steering->fdb_root_ns);
 	cleanup_root_ns(steering->sniffer_rx_root_ns);
 	cleanup_root_ns(steering->sniffer_tx_root_ns);
@@ -2406,34 +2451,86 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
 	return PTR_ERR(prio);
 }
 
-static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering)
+static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
 {
 	struct fs_prio *prio;
 
-	steering->esw_egress_root_ns = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
-	if (!steering->esw_egress_root_ns)
+	steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
+	if (!steering->esw_egress_root_ns[vport])
 		return -ENOMEM;
 
 	/* create 1 prio*/
-	prio = fs_create_prio(&steering->esw_egress_root_ns->ns, 0,
-			      MLX5_TOTAL_VPORTS(steering->dev));
+	prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, 0, 1);
 	return PTR_ERR_OR_ZERO(prio);
 }
 
-static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering)
+static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
 {
 	struct fs_prio *prio;
 
-	steering->esw_ingress_root_ns = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
-	if (!steering->esw_ingress_root_ns)
+	steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
+	if (!steering->esw_ingress_root_ns[vport])
 		return -ENOMEM;
 
 	/* create 1 prio*/
-	prio = fs_create_prio(&steering->esw_ingress_root_ns->ns, 0,
-			      MLX5_TOTAL_VPORTS(steering->dev));
+	prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, 0, 1);
 	return PTR_ERR_OR_ZERO(prio);
 }
 
+static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+					       sizeof(*steering->esw_egress_root_ns),
+					       GFP_KERNEL);
+	if (!steering->esw_egress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_egress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_egress_root_ns[i]);
+	kfree(steering->esw_egress_root_ns);
+	return err;
+}
+
+static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+	struct mlx5_flow_steering *steering = dev->priv.steering;
+	int err;
+	int i;
+
+	steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+						sizeof(*steering->esw_ingress_root_ns),
+						GFP_KERNEL);
+	if (!steering->esw_ingress_root_ns)
+		return -ENOMEM;
+
+	for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+		err = init_ingress_acl_root_ns(steering, i);
+		if (err)
+			goto cleanup_root_ns;
+	}
+
+	return 0;
+
+cleanup_root_ns:
+	for (i--; i >= 0; i--)
+		cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+	kfree(steering->esw_ingress_root_ns);
+	return err;
+}
+
 int mlx5_init_fs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_flow_steering *steering;
@@ -2476,12 +2573,12 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 				goto err;
 		}
 		if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
-			err = init_egress_acl_root_ns(steering);
+			err = init_egress_acls_root_ns(dev);
 			if (err)
 				goto err;
 		}
 		if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
-			err = init_ingress_acl_root_ns(steering);
+			err = init_ingress_acls_root_ns(dev);
 			if (err)
 				goto err;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 397d24a621a4..3e571045626f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -71,8 +71,8 @@ struct mlx5_flow_steering {
 	struct kmem_cache               *ftes_cache;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
-	struct mlx5_flow_root_namespace *esw_egress_root_ns;
-	struct mlx5_flow_root_namespace *esw_ingress_root_ns;
+	struct mlx5_flow_root_namespace **esw_egress_root_ns;
+	struct mlx5_flow_root_namespace **esw_ingress_root_ns;
 	struct mlx5_flow_root_namespace	*sniffer_tx_root_ns;
 	struct mlx5_flow_root_namespace	*sniffer_rx_root_ns;
 };
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index b25e7baa273e..a0b48afcb422 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -95,6 +95,10 @@ struct mlx5_flow_destination {
 struct mlx5_flow_namespace *
 mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 			enum mlx5_flow_namespace_type type);
+struct mlx5_flow_namespace *
+mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+				  enum mlx5_flow_namespace_type type,
+				  int vport);
 
 struct mlx5_flow_table *
 mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
-- 
2.13.0

^ permalink raw reply related

* [for-next V3 07/11] net/mlx5: E-Switch, Create generic header struct to be used by representors
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>

From: Mark Bloch <markb@mellanox.com>

Now that we don't store type dependent data in struct mlx5_eswitch_rep
we can create a generic interface, and representor type.

struct mlx5_eswitch_rep will store an array of interfaces, each
interface is used by a different representor type.

Once we moved to a more generic interface, rdma driver representors can
be added and utilize the same mechanism as the Ethernet driver
representors use.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 29 ++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  9 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  | 22 +++++--
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 70 +++++++++++++++-------
 5 files changed, 88 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3c74f0599ad3..5b2b673c0b13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1086,7 +1086,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 
 	rpriv->netdev = netdev;
 	rpriv->rep = rep;
-	rep->priv = rpriv;
+	rep->rep_if[REP_ETH].priv = rpriv;
 	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
 
 	err = mlx5e_attach_netdev(netdev_priv(netdev));
@@ -1103,7 +1103,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 		goto err_detach_netdev;
 	}
 
-	uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
 	upriv = netdev_priv(uplink_rpriv->netdev);
 	err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
 					 upriv);
@@ -1146,7 +1146,8 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 	struct mlx5e_priv *upriv;
 
 	unregister_netdev(netdev);
-	uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
+						    REP_ETH);
 	upriv = netdev_priv(uplink_rpriv->netdev);
 	tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
 				     upriv);
@@ -1164,11 +1165,11 @@ static void mlx5e_rep_register_vf_vports(struct mlx5e_priv *priv)
 	int vport;
 
 	for (vport = 1; vport < total_vfs; vport++) {
-		struct mlx5_eswitch_rep rep = {};
+		struct mlx5_eswitch_rep_if rep_if = {};
 
-		rep.load = mlx5e_vport_rep_load;
-		rep.unload = mlx5e_vport_rep_unload;
-		mlx5_eswitch_register_vport_rep(esw, vport, &rep);
+		rep_if.load = mlx5e_vport_rep_load;
+		rep_if.unload = mlx5e_vport_rep_unload;
+		mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_ETH);
 	}
 }
 
@@ -1180,24 +1181,24 @@ static void mlx5e_rep_unregister_vf_vports(struct mlx5e_priv *priv)
 	int vport;
 
 	for (vport = 1; vport < total_vfs; vport++)
-		mlx5_eswitch_unregister_vport_rep(esw, vport);
+		mlx5_eswitch_unregister_vport_rep(esw, vport, REP_ETH);
 }
 
 void mlx5e_register_vport_reps(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
+	struct mlx5_eswitch_rep_if rep_if;
 	struct mlx5e_rep_priv *rpriv;
-	struct mlx5_eswitch_rep rep;
 
 	rpriv = priv->ppriv;
 	rpriv->netdev = priv->netdev;
 
-	rep.load = mlx5e_nic_rep_load;
-	rep.unload = mlx5e_nic_rep_unload;
-	rep.priv = rpriv;
+	rep_if.load = mlx5e_nic_rep_load;
+	rep_if.unload = mlx5e_nic_rep_unload;
+	rep_if.priv = rpriv;
 	INIT_LIST_HEAD(&rpriv->vport_sqs_list);
-	mlx5_eswitch_register_vport_rep(esw, 0, &rep); /* UPLINK PF vport*/
+	mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_ETH); /* UPLINK PF vport*/
 
 	mlx5e_rep_register_vf_vports(priv); /* VFs vports */
 }
@@ -1208,7 +1209,7 @@ void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv)
 	struct mlx5_eswitch *esw   = mdev->priv.eswitch;
 
 	mlx5e_rep_unregister_vf_vports(priv); /* VFs vports */
-	mlx5_eswitch_unregister_vport_rep(esw, 0); /* UPLINK PF*/
+	mlx5_eswitch_unregister_vport_rep(esw, 0, REP_ETH); /* UPLINK PF*/
 }
 
 void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 8db68369367e..e4473a9ebd50 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -64,7 +64,7 @@ struct mlx5e_rep_priv {
 static inline
 struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
 {
-	return (struct mlx5e_rep_priv *)rep->priv;
+	return (struct mlx5e_rep_priv *)rep->rep_if[REP_ETH].priv;
 }
 
 struct mlx5e_neigh {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index f462496cce7a..259e91e2d09a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -617,7 +617,7 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
 						  FLOW_DISSECTOR_KEY_ENC_PORTS,
 						  f->mask);
 		struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-		struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+		struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
 		struct net_device *up_dev = uplink_rpriv->netdev;
 		struct mlx5e_priv *up_priv = netdev_priv(up_dev);
 
@@ -1522,7 +1522,7 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
 #else
 	return -EOPNOTSUPP;
 #endif
-	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
 	/* if the egress device isn't on the same HW e-switch, we use the uplink */
 	if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev))
 		*out_dev = uplink_rpriv->netdev;
@@ -1561,7 +1561,7 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
 
 	*out_ttl = ip6_dst_hoplimit(dst);
 
-	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+	uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
 	/* if the egress device isn't on the same HW e-switch, we use the uplink */
 	if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
 		*out_dev = uplink_rpriv->netdev;
@@ -1864,7 +1864,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 			      struct mlx5e_tc_flow *flow)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-	struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+	struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw,
+									   REP_ETH);
 	struct net_device *up_dev = uplink_rpriv->netdev;
 	unsigned short family = ip_tunnel_info_af(tun_info);
 	struct mlx5e_priv *up_priv = netdev_priv(up_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 3a21ea4e4d24..91175965df7f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -45,6 +45,11 @@ enum {
 	SRIOV_OFFLOADS
 };
 
+enum {
+	REP_ETH,
+	NUM_REP_TYPES,
+};
+
 #ifdef CONFIG_MLX5_ESWITCH
 
 #define MLX5_MAX_UC_PER_VPORT(dev) \
@@ -138,16 +143,21 @@ struct mlx5_esw_sq {
 	struct list_head	 list;
 };
 
-struct mlx5_eswitch_rep {
+struct mlx5_eswitch_rep;
+struct mlx5_eswitch_rep_if {
 	int		       (*load)(struct mlx5_core_dev *dev,
 				       struct mlx5_eswitch_rep *rep);
 	void		       (*unload)(struct mlx5_eswitch_rep *rep);
 	void			*priv;
+	bool		       valid;
+};
+
+struct mlx5_eswitch_rep {
+	struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES];
 	u16		       vport;
 	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
 	u32		       vlan_refcount;
-	bool		       valid;
 };
 
 struct mlx5_esw_offload {
@@ -268,10 +278,12 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap);
 int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
 				     int vport_index,
-				     struct mlx5_eswitch_rep *rep);
+				     struct mlx5_eswitch_rep_if *rep_if,
+				     u8 rep_type);
 void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
-				       int vport_index);
-void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw);
+				       int vport_index,
+				       u8 rep_type);
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
 
 int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
 				 struct mlx5_esw_flow_attr *attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 07f26c1986fc..99f583a15cc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -130,7 +130,7 @@ static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
 	esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
 	for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
 		rep = &esw->offloads.vport_reps[vf_vport];
-		if (!rep->valid)
+		if (!rep->rep_if[REP_ETH].valid)
 			continue;
 
 		err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
@@ -719,21 +719,31 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 	return 0;
 }
 
-static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+static void esw_offloads_unload_reps_type(struct mlx5_eswitch *esw, int nvports,
+					  u8 rep_type)
 {
 	struct mlx5_eswitch_rep *rep;
 	int vport;
 
 	for (vport = nvports - 1; vport >= 0; vport--) {
 		rep = &esw->offloads.vport_reps[vport];
-		if (!rep->valid)
+		if (!rep->rep_if[rep_type].valid)
 			continue;
 
-		rep->unload(rep);
+		rep->rep_if[rep_type].unload(rep);
 	}
 }
 
-static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = NUM_REP_TYPES;
+
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
+}
+
+static int esw_offloads_load_reps_type(struct mlx5_eswitch *esw, int nvports,
+				       u8 rep_type)
 {
 	struct mlx5_eswitch_rep *rep;
 	int vport;
@@ -741,10 +751,10 @@ static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
 
 	for (vport = 0; vport < nvports; vport++) {
 		rep = &esw->offloads.vport_reps[vport];
-		if (!rep->valid)
+		if (!rep->rep_if[rep_type].valid)
 			continue;
 
-		err = rep->load(esw->dev, rep);
+		err = rep->rep_if[rep_type].load(esw->dev, rep);
 		if (err)
 			goto err_reps;
 	}
@@ -752,7 +762,26 @@ static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
 	return 0;
 
 err_reps:
-	esw_offloads_unload_reps(esw, vport);
+	esw_offloads_unload_reps_type(esw, vport, rep_type);
+	return err;
+}
+
+static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+{
+	u8 rep_type = 0;
+	int err;
+
+	for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+		err = esw_offloads_load_reps_type(esw, nvports, rep_type);
+		if (err)
+			goto err_reps;
+	}
+
+	return err;
+
+err_reps:
+	while (rep_type-- > 0)
+		esw_offloads_unload_reps_type(esw, nvports, rep_type);
 	return err;
 }
 
@@ -1121,22 +1150,23 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
 
 void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
 				     int vport_index,
-				     struct mlx5_eswitch_rep *__rep)
+				     struct mlx5_eswitch_rep_if *__rep_if,
+				     u8 rep_type)
 {
 	struct mlx5_esw_offload *offloads = &esw->offloads;
-	struct mlx5_eswitch_rep *rep;
+	struct mlx5_eswitch_rep_if *rep_if;
 
-	rep = &offloads->vport_reps[vport_index];
+	rep_if = &offloads->vport_reps[vport_index].rep_if[rep_type];
 
-	rep->load   = __rep->load;
-	rep->unload = __rep->unload;
-	rep->priv = __rep->priv;
+	rep_if->load   = __rep_if->load;
+	rep_if->unload = __rep_if->unload;
+	rep_if->priv = __rep_if->priv;
 
-	rep->valid = true;
+	rep_if->valid = true;
 }
 
 void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
-				       int vport_index)
+				       int vport_index, u8 rep_type)
 {
 	struct mlx5_esw_offload *offloads = &esw->offloads;
 	struct mlx5_eswitch_rep *rep;
@@ -1144,17 +1174,17 @@ void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
 	rep = &offloads->vport_reps[vport_index];
 
 	if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
-		rep->unload(rep);
+		rep->rep_if[rep_type].unload(rep);
 
-	rep->valid = false;
+	rep->rep_if[rep_type].valid = false;
 }
 
-void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw)
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
 {
 #define UPLINK_REP_INDEX 0
 	struct mlx5_esw_offload *offloads = &esw->offloads;
 	struct mlx5_eswitch_rep *rep;
 
 	rep = &offloads->vport_reps[UPLINK_REP_INDEX];
-	return rep->priv;
+	return rep->rep_if[rep_type].priv;
 }
-- 
2.13.0

^ permalink raw reply related

* [for-next V3 09/11] net/mlx5e: E-Switch, Use the name of static array instead of its address
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>

From: Gal Pressman <galp@mellanox.com>

Using the address of a static array is the same as using its name (in
this specific use-case), but it's confusing and makes the code less
readable.

Fixes: 1bd27b11c1df ("net/mlx5: Introduce E-switch QoS management")
Fixes: bd77bf1cb595 ("net/mlx5: Add SRIOV VF max rate configuration support")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 6d4cbdb69823..cdf65ed8714c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1290,7 +1290,7 @@ static int esw_create_tsar(struct mlx5_eswitch *esw)
 
 	err = mlx5_create_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
-						 &tsar_ctx,
+						 tsar_ctx,
 						 &esw->qos.root_tsar_id);
 	if (err) {
 		esw_warn(esw->dev, "E-Switch create TSAR failed (%d)\n", err);
@@ -1333,20 +1333,20 @@ static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
 	if (vport->qos.enabled)
 		return -EEXIST;
 
-	MLX5_SET(scheduling_context, &sched_ctx, element_type,
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
-	vport_elem = MLX5_ADDR_OF(scheduling_context, &sched_ctx,
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
 				  element_attributes);
 	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
-	MLX5_SET(scheduling_context, &sched_ctx, parent_element_id,
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
 		 esw->qos.root_tsar_id);
-	MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
 		 initial_max_rate);
-	MLX5_SET(scheduling_context, &sched_ctx, bw_share, initial_bw_share);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, initial_bw_share);
 
 	err = mlx5_create_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
-						 &sched_ctx,
+						 sched_ctx,
 						 &vport->qos.esw_tsar_ix);
 	if (err) {
 		esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n",
@@ -1392,22 +1392,22 @@ static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
 	if (!vport->qos.enabled)
 		return -EIO;
 
-	MLX5_SET(scheduling_context, &sched_ctx, element_type,
+	MLX5_SET(scheduling_context, sched_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
-	vport_elem = MLX5_ADDR_OF(scheduling_context, &sched_ctx,
+	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
 				  element_attributes);
 	MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
-	MLX5_SET(scheduling_context, &sched_ctx, parent_element_id,
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
 		 esw->qos.root_tsar_id);
-	MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
 		 max_rate);
-	MLX5_SET(scheduling_context, &sched_ctx, bw_share, bw_share);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
 	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
 	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
 
 	err = mlx5_modify_scheduling_element_cmd(dev,
 						 SCHEDULING_HIERARCHY_E_SWITCH,
-						 &sched_ctx,
+						 sched_ctx,
 						 vport->qos.esw_tsar_ix,
 						 bitmask);
 	if (err) {
-- 
2.13.0

^ permalink raw reply related

* Re: iproute2 net-next
From: Daniel Borkmann @ 2017-12-28 23:46 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Stephen Hemminger, netdev, dsa
In-Reply-To: <20171226093547.GC10734@mtr-leonro.local>

On 12/26/2017 10:35 AM, Leon Romanovsky wrote:
> On Mon, Dec 25, 2017 at 10:14:26PM -0800, Stephen Hemminger wrote:
>> On Tue, 26 Dec 2017 06:47:43 +0200
>> Leon Romanovsky <leon@kernel.org> wrote:
>>
>>> On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote:
>>>> David Ahern has agreed to take over managing the net-next branch of iproute2.
>>>> The new location is:
>>>>  https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/
>>>>
>>>> In the past, I have accepted new features into iproute2 master branch, but
>>>> am changing the policy so that outside of the merge window (up until -rc1)
>>>> new features will get put into net-next to get some more review and testing
>>>> time. This means that things like the proposed batch streaming mode will
>>>> go through net-next.
>>>
>>> Did you consider to create one shared repo for the iproute2 to allow
>>> multiple committers workflow?
>>
>> For now having separate trees is best, there is no need for multiple
>> committers the load is very light.
>>
>>> It will be much convenient for the users to have one place for
>>> master/stable/net-next branches, instead of actually following two
>>> different repositories.
>>
>> If you are doing network development, you already need to deal with
>> multiple repo's on the kernel side so there is no difference.
> 
> I agree with you that one extra "git remote add .." is not so huge and
> all people who develop for the netdev will do it. My concern is about
> Documentation and newcomers, who will have a hard time to find a right
> tree.

I guess it would certainly help to identify the official repo to rebase
against much quicker if it would be under a common group on korg e.g.

  * iproute2/iproute2.git         - for current cycle
  * iproute2/iproute2-next.git    - for net-next bits

and also be in line with other tooling (ethtool and others), even if
not as high volume, but it would make it unambiguous right away from
the other, private iproute2 repos on korg, imho. Just a thought.

>>> Example, of such shared repo:
>>> BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
>>> Bluetooth: https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/
>>> RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/
>>
>> Most of these are high volume or vendor silo'd which is not the case here.
Cheers,
Daniel

^ permalink raw reply

* Re: Pravin Shelar
From: Pravin Shelar @ 2017-12-28 23:47 UTC (permalink / raw)
  To: Joe Perches; +Cc: ovs dev, Julia Lawall, Linux Kernel Network Developers
In-Reply-To: <1514399625.7654.9.camel-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org>

On Wed, Dec 27, 2017 at 10:33 AM, Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org> wrote:
> On Wed, 2017-12-27 at 10:25 -0800, Ben Pfaff wrote:
>> On Wed, Dec 27, 2017 at 04:22:55PM +0100, Julia Lawall wrote:
>> > The email address pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org listed for Pravin Shelar in
>> > MAINTAINERS (OPENVSWITCH section) seems to bounce.
>>
>> Pravin has used a newer address recently, so I sent out a suggested
>> update (for OVS):
>>         https://patchwork.ozlabs.org/patch/853232/
>
> As Pravin is still active with acks but not any authored patches in
> the
> last year, this should still be updated in the linux-kernel's
> MAINTAINERS
> file too.
> ---
> diff --git a/MAINTAINERS b/MAINTAINERS
> index
> a6e86e20761e..5869e5f0b930 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@
> -10137,7 +10137,7 @@ F: drivers/irqchip/irq-ompic.c
>  F:     dri
> vers/irqchip/irq-or1k-*
>
>  OPENVSWITCH
> -M:     Pravin Shelar <pshelar@ni
> cira.com>
> +M:     Pravin Shelar <pshelar-LZ6Gd1LRuIk@public.gmane.org>
>  L:     netdev@vge
> r.kernel.org
>  L:     dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org
>  W:     http://openvswitch.org

Thanks Joe for the patch. But it is corrupted. I will send updated patch soon.

^ permalink raw reply

* Re: [PATCH net-next v6 0/6] net: tcp: sctp: dccp: Replace jprobe usage with trace events
From: Masami Hiramatsu @ 2017-12-28 23:48 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, ian.mcdonald, vyasevich, stephen, rostedt, peterz, tglx,
	linux-kernel, hpa, gerrit, nhorman, dccp, netdev, linux-sctp, sfr
In-Reply-To: <20171228.120613.1048334566479767343.davem@davemloft.net>

On Thu, 28 Dec 2017 12:06:13 -0500 (EST)
David Miller <davem@davemloft.net> wrote:

> From: Masami Hiramatsu <mhiramat@kernel.org>
> Date: Thu, 28 Dec 2017 15:10:00 +0900
> 
> > Changes from v5:
> >   [1/6]: Avoid preprocessor directives in tracepoint macro args
> 
> Patch #1 is not the only patch which has this problem, at a minimum
> patch #5 has it too.

Oops, sorry...

> Please audit the entire series for an issue when it is brought to your
> attention.

Thank you for your kindly advice.

> 
> Thank you.


-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply

* Re: [pull request][for-next V3 00/11] Mellanox, mlx5 E-Switch updates 2017-12-19
From: David Miller @ 2017-12-29  0:46 UTC (permalink / raw)
  To: saeedm; +Cc: dledford, netdev, linux-rdma, leonro
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>

From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 29 Dec 2017 01:23:03 +0200

> ==============
> This series includes updates for mlx5 E-Switch infrastructures,
> to be merged into net-next and rdma-next trees.
> 
> Mark's patches provide E-Switch refactoring that generalize the mlx5
> E-Switch vf representors interfaces and data structures. The serious is
> mainly focused on moving ethernet (netdev) specific representors logic out
> of E-Switch (eswitch.c) into mlx5e representor module (en_rep.c), which
> provides better separation and allows future support for other types of vf
> representors (e.g. RDMA).
> 
> Gal's patches at the end of this serious, provide a simple syntax fix and
> two other patches that handles vport ingress/egress ACL steering name
> spaces to be aligned with the Firmware/Hardware specs.
> ===============
> 
> V1->V2:
>  - Addressed coding style comments in patches #1 and #7
>  - The series is still based on rc4, as now I see net-next is also @rc4.
> 
> V2->V3:
>  - Fixed compilation warning, reported by Dave.
> 
> Please pull and let me know if there's any problem.

Looks good, pulled, thank you.

^ permalink raw reply

* Re: [RFC PATCH bpf-next v2 1/4] tracing/kprobe: bpf: Check error injectable event is on function entry
From: Alexei Starovoitov @ 2017-12-29  1:03 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Steven Rostedt, Alexei Starovoitov, Josef Bacik, mingo, davem,
	netdev, linux-kernel, ast, kernel-team, daniel, linux-btrfs,
	darrick.wong, Josef Bacik, Akinobu Mita
In-Reply-To: <20171228172027.4a8f2f0cf0506499acd26738@kernel.org>

On 12/28/17 12:20 AM, Masami Hiramatsu wrote:
> On Wed, 27 Dec 2017 20:32:07 -0800
> Alexei Starovoitov <ast@fb.com> wrote:
>
>> On 12/27/17 8:16 PM, Steven Rostedt wrote:
>>> On Wed, 27 Dec 2017 19:45:42 -0800
>>> Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>>> I don't think that's the case. My reading of current
>>>> trace_kprobe_ftrace() -> arch_check_ftrace_location()
>>>> is that it will not be true for old mcount case.
>>>
>>> In the old mcount case, you can't use ftrace to return without calling
>>> the function. That is, no modification of the return ip, unless you
>>> created a trampoline that could handle arbitrary stack frames, and
>>> remove them from the stack before returning back to the function.
>>
>> correct. I was saying that trace_kprobe_ftrace() won't let us do
>> bpf_override_return with old mcount.
>
> No, trace_kprobe_ftrace() just checks the given address will be
> managed by ftrace. you can see arch_check_ftrace_location() in kernel/kprobes.c.
>
> FYI, CONFIG_KPROBES_ON_FTRACE depends on DYNAMIC_FTRACE_WITH_REGS, and
> DYNAMIC_FTRACE_WITH_REGS doesn't depend on CC_USING_FENTRY.
> This means if you compile kernel with old gcc and enable DYNAMIC_FTRACE,
> kprobes uses ftrace on mcount address which is NOT the entry point
> of target function.

ok. fair enough. I think we can gate the feature to !mcount only.

> On the other hand, changing IP feature has been implemented originaly
> by kprobes with int3 (sw breakpoint). This means you can use kprobes
> at correct address (the entry address of the function) you can hijack
> the function, as jprobe did.
>
>>>> As far as the rest of your arguments it very much puzzles me that
>>>> you claim that this patch suppose to work based on historical
>>>> reasoning whereas you did NOT test it.
>>>
>>> I believe that Masami is saying that the modification of the IP from
>>> kprobes has been very well tested. But I'm guessing that you still want
>>> a test case for using kprobes in this particular instance. It's not the
>>> implementation of modifying the IP that you are worried about, but the
>>> implementation of BPF using it in this case. Right?
>>
>> exactly. No doubt that old code works.
>> But it doesn't mean that bpf_override_return() will continue to
>> work in kprobes that are not ftrace based.
>> I suspect Josef's existing test case will cover this situation.
>> Probably only special .config is needed to disable ftrace, so
>> "kprobe on entry but not ftrace" check will kick in.
>
> Right. If you need to test it, you can run Josef's test case without
> CONFIG_DYNAMIC_FTRACE.

It should be obvious that the person who submits the patch
must run the tests.

>> But I didn't get an impression that this situation was tested.
>> Instead I see only logical reasoning that it's _supposed_ to work.
>> That's not enough.
>
> OK, so would you just ask me to run samples/bpf ?

Please run Josef's test in the !ftrace setup.

^ permalink raw reply

* Re: [RFC PATCH bpf-next v2 4/4] error-injection: Support fault injection framework
From: Alexei Starovoitov @ 2017-12-29  1:11 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Alexei Starovoitov, Josef Bacik, rostedt, mingo, davem, netdev,
	linux-kernel, ast, kernel-team, daniel, linux-btrfs, darrick.wong,
	Josef Bacik, Akinobu Mita
In-Reply-To: <20171228165140.0f9fc7f6067b5581c018e81d@kernel.org>

On 12/27/17 11:51 PM, Masami Hiramatsu wrote:
>
> Then what happen if the user set invalid retval to those functions?
> even if we limit the injectable functions, it can cause a problem,
>
> for example,
>
>  obj = func_return_object();
>  if (!obj) {
>     handling_error...;
>  }
>  obj->field = x;
>
> In this case, obviously func_return_object() must return NULL if there is
> an error, not -ENOMEM. But without the correct retval information, how would
> you check the BPF code doesn't cause a trouble?
> Currently it seems you are expecting only the functions which return error code.
>
>  ret = func_return_state();
>  if (ret < 0) {
>     handling_error...;
>  }
>
> But how we can distinguish those?
>
> If we have the error range for each function, we can ensure what is
> *correct* error code, NULL or errno, or any other error numbers. :)

messing up return values may cause problems and range check is
not going to magically help.
The caller may handle only a certain set of errors or interpret
some of them like EBUSY as a signal to retry.
It's plain impossible to make sure that kernel will be functional
after error injection has been made.
Like kmalloc() unconditionally returning NULL will be deadly
for the kernel, hence this patch 4/4 has very limited practical
use. The bpf program need to make intelligent decisions when
to return an error and what kind of error to return.
Doing blank range check adds a false sense of additional safety.
More so it wastes kilobytes of memory to do this check, hence nack.

^ permalink raw reply

* Re: [RFT net-next v3 0/5] dwmac-meson8b: RGMII clock fixes for Meson8b
From: Emiliano Ingrassia @ 2017-12-29  1:31 UTC (permalink / raw)
  To: Martin Blumenstingl
  Cc: netdev, linus.luessing, khilman, linux-amlogic, jbrunet,
	Neil Armstrong, peppe.cavallaro, alexandre.torgue
In-Reply-To: <20171228222128.15215-1-martin.blumenstingl@googlemail.com>

Hi Martin, Hi Dave,

On Thu, Dec 28, 2017 at 11:21:23PM +0100, Martin Blumenstingl wrote:
> Hi Dave,
> 
> please do not apply this series until it got a Tested-by from Emiliano.
> 
> 
> Hi Emiliano,
> 
> you reported [0] that you couldn't get dwmac-meson8b to work on your
> Odroid-C1. With your findings (register dumps, clk_summary output, etc.)
> I think I was able to find a fix: it consists of two patches (which you
> find in this series)
> 
> Unfortunately I don't have any Meson8b boards with RGMII PHY so I could
> only partially test this (I could only check if the clocks were
> calculated correctly when using a dummy 500002394Hz input clock instead
> of MPLL2).
> 
> Could you please give this series a try and let me know about the
> results?
> You obviously still need your two "ARM: dts: meson8b" patches which
> - add the amlogic,meson8b-dwmac" compatible to meson8b.dtsi
> - enable Ethernet on the Odroid-C1
> 
> When testing on Meson8b this also needs a fix for the MPLL clock driver:
> "clk: meson: mpll: use 64-bit maths in params_from_rate", see:
> https://patchwork.kernel.org/patch/10131677/
> 
> 
> I have tested this myself on a Khadas VIM (GXL SoC, internal RMII PHY)
> and a Khadas VIM2 (GXM SoC, external RGMII PHY). Both are still working
> fine (so let's hope that this also fixes your Meson8b issue :)).
> 
> 
> changes since v1 at [1]:
> - changed the subject of the cover-letter to indicate that this is all
>   about the RGMII clock
> - added PATCH #1 which ensures that we don't unnecessarily change the
>   parent clocks in RMII mode (and also makes the code easier to
>   understand)
> - changed subject of PATCH #2 (formerly PATCH #1) to state that this
>   is about the RGMII clock
> - added Jerome's Reviewed-by to PATCH #2 (formerly PATCH #1)
> - replaced PATCH #3 (formerly PATCH #2) with one that sets
>   CLK_SET_RATE_PARENT on the mux and thus re-configures the MPLL2 clock
>   on Meson8b correctly
> 
> changes since v2 at [2]:
> - added PATCH #2 to make the following patch easier
> - Emiliano reported that there's currently another bug in the
>   dwmac-meson8b driver which prevents it from working with RGMII PHYs on
>   Meson8b: bit 10 of the PRG_ETH0 register is configures a clock gate
>   (instead of a divide by 5 or divide by 10 clock divider). This has not
>   been visible on GXBB and later due to the input clock which always led
>   to a selection of "divide by 10" (which is done internally in the IP
>   block, but the bit actually means "enable RGMII clock output").
>   PATCH #3 was added to address this issue.
> - the commit message of PATCH #4 and #5 (formerly PATCH #2 and #3) were
>   updated and the patch itself rebased because the m25_div clock was
>   removed with the new PATCH #3 (so some of the statements were not
>   valid anymore)
>

Here is the clk_summary relative to ethernet on Odroid-C1+
with this new series applied:

xtal				    1            1    24000000          0 0
 sys_pll			    0            0  1200000000          0 0
  cpu_clk			    0            0  1200000000          0 0
 vid_pll			    0            0   732000000          0 0
 fixed_pll			    2            2  2550000000          0 0
  mpll2				    1            1   249999701          0 0
   c9410000.ethernet#m250_sel       1            1   249999701          0 0
    c9410000.ethernet#m250_div	    1            1   249999701          0 0
     c9410000.ethernet#fixed_div10  1            1    24999970          0 0
      c9410000.ethernet#m25_en	    1            1    24999970          0 0

The ethernet prg0 register is set to 0x74A1 which should be correct with
respect to the information contained in the S805 SoC manual.
Actually, the ethernet is not yet fully functional.
Trying to ping the board, I can see ARP request from host to board using
tcpdump. However, the host can't see any response.

Following the U-Boot value for prg0 register, which is 0x7d21, I also
tried to set bit 11. As expected, this did not have any influence.
Another thing that we should check is the "Ethernet Memory PD" (see S805
manual - sec. 5.4) register which bits 3-2 enable/disable ethernet
normal operation. However, those bits are already cleared by U-Boot.

Thank you for the support.

Best regards,

Emiliano

> 
> [0] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005596.html
> [1] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005848.html
> [2] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005861.html
> 
> 
> Martin Blumenstingl (5):
>   net: stmmac: dwmac-meson8b: only configure the clocks in RGMII mode
>   net: stmmac: dwmac-meson8b: simplify generating the clock names
>   net: stmmac: dwmac-meson8b: fix internal RGMII clock configuration
>   net: stmmac: dwmac-meson8b: fix setting the RGMII clock on Meson8b
>   net: stmmac: dwmac-meson8b: propagate rate changes to the parent clock
> 
>  .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c    | 119 +++++++++++----------
>  1 file changed, 63 insertions(+), 56 deletions(-)
> 
> -- 
> 2.15.1
> 

^ permalink raw reply

* [GIT] Networking
From: David Miller @ 2017-12-29  2:05 UTC (permalink / raw)
  To: torvalds; +Cc: akpm, netdev, linux-kernel


1) IPv6 gre tunnels end up with different default features enabled
   depending upon whether netlink or ioctls are used to bring them
   up.  Fix from Alexey Kodanev.

2) Fix read past end of user control message in RDS< from Avinash
   Repaka.

3) Missing RCU barrier in mini qdisc code, from Cong Wang.

4) Missing policy put when reusing per-cpu route entries, from
   Florian Westphal.

5) Handle nested PCI errors properly in bnx2x driver, from Guilherme
   G. Piccoli.

6) Run nested transport mode IPSEC packets via tasklet, from Herbert
   Xu.

7) Fix handling poll() for stream sockets in tipc, from Parthasarathy
   Bhuvaragan.

8) Fix two stack-out-of-bounds issues in IPSEC, from Steffen Klassert.

9) Another zerocopy ubuf handling fix, from Willem de Bruijn.

Please pull, thanks a lot!

The following changes since commit ead68f216110170ec729e2c4dec0aad6d38259d7:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-12-21 15:57:30 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 

for you to fetch changes up to d5902f6d1fbdb27e6a33c418063466d94be9dfa2:

  Merge branch 'strparser-Fix-lockdep-issue' (2017-12-28 14:28:23 -0500)

----------------------------------------------------------------
Alexey Kodanev (1):
      ip6_gre: fix device features for ioctl setup

Antony Antony (1):
      xfrm: fix xfrm_do_migrate() with AEAD e.g(AES-GCM)

Avinash Repaka (1):
      RDS: Check cmsg_len before dereferencing CMSG_DATA

Aviv Heller (1):
      xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0)

Cong Wang (2):
      xfrm: check id proto in validate_tmpl()
      net_sched: fix a missing rcu barrier in mini_qdisc_pair_swap()

Daniel Borkmann (1):
      Merge branch 'bpf-bpftool-various-fixes'

David S. Miller (4):
      Merge branch 'master' of git://git.kernel.org/.../klassert/ipsec
      Merge branch 'tg3-fixes'
      Merge git://git.kernel.org/.../bpf/bpf
      Merge branch 'strparser-Fix-lockdep-issue'

Florian Westphal (1):
      xfrm: put policies when reusing pcpu xdst entry

Fugang Duan (1):
      net: fec: unmap the xmit buffer that are not transferred by DMA

Grygorii Strashko (1):
      net: phy: micrel: ksz9031: reconfigure autoneg after phy autoneg workaround

Guilherme G. Piccoli (1):
      bnx2x: Improve reliability in case of nested PCI errors

Herbert Xu (1):
      xfrm: Reinject transport-mode packets through tasklet

Jakub Kicinski (2):
      tools: bpftool: maps: close json array on error paths of show
      tools: bpftool: protect against races with disappearing objects

Jiri Pirko (1):
      net: sched: fix possible null pointer deref in tcf_block_put

Jon Maloy (2):
      tipc: base group replicast ack counter on number of actual receivers
      tipc: fix memory leak of group member when peer node is lost

Mat Martineau (1):
      tcp: Avoid preprocessor directives in tracepoint macro args

Michal Kubecek (1):
      xfrm: fix XFRMA_OUTPUT_MARK policy entry

Parthasarathy Bhuvaragan (1):
      tipc: fix hanging poll() for stream sockets

Quentin Monnet (1):
      selftests/bpf: fix Makefile for passing LLC to the command line

Russell King (2):
      phylink: ensure the PHY interface mode is appropriately set
      phylink: ensure AN is enabled

Siva Reddy Kallam (3):
      tg3: Update copyright
      tg3: Add workaround to restrict 5762 MRRS to 2048
      tg3: Enable PHY reset in MTU change path for 5720

Steffen Klassert (2):
      xfrm: Fix stack-out-of-bounds read on socket policy lookup.
      xfrm: Fix stack-out-of-bounds with misconfigured transport mode policies.

Tom Herbert (2):
      sock: Add sock_owned_by_user_nocheck
      strparser: Call sock_owned_by_user_nocheck

Tommi Rantala (2):
      tipc: error path leak fixes in tipc_enable_bearer()
      tipc: fix tipc_mon_delete() oops in tipc_enable_bearer() error path

Tonghao Zhang (1):
      sctp: Replace use of sockets_allocated with specified macro.

Willem de Bruijn (1):
      skbuff: in skb_copy_ubufs unclone before releasing zerocopy

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  4 +--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 +++++++++-
 drivers/net/ethernet/broadcom/tg3.c              | 19 +++++++++++--
 drivers/net/ethernet/broadcom/tg3.h              |  7 ++++-
 drivers/net/ethernet/freescale/fec_main.c        |  6 ++++
 drivers/net/phy/micrel.c                         |  1 +
 drivers/net/phy/phylink.c                        |  2 ++
 include/net/sock.h                               |  5 ++++
 include/net/xfrm.h                               |  3 ++
 include/trace/events/tcp.h                       | 97 +++++++++++++++++++++++++----------------------------------------
 net/core/skbuff.c                                |  6 ++--
 net/ipv4/xfrm4_input.c                           | 12 +++++++-
 net/ipv6/ip6_gre.c                               | 57 +++++++++++++++++++++-----------------
 net/ipv6/xfrm6_input.c                           | 10 ++++++-
 net/rds/send.c                                   |  3 ++
 net/sched/cls_api.c                              |  2 ++
 net/sched/sch_generic.c                          |  4 ++-
 net/sctp/socket.c                                |  4 +--
 net/strparser/strparser.c                        |  2 +-
 net/tipc/bearer.c                                |  5 +++-
 net/tipc/group.c                                 | 31 ++++++++++++++-------
 net/tipc/monitor.c                               |  6 +++-
 net/tipc/socket.c                                |  2 +-
 net/xfrm/xfrm_input.c                            | 69 +++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_policy.c                           |  9 +++++-
 net/xfrm/xfrm_state.c                            |  1 +
 net/xfrm/xfrm_user.c                             | 26 +++++++++++++++++-
 tools/bpf/bpftool/map.c                          |  8 ++++--
 tools/bpf/bpftool/prog.c                         |  2 ++
 tools/testing/selftests/bpf/Makefile             |  2 +-
 30 files changed, 298 insertions(+), 121 deletions(-)

^ permalink raw reply

* [PATCH net-next 0/2] tun: allow to attach eBPF filter
From: Jason Wang @ 2017-12-29  2:44 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang

Hi all:

This series tries to implement eBPF socket filter for tun. This could
be used for implementing efficient virtio-net receive filter for
vhost-net.

Thanks

Jason Wang (2):
  tuntap: rename struct tun_steering_prog to struct tun_prog
  tun: allow to attach ebpf socket filter

 drivers/net/tun.c           | 58 ++++++++++++++++++++++++++++++++-------------
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 43 insertions(+), 16 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next 1/2] tuntap: rename struct tun_steering_prog to struct tun_prog
From: Jason Wang @ 2017-12-29  2:44 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1514515491-6041-1-git-send-email-jasowang@redhat.com>

To be reused by other eBPF program other than queue selection.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e367d631..0853829 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -195,7 +195,7 @@ struct tun_flow_entry {
 
 #define TUN_NUM_FLOW_ENTRIES 1024
 
-struct tun_steering_prog {
+struct tun_prog {
 	struct rcu_head rcu;
 	struct bpf_prog *prog;
 };
@@ -237,7 +237,7 @@ struct tun_struct {
 	u32 rx_batched;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
-	struct tun_steering_prog __rcu *steering_prog;
+	struct tun_prog __rcu *steering_prog;
 };
 
 static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -571,7 +571,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 
 static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
 {
-	struct tun_steering_prog *prog;
+	struct tun_prog *prog;
 	u16 ret = 0;
 
 	prog = rcu_dereference(tun->steering_prog);
@@ -2027,19 +2027,18 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return ret;
 }
 
-static void tun_steering_prog_free(struct rcu_head *rcu)
+static void tun_prog_free(struct rcu_head *rcu)
 {
-	struct tun_steering_prog *prog = container_of(rcu,
-					 struct tun_steering_prog, rcu);
+	struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
 
 	bpf_prog_destroy(prog->prog);
 	kfree(prog);
 }
 
-static int __tun_set_steering_ebpf(struct tun_struct *tun,
-				   struct bpf_prog *prog)
+static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+			  struct bpf_prog *prog)
 {
-	struct tun_steering_prog *old, *new = NULL;
+	struct tun_prog *old, *new = NULL;
 
 	if (prog) {
 		new = kmalloc(sizeof(*new), GFP_KERNEL);
@@ -2049,13 +2048,13 @@ static int __tun_set_steering_ebpf(struct tun_struct *tun,
 	}
 
 	spin_lock_bh(&tun->lock);
-	old = rcu_dereference_protected(tun->steering_prog,
+	old = rcu_dereference_protected(*prog_p,
 					lockdep_is_held(&tun->lock));
-	rcu_assign_pointer(tun->steering_prog, new);
+	rcu_assign_pointer(*prog_p, new);
 	spin_unlock_bh(&tun->lock);
 
 	if (old)
-		call_rcu(&old->rcu, tun_steering_prog_free);
+		call_rcu(&old->rcu, tun_prog_free);
 
 	return 0;
 }
@@ -2068,7 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
-	__tun_set_steering_ebpf(tun, NULL);
+	__tun_set_ebpf(tun, &tun->steering_prog, NULL);
 }
 
 static void tun_setup(struct net_device *dev)
@@ -2550,7 +2549,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 	return ret;
 }
 
-static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+			void __user *data)
 {
 	struct bpf_prog *prog;
 	int fd;
@@ -2566,7 +2566,7 @@ static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
 			return PTR_ERR(prog);
 	}
 
-	return __tun_set_steering_ebpf(tun, prog);
+	return __tun_set_ebpf(tun, prog_p, prog);
 }
 
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
@@ -2846,7 +2846,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	case TUNSETSTEERINGEBPF:
-		ret = tun_set_steering_ebpf(tun, argp);
+		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
 		break;
 
 	default:
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 2/2] tun: allow to attach ebpf socket filter
From: Jason Wang @ 2017-12-29  2:44 UTC (permalink / raw)
  To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1514515491-6041-1-git-send-email-jasowang@redhat.com>

This patch allows userspace to attach eBPF filter to tun. This will
allow to implement VM dataplane filtering in a more efficient way
compared to cBPF filter.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c           | 26 ++++++++++++++++++++++++++
 include/uapi/linux/if_tun.h |  1 +
 2 files changed, 27 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0853829..6e9452b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -238,6 +238,7 @@ struct tun_struct {
 	struct tun_pcpu_stats __percpu *pcpu_stats;
 	struct bpf_prog __rcu *xdp_prog;
 	struct tun_prog __rcu *steering_prog;
+	struct tun_prog __rcu *filter_prog;
 };
 
 static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -984,12 +985,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
 #endif
 }
 
+static unsigned int run_ebpf_filter(struct tun_struct *tun,
+				    struct sk_buff *skb,
+				    int len)
+{
+	struct tun_prog *prog = rcu_dereference(tun->filter_prog);
+
+	if (prog)
+		len = bpf_prog_run_clear_cb(prog->prog, skb);
+
+	return len;
+}
+
 /* Net device start xmit */
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
 	int txq = skb->queue_mapping;
 	struct tun_file *tfile;
+	int len = skb->len;
 
 	rcu_read_lock();
 	tfile = rcu_dereference(tun->tfiles[txq]);
@@ -1015,9 +1029,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	    sk_filter(tfile->socket.sk, skb))
 		goto drop;
 
+	len = run_ebpf_filter(tun, skb, len);
+	if (!len)
+		goto drop;
+
 	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
 		goto drop;
 
+	if (pskb_trim(skb, len))
+		goto drop;
+
 	skb_tx_timestamp(skb);
 
 	/* Orphan the skb - required as we might hang on to it
@@ -2068,6 +2089,7 @@ static void tun_free_netdev(struct net_device *dev)
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
 	__tun_set_ebpf(tun, &tun->steering_prog, NULL);
+	__tun_set_ebpf(tun, &tun->filter_prog, NULL);
 }
 
 static void tun_setup(struct net_device *dev)
@@ -2849,6 +2871,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
 		break;
 
+	case TUNSETFILTEREBPF:
+		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index fb38c17..ee432cd 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -58,6 +58,7 @@
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
 #define TUNSETSTEERINGEBPF _IOR('T', 224, int)
+#define TUNSETFILTEREBPF _IOR('T', 225, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next v7 0/6] net: tcp: sctp: dccp: Replace jprobe usage with trace events
From: Masami Hiramatsu @ 2017-12-29  2:45 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat

Hi,

This series is v7 of the replacement of jprobe usage with trace
events. This version fixes net/dccp/trace.h to avoid sparse
warning. Since the TP_STORE_ADDR_PORTS macro can be shared
with trace/events/tcp.h, it also introduce a new common header
file and move the definition of that macro.

Previous version is here;
 https://lkml.org/lkml/2017/12/28/7

Changes from v6:
  [5/6]: Avoid preprocessor directives in tracepoint macro args

Thank you,

---

Masami Hiramatsu (6):
      net: tcp: Add trace events for TCP congestion window tracing
      net: tcp: Remove TCP probe module
      net: sctp: Add SCTP ACK tracking trace event
      net: sctp: Remove debug SCTP probe module
      net: dccp: Add DCCP sendmsg trace event
      net: dccp: Remove dccpprobe module


 include/trace/events/net_probe_common.h |   44 +++++
 include/trace/events/sctp.h             |   99 ++++++++++
 include/trace/events/tcp.h              |   60 ++++++
 net/Kconfig                             |   17 --
 net/dccp/Kconfig                        |   17 --
 net/dccp/Makefile                       |    5 -
 net/dccp/probe.c                        |  203 ---------------------
 net/dccp/proto.c                        |    5 +
 net/dccp/trace.h                        |   84 +++++++++
 net/ipv4/Makefile                       |    1 
 net/ipv4/tcp_input.c                    |    3 
 net/ipv4/tcp_probe.c                    |  301 -------------------------------
 net/sctp/Kconfig                        |   12 -
 net/sctp/Makefile                       |    3 
 net/sctp/probe.c                        |  244 -------------------------
 net/sctp/sm_statefuns.c                 |    5 +
 16 files changed, 303 insertions(+), 800 deletions(-)
 create mode 100644 include/trace/events/net_probe_common.h
 create mode 100644 include/trace/events/sctp.h
 delete mode 100644 net/dccp/probe.c
 create mode 100644 net/dccp/trace.h
 delete mode 100644 net/ipv4/tcp_probe.c
 delete mode 100644 net/sctp/probe.c

--
Masami Hiramatsu (Linaro) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH net-next v7 1/6] net: tcp: Add trace events for TCP congestion window tracing
From: Masami Hiramatsu @ 2017-12-29  2:45 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

This adds an event to trace TCP stat variables with
slightly intrusive trace-event. This uses ftrace/perf
event log buffer to trace those state, no needs to
prepare own ring-buffer, nor custom user apps.

User can use ftrace to trace this event as below;

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/tcp/tcp_probe/enable
  (run workloads)
  # cat trace

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 Changes in v6:
  - Avoid preprocessor directives in tracepoint macro args as
    Mat did on net tree.
---
 include/trace/events/tcp.h |   97 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |    3 +
 2 files changed, 100 insertions(+)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 8e88a1671538..4dea6342f7d4 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM tcp
 
@@ -8,6 +9,7 @@
 #include <linux/tcp.h>
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
+#include <net/tcp.h>
 
 /*
  * tcp event with arguments sk and skb
@@ -277,6 +279,101 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
+	do {								\
+		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+									\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_sport;			\
+		v4->sin_addr.s_addr = inet->inet_saddr;			\
+		v4 = (void *)__entry->daddr;				\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_dport;			\
+		v4->sin_addr.s_addr = inet->inet_daddr;			\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+									\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_sport;		\
+			v6->sin6_addr = inet6_sk(sk)->saddr;		\
+			v6 = (void *)__entry->daddr;			\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_dport;		\
+			v6->sin6_addr = sk->sk_v6_daddr;		\
+		} else							\
+			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
+	} while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
+	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+TRACE_EVENT(tcp_probe,
+
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u32, mark)
+		__field(__u16, length)
+		__field(__u32, snd_nxt)
+		__field(__u32, snd_una)
+		__field(__u32, snd_cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, snd_wnd)
+		__field(__u32, srtt)
+		__field(__u32, rcv_wnd)
+	),
+
+	TP_fast_assign(
+		const struct tcp_sock *tp = tcp_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->mark = skb->mark;
+
+		__entry->length = skb->len;
+		__entry->snd_nxt = tp->snd_nxt;
+		__entry->snd_una = tp->snd_una;
+		__entry->snd_cwnd = tp->snd_cwnd;
+		__entry->snd_wnd = tp->snd_wnd;
+		__entry->rcv_wnd = tp->rcv_wnd;
+		__entry->ssthresh = tcp_current_ssthresh(sk);
+		__entry->srtt = tp->srtt_us >> 3;
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
+		  "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
+		  "rcv_wnd=%u",
+		  __entry->saddr, __entry->daddr, __entry->mark,
+		  __entry->length, __entry->snd_nxt, __entry->snd_una,
+		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
+		  __entry->srtt, __entry->rcv_wnd)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..ff71b18d9682 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	unsigned int len = skb->len;
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	/* TCP congestion window tracking */
+	trace_tcp_probe(sk, skb);
+
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);

^ permalink raw reply related

* [PATCH net-next v7 2/6] net: tcp: Remove TCP probe module
From: Masami Hiramatsu @ 2017-12-29  2:46 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

Remove TCP probe module since jprobe has been deprecated.
That function is now replaced by tcp/tcp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/Kconfig          |   17 ---
 net/ipv4/Makefile    |    1 
 net/ipv4/tcp_probe.c |  301 --------------------------------------------------
 3 files changed, 319 deletions(-)
 delete mode 100644 net/ipv4/tcp_probe.c

diff --git a/net/Kconfig b/net/Kconfig
index 9dba2715919d..efe930db3c08 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -336,23 +336,6 @@ config NET_PKTGEN
 	  To compile this code as a module, choose M here: the
 	  module will be called pktgen.
 
-config NET_TCPPROBE
-	tristate "TCP connection probing"
-	depends on INET && PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to TCP connection
-	state in response to incoming packets. It is used for debugging
-	TCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use TCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called tcp_probe.
-
 config NET_DROP_MONITOR
 	tristate "Network packet drop alerting service"
 	depends on INET && TRACEPOINTS
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6c8ad1d4b6d..47a0a6649a9d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index 697f4c67b2e3..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/ktime.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1");
-
-static int port __read_mostly;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int bufsize __read_mostly = 4096;
-MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, uint, 0);
-
-static unsigned int fwmark __read_mostly;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int full __read_mostly;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct tcp_log {
-	ktime_t tstamp;
-	union {
-		struct sockaddr		raw;
-		struct sockaddr_in	v4;
-		struct sockaddr_in6	v6;
-	}	src, dst;
-	u16	length;
-	u32	snd_nxt;
-	u32	snd_una;
-	u32	snd_wnd;
-	u32	rcv_wnd;
-	u32	snd_cwnd;
-	u32	ssthresh;
-	u32	srtt;
-};
-
-static struct {
-	spinlock_t	lock;
-	wait_queue_head_t wait;
-	ktime_t		start;
-	u32		lastcwnd;
-
-	unsigned long	head, tail;
-	struct tcp_log	*log;
-} tcp_probe;
-
-static inline int tcp_probe_used(void)
-{
-	return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
-}
-
-static inline int tcp_probe_avail(void)
-{
-	return bufsize - tcp_probe_used() - 1;
-}
-
-#define tcp_probe_copy_fl_to_si4(inet, si4, mem)		\
-	do {							\
-		si4.sin_family = AF_INET;			\
-		si4.sin_port = inet->inet_##mem##port;		\
-		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\
-	} while (0)						\
-
-/*
- * Hook inserted to be called before each receive packet.
- * Note: arguments must match tcp_rcv_established()!
- */
-static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-				 const struct tcphdr *th)
-{
-	unsigned int len = skb->len;
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_sock *inet = inet_sk(sk);
-
-	/* Only update if port or skb mark matches */
-	if (((port == 0 && fwmark == 0) ||
-	     ntohs(inet->inet_dport) == port ||
-	     ntohs(inet->inet_sport) == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
-
-		spin_lock(&tcp_probe.lock);
-		/* If log fills, just silently drop */
-		if (tcp_probe_avail() > 1) {
-			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
-
-			p->tstamp = ktime_get();
-			switch (sk->sk_family) {
-			case AF_INET:
-				tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
-				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
-				break;
-			case AF_INET6:
-				memset(&p->src.v6, 0, sizeof(p->src.v6));
-				memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
-				p->src.v6.sin6_family = AF_INET6;
-				p->src.v6.sin6_port = inet->inet_sport;
-				p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
-				p->dst.v6.sin6_family = AF_INET6;
-				p->dst.v6.sin6_port = inet->inet_dport;
-				p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
-				break;
-			default:
-				BUG();
-			}
-
-			p->length = len;
-			p->snd_nxt = tp->snd_nxt;
-			p->snd_una = tp->snd_una;
-			p->snd_cwnd = tp->snd_cwnd;
-			p->snd_wnd = tp->snd_wnd;
-			p->rcv_wnd = tp->rcv_wnd;
-			p->ssthresh = tcp_current_ssthresh(sk);
-			p->srtt = tp->srtt_us >> 3;
-
-			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
-		}
-		tcp_probe.lastcwnd = tp->snd_cwnd;
-		spin_unlock(&tcp_probe.lock);
-
-		wake_up(&tcp_probe.wait);
-	}
-
-	jprobe_return();
-}
-
-static struct jprobe tcp_jprobe = {
-	.kp = {
-		.symbol_name	= "tcp_rcv_established",
-	},
-	.entry	= jtcp_rcv_established,
-};
-
-static int tcpprobe_open(struct inode *inode, struct file *file)
-{
-	/* Reset (empty) log */
-	spin_lock_bh(&tcp_probe.lock);
-	tcp_probe.head = tcp_probe.tail = 0;
-	tcp_probe.start = ktime_get();
-	spin_unlock_bh(&tcp_probe.lock);
-
-	return 0;
-}
-
-static int tcpprobe_sprint(char *tbuf, int n)
-{
-	const struct tcp_log *p
-		= tcp_probe.log + tcp_probe.tail;
-	struct timespec64 ts
-		= ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
-
-	return scnprintf(tbuf, n,
-			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
-			(unsigned long)ts.tv_sec,
-			(unsigned long)ts.tv_nsec,
-			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
-			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
-			     size_t len, loff_t *ppos)
-{
-	int error = 0;
-	size_t cnt = 0;
-
-	if (!buf)
-		return -EINVAL;
-
-	while (cnt < len) {
-		char tbuf[256];
-		int width;
-
-		/* Wait for data in buffer */
-		error = wait_event_interruptible(tcp_probe.wait,
-						 tcp_probe_used() > 0);
-		if (error)
-			break;
-
-		spin_lock_bh(&tcp_probe.lock);
-		if (tcp_probe.head == tcp_probe.tail) {
-			/* multiple readers race? */
-			spin_unlock_bh(&tcp_probe.lock);
-			continue;
-		}
-
-		width = tcpprobe_sprint(tbuf, sizeof(tbuf));
-
-		if (cnt + width < len)
-			tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
-
-		spin_unlock_bh(&tcp_probe.lock);
-
-		/* if record greater than space available
-		   return partial buffer (so far) */
-		if (cnt + width >= len)
-			break;
-
-		if (copy_to_user(buf + cnt, tbuf, width))
-			return -EFAULT;
-		cnt += width;
-	}
-
-	return cnt == 0 ? error : cnt;
-}
-
-static const struct file_operations tcpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = tcpprobe_open,
-	.read    = tcpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int tcpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of tcp_rcv_established,
-	 * has been changed, you also have to change the signature of
-	 * jtcp_rcv_established, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(tcp_rcv_established,
-				 jtcp_rcv_established) == 0);
-
-	init_waitqueue_head(&tcp_probe.wait);
-	spin_lock_init(&tcp_probe.lock);
-
-	if (bufsize == 0)
-		return -EINVAL;
-
-	bufsize = roundup_pow_of_two(bufsize);
-	tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
-	if (!tcp_probe.log)
-		goto err0;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&tcp_jprobe);
-	if (ret)
-		goto err1;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
- err1:
-	remove_proc_entry(procname, init_net.proc_net);
- err0:
-	kfree(tcp_probe.log);
-	return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&tcp_jprobe);
-	kfree(tcp_probe.log);
-}
-module_exit(tcpprobe_exit);

^ permalink raw reply related

* [PATCH net-next v7 3/6] net: sctp: Add SCTP ACK tracking trace event
From: Masami Hiramatsu @ 2017-12-29  2:46 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

Add SCTP ACK tracking trace event to trace the changes of SCTP
association state in response to incoming packets.
It is used for debugging SCTP congestion control algorithms,
and will replace sctp_probe module.

Note that this event a bit tricky. Since this consists of 2
events (sctp_probe and sctp_probe_path) so you have to enable
both events as below.

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/sctp/sctp_probe/enable
  # echo 1 > events/sctp/sctp_probe_path/enable

Or, you can enable all the events under sctp.

  # echo 1 > events/sctp/enable

Since sctp_probe_path event is always invoked from sctp_probe
event, you can not see any output if you only enable
sctp_probe_path.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
  Changes in v3:
   - Add checking whether sctp_probe_path event is enabled
     before iterating sctp paths to record. Thanks Steven.
  Changes in v4:
   - Move a temporal variable definition in the block.
   - Fix to cast pointer to unsigned long instead of __u64
     for 32bit environment.
---
 include/trace/events/sctp.h |   99 +++++++++++++++++++++++++++++++++++++++++++
 net/sctp/sm_statefuns.c     |    5 ++
 2 files changed, 104 insertions(+)
 create mode 100644 include/trace/events/sctp.h

diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h
new file mode 100644
index 000000000000..7475c7be165a
--- /dev/null
+++ b/include/trace/events/sctp.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sctp
+
+#if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCTP_H
+
+#include <net/sctp/structs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sctp_probe_path,
+
+	TP_PROTO(struct sctp_transport *sp,
+		 const struct sctp_association *asoc),
+
+	TP_ARGS(sp, asoc),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, primary)
+		__array(__u8, ipaddr, sizeof(union sctp_addr))
+		__field(__u32, state)
+		__field(__u32, cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, flight_size)
+		__field(__u32, partial_bytes_acked)
+		__field(__u32, pathmtu)
+	),
+
+	TP_fast_assign(
+		__entry->asoc = (unsigned long)asoc;
+		__entry->primary = (sp == asoc->peer.primary_path);
+		memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr));
+		__entry->state = sp->state;
+		__entry->cwnd = sp->cwnd;
+		__entry->ssthresh = sp->ssthresh;
+		__entry->flight_size = sp->flight_size;
+		__entry->partial_bytes_acked = sp->partial_bytes_acked;
+		__entry->pathmtu = sp->pathmtu;
+	),
+
+	TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u "
+		  "flight_size=%u partial_bytes_acked=%u pathmtu=%u",
+		  __entry->asoc, __entry->primary ? "(*)" : "",
+		  __entry->ipaddr, __entry->state, __entry->cwnd,
+		  __entry->ssthresh, __entry->flight_size,
+		  __entry->partial_bytes_acked, __entry->pathmtu)
+);
+
+TRACE_EVENT(sctp_probe,
+
+	TP_PROTO(const struct sctp_endpoint *ep,
+		 const struct sctp_association *asoc,
+		 struct sctp_chunk *chunk),
+
+	TP_ARGS(ep, asoc, chunk),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, mark)
+		__field(__u16, bind_port)
+		__field(__u16, peer_port)
+		__field(__u32, pathmtu)
+		__field(__u32, rwnd)
+		__field(__u16, unack_data)
+	),
+
+	TP_fast_assign(
+		struct sk_buff *skb = chunk->skb;
+
+		__entry->asoc = (unsigned long)asoc;
+		__entry->mark = skb->mark;
+		__entry->bind_port = ep->base.bind_addr.port;
+		__entry->peer_port = asoc->peer.port;
+		__entry->pathmtu = asoc->pathmtu;
+		__entry->rwnd = asoc->peer.rwnd;
+		__entry->unack_data = asoc->unack_data;
+
+		if (trace_sctp_probe_path_enabled()) {
+			struct sctp_transport *sp;
+
+			list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+					    transports) {
+				trace_sctp_probe_path(sp, asoc);
+			}
+		}
+	),
+
+	TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d "
+		  "rwnd=%u unack_data=%d",
+		  __entry->asoc, __entry->mark, __entry->bind_port,
+		  __entry->peer_port, __entry->pathmtu, __entry->rwnd,
+		  __entry->unack_data)
+);
+
+#endif /* _TRACE_SCTP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 541f34735346..eb7905ffe5f2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -59,6 +59,9 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/structs.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sctp.h>
+
 static struct sctp_packet *sctp_abort_pkt_new(
 					struct net *net,
 					const struct sctp_endpoint *ep,
@@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
 	struct sctp_sackhdr *sackh;
 	__u32 ctsn;
 
+	trace_sctp_probe(ep, asoc, chunk);
+
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 

^ permalink raw reply related

* [PATCH net-next v7 4/6] net: sctp: Remove debug SCTP probe module
From: Masami Hiramatsu @ 2017-12-29  2:47 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

Remove SCTP probe module since jprobe has been deprecated.
That function is now replaced by sctp/sctp_probe and
sctp/sctp_probe_path trace-events.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/sctp/Kconfig  |   12 ---
 net/sctp/Makefile |    3 -
 net/sctp/probe.c  |  244 -----------------------------------------------------
 3 files changed, 259 deletions(-)
 delete mode 100644 net/sctp/probe.c

diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index d9c04dc1b3f3..c740b189d4ba 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -37,18 +37,6 @@ menuconfig IP_SCTP
 
 if IP_SCTP
 
-config NET_SCTPPROBE
-	tristate "SCTP: Association probing"
-        depends on PROC_FS && KPROBES
-        ---help---
-        This module allows for capturing the changes to SCTP association
-        state in response to incoming packets. It is used for debugging
-        SCTP congestion control algorithms. If you don't understand
-        what was just said, you don't need it: say N.
-
-        To compile this code as a module, choose M here: the
-        module will be called sctp_probe.
-
 config SCTP_DBG_OBJCNT
 	bool "SCTP: Debug object counts"
 	depends on PROC_FS
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 54bd9c1a8aa1..6776582ec449 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,7 +4,6 @@
 #
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
-obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
 obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
@@ -16,8 +15,6 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
 	  stream_sched_rr.o stream_interleave.o
 
-sctp_probe-y := probe.o
-
 sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
 sctp-$(CONFIG_PROC_FS) += proc.o
 sctp-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
deleted file mode 100644
index 1280f85a598d..000000000000
--- a/net/sctp/probe.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * sctp_probe - Observe the SCTP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for SCTP from Stephen Hemminger's code
- * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/sctp.h>
-#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-MODULE_SOFTDEP("pre: sctp");
-MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
-MODULE_DESCRIPTION("SCTP snooper");
-MODULE_LICENSE("GPL");
-
-static int port __read_mostly = 0;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int fwmark __read_mostly = 0;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int bufsize __read_mostly = 64 * 1024;
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-static int full __read_mostly = 1;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "sctpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} sctpw;
-
-static __printf(1, 2) void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	wake_up(&sctpw.wait);
-}
-
-static int sctpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&sctpw.fifo);
-	ktime_get_ts64(&sctpw.tstart);
-
-	return 0;
-}
-
-static ssize_t sctpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(sctpw.wait,
-					 kfifo_len(&sctpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations sctpprobe_fops = {
-	.owner	= THIS_MODULE,
-	.open	= sctpprobe_open,
-	.read	= sctpprobe_read,
-	.llseek = noop_llseek,
-};
-
-static enum sctp_disposition jsctp_sf_eat_sack(
-					struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
-{
-	struct sctp_chunk *chunk = arg;
-	struct sk_buff *skb = chunk->skb;
-	struct sctp_transport *sp;
-	static __u32 lcwnd = 0;
-	struct timespec64 now;
-
-	sp = asoc->peer.primary_path;
-
-	if (((port == 0 && fwmark == 0) ||
-	     asoc->peer.port == port ||
-	     ep->base.bind_addr.port == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || sp->cwnd != lcwnd)) {
-		lcwnd = sp->cwnd;
-
-		ktime_get_ts64(&now);
-		now = timespec64_sub(now, sctpw.tstart);
-
-		printl("%lu.%06lu ", (unsigned long) now.tv_sec,
-		       (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-
-		printl("%p %5d %5d %5d %8d %5d ", asoc,
-		       ep->base.bind_addr.port, asoc->peer.port,
-		       asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
-
-		list_for_each_entry(sp, &asoc->peer.transport_addr_list,
-					transports) {
-			if (sp == asoc->peer.primary_path)
-				printl("*");
-
-			printl("%pISc %2u %8u %8u %8u %8u %8u ",
-			       &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
-			       sp->flight_size, sp->partial_bytes_acked,
-			       sp->pathmtu);
-		}
-		printl("\n");
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe sctp_recv_probe = {
-	.kp	= {
-		.symbol_name = "sctp_sf_eat_sack_6_2",
-	},
-	.entry	= jsctp_sf_eat_sack,
-};
-
-static __init int sctp_setup_jprobe(void)
-{
-	int ret = register_jprobe(&sctp_recv_probe);
-
-	if (ret) {
-		if (request_module("sctp"))
-			goto out;
-		ret = register_jprobe(&sctp_recv_probe);
-	}
-
-out:
-	return ret;
-}
-
-static __init int sctpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of sctp_sf_eat_sack_6_2,
-	 * has been changed, you also have to change the signature of
-	 * jsctp_sf_eat_sack, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2,
-				 jsctp_sf_eat_sack) == 0);
-
-	init_waitqueue_head(&sctpw.wait);
-	spin_lock_init(&sctpw.lock);
-	if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net,
-			 &sctpprobe_fops))
-		goto free_kfifo;
-
-	ret = sctp_setup_jprobe();
-	if (ret)
-		goto remove_proc;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
-
-remove_proc:
-	remove_proc_entry(procname, init_net.proc_net);
-free_kfifo:
-	kfifo_free(&sctpw.fifo);
-	return ret;
-}
-
-static __exit void sctpprobe_exit(void)
-{
-	kfifo_free(&sctpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&sctp_recv_probe);
-}
-
-module_init(sctpprobe_init);
-module_exit(sctpprobe_exit);

^ permalink raw reply related

* [PATCH net-next v7 5/6] net: dccp: Add DCCP sendmsg trace event
From: Masami Hiramatsu @ 2017-12-29  2:47 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

Add DCCP sendmsg trace event (dccp/dccp_probe) for
replacing dccpprobe. User can trace this event via
ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
  Changes in v5:
   - Fix to add local directory to include for trace.h.
     Thanks Steven!
  Changes in v7:
   - Avoid preprocessor directives in tracepoint macro args
     by sharing TP_STORE_ADDR_PORTS() macro with tcp.h.
---
 include/trace/events/net_probe_common.h |   44 ++++++++++++++++
 include/trace/events/tcp.h              |   39 --------------
 net/dccp/Makefile                       |    3 +
 net/dccp/proto.c                        |    5 ++
 net/dccp/trace.h                        |   84 +++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 38 deletions(-)
 create mode 100644 include/trace/events/net_probe_common.h
 create mode 100644 net/dccp/trace.h

diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
new file mode 100644
index 000000000000..3930119cab08
--- /dev/null
+++ b/include/trace/events/net_probe_common.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_TRACE_NET_PROBE_COMMON_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_PROBE_COMMON_H
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
+	do {								\
+		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+									\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_sport;			\
+		v4->sin_addr.s_addr = inet->inet_saddr;			\
+		v4 = (void *)__entry->daddr;				\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = inet->inet_dport;			\
+		v4->sin_addr.s_addr = inet->inet_daddr;			\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+									\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_sport;		\
+			v6->sin6_addr = inet6_sk(sk)->saddr;		\
+			v6 = (void *)__entry->daddr;			\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = inet->inet_dport;		\
+			v6->sin6_addr = sk->sk_v6_daddr;		\
+		} else							\
+			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
+	} while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
+	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+#endif
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 4dea6342f7d4..1501ca91814f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -279,44 +279,7 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
-
-#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk)			\
-	do {								\
-		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
-									\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = inet->inet_sport;			\
-		v4->sin_addr.s_addr = inet->inet_saddr;			\
-		v4 = (void *)__entry->daddr;				\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = inet->inet_dport;			\
-		v4->sin_addr.s_addr = inet->inet_daddr;			\
-	} while (0)
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk)				\
-	do {								\
-		if (sk->sk_family == AF_INET6) {			\
-			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
-									\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = inet->inet_sport;		\
-			v6->sin6_addr = inet6_sk(sk)->saddr;		\
-			v6 = (void *)__entry->daddr;			\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = inet->inet_dport;		\
-			v6->sin6_addr = sk->sk_v6_daddr;		\
-		} else							\
-			TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);	\
-	} while (0)
-
-#else
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk)		\
-	TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
-
-#endif
+#include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(tcp_probe,
 
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2e7b56097bc4..4215f13a63af 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -27,3 +27,6 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
 dccp_probe-y := probe.o
+
+# build with local directory for trace.h
+CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7a75a1d3568b..fa7e92e08920 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -38,6 +38,9 @@
 #include "dccp.h"
 #include "feat.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
 EXPORT_SYMBOL_GPL(dccp_statistics);
@@ -761,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int rc, size;
 	long timeo;
 
+	trace_dccp_probe(sk, len);
+
 	if (len > dp->dccps_mss_cache)
 		return -EMSGSIZE;
 
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
new file mode 100644
index 000000000000..5062421beee9
--- /dev/null
+++ b/net/dccp/trace.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dccp
+
+#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DCCP_H
+
+#include <net/sock.h>
+#include "dccp.h"
+#include "ccids/ccid3.h"
+#include <linux/tracepoint.h>
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(dccp_probe,
+
+	TP_PROTO(struct sock *sk, size_t size),
+
+	TP_ARGS(sk, size),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, size)
+		__field(__u16, tx_s)
+		__field(__u32, tx_rtt)
+		__field(__u32, tx_p)
+		__field(__u32, tx_x_calc)
+		__field(__u64, tx_x_recv)
+		__field(__u64, tx_x)
+		__field(__u32, tx_t_ipi)
+	),
+
+	TP_fast_assign(
+		const struct inet_sock *inet = inet_sk(sk);
+		struct ccid3_hc_tx_sock *hc = NULL;
+
+		if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+			hc = ccid3_hc_tx_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		__entry->size = size;
+		if (hc) {
+			__entry->tx_s = hc->tx_s;
+			__entry->tx_rtt = hc->tx_rtt;
+			__entry->tx_p = hc->tx_p;
+			__entry->tx_x_calc = hc->tx_x_calc;
+			__entry->tx_x_recv = hc->tx_x_recv >> 6;
+			__entry->tx_x = hc->tx_x >> 6;
+			__entry->tx_t_ipi = hc->tx_t_ipi;
+		} else {
+			__entry->tx_s = 0;
+			memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi -
+			       (void *)&__entry->tx_rtt +
+			       sizeof(__entry->tx_t_ipi));
+		}
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
+		  "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
+		  __entry->saddr, __entry->daddr, __entry->size,
+		  __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
+		  __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
+		  __entry->tx_t_ipi)
+);
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>

^ permalink raw reply related

* [PATCH net-next v7 6/6] net: dccp: Remove dccpprobe module
From: Masami Hiramatsu @ 2017-12-29  2:48 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>

Remove DCCP probe module since jprobe has been deprecated.
That function is now replaced by dccp/dccp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 Changes in v5:
  - Fix a conflict with previous change in Makefile.
---
 net/dccp/Kconfig  |   17 ----
 net/dccp/Makefile |    2 -
 net/dccp/probe.c  |  203 -----------------------------------------------------
 3 files changed, 222 deletions(-)
 delete mode 100644 net/dccp/probe.c

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 8c0ef71bed2f..b270e84d9c13 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -39,23 +39,6 @@ config IP_DCCP_DEBUG
 
 	  Just say N.
 
-config NET_DCCPPROBE
-	tristate "DCCP connection probing"
-	depends on PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to DCCP connection
-	state in response to incoming packets. It is used for debugging
-	DCCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use DCCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called dccp_probe.
-
 
 endmenu
 
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 4215f13a63af..5b4ff37bc806 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -21,12 +21,10 @@ obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
 dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
-dccp_probe-y := probe.o
 
 # build with local directory for trace.h
 CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
deleted file mode 100644
index 3d3fda05b32d..000000000000
--- a/net/dccp/probe.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * dccp_probe - Observe the DCCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for DCCP from Stephen Hemminger's code
- * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/dccp.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/vmalloc.h>
-#include <linux/time64.h>
-#include <linux/gfp.h>
-#include <net/net_namespace.h>
-
-#include "dccp.h"
-#include "ccid.h"
-#include "ccids/ccid3.h"
-
-static int port;
-
-static int bufsize = 64 * 1024;
-
-static const char procname[] = "dccpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} dccpw;
-
-static void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	struct timespec64 now;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	getnstimeofday64(&now);
-
-	now = timespec64_sub(now, dccpw.tstart);
-
-	len = sprintf(tbuf, "%lu.%06lu ",
-		      (unsigned long) now.tv_sec,
-		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	wake_up(&dccpw.wait);
-}
-
-static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	struct ccid3_hc_tx_sock *hc = NULL;
-
-	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
-		hc = ccid3_hc_tx_sk(sk);
-
-	if (port == 0 || ntohs(inet->inet_dport) == port ||
-	    ntohs(inet->inet_sport) == port) {
-		if (hc)
-			printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport), size,
-			       hc->tx_s, hc->tx_rtt, hc->tx_p,
-			       hc->tx_x_calc, hc->tx_x_recv >> 6,
-			       hc->tx_x >> 6, hc->tx_t_ipi);
-		else
-			printl("%pI4:%u %pI4:%u %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport),
-			       size);
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe dccp_send_probe = {
-	.kp	= {
-		.symbol_name = "dccp_sendmsg",
-	},
-	.entry	= jdccp_sendmsg,
-};
-
-static int dccpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&dccpw.fifo);
-	getnstimeofday64(&dccpw.tstart);
-	return 0;
-}
-
-static ssize_t dccpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(dccpw.wait,
-					 kfifo_len(&dccpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations dccpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = dccpprobe_open,
-	.read    = dccpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int dccpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	init_waitqueue_head(&dccpw.wait);
-	spin_lock_init(&dccpw.lock);
-	if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&dccp_send_probe);
-	if (ret) {
-		ret = request_module("dccp");
-		if (!ret)
-			ret = register_jprobe(&dccp_send_probe);
-	}
-
-	if (ret)
-		goto err1;
-
-	pr_info("DCCP watch registered (port=%d)\n", port);
-	return 0;
-err1:
-	remove_proc_entry(procname, init_net.proc_net);
-err0:
-	kfifo_free(&dccpw.fifo);
-	return ret;
-}
-module_init(dccpprobe_init);
-
-static __exit void dccpprobe_exit(void)
-{
-	kfifo_free(&dccpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&dccp_send_probe);
-
-}
-module_exit(dccpprobe_exit);
-
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
-MODULE_DESCRIPTION("DCCP snooper");
-MODULE_LICENSE("GPL");

^ permalink raw reply related

* Re: [PATCH net-next] virtio_net: implement VIRTIO_CONFIG_S_NEEDS_RESET
From: Jason Wang @ 2017-12-29  3:10 UTC (permalink / raw)
  To: Willem de Bruijn, Michael S. Tsirkin
  Cc: Network Development, David Miller, virtualization,
	Willem de Bruijn
In-Reply-To: <CAF=yD-+z_3Pmsa=6_j4Yzt1QxfgbVwNsDoKLc-2rb7rH63p1WQ@mail.gmail.com>



On 2017年12月29日 03:11, Willem de Bruijn wrote:
> On Mon, Oct 16, 2017 at 11:44 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> On Tue, Oct 17, 2017 at 11:05:07AM +0800, Jason Wang wrote:
>>>
>>> On 2017年10月17日 06:34, Willem de Bruijn wrote:
>>>> On Mon, Oct 16, 2017 at 12:38 PM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> On Mon, Oct 16, 2017 at 12:04:57PM -0400, Willem de Bruijn wrote:
>>>>>> On Mon, Oct 16, 2017 at 11:31 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>>>> On Mon, Oct 16, 2017 at 11:03:18AM -0400, Willem de Bruijn wrote:
>>>>>>>>>> +static int virtnet_reset(struct virtnet_info *vi)
>>>>>>>>>> +{
>>>>>>>>>> +     struct virtio_device *dev = vi->vdev;
>>>>>>>>>> +     int ret;
>>>>>>>>>> +
>>>>>>>>>> +     virtio_config_disable(dev);
>>>>>>>>>> +     dev->failed = dev->config->get_status(dev) & VIRTIO_CONFIG_S_FAILED;
>>>>>>>>>> +     virtnet_freeze_down(dev, true);
>>>>>>>>>> +     remove_vq_common(vi);
>>>>>>>>>> +
>>>>>>>>>> +     virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
>>>>>>>>>> +     virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER);
>>>>>>>>>> +
>>>>>>>>>> +     ret = virtio_finalize_features(dev);
>>>>>>>>>> +     if (ret)
>>>>>>>>>> +             goto err;
>>>>>>>>>> +
>>>>>>>>>> +     ret = virtnet_restore_up(dev);
>>>>>>>>>> +     if (ret)
>>>>>>>>>> +             goto err;
>>>>>>>>>> +
>>>>>>>>>> +     ret = virtnet_set_queues(vi, vi->curr_queue_pairs);
>>>>>>>>>> +     if (ret)
>>>>>>>>>> +             goto err;
>>>>>>>>>> +
>>>>>>>>>> +     virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
>>>>>>>>>> +     virtio_config_enable(dev);
>>>>>>>>>> +     return 0;
>>>>>>>>>> +
>>>>>>>>>> +err:
>>>>>>>>>> +     virtio_add_status(dev, VIRTIO_CONFIG_S_FAILED);
>>>>>>>>>> +     return ret;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>    static int virtnet_set_guest_offloads(struct virtnet_info *vi, u64 offloads)
>>>>>>>>>>    {
>>>>>>>>>>         struct scatterlist sg;
>>>>>>>>> I have a question here though. How do things like MAC address
>>>>>>>>> get restored?
>>>>>>>>>
>>>>>>>>> What about the rx mode?
>>>>>>>>>
>>>>>>>>> vlans?
>>>>>>>> The function as is releases and reinitializes only ring state.
>>>>>>>> Device configuration such as mac and vlan persist across
>>>>>>>> the reset.
>>>>>>> What gave you this impression? Take a look at e.g. this
>>>>>>> code in qemu:
>>>>>>>
>>>>>>> static void virtio_net_reset(VirtIODevice *vdev)
>>>>>>> {
>>>>>>>       VirtIONet *n = VIRTIO_NET(vdev);
>>>>>>>
>>>>>>>       /* Reset back to compatibility mode */
>>>>>>>       n->promisc = 1;
>>>>>>>       n->allmulti = 0;
>>>>>>>       n->alluni = 0;
>>>>>>>       n->nomulti = 0;
>>>>>>>       n->nouni = 0;
>>>>>>>       n->nobcast = 0;
>>>>>>>       /* multiqueue is disabled by default */
>>>>>>>       n->curr_queues = 1;
>>>>>>>       timer_del(n->announce_timer);
>>>>>>>       n->announce_counter = 0;
>>>>>>>       n->status &= ~VIRTIO_NET_S_ANNOUNCE;
>>>>>>>
>>>>>>>       /* Flush any MAC and VLAN filter table state */
>>>>>>>       n->mac_table.in_use = 0;
>>>>>>>       n->mac_table.first_multi = 0;
>>>>>>>       n->mac_table.multi_overflow = 0;
>>>>>>>       n->mac_table.uni_overflow = 0;
>>>>>>>       memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
>>>>>>>       memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
>>>>>>>       qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
>>>>>>>       memset(n->vlans, 0, MAX_VLAN >> 3);
>>>>>>> }
>>>>>>>
>>>>>>> So device seems to lose all state, you have to re-program it.
>>>>>> Oh, indeed! The guest does not reset its state, so it might
>>>>>> be out of sync with the host after the operation. Was this not
>>>>>> an issue when previously resetting in the context of xdp?
>>>>> I suspect it was broken back then, too.
>>>> Okay. I guess that in principle this is all programmable through
>>>> virtnet_set_rx_mode, virtnet_vlan_rx_add_vid, etc. But it's a
>>>> lot more complex than just restoring virtnet_reset. Will need to
>>>> be careful about concurrency issues at the least. Similar to the
>>>> ones you point out below.
>>>>
>>> The problem has been pointed out during developing virtio-net XDP. But it
>>> may not be a big issue since vhost_net ignores all kinds of the filters now.
>>>
>>> Thanks
>> It might not keep doing that in the future though.
>> And virtio-net in userspace doesn't ignore the filters.
> How about the guest honor the request only if no state has been
> offloaded to the host?
>
> This is the common case for vhost_net, and not expected to change
> soon.

FYI, I'm implementing to use tun eBPF filter for virtio-net. So 
recovering filter should be considered.

Thanks

>
> Even when it does, we have a graceful degradation strategy. Guest
> revert state prior to reset and reapply. Though for the time being,
> solving this only in the case without state offload would be solve my
> use case.

^ permalink raw reply

* Re: [PATCH v6 0/6] Add M_CAN Support for Dra76 platform
From: Yang, Wenyou @ 2017-12-29  3:38 UTC (permalink / raw)
  To: Faiz Abbas, wg, mkl, robh+dt, mark.rutland
  Cc: linux-can, netdev, devicetree, linux-kernel, nsekhar, fcooper,
	robh, sergei.shtylyov
In-Reply-To: <1513949488-13026-1-git-send-email-faiz_abbas@ti.com>



On 2017/12/22 21:31, Faiz Abbas wrote:
> This patch series adds support for M_CAN on the TI Dra76
> platform. Device tree patches will be sent separately.
> A bunch of patches were sent before by
> Franklin Cooper <fcooper@ti.com>. I have clubbed the
> series together and rebased to the latest kernel.
Tested this series on SAMA5D2 Xplained board.

Tested-by: Wenyou Yang <wenyou.yang@microchip.com>

>
> v6 changes:
> Dropped the patches to make hclk optional. Drivers
> which enable hclk as the interface clock using
> pm_runtime calls must still provide a hclk in the
> clocks property.
>
> Support higher speed CAN-FD bitrate:
> The community decided that data sampling point be used
> for the secondary sampling point here
> https://patchwork.kernel.org/patch/9909845/
>
> Franklin S Cooper Jr (6):
>    can: dev: Add support for limiting configured bitrate
>    can: m_can: Add call to of_can_transceiver
>    can: m_can: Add PM Runtime
>    can: m_can: Support higher speed CAN-FD bitrates
>    dt-bindings: can: m_can: Document new can transceiver binding
>    dt-bindings: can: can-transceiver: Document new binding
>
>   .../bindings/net/can/can-transceiver.txt           | 24 +++++++
>   .../devicetree/bindings/net/can/m_can.txt          |  9 +++
>   drivers/net/can/dev.c                              | 39 +++++++++++
>   drivers/net/can/m_can/m_can.c                      | 81 ++++++++++++++++++++--
>   include/linux/can/dev.h                            |  8 +++
>   5 files changed, 156 insertions(+), 5 deletions(-)
>   create mode 100644 Documentation/devicetree/bindings/net/can/can-transceiver.txt
>

Best Regards,
Wenyou Yang

^ permalink raw reply

* [PATCH net-next] cxgb4: Check alignment constraint for T6
From: Ganesh Goudar @ 2017-12-29  7:18 UTC (permalink / raw)
  To: netdev, davem
  Cc: nirranjan, indranil, venkatesh, Ganesh Goudar, Arjun Vynipadath

Update the check for setting  IPV4 filters and align filter_id
to multiple of 2, only for IPv6 filters in case of T6.

Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index 5980f30..29178cf 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -1189,6 +1189,7 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
 		       struct filter_ctx *ctx)
 {
 	struct adapter *adapter = netdev2adap(dev);
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
 	unsigned int max_fidx, fidx;
 	struct filter_entry *f;
 	u32 iconf;
@@ -1225,12 +1226,18 @@ int __cxgb4_set_filter(struct net_device *dev, int filter_id,
 	 * insertion.
 	 */
 	if (fs->type == 0) { /* IPv4 */
-		/* If our IPv4 filter isn't being written to a
-		 * multiple of four filter index and there's an IPv6
-		 * filter at the multiple of 4 base slot, then we
-		 * prevent insertion.
+		/* For T6, If our IPv4 filter isn't being written to a
+		 * multiple of two filter index and there's an IPv6
+		 * filter at the multiple of 2 base slot, then we need
+		 * to delete that IPv6 filter ...
+		 * For adapters below T6, IPv6 filter occupies 4 entries.
+		 * Hence we need to delete the filter in multiple of 4 slot.
 		 */
-		fidx = filter_id & ~0x3;
+		if (chip_ver < CHELSIO_T6)
+			fidx = filter_id & ~0x3;
+		else
+			fidx = filter_id & ~0x1;
+
 		if (fidx != filter_id &&
 		    adapter->tids.ftid_tab[fidx].fs.type) {
 			f = &adapter->tids.ftid_tab[fidx];
-- 
2.1.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox