* [for-next V3 02/11] net/mlx5: E-Switch, Refactor load/unload of representors
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
Refactor the load/unload stages for better code reuse.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 66 +++++++++++++---------
1 file changed, 40 insertions(+), 26 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 7e15854c1087..26fbc50ddc6d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -767,12 +767,47 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
return 0;
}
-int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
+static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+{
+ struct mlx5_eswitch_rep *rep;
+ int vport;
+
+ for (vport = nvports - 1; vport >= 0; vport--) {
+ rep = &esw->offloads.vport_reps[vport];
+ if (!rep->valid)
+ continue;
+
+ rep->unload(esw, rep);
+ }
+}
+
+static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
{
struct mlx5_eswitch_rep *rep;
int vport;
int err;
+ for (vport = 0; vport < nvports; vport++) {
+ rep = &esw->offloads.vport_reps[vport];
+ if (!rep->valid)
+ continue;
+
+ err = rep->load(esw, rep);
+ if (err)
+ goto err_reps;
+ }
+
+ return 0;
+
+err_reps:
+ esw_offloads_unload_reps(esw, vport);
+ return err;
+}
+
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
+{
+ int err;
+
/* disable PF RoCE so missed packets don't go through RoCE steering */
mlx5_dev_list_lock();
mlx5_remove_dev_by_protocol(esw->dev, MLX5_INTERFACE_PROTOCOL_IB);
@@ -790,25 +825,13 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
if (err)
goto create_fg_err;
- for (vport = 0; vport < nvports; vport++) {
- rep = &esw->offloads.vport_reps[vport];
- if (!rep->valid)
- continue;
-
- err = rep->load(esw, rep);
- if (err)
- goto err_reps;
- }
+ err = esw_offloads_load_reps(esw, nvports);
+ if (err)
+ goto err_reps;
return 0;
err_reps:
- for (vport--; vport >= 0; vport--) {
- rep = &esw->offloads.vport_reps[vport];
- if (!rep->valid)
- continue;
- rep->unload(esw, rep);
- }
esw_destroy_vport_rx_group(esw);
create_fg_err:
@@ -849,16 +872,7 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw)
void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
{
- struct mlx5_eswitch_rep *rep;
- int vport;
-
- for (vport = nvports - 1; vport >= 0; vport--) {
- rep = &esw->offloads.vport_reps[vport];
- if (!rep->valid)
- continue;
- rep->unload(esw, rep);
- }
-
+ esw_offloads_unload_reps(esw, nvports);
esw_destroy_vport_rx_group(esw);
esw_destroy_offloads_table(esw);
esw_destroy_offloads_fdb_tables(esw);
--
2.13.0
^ permalink raw reply related
* [for-next V3 04/11] net/mlx5: E-Switch, Move mlx5e only logic outside E-Switch
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
In our pursuit to cleanup e-switch sub-module from mlx5e specific code,
we move the functions that insert/remove the flow steering rules that
allow mlx5e representors to send packets directly to VFs into the EN
driver code.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 57 +++++++++++++++++++++-
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 9 ++--
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 55 +--------------------
3 files changed, 59 insertions(+), 62 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 6d2219f3acf6..19edaa155062 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -190,6 +190,59 @@ int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
return 0;
}
+static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5_esw_sq *esw_sq, *tmp;
+
+ if (esw->mode != SRIOV_OFFLOADS)
+ return;
+
+ list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
+ mlx5_del_flow_rules(esw_sq->send_to_vport_rule);
+ list_del(&esw_sq->list);
+ kfree(esw_sq);
+ }
+}
+
+static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep,
+ u16 *sqns_array, int sqns_num)
+{
+ struct mlx5_flow_handle *flow_rule;
+ struct mlx5_esw_sq *esw_sq;
+ int err;
+ int i;
+
+ if (esw->mode != SRIOV_OFFLOADS)
+ return 0;
+
+ for (i = 0; i < sqns_num; i++) {
+ esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
+ if (!esw_sq) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Add re-inject rule to the PF/representor sqs */
+ flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
+ rep->vport,
+ sqns_array[i]);
+ if (IS_ERR(flow_rule)) {
+ err = PTR_ERR(flow_rule);
+ kfree(esw_sq);
+ goto out_err;
+ }
+ esw_sq->send_to_vport_rule = flow_rule;
+ list_add(&esw_sq->list, &rep->vport_sqs_list);
+ }
+ return 0;
+
+out_err:
+ mlx5e_sqs2vport_stop(esw, rep);
+ return err;
+}
+
int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -210,7 +263,7 @@ int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
sqs[num_sqs++] = c->sq[tc].sqn;
}
- err = mlx5_eswitch_sqs2vport_start(esw, rep, sqs, num_sqs);
+ err = mlx5e_sqs2vport_start(esw, rep, sqs, num_sqs);
kfree(sqs);
out:
@@ -225,7 +278,7 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
struct mlx5e_rep_priv *rpriv = priv->ppriv;
struct mlx5_eswitch_rep *rep = rpriv->rep;
- mlx5_eswitch_sqs2vport_stop(esw, rep);
+ mlx5e_sqs2vport_stop(esw, rep);
}
static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 23808a65889c..21b506fd2b67 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -222,6 +222,9 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats);
+struct mlx5_flow_handle *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport,
+ u32 sqn);
struct mlx5_flow_spec;
struct mlx5_esw_flow_attr;
@@ -258,12 +261,6 @@ struct mlx5_esw_flow_attr {
struct mlx5e_tc_flow_parse_attr *parse_attr;
};
-int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
- struct mlx5_eswitch_rep *rep,
- u16 *sqns_array, int sqns_num);
-void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
- struct mlx5_eswitch_rep *rep);
-
int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index aa20f51c0a99..90a30c51d92e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -302,7 +302,7 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
return err;
}
-static struct mlx5_flow_handle *
+struct mlx5_flow_handle *
mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
{
struct mlx5_flow_act flow_act = {0};
@@ -339,59 +339,6 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
return flow_rule;
}
-void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
- struct mlx5_eswitch_rep *rep)
-{
- struct mlx5_esw_sq *esw_sq, *tmp;
-
- if (esw->mode != SRIOV_OFFLOADS)
- return;
-
- list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
- mlx5_del_flow_rules(esw_sq->send_to_vport_rule);
- list_del(&esw_sq->list);
- kfree(esw_sq);
- }
-}
-
-int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
- struct mlx5_eswitch_rep *rep,
- u16 *sqns_array, int sqns_num)
-{
- struct mlx5_flow_handle *flow_rule;
- struct mlx5_esw_sq *esw_sq;
- int err;
- int i;
-
- if (esw->mode != SRIOV_OFFLOADS)
- return 0;
-
- for (i = 0; i < sqns_num; i++) {
- esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
- if (!esw_sq) {
- err = -ENOMEM;
- goto out_err;
- }
-
- /* Add re-inject rule to the PF/representor sqs */
- flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
- rep->vport,
- sqns_array[i]);
- if (IS_ERR(flow_rule)) {
- err = PTR_ERR(flow_rule);
- kfree(esw_sq);
- goto out_err;
- }
- esw_sq->send_to_vport_rule = flow_rule;
- list_add(&esw_sq->list, &rep->vport_sqs_list);
- }
- return 0;
-
-out_err:
- mlx5_eswitch_sqs2vport_stop(esw, rep);
- return err;
-}
-
static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
{
struct mlx5_flow_act flow_act = {0};
--
2.13.0
^ permalink raw reply related
* [for-next V3 06/11] net/mlx5e: Move ethernet representors data into separate struct
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
Ethernet representors have a need to store data which is applicable
only for them. Create a priv void pointer in struct mlx5_eswitch_rep
and move mlx5e to store the relevant data there. As part of this change
we also initialize rep_if in mlx5e_rep_register_vf_vports() as otherwise the
E-Switch code will copy a priv value which is garbage.
We also rename mlx5_eswitch_get_uplink_netdev() to
mlx5_eswitch_get_uplink_priv() and make it return void *.
This way E-Switch code doesn't need to deal with net devices and
we leave the task of getting it to mlx5e.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 58 ++++++++++++++--------
drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 9 ++++
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 14 ++++--
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 7 +--
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 ++-
5 files changed, 60 insertions(+), 35 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 01bf4e3c8afa..3c74f0599ad3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -194,11 +194,13 @@ static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep)
{
struct mlx5_esw_sq *esw_sq, *tmp;
+ struct mlx5e_rep_priv *rpriv;
if (esw->mode != SRIOV_OFFLOADS)
return;
- list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
+ rpriv = mlx5e_rep_to_rep_priv(rep);
+ list_for_each_entry_safe(esw_sq, tmp, &rpriv->vport_sqs_list, list) {
mlx5_eswitch_del_send_to_vport_rule(esw_sq->send_to_vport_rule);
list_del(&esw_sq->list);
kfree(esw_sq);
@@ -210,6 +212,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
u16 *sqns_array, int sqns_num)
{
struct mlx5_flow_handle *flow_rule;
+ struct mlx5e_rep_priv *rpriv;
struct mlx5_esw_sq *esw_sq;
int err;
int i;
@@ -217,6 +220,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
if (esw->mode != SRIOV_OFFLOADS)
return 0;
+ rpriv = mlx5e_rep_to_rep_priv(rep);
for (i = 0; i < sqns_num; i++) {
esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
if (!esw_sq) {
@@ -234,7 +238,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
goto out_err;
}
esw_sq->send_to_vport_rule = flow_rule;
- list_add(&esw_sq->list, &rep->vport_sqs_list);
+ list_add(&esw_sq->list, &rpriv->vport_sqs_list);
}
return 0;
@@ -291,7 +295,7 @@ static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
#endif
unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
DELAY_PROBE_TIME);
- struct net_device *netdev = rpriv->rep->netdev;
+ struct net_device *netdev = rpriv->netdev;
struct mlx5e_priv *priv = netdev_priv(netdev);
rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
@@ -312,7 +316,7 @@ static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
{
struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
neigh_update.neigh_stats_work.work);
- struct net_device *netdev = rpriv->rep->netdev;
+ struct net_device *netdev = rpriv->netdev;
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_neigh_hash_entry *nhe;
@@ -408,7 +412,7 @@ static int mlx5e_rep_netevent_event(struct notifier_block *nb,
struct mlx5e_rep_priv *rpriv = container_of(nb, struct mlx5e_rep_priv,
neigh_update.netevent_nb);
struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- struct net_device *netdev = rpriv->rep->netdev;
+ struct net_device *netdev = rpriv->netdev;
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5e_neigh_hash_entry *nhe = NULL;
struct mlx5e_neigh m_neigh = {};
@@ -536,7 +540,7 @@ static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
{
struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
- struct mlx5e_priv *priv = netdev_priv(rpriv->rep->netdev);
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
unregister_netevent_notifier(&neigh_update->netevent_nb);
@@ -957,7 +961,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
err = PTR_ERR(flow_rule);
goto err_destroy_direct_tirs;
}
- rep->vport_rx_rule = flow_rule;
+ rpriv->vport_rx_rule = flow_rule;
err = mlx5e_tc_init(priv);
if (err)
@@ -966,7 +970,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
return 0;
err_del_flow_rule:
- mlx5_del_flow_rules(rep->vport_rx_rule);
+ mlx5_del_flow_rules(rpriv->vport_rx_rule);
err_destroy_direct_tirs:
mlx5e_destroy_direct_tirs(priv);
err_destroy_direct_rqts:
@@ -977,10 +981,9 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
{
struct mlx5e_rep_priv *rpriv = priv->ppriv;
- struct mlx5_eswitch_rep *rep = rpriv->rep;
mlx5e_tc_cleanup(priv);
- mlx5_del_flow_rules(rep->vport_rx_rule);
+ mlx5_del_flow_rules(rpriv->vport_rx_rule);
mlx5e_destroy_direct_tirs(priv);
mlx5e_destroy_direct_rqts(priv);
}
@@ -1022,8 +1025,8 @@ static const struct mlx5e_profile mlx5e_rep_profile = {
static int
mlx5e_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = netdev_priv(rep->netdev);
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
int err;
@@ -1047,8 +1050,8 @@ mlx5e_nic_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
static void
mlx5e_nic_rep_unload(struct mlx5_eswitch_rep *rep)
{
- struct mlx5e_priv *priv = netdev_priv(rep->netdev);
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+ struct mlx5e_priv *priv = netdev_priv(rpriv->netdev);
if (test_bit(MLX5E_STATE_OPENED, &priv->state))
mlx5e_remove_sqs_fwd_rules(priv);
@@ -1063,6 +1066,7 @@ mlx5e_nic_rep_unload(struct mlx5_eswitch_rep *rep)
static int
mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
{
+ struct mlx5e_rep_priv *uplink_rpriv;
struct mlx5e_rep_priv *rpriv;
struct net_device *netdev;
struct mlx5e_priv *upriv;
@@ -1080,8 +1084,10 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
return -EINVAL;
}
- rep->netdev = netdev;
+ rpriv->netdev = netdev;
rpriv->rep = rep;
+ rep->priv = rpriv;
+ INIT_LIST_HEAD(&rpriv->vport_sqs_list);
err = mlx5e_attach_netdev(netdev_priv(netdev));
if (err) {
@@ -1097,7 +1103,8 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
goto err_detach_netdev;
}
- upriv = netdev_priv(mlx5_eswitch_get_uplink_netdev(dev->priv.eswitch));
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch);
+ upriv = netdev_priv(uplink_rpriv->netdev);
err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
upriv);
if (err)
@@ -1131,14 +1138,16 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
static void
mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
{
- struct net_device *netdev = rep->netdev;
+ struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep);
+ struct net_device *netdev = rpriv->netdev;
struct mlx5e_priv *priv = netdev_priv(netdev);
- struct mlx5e_rep_priv *rpriv = priv->ppriv;
+ struct mlx5e_rep_priv *uplink_rpriv;
void *ppriv = priv->ppriv;
struct mlx5e_priv *upriv;
- unregister_netdev(rep->netdev);
- upriv = netdev_priv(mlx5_eswitch_get_uplink_netdev(priv->mdev->priv.eswitch));
+ unregister_netdev(netdev);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch);
+ upriv = netdev_priv(uplink_rpriv->netdev);
tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
upriv);
mlx5e_rep_neigh_cleanup(rpriv);
@@ -1155,7 +1164,7 @@ static void mlx5e_rep_register_vf_vports(struct mlx5e_priv *priv)
int vport;
for (vport = 1; vport < total_vfs; vport++) {
- struct mlx5_eswitch_rep rep;
+ struct mlx5_eswitch_rep rep = {};
rep.load = mlx5e_vport_rep_load;
rep.unload = mlx5e_vport_rep_unload;
@@ -1178,11 +1187,16 @@ void mlx5e_register_vport_reps(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ struct mlx5e_rep_priv *rpriv;
struct mlx5_eswitch_rep rep;
+ rpriv = priv->ppriv;
+ rpriv->netdev = priv->netdev;
+
rep.load = mlx5e_nic_rep_load;
rep.unload = mlx5e_nic_rep_unload;
- rep.netdev = priv->netdev;
+ rep.priv = rpriv;
+ INIT_LIST_HEAD(&rpriv->vport_sqs_list);
mlx5_eswitch_register_vport_rep(esw, 0, &rep); /* UPLINK PF vport*/
mlx5e_rep_register_vf_vports(priv); /* VFs vports */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 5659ed9f51e6..8db68369367e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -56,8 +56,17 @@ struct mlx5e_neigh_update_table {
struct mlx5e_rep_priv {
struct mlx5_eswitch_rep *rep;
struct mlx5e_neigh_update_table neigh_update;
+ struct net_device *netdev;
+ struct mlx5_flow_handle *vport_rx_rule;
+ struct list_head vport_sqs_list;
};
+static inline
+struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
+{
+ return (struct mlx5e_rep_priv *)rep->priv;
+}
+
struct mlx5e_neigh {
struct net_device *dev;
union {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 55979ec2e88a..f462496cce7a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -617,7 +617,8 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
FLOW_DISSECTOR_KEY_ENC_PORTS,
f->mask);
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw);
+ struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ struct net_device *up_dev = uplink_rpriv->netdev;
struct mlx5e_priv *up_priv = netdev_priv(up_dev);
/* Full udp dst port must be given */
@@ -1507,6 +1508,7 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
int *out_ttl)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5e_rep_priv *uplink_rpriv;
struct rtable *rt;
struct neighbour *n = NULL;
@@ -1520,9 +1522,10 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
#else
return -EOPNOTSUPP;
#endif
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
/* if the egress device isn't on the same HW e-switch, we use the uplink */
if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev))
- *out_dev = mlx5_eswitch_get_uplink_netdev(esw);
+ *out_dev = uplink_rpriv->netdev;
else
*out_dev = rt->dst.dev;
@@ -1543,6 +1546,7 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
struct neighbour **out_n,
int *out_ttl)
{
+ struct mlx5e_rep_priv *uplink_rpriv;
struct neighbour *n = NULL;
struct dst_entry *dst;
@@ -1557,9 +1561,10 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
*out_ttl = ip6_dst_hoplimit(dst);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
/* if the egress device isn't on the same HW e-switch, we use the uplink */
if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
- *out_dev = mlx5_eswitch_get_uplink_netdev(esw);
+ *out_dev = uplink_rpriv->netdev;
else
*out_dev = dst->dev;
#else
@@ -1859,7 +1864,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct net_device *up_dev = mlx5_eswitch_get_uplink_netdev(esw);
+ struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ struct net_device *up_dev = uplink_rpriv->netdev;
unsigned short family = ip_tunnel_info_af(tun_info);
struct mlx5e_priv *up_priv = netdev_priv(up_dev);
struct mlx5_esw_flow_attr *attr = flow->esw_attr;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 9ed401225225..3a21ea4e4d24 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -142,12 +142,9 @@ struct mlx5_eswitch_rep {
int (*load)(struct mlx5_core_dev *dev,
struct mlx5_eswitch_rep *rep);
void (*unload)(struct mlx5_eswitch_rep *rep);
+ void *priv;
u16 vport;
u8 hw_id[ETH_ALEN];
- struct net_device *netdev;
-
- struct mlx5_flow_handle *vport_rx_rule;
- struct list_head vport_sqs_list;
u16 vlan;
u32 vlan_refcount;
bool valid;
@@ -274,7 +271,7 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep);
void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
int vport_index);
-struct net_device *mlx5_eswitch_get_uplink_netdev(struct mlx5_eswitch *esw);
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw);
int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
struct mlx5_esw_flow_attr *attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 121609b823c6..07f26c1986fc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1130,9 +1130,8 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
rep->load = __rep->load;
rep->unload = __rep->unload;
- rep->netdev = __rep->netdev;
+ rep->priv = __rep->priv;
- INIT_LIST_HEAD(&rep->vport_sqs_list);
rep->valid = true;
}
@@ -1150,12 +1149,12 @@ void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
rep->valid = false;
}
-struct net_device *mlx5_eswitch_get_uplink_netdev(struct mlx5_eswitch *esw)
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw)
{
#define UPLINK_REP_INDEX 0
struct mlx5_esw_offload *offloads = &esw->offloads;
struct mlx5_eswitch_rep *rep;
rep = &offloads->vport_reps[UPLINK_REP_INDEX];
- return rep->netdev;
+ return rep->priv;
}
--
2.13.0
^ permalink raw reply related
* [for-next V3 05/11] net/mlx5: E-Switch, Create a dedicated send to vport rule deletion function
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
In order for representors to send packets directly to VFs we use an
E-Switch function which insert special rules into the HW. For symmetry
create an E-Switch function that deletes these rules as well.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 1 +
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 +++++
3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 19edaa155062..01bf4e3c8afa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -199,7 +199,7 @@ static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
return;
list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
- mlx5_del_flow_rules(esw_sq->send_to_vport_rule);
+ mlx5_eswitch_del_send_to_vport_rule(esw_sq->send_to_vport_rule);
list_del(&esw_sq->list);
kfree(esw_sq);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 21b506fd2b67..9ed401225225 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -225,6 +225,7 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *
mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport,
u32 sqn);
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule);
struct mlx5_flow_spec;
struct mlx5_esw_flow_attr;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 90a30c51d92e..121609b823c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -339,6 +339,11 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
return flow_rule;
}
+void mlx5_eswitch_del_send_to_vport_rule(struct mlx5_flow_handle *rule)
+{
+ mlx5_del_flow_rules(rule);
+}
+
static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
{
struct mlx5_flow_act flow_act = {0};
--
2.13.0
^ permalink raw reply related
* [for-next V3 08/11] net/mlx5e: E-Switch, Move send-to-vport rule struct to en_rep
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
Move struct mlx5_esw_sq which keeps send-to-vport rule to from the eswitch
code to mlx5e and rename it to better reflect where it belongs
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 22 +++++++++++-----------
drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 5 +++++
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 5 -----
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 5b2b673c0b13..c6a77f8e99a4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -193,17 +193,17 @@ int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep)
{
- struct mlx5_esw_sq *esw_sq, *tmp;
+ struct mlx5e_rep_sq *rep_sq, *tmp;
struct mlx5e_rep_priv *rpriv;
if (esw->mode != SRIOV_OFFLOADS)
return;
rpriv = mlx5e_rep_to_rep_priv(rep);
- list_for_each_entry_safe(esw_sq, tmp, &rpriv->vport_sqs_list, list) {
- mlx5_eswitch_del_send_to_vport_rule(esw_sq->send_to_vport_rule);
- list_del(&esw_sq->list);
- kfree(esw_sq);
+ list_for_each_entry_safe(rep_sq, tmp, &rpriv->vport_sqs_list, list) {
+ mlx5_eswitch_del_send_to_vport_rule(rep_sq->send_to_vport_rule);
+ list_del(&rep_sq->list);
+ kfree(rep_sq);
}
}
@@ -213,7 +213,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
{
struct mlx5_flow_handle *flow_rule;
struct mlx5e_rep_priv *rpriv;
- struct mlx5_esw_sq *esw_sq;
+ struct mlx5e_rep_sq *rep_sq;
int err;
int i;
@@ -222,8 +222,8 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
rpriv = mlx5e_rep_to_rep_priv(rep);
for (i = 0; i < sqns_num; i++) {
- esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
- if (!esw_sq) {
+ rep_sq = kzalloc(sizeof(*rep_sq), GFP_KERNEL);
+ if (!rep_sq) {
err = -ENOMEM;
goto out_err;
}
@@ -234,11 +234,11 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw,
sqns_array[i]);
if (IS_ERR(flow_rule)) {
err = PTR_ERR(flow_rule);
- kfree(esw_sq);
+ kfree(rep_sq);
goto out_err;
}
- esw_sq->send_to_vport_rule = flow_rule;
- list_add(&esw_sq->list, &rpriv->vport_sqs_list);
+ rep_sq->send_to_vport_rule = flow_rule;
+ list_add(&rep_sq->list, &rpriv->vport_sqs_list);
}
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index e4473a9ebd50..b9b481f2833a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -133,6 +133,11 @@ struct mlx5e_encap_entry {
int encap_size;
};
+struct mlx5e_rep_sq {
+ struct mlx5_flow_handle *send_to_vport_rule;
+ struct list_head list;
+};
+
void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev);
void mlx5e_register_vport_reps(struct mlx5e_priv *priv);
void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 91175965df7f..3b481182f13a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -138,11 +138,6 @@ struct mlx5_eswitch_fdb {
};
};
-struct mlx5_esw_sq {
- struct mlx5_flow_handle *send_to_vport_rule;
- struct list_head list;
-};
-
struct mlx5_eswitch_rep;
struct mlx5_eswitch_rep_if {
int (*load)(struct mlx5_core_dev *dev,
--
2.13.0
^ permalink raw reply related
* [for-next V3 11/11] net/mlx5: Separate ingress/egress namespaces for each vport
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Gal Pressman <galp@mellanox.com>
Each vport has its own root flow table for the ACL flow tables and root
flow table is per namespace, therefore we should create a namespace for
each vport.
Fixes: efdc810ba39d ("net/mlx5: Flow steering, Add vport ACL support")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 10 +-
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 145 ++++++++++++++++++----
drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 4 +-
include/linux/mlx5/fs.h | 4 +
4 files changed, 133 insertions(+), 30 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index cdf65ed8714c..7649e36653d9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -867,9 +867,10 @@ static int esw_vport_enable_egress_acl(struct mlx5_eswitch *esw,
esw_debug(dev, "Create vport[%d] egress ACL log_max_size(%d)\n",
vport->vport, MLX5_CAP_ESW_EGRESS_ACL(dev, log_max_ft_size));
- root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS);
+ root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+ vport->vport);
if (!root_ns) {
- esw_warn(dev, "Failed to get E-Switch egress flow namespace\n");
+ esw_warn(dev, "Failed to get E-Switch egress flow namespace for vport (%d)\n", vport->vport);
return -EOPNOTSUPP;
}
@@ -984,9 +985,10 @@ static int esw_vport_enable_ingress_acl(struct mlx5_eswitch *esw,
esw_debug(dev, "Create vport[%d] ingress ACL log_max_size(%d)\n",
vport->vport, MLX5_CAP_ESW_INGRESS_ACL(dev, log_max_ft_size));
- root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS);
+ root_ns = mlx5_get_flow_vport_acl_namespace(dev, MLX5_FLOW_NAMESPACE_ESW_INGRESS,
+ vport->vport);
if (!root_ns) {
- esw_warn(dev, "Failed to get E-Switch ingress flow namespace\n");
+ esw_warn(dev, "Failed to get E-Switch ingress flow namespace for vport (%d)\n", vport->vport);
return -EOPNOTSUPP;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5e786e29f93a..45e75b1010f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2014,16 +2014,6 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
return &steering->fdb_root_ns->ns;
else
return NULL;
- case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
- if (steering->esw_egress_root_ns)
- return &steering->esw_egress_root_ns->ns;
- else
- return NULL;
- case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
- if (steering->esw_ingress_root_ns)
- return &steering->esw_ingress_root_ns->ns;
- else
- return NULL;
case MLX5_FLOW_NAMESPACE_SNIFFER_RX:
if (steering->sniffer_rx_root_ns)
return &steering->sniffer_rx_root_ns->ns;
@@ -2054,6 +2044,33 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
}
EXPORT_SYMBOL(mlx5_get_flow_namespace);
+struct mlx5_flow_namespace *mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+ enum mlx5_flow_namespace_type type,
+ int vport)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+
+ if (!steering || vport >= MLX5_TOTAL_VPORTS(dev))
+ return NULL;
+
+ switch (type) {
+ case MLX5_FLOW_NAMESPACE_ESW_EGRESS:
+ if (steering->esw_egress_root_ns &&
+ steering->esw_egress_root_ns[vport])
+ return &steering->esw_egress_root_ns[vport]->ns;
+ else
+ return NULL;
+ case MLX5_FLOW_NAMESPACE_ESW_INGRESS:
+ if (steering->esw_ingress_root_ns &&
+ steering->esw_ingress_root_ns[vport])
+ return &steering->esw_ingress_root_ns[vport]->ns;
+ else
+ return NULL;
+ default:
+ return NULL;
+ }
+}
+
static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
unsigned int prio, int num_levels)
{
@@ -2331,13 +2348,41 @@ static void cleanup_root_ns(struct mlx5_flow_root_namespace *root_ns)
clean_tree(&root_ns->ns.node);
}
+static void cleanup_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+ int i;
+
+ if (!steering->esw_egress_root_ns)
+ return;
+
+ for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+ cleanup_root_ns(steering->esw_egress_root_ns[i]);
+
+ kfree(steering->esw_egress_root_ns);
+}
+
+static void cleanup_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+ int i;
+
+ if (!steering->esw_ingress_root_ns)
+ return;
+
+ for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++)
+ cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+
+ kfree(steering->esw_ingress_root_ns);
+}
+
void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
{
struct mlx5_flow_steering *steering = dev->priv.steering;
cleanup_root_ns(steering->root_ns);
- cleanup_root_ns(steering->esw_egress_root_ns);
- cleanup_root_ns(steering->esw_ingress_root_ns);
+ cleanup_egress_acls_root_ns(dev);
+ cleanup_ingress_acls_root_ns(dev);
cleanup_root_ns(steering->fdb_root_ns);
cleanup_root_ns(steering->sniffer_rx_root_ns);
cleanup_root_ns(steering->sniffer_tx_root_ns);
@@ -2406,34 +2451,86 @@ static int init_fdb_root_ns(struct mlx5_flow_steering *steering)
return PTR_ERR(prio);
}
-static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering)
+static int init_egress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
{
struct fs_prio *prio;
- steering->esw_egress_root_ns = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
- if (!steering->esw_egress_root_ns)
+ steering->esw_egress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_EGRESS_ACL);
+ if (!steering->esw_egress_root_ns[vport])
return -ENOMEM;
/* create 1 prio*/
- prio = fs_create_prio(&steering->esw_egress_root_ns->ns, 0,
- MLX5_TOTAL_VPORTS(steering->dev));
+ prio = fs_create_prio(&steering->esw_egress_root_ns[vport]->ns, 0, 1);
return PTR_ERR_OR_ZERO(prio);
}
-static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering)
+static int init_ingress_acl_root_ns(struct mlx5_flow_steering *steering, int vport)
{
struct fs_prio *prio;
- steering->esw_ingress_root_ns = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
- if (!steering->esw_ingress_root_ns)
+ steering->esw_ingress_root_ns[vport] = create_root_ns(steering, FS_FT_ESW_INGRESS_ACL);
+ if (!steering->esw_ingress_root_ns[vport])
return -ENOMEM;
/* create 1 prio*/
- prio = fs_create_prio(&steering->esw_ingress_root_ns->ns, 0,
- MLX5_TOTAL_VPORTS(steering->dev));
+ prio = fs_create_prio(&steering->esw_ingress_root_ns[vport]->ns, 0, 1);
return PTR_ERR_OR_ZERO(prio);
}
+static int init_egress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+ int err;
+ int i;
+
+ steering->esw_egress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+ sizeof(*steering->esw_egress_root_ns),
+ GFP_KERNEL);
+ if (!steering->esw_egress_root_ns)
+ return -ENOMEM;
+
+ for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+ err = init_egress_acl_root_ns(steering, i);
+ if (err)
+ goto cleanup_root_ns;
+ }
+
+ return 0;
+
+cleanup_root_ns:
+ for (i--; i >= 0; i--)
+ cleanup_root_ns(steering->esw_egress_root_ns[i]);
+ kfree(steering->esw_egress_root_ns);
+ return err;
+}
+
+static int init_ingress_acls_root_ns(struct mlx5_core_dev *dev)
+{
+ struct mlx5_flow_steering *steering = dev->priv.steering;
+ int err;
+ int i;
+
+ steering->esw_ingress_root_ns = kcalloc(MLX5_TOTAL_VPORTS(dev),
+ sizeof(*steering->esw_ingress_root_ns),
+ GFP_KERNEL);
+ if (!steering->esw_ingress_root_ns)
+ return -ENOMEM;
+
+ for (i = 0; i < MLX5_TOTAL_VPORTS(dev); i++) {
+ err = init_ingress_acl_root_ns(steering, i);
+ if (err)
+ goto cleanup_root_ns;
+ }
+
+ return 0;
+
+cleanup_root_ns:
+ for (i--; i >= 0; i--)
+ cleanup_root_ns(steering->esw_ingress_root_ns[i]);
+ kfree(steering->esw_ingress_root_ns);
+ return err;
+}
+
int mlx5_init_fs(struct mlx5_core_dev *dev)
{
struct mlx5_flow_steering *steering;
@@ -2476,12 +2573,12 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
goto err;
}
if (MLX5_CAP_ESW_EGRESS_ACL(dev, ft_support)) {
- err = init_egress_acl_root_ns(steering);
+ err = init_egress_acls_root_ns(dev);
if (err)
goto err;
}
if (MLX5_CAP_ESW_INGRESS_ACL(dev, ft_support)) {
- err = init_ingress_acl_root_ns(steering);
+ err = init_ingress_acls_root_ns(dev);
if (err)
goto err;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 397d24a621a4..3e571045626f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -71,8 +71,8 @@ struct mlx5_flow_steering {
struct kmem_cache *ftes_cache;
struct mlx5_flow_root_namespace *root_ns;
struct mlx5_flow_root_namespace *fdb_root_ns;
- struct mlx5_flow_root_namespace *esw_egress_root_ns;
- struct mlx5_flow_root_namespace *esw_ingress_root_ns;
+ struct mlx5_flow_root_namespace **esw_egress_root_ns;
+ struct mlx5_flow_root_namespace **esw_ingress_root_ns;
struct mlx5_flow_root_namespace *sniffer_tx_root_ns;
struct mlx5_flow_root_namespace *sniffer_rx_root_ns;
};
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index b25e7baa273e..a0b48afcb422 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -95,6 +95,10 @@ struct mlx5_flow_destination {
struct mlx5_flow_namespace *
mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
enum mlx5_flow_namespace_type type);
+struct mlx5_flow_namespace *
+mlx5_get_flow_vport_acl_namespace(struct mlx5_core_dev *dev,
+ enum mlx5_flow_namespace_type type,
+ int vport);
struct mlx5_flow_table *
mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
--
2.13.0
^ permalink raw reply related
* [for-next V3 07/11] net/mlx5: E-Switch, Create generic header struct to be used by representors
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Mark Bloch, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Mark Bloch <markb@mellanox.com>
Now that we don't store type dependent data in struct mlx5_eswitch_rep
we can create a generic interface, and representor type.
struct mlx5_eswitch_rep will store an array of interfaces, each
interface is used by a different representor type.
Once we moved to a more generic interface, rdma driver representors can
be added and utilize the same mechanism as the Ethernet driver
representors use.
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 29 ++++-----
drivers/net/ethernet/mellanox/mlx5/core/en_rep.h | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +--
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 22 +++++--
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 70 +++++++++++++++-------
5 files changed, 88 insertions(+), 44 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3c74f0599ad3..5b2b673c0b13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1086,7 +1086,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
rpriv->netdev = netdev;
rpriv->rep = rep;
- rep->priv = rpriv;
+ rep->rep_if[REP_ETH].priv = rpriv;
INIT_LIST_HEAD(&rpriv->vport_sqs_list);
err = mlx5e_attach_netdev(netdev_priv(netdev));
@@ -1103,7 +1103,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
goto err_detach_netdev;
}
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
upriv = netdev_priv(uplink_rpriv->netdev);
err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
upriv);
@@ -1146,7 +1146,8 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep)
struct mlx5e_priv *upriv;
unregister_netdev(netdev);
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
+ REP_ETH);
upriv = netdev_priv(uplink_rpriv->netdev);
tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
upriv);
@@ -1164,11 +1165,11 @@ static void mlx5e_rep_register_vf_vports(struct mlx5e_priv *priv)
int vport;
for (vport = 1; vport < total_vfs; vport++) {
- struct mlx5_eswitch_rep rep = {};
+ struct mlx5_eswitch_rep_if rep_if = {};
- rep.load = mlx5e_vport_rep_load;
- rep.unload = mlx5e_vport_rep_unload;
- mlx5_eswitch_register_vport_rep(esw, vport, &rep);
+ rep_if.load = mlx5e_vport_rep_load;
+ rep_if.unload = mlx5e_vport_rep_unload;
+ mlx5_eswitch_register_vport_rep(esw, vport, &rep_if, REP_ETH);
}
}
@@ -1180,24 +1181,24 @@ static void mlx5e_rep_unregister_vf_vports(struct mlx5e_priv *priv)
int vport;
for (vport = 1; vport < total_vfs; vport++)
- mlx5_eswitch_unregister_vport_rep(esw, vport);
+ mlx5_eswitch_unregister_vport_rep(esw, vport, REP_ETH);
}
void mlx5e_register_vport_reps(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ struct mlx5_eswitch_rep_if rep_if;
struct mlx5e_rep_priv *rpriv;
- struct mlx5_eswitch_rep rep;
rpriv = priv->ppriv;
rpriv->netdev = priv->netdev;
- rep.load = mlx5e_nic_rep_load;
- rep.unload = mlx5e_nic_rep_unload;
- rep.priv = rpriv;
+ rep_if.load = mlx5e_nic_rep_load;
+ rep_if.unload = mlx5e_nic_rep_unload;
+ rep_if.priv = rpriv;
INIT_LIST_HEAD(&rpriv->vport_sqs_list);
- mlx5_eswitch_register_vport_rep(esw, 0, &rep); /* UPLINK PF vport*/
+ mlx5_eswitch_register_vport_rep(esw, 0, &rep_if, REP_ETH); /* UPLINK PF vport*/
mlx5e_rep_register_vf_vports(priv); /* VFs vports */
}
@@ -1208,7 +1209,7 @@ void mlx5e_unregister_vport_reps(struct mlx5e_priv *priv)
struct mlx5_eswitch *esw = mdev->priv.eswitch;
mlx5e_rep_unregister_vf_vports(priv); /* VFs vports */
- mlx5_eswitch_unregister_vport_rep(esw, 0); /* UPLINK PF*/
+ mlx5_eswitch_unregister_vport_rep(esw, 0, REP_ETH); /* UPLINK PF*/
}
void *mlx5e_alloc_nic_rep_priv(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 8db68369367e..e4473a9ebd50 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -64,7 +64,7 @@ struct mlx5e_rep_priv {
static inline
struct mlx5e_rep_priv *mlx5e_rep_to_rep_priv(struct mlx5_eswitch_rep *rep)
{
- return (struct mlx5e_rep_priv *)rep->priv;
+ return (struct mlx5e_rep_priv *)rep->rep_if[REP_ETH].priv;
}
struct mlx5e_neigh {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index f462496cce7a..259e91e2d09a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -617,7 +617,7 @@ static int parse_tunnel_attr(struct mlx5e_priv *priv,
FLOW_DISSECTOR_KEY_ENC_PORTS,
f->mask);
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
struct net_device *up_dev = uplink_rpriv->netdev;
struct mlx5e_priv *up_priv = netdev_priv(up_dev);
@@ -1522,7 +1522,7 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
#else
return -EOPNOTSUPP;
#endif
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
/* if the egress device isn't on the same HW e-switch, we use the uplink */
if (!switchdev_port_same_parent_id(priv->netdev, rt->dst.dev))
*out_dev = uplink_rpriv->netdev;
@@ -1561,7 +1561,7 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
*out_ttl = ip6_dst_hoplimit(dst);
- uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
/* if the egress device isn't on the same HW e-switch, we use the uplink */
if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
*out_dev = uplink_rpriv->netdev;
@@ -1864,7 +1864,8 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow)
{
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
- struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw);
+ struct mlx5e_rep_priv *uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw,
+ REP_ETH);
struct net_device *up_dev = uplink_rpriv->netdev;
unsigned short family = ip_tunnel_info_af(tun_info);
struct mlx5e_priv *up_priv = netdev_priv(up_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 3a21ea4e4d24..91175965df7f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -45,6 +45,11 @@ enum {
SRIOV_OFFLOADS
};
+enum {
+ REP_ETH,
+ NUM_REP_TYPES,
+};
+
#ifdef CONFIG_MLX5_ESWITCH
#define MLX5_MAX_UC_PER_VPORT(dev) \
@@ -138,16 +143,21 @@ struct mlx5_esw_sq {
struct list_head list;
};
-struct mlx5_eswitch_rep {
+struct mlx5_eswitch_rep;
+struct mlx5_eswitch_rep_if {
int (*load)(struct mlx5_core_dev *dev,
struct mlx5_eswitch_rep *rep);
void (*unload)(struct mlx5_eswitch_rep *rep);
void *priv;
+ bool valid;
+};
+
+struct mlx5_eswitch_rep {
+ struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES];
u16 vport;
u8 hw_id[ETH_ALEN];
u16 vlan;
u32 vlan_refcount;
- bool valid;
};
struct mlx5_esw_offload {
@@ -268,10 +278,12 @@ int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink, u8 encap);
int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap);
void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
int vport_index,
- struct mlx5_eswitch_rep *rep);
+ struct mlx5_eswitch_rep_if *rep_if,
+ u8 rep_type);
void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
- int vport_index);
-void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw);
+ int vport_index,
+ u8 rep_type);
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
struct mlx5_esw_flow_attr *attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 07f26c1986fc..99f583a15cc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -130,7 +130,7 @@ static int esw_set_global_vlan_pop(struct mlx5_eswitch *esw, u8 val)
esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
rep = &esw->offloads.vport_reps[vf_vport];
- if (!rep->valid)
+ if (!rep->rep_if[REP_ETH].valid)
continue;
err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
@@ -719,21 +719,31 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
return 0;
}
-static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+static void esw_offloads_unload_reps_type(struct mlx5_eswitch *esw, int nvports,
+ u8 rep_type)
{
struct mlx5_eswitch_rep *rep;
int vport;
for (vport = nvports - 1; vport >= 0; vport--) {
rep = &esw->offloads.vport_reps[vport];
- if (!rep->valid)
+ if (!rep->rep_if[rep_type].valid)
continue;
- rep->unload(rep);
+ rep->rep_if[rep_type].unload(rep);
}
}
-static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+{
+ u8 rep_type = NUM_REP_TYPES;
+
+ while (rep_type-- > 0)
+ esw_offloads_unload_reps_type(esw, nvports, rep_type);
+}
+
+static int esw_offloads_load_reps_type(struct mlx5_eswitch *esw, int nvports,
+ u8 rep_type)
{
struct mlx5_eswitch_rep *rep;
int vport;
@@ -741,10 +751,10 @@ static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
for (vport = 0; vport < nvports; vport++) {
rep = &esw->offloads.vport_reps[vport];
- if (!rep->valid)
+ if (!rep->rep_if[rep_type].valid)
continue;
- err = rep->load(esw->dev, rep);
+ err = rep->rep_if[rep_type].load(esw->dev, rep);
if (err)
goto err_reps;
}
@@ -752,7 +762,26 @@ static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
return 0;
err_reps:
- esw_offloads_unload_reps(esw, vport);
+ esw_offloads_unload_reps_type(esw, vport, rep_type);
+ return err;
+}
+
+static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+{
+ u8 rep_type = 0;
+ int err;
+
+ for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+ err = esw_offloads_load_reps_type(esw, nvports, rep_type);
+ if (err)
+ goto err_reps;
+ }
+
+ return err;
+
+err_reps:
+ while (rep_type-- > 0)
+ esw_offloads_unload_reps_type(esw, nvports, rep_type);
return err;
}
@@ -1121,22 +1150,23 @@ int mlx5_devlink_eswitch_encap_mode_get(struct devlink *devlink, u8 *encap)
void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
int vport_index,
- struct mlx5_eswitch_rep *__rep)
+ struct mlx5_eswitch_rep_if *__rep_if,
+ u8 rep_type)
{
struct mlx5_esw_offload *offloads = &esw->offloads;
- struct mlx5_eswitch_rep *rep;
+ struct mlx5_eswitch_rep_if *rep_if;
- rep = &offloads->vport_reps[vport_index];
+ rep_if = &offloads->vport_reps[vport_index].rep_if[rep_type];
- rep->load = __rep->load;
- rep->unload = __rep->unload;
- rep->priv = __rep->priv;
+ rep_if->load = __rep_if->load;
+ rep_if->unload = __rep_if->unload;
+ rep_if->priv = __rep_if->priv;
- rep->valid = true;
+ rep_if->valid = true;
}
void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
- int vport_index)
+ int vport_index, u8 rep_type)
{
struct mlx5_esw_offload *offloads = &esw->offloads;
struct mlx5_eswitch_rep *rep;
@@ -1144,17 +1174,17 @@ void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
rep = &offloads->vport_reps[vport_index];
if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
- rep->unload(rep);
+ rep->rep_if[rep_type].unload(rep);
- rep->valid = false;
+ rep->rep_if[rep_type].valid = false;
}
-void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw)
+void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
{
#define UPLINK_REP_INDEX 0
struct mlx5_esw_offload *offloads = &esw->offloads;
struct mlx5_eswitch_rep *rep;
rep = &offloads->vport_reps[UPLINK_REP_INDEX];
- return rep->priv;
+ return rep->rep_if[rep_type].priv;
}
--
2.13.0
^ permalink raw reply related
* [for-next V3 09/11] net/mlx5e: E-Switch, Use the name of static array instead of its address
From: Saeed Mahameed @ 2017-12-28 23:23 UTC (permalink / raw)
To: David S. Miller, Doug Ledford
Cc: netdev, linux-rdma, Leon Romanovsky, Gal Pressman, Saeed Mahameed
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Gal Pressman <galp@mellanox.com>
Using the address of a static array is the same as using its name (in
this specific use-case), but it's confusing and makes the code less
readable.
Fixes: 1bd27b11c1df ("net/mlx5: Introduce E-switch QoS management")
Fixes: bd77bf1cb595 ("net/mlx5: Add SRIOV VF max rate configuration support")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 26 +++++++++++------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 6d4cbdb69823..cdf65ed8714c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1290,7 +1290,7 @@ static int esw_create_tsar(struct mlx5_eswitch *esw)
err = mlx5_create_scheduling_element_cmd(dev,
SCHEDULING_HIERARCHY_E_SWITCH,
- &tsar_ctx,
+ tsar_ctx,
&esw->qos.root_tsar_id);
if (err) {
esw_warn(esw->dev, "E-Switch create TSAR failed (%d)\n", err);
@@ -1333,20 +1333,20 @@ static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
if (vport->qos.enabled)
return -EEXIST;
- MLX5_SET(scheduling_context, &sched_ctx, element_type,
+ MLX5_SET(scheduling_context, sched_ctx, element_type,
SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
- vport_elem = MLX5_ADDR_OF(scheduling_context, &sched_ctx,
+ vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
element_attributes);
MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
- MLX5_SET(scheduling_context, &sched_ctx, parent_element_id,
+ MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
esw->qos.root_tsar_id);
- MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
+ MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
initial_max_rate);
- MLX5_SET(scheduling_context, &sched_ctx, bw_share, initial_bw_share);
+ MLX5_SET(scheduling_context, sched_ctx, bw_share, initial_bw_share);
err = mlx5_create_scheduling_element_cmd(dev,
SCHEDULING_HIERARCHY_E_SWITCH,
- &sched_ctx,
+ sched_ctx,
&vport->qos.esw_tsar_ix);
if (err) {
esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n",
@@ -1392,22 +1392,22 @@ static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
if (!vport->qos.enabled)
return -EIO;
- MLX5_SET(scheduling_context, &sched_ctx, element_type,
+ MLX5_SET(scheduling_context, sched_ctx, element_type,
SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
- vport_elem = MLX5_ADDR_OF(scheduling_context, &sched_ctx,
+ vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
element_attributes);
MLX5_SET(vport_element, vport_elem, vport_number, vport_num);
- MLX5_SET(scheduling_context, &sched_ctx, parent_element_id,
+ MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
esw->qos.root_tsar_id);
- MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
+ MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
max_rate);
- MLX5_SET(scheduling_context, &sched_ctx, bw_share, bw_share);
+ MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
err = mlx5_modify_scheduling_element_cmd(dev,
SCHEDULING_HIERARCHY_E_SWITCH,
- &sched_ctx,
+ sched_ctx,
vport->qos.esw_tsar_ix,
bitmask);
if (err) {
--
2.13.0
^ permalink raw reply related
* Re: iproute2 net-next
From: Daniel Borkmann @ 2017-12-28 23:46 UTC (permalink / raw)
To: Leon Romanovsky; +Cc: Stephen Hemminger, netdev, dsa
In-Reply-To: <20171226093547.GC10734@mtr-leonro.local>
On 12/26/2017 10:35 AM, Leon Romanovsky wrote:
> On Mon, Dec 25, 2017 at 10:14:26PM -0800, Stephen Hemminger wrote:
>> On Tue, 26 Dec 2017 06:47:43 +0200
>> Leon Romanovsky <leon@kernel.org> wrote:
>>
>>> On Mon, Dec 25, 2017 at 10:49:19AM -0800, Stephen Hemminger wrote:
>>>> David Ahern has agreed to take over managing the net-next branch of iproute2.
>>>> The new location is:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/dsahern/iproute2-next.git/
>>>>
>>>> In the past, I have accepted new features into iproute2 master branch, but
>>>> am changing the policy so that outside of the merge window (up until -rc1)
>>>> new features will get put into net-next to get some more review and testing
>>>> time. This means that things like the proposed batch streaming mode will
>>>> go through net-next.
>>>
>>> Did you consider to create one shared repo for the iproute2 to allow
>>> multiple committers workflow?
>>
>> For now having separate trees is best, there is no need for multiple
>> committers the load is very light.
>>
>>> It will be much convenient for the users to have one place for
>>> master/stable/net-next branches, instead of actually following two
>>> different repositories.
>>
>> If you are doing network development, you already need to deal with
>> multiple repo's on the kernel side so there is no difference.
>
> I agree with you that one extra "git remote add .." is not so huge and
> all people who develop for the netdev will do it. My concern is about
> Documentation and newcomers, who will have a hard time to find a right
> tree.
I guess it would certainly help to identify the official repo to rebase
against much quicker if it would be under a common group on korg e.g.
* iproute2/iproute2.git - for current cycle
* iproute2/iproute2-next.git - for net-next bits
and also be in line with other tooling (ethtool and others), even if
not as high volume, but it would make it unambiguous right away from
the other, private iproute2 repos on korg, imho. Just a thought.
>>> Example, of such shared repo:
>>> BPF: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
>>> Bluetooth: https://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git/
>>> RDMA: https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git/
>>
>> Most of these are high volume or vendor silo'd which is not the case here.
Cheers,
Daniel
^ permalink raw reply
* Re: Pravin Shelar
From: Pravin Shelar @ 2017-12-28 23:47 UTC (permalink / raw)
To: Joe Perches; +Cc: ovs dev, Julia Lawall, Linux Kernel Network Developers
In-Reply-To: <1514399625.7654.9.camel-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org>
On Wed, Dec 27, 2017 at 10:33 AM, Joe Perches <joe-6d6DIl74uiNBDgjK7y7TUQ@public.gmane.org> wrote:
> On Wed, 2017-12-27 at 10:25 -0800, Ben Pfaff wrote:
>> On Wed, Dec 27, 2017 at 04:22:55PM +0100, Julia Lawall wrote:
>> > The email address pshelar-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org listed for Pravin Shelar in
>> > MAINTAINERS (OPENVSWITCH section) seems to bounce.
>>
>> Pravin has used a newer address recently, so I sent out a suggested
>> update (for OVS):
>> https://patchwork.ozlabs.org/patch/853232/
>
> As Pravin is still active with acks but not any authored patches in
> the
> last year, this should still be updated in the linux-kernel's
> MAINTAINERS
> file too.
> ---
> diff --git a/MAINTAINERS b/MAINTAINERS
> index
> a6e86e20761e..5869e5f0b930 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@
> -10137,7 +10137,7 @@ F: drivers/irqchip/irq-ompic.c
> F: dri
> vers/irqchip/irq-or1k-*
>
> OPENVSWITCH
> -M: Pravin Shelar <pshelar@ni
> cira.com>
> +M: Pravin Shelar <pshelar-LZ6Gd1LRuIk@public.gmane.org>
> L: netdev@vge
> r.kernel.org
> L: dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org
> W: http://openvswitch.org
Thanks Joe for the patch. But it is corrupted. I will send updated patch soon.
^ permalink raw reply
* Re: [PATCH net-next v6 0/6] net: tcp: sctp: dccp: Replace jprobe usage with trace events
From: Masami Hiramatsu @ 2017-12-28 23:48 UTC (permalink / raw)
To: David Miller
Cc: mingo, ian.mcdonald, vyasevich, stephen, rostedt, peterz, tglx,
linux-kernel, hpa, gerrit, nhorman, dccp, netdev, linux-sctp, sfr
In-Reply-To: <20171228.120613.1048334566479767343.davem@davemloft.net>
On Thu, 28 Dec 2017 12:06:13 -0500 (EST)
David Miller <davem@davemloft.net> wrote:
> From: Masami Hiramatsu <mhiramat@kernel.org>
> Date: Thu, 28 Dec 2017 15:10:00 +0900
>
> > Changes from v5:
> > [1/6]: Avoid preprocessor directives in tracepoint macro args
>
> Patch #1 is not the only patch which has this problem, at a minimum
> patch #5 has it too.
Oops, sorry...
> Please audit the entire series for an issue when it is brought to your
> attention.
Thank you for your kindly advice.
>
> Thank you.
--
Masami Hiramatsu <mhiramat@kernel.org>
^ permalink raw reply
* Re: [pull request][for-next V3 00/11] Mellanox, mlx5 E-Switch updates 2017-12-19
From: David Miller @ 2017-12-29 0:46 UTC (permalink / raw)
To: saeedm; +Cc: dledford, netdev, linux-rdma, leonro
In-Reply-To: <20171228232314.13678-1-saeedm@mellanox.com>
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 29 Dec 2017 01:23:03 +0200
> ==============
> This series includes updates for mlx5 E-Switch infrastructures,
> to be merged into net-next and rdma-next trees.
>
> Mark's patches provide E-Switch refactoring that generalize the mlx5
> E-Switch vf representors interfaces and data structures. The serious is
> mainly focused on moving ethernet (netdev) specific representors logic out
> of E-Switch (eswitch.c) into mlx5e representor module (en_rep.c), which
> provides better separation and allows future support for other types of vf
> representors (e.g. RDMA).
>
> Gal's patches at the end of this serious, provide a simple syntax fix and
> two other patches that handles vport ingress/egress ACL steering name
> spaces to be aligned with the Firmware/Hardware specs.
> ===============
>
> V1->V2:
> - Addressed coding style comments in patches #1 and #7
> - The series is still based on rc4, as now I see net-next is also @rc4.
>
> V2->V3:
> - Fixed compilation warning, reported by Dave.
>
> Please pull and let me know if there's any problem.
Looks good, pulled, thank you.
^ permalink raw reply
* Re: [RFC PATCH bpf-next v2 1/4] tracing/kprobe: bpf: Check error injectable event is on function entry
From: Alexei Starovoitov @ 2017-12-29 1:03 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Steven Rostedt, Alexei Starovoitov, Josef Bacik, mingo, davem,
netdev, linux-kernel, ast, kernel-team, daniel, linux-btrfs,
darrick.wong, Josef Bacik, Akinobu Mita
In-Reply-To: <20171228172027.4a8f2f0cf0506499acd26738@kernel.org>
On 12/28/17 12:20 AM, Masami Hiramatsu wrote:
> On Wed, 27 Dec 2017 20:32:07 -0800
> Alexei Starovoitov <ast@fb.com> wrote:
>
>> On 12/27/17 8:16 PM, Steven Rostedt wrote:
>>> On Wed, 27 Dec 2017 19:45:42 -0800
>>> Alexei Starovoitov <ast@fb.com> wrote:
>>>
>>>> I don't think that's the case. My reading of current
>>>> trace_kprobe_ftrace() -> arch_check_ftrace_location()
>>>> is that it will not be true for old mcount case.
>>>
>>> In the old mcount case, you can't use ftrace to return without calling
>>> the function. That is, no modification of the return ip, unless you
>>> created a trampoline that could handle arbitrary stack frames, and
>>> remove them from the stack before returning back to the function.
>>
>> correct. I was saying that trace_kprobe_ftrace() won't let us do
>> bpf_override_return with old mcount.
>
> No, trace_kprobe_ftrace() just checks the given address will be
> managed by ftrace. you can see arch_check_ftrace_location() in kernel/kprobes.c.
>
> FYI, CONFIG_KPROBES_ON_FTRACE depends on DYNAMIC_FTRACE_WITH_REGS, and
> DYNAMIC_FTRACE_WITH_REGS doesn't depend on CC_USING_FENTRY.
> This means if you compile kernel with old gcc and enable DYNAMIC_FTRACE,
> kprobes uses ftrace on mcount address which is NOT the entry point
> of target function.
ok. fair enough. I think we can gate the feature to !mcount only.
> On the other hand, changing IP feature has been implemented originaly
> by kprobes with int3 (sw breakpoint). This means you can use kprobes
> at correct address (the entry address of the function) you can hijack
> the function, as jprobe did.
>
>>>> As far as the rest of your arguments it very much puzzles me that
>>>> you claim that this patch suppose to work based on historical
>>>> reasoning whereas you did NOT test it.
>>>
>>> I believe that Masami is saying that the modification of the IP from
>>> kprobes has been very well tested. But I'm guessing that you still want
>>> a test case for using kprobes in this particular instance. It's not the
>>> implementation of modifying the IP that you are worried about, but the
>>> implementation of BPF using it in this case. Right?
>>
>> exactly. No doubt that old code works.
>> But it doesn't mean that bpf_override_return() will continue to
>> work in kprobes that are not ftrace based.
>> I suspect Josef's existing test case will cover this situation.
>> Probably only special .config is needed to disable ftrace, so
>> "kprobe on entry but not ftrace" check will kick in.
>
> Right. If you need to test it, you can run Josef's test case without
> CONFIG_DYNAMIC_FTRACE.
It should be obvious that the person who submits the patch
must run the tests.
>> But I didn't get an impression that this situation was tested.
>> Instead I see only logical reasoning that it's _supposed_ to work.
>> That's not enough.
>
> OK, so would you just ask me to run samples/bpf ?
Please run Josef's test in the !ftrace setup.
^ permalink raw reply
* Re: [RFC PATCH bpf-next v2 4/4] error-injection: Support fault injection framework
From: Alexei Starovoitov @ 2017-12-29 1:11 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Alexei Starovoitov, Josef Bacik, rostedt, mingo, davem, netdev,
linux-kernel, ast, kernel-team, daniel, linux-btrfs, darrick.wong,
Josef Bacik, Akinobu Mita
In-Reply-To: <20171228165140.0f9fc7f6067b5581c018e81d@kernel.org>
On 12/27/17 11:51 PM, Masami Hiramatsu wrote:
>
> Then what happen if the user set invalid retval to those functions?
> even if we limit the injectable functions, it can cause a problem,
>
> for example,
>
> obj = func_return_object();
> if (!obj) {
> handling_error...;
> }
> obj->field = x;
>
> In this case, obviously func_return_object() must return NULL if there is
> an error, not -ENOMEM. But without the correct retval information, how would
> you check the BPF code doesn't cause a trouble?
> Currently it seems you are expecting only the functions which return error code.
>
> ret = func_return_state();
> if (ret < 0) {
> handling_error...;
> }
>
> But how we can distinguish those?
>
> If we have the error range for each function, we can ensure what is
> *correct* error code, NULL or errno, or any other error numbers. :)
messing up return values may cause problems and range check is
not going to magically help.
The caller may handle only a certain set of errors or interpret
some of them like EBUSY as a signal to retry.
It's plain impossible to make sure that kernel will be functional
after error injection has been made.
Like kmalloc() unconditionally returning NULL will be deadly
for the kernel, hence this patch 4/4 has very limited practical
use. The bpf program need to make intelligent decisions when
to return an error and what kind of error to return.
Doing blank range check adds a false sense of additional safety.
More so it wastes kilobytes of memory to do this check, hence nack.
^ permalink raw reply
* Re: [RFT net-next v3 0/5] dwmac-meson8b: RGMII clock fixes for Meson8b
From: Emiliano Ingrassia @ 2017-12-29 1:31 UTC (permalink / raw)
To: Martin Blumenstingl
Cc: netdev, linus.luessing, khilman, linux-amlogic, jbrunet,
Neil Armstrong, peppe.cavallaro, alexandre.torgue
In-Reply-To: <20171228222128.15215-1-martin.blumenstingl@googlemail.com>
Hi Martin, Hi Dave,
On Thu, Dec 28, 2017 at 11:21:23PM +0100, Martin Blumenstingl wrote:
> Hi Dave,
>
> please do not apply this series until it got a Tested-by from Emiliano.
>
>
> Hi Emiliano,
>
> you reported [0] that you couldn't get dwmac-meson8b to work on your
> Odroid-C1. With your findings (register dumps, clk_summary output, etc.)
> I think I was able to find a fix: it consists of two patches (which you
> find in this series)
>
> Unfortunately I don't have any Meson8b boards with RGMII PHY so I could
> only partially test this (I could only check if the clocks were
> calculated correctly when using a dummy 500002394Hz input clock instead
> of MPLL2).
>
> Could you please give this series a try and let me know about the
> results?
> You obviously still need your two "ARM: dts: meson8b" patches which
> - add the amlogic,meson8b-dwmac" compatible to meson8b.dtsi
> - enable Ethernet on the Odroid-C1
>
> When testing on Meson8b this also needs a fix for the MPLL clock driver:
> "clk: meson: mpll: use 64-bit maths in params_from_rate", see:
> https://patchwork.kernel.org/patch/10131677/
>
>
> I have tested this myself on a Khadas VIM (GXL SoC, internal RMII PHY)
> and a Khadas VIM2 (GXM SoC, external RGMII PHY). Both are still working
> fine (so let's hope that this also fixes your Meson8b issue :)).
>
>
> changes since v1 at [1]:
> - changed the subject of the cover-letter to indicate that this is all
> about the RGMII clock
> - added PATCH #1 which ensures that we don't unnecessarily change the
> parent clocks in RMII mode (and also makes the code easier to
> understand)
> - changed subject of PATCH #2 (formerly PATCH #1) to state that this
> is about the RGMII clock
> - added Jerome's Reviewed-by to PATCH #2 (formerly PATCH #1)
> - replaced PATCH #3 (formerly PATCH #2) with one that sets
> CLK_SET_RATE_PARENT on the mux and thus re-configures the MPLL2 clock
> on Meson8b correctly
>
> changes since v2 at [2]:
> - added PATCH #2 to make the following patch easier
> - Emiliano reported that there's currently another bug in the
> dwmac-meson8b driver which prevents it from working with RGMII PHYs on
> Meson8b: bit 10 of the PRG_ETH0 register is configures a clock gate
> (instead of a divide by 5 or divide by 10 clock divider). This has not
> been visible on GXBB and later due to the input clock which always led
> to a selection of "divide by 10" (which is done internally in the IP
> block, but the bit actually means "enable RGMII clock output").
> PATCH #3 was added to address this issue.
> - the commit message of PATCH #4 and #5 (formerly PATCH #2 and #3) were
> updated and the patch itself rebased because the m25_div clock was
> removed with the new PATCH #3 (so some of the statements were not
> valid anymore)
>
Here is the clk_summary relative to ethernet on Odroid-C1+
with this new series applied:
xtal 1 1 24000000 0 0
sys_pll 0 0 1200000000 0 0
cpu_clk 0 0 1200000000 0 0
vid_pll 0 0 732000000 0 0
fixed_pll 2 2 2550000000 0 0
mpll2 1 1 249999701 0 0
c9410000.ethernet#m250_sel 1 1 249999701 0 0
c9410000.ethernet#m250_div 1 1 249999701 0 0
c9410000.ethernet#fixed_div10 1 1 24999970 0 0
c9410000.ethernet#m25_en 1 1 24999970 0 0
The ethernet prg0 register is set to 0x74A1 which should be correct with
respect to the information contained in the S805 SoC manual.
Actually, the ethernet is not yet fully functional.
Trying to ping the board, I can see ARP request from host to board using
tcpdump. However, the host can't see any response.
Following the U-Boot value for prg0 register, which is 0x7d21, I also
tried to set bit 11. As expected, this did not have any influence.
Another thing that we should check is the "Ethernet Memory PD" (see S805
manual - sec. 5.4) register which bits 3-2 enable/disable ethernet
normal operation. However, those bits are already cleared by U-Boot.
Thank you for the support.
Best regards,
Emiliano
>
> [0] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005596.html
> [1] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005848.html
> [2] http://lists.infradead.org/pipermail/linux-amlogic/2017-December/005861.html
>
>
> Martin Blumenstingl (5):
> net: stmmac: dwmac-meson8b: only configure the clocks in RGMII mode
> net: stmmac: dwmac-meson8b: simplify generating the clock names
> net: stmmac: dwmac-meson8b: fix internal RGMII clock configuration
> net: stmmac: dwmac-meson8b: fix setting the RGMII clock on Meson8b
> net: stmmac: dwmac-meson8b: propagate rate changes to the parent clock
>
> .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 119 +++++++++++----------
> 1 file changed, 63 insertions(+), 56 deletions(-)
>
> --
> 2.15.1
>
^ permalink raw reply
* [GIT] Networking
From: David Miller @ 2017-12-29 2:05 UTC (permalink / raw)
To: torvalds; +Cc: akpm, netdev, linux-kernel
1) IPv6 gre tunnels end up with different default features enabled
depending upon whether netlink or ioctls are used to bring them
up. Fix from Alexey Kodanev.
2) Fix read past end of user control message in RDS< from Avinash
Repaka.
3) Missing RCU barrier in mini qdisc code, from Cong Wang.
4) Missing policy put when reusing per-cpu route entries, from
Florian Westphal.
5) Handle nested PCI errors properly in bnx2x driver, from Guilherme
G. Piccoli.
6) Run nested transport mode IPSEC packets via tasklet, from Herbert
Xu.
7) Fix handling poll() for stream sockets in tipc, from Parthasarathy
Bhuvaragan.
8) Fix two stack-out-of-bounds issues in IPSEC, from Steffen Klassert.
9) Another zerocopy ubuf handling fix, from Willem de Bruijn.
Please pull, thanks a lot!
The following changes since commit ead68f216110170ec729e2c4dec0aad6d38259d7:
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (2017-12-21 15:57:30 -0800)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
for you to fetch changes up to d5902f6d1fbdb27e6a33c418063466d94be9dfa2:
Merge branch 'strparser-Fix-lockdep-issue' (2017-12-28 14:28:23 -0500)
----------------------------------------------------------------
Alexey Kodanev (1):
ip6_gre: fix device features for ioctl setup
Antony Antony (1):
xfrm: fix xfrm_do_migrate() with AEAD e.g(AES-GCM)
Avinash Repaka (1):
RDS: Check cmsg_len before dereferencing CMSG_DATA
Aviv Heller (1):
xfrm: Fix xfrm_input() to verify state is valid when (encap_type < 0)
Cong Wang (2):
xfrm: check id proto in validate_tmpl()
net_sched: fix a missing rcu barrier in mini_qdisc_pair_swap()
Daniel Borkmann (1):
Merge branch 'bpf-bpftool-various-fixes'
David S. Miller (4):
Merge branch 'master' of git://git.kernel.org/.../klassert/ipsec
Merge branch 'tg3-fixes'
Merge git://git.kernel.org/.../bpf/bpf
Merge branch 'strparser-Fix-lockdep-issue'
Florian Westphal (1):
xfrm: put policies when reusing pcpu xdst entry
Fugang Duan (1):
net: fec: unmap the xmit buffer that are not transferred by DMA
Grygorii Strashko (1):
net: phy: micrel: ksz9031: reconfigure autoneg after phy autoneg workaround
Guilherme G. Piccoli (1):
bnx2x: Improve reliability in case of nested PCI errors
Herbert Xu (1):
xfrm: Reinject transport-mode packets through tasklet
Jakub Kicinski (2):
tools: bpftool: maps: close json array on error paths of show
tools: bpftool: protect against races with disappearing objects
Jiri Pirko (1):
net: sched: fix possible null pointer deref in tcf_block_put
Jon Maloy (2):
tipc: base group replicast ack counter on number of actual receivers
tipc: fix memory leak of group member when peer node is lost
Mat Martineau (1):
tcp: Avoid preprocessor directives in tracepoint macro args
Michal Kubecek (1):
xfrm: fix XFRMA_OUTPUT_MARK policy entry
Parthasarathy Bhuvaragan (1):
tipc: fix hanging poll() for stream sockets
Quentin Monnet (1):
selftests/bpf: fix Makefile for passing LLC to the command line
Russell King (2):
phylink: ensure the PHY interface mode is appropriately set
phylink: ensure AN is enabled
Siva Reddy Kallam (3):
tg3: Update copyright
tg3: Add workaround to restrict 5762 MRRS to 2048
tg3: Enable PHY reset in MTU change path for 5720
Steffen Klassert (2):
xfrm: Fix stack-out-of-bounds read on socket policy lookup.
xfrm: Fix stack-out-of-bounds with misconfigured transport mode policies.
Tom Herbert (2):
sock: Add sock_owned_by_user_nocheck
strparser: Call sock_owned_by_user_nocheck
Tommi Rantala (2):
tipc: error path leak fixes in tipc_enable_bearer()
tipc: fix tipc_mon_delete() oops in tipc_enable_bearer() error path
Tonghao Zhang (1):
sctp: Replace use of sockets_allocated with specified macro.
Willem de Bruijn (1):
skbuff: in skb_copy_ubufs unclone before releasing zerocopy
drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 +--
drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 14 +++++++++-
drivers/net/ethernet/broadcom/tg3.c | 19 +++++++++++--
drivers/net/ethernet/broadcom/tg3.h | 7 ++++-
drivers/net/ethernet/freescale/fec_main.c | 6 ++++
drivers/net/phy/micrel.c | 1 +
drivers/net/phy/phylink.c | 2 ++
include/net/sock.h | 5 ++++
include/net/xfrm.h | 3 ++
include/trace/events/tcp.h | 97 +++++++++++++++++++++++++----------------------------------------
net/core/skbuff.c | 6 ++--
net/ipv4/xfrm4_input.c | 12 +++++++-
net/ipv6/ip6_gre.c | 57 +++++++++++++++++++++-----------------
net/ipv6/xfrm6_input.c | 10 ++++++-
net/rds/send.c | 3 ++
net/sched/cls_api.c | 2 ++
net/sched/sch_generic.c | 4 ++-
net/sctp/socket.c | 4 +--
net/strparser/strparser.c | 2 +-
net/tipc/bearer.c | 5 +++-
net/tipc/group.c | 31 ++++++++++++++-------
net/tipc/monitor.c | 6 +++-
net/tipc/socket.c | 2 +-
net/xfrm/xfrm_input.c | 69 +++++++++++++++++++++++++++++++++++++++++++++-
net/xfrm/xfrm_policy.c | 9 +++++-
net/xfrm/xfrm_state.c | 1 +
net/xfrm/xfrm_user.c | 26 +++++++++++++++++-
tools/bpf/bpftool/map.c | 8 ++++--
tools/bpf/bpftool/prog.c | 2 ++
tools/testing/selftests/bpf/Makefile | 2 +-
30 files changed, 298 insertions(+), 121 deletions(-)
^ permalink raw reply
* [PATCH net-next 0/2] tun: allow to attach eBPF filter
From: Jason Wang @ 2017-12-29 2:44 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
Hi all:
This series tries to implement eBPF socket filter for tun. This could
be used for implementing efficient virtio-net receive filter for
vhost-net.
Thanks
Jason Wang (2):
tuntap: rename struct tun_steering_prog to struct tun_prog
tun: allow to attach ebpf socket filter
drivers/net/tun.c | 58 ++++++++++++++++++++++++++++++++-------------
include/uapi/linux/if_tun.h | 1 +
2 files changed, 43 insertions(+), 16 deletions(-)
--
2.7.4
^ permalink raw reply
* [PATCH net-next 1/2] tuntap: rename struct tun_steering_prog to struct tun_prog
From: Jason Wang @ 2017-12-29 2:44 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1514515491-6041-1-git-send-email-jasowang@redhat.com>
To be reused by other eBPF program other than queue selection.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e367d631..0853829 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -195,7 +195,7 @@ struct tun_flow_entry {
#define TUN_NUM_FLOW_ENTRIES 1024
-struct tun_steering_prog {
+struct tun_prog {
struct rcu_head rcu;
struct bpf_prog *prog;
};
@@ -237,7 +237,7 @@ struct tun_struct {
u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
- struct tun_steering_prog __rcu *steering_prog;
+ struct tun_prog __rcu *steering_prog;
};
static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -571,7 +571,7 @@ static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
- struct tun_steering_prog *prog;
+ struct tun_prog *prog;
u16 ret = 0;
prog = rcu_dereference(tun->steering_prog);
@@ -2027,19 +2027,18 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret;
}
-static void tun_steering_prog_free(struct rcu_head *rcu)
+static void tun_prog_free(struct rcu_head *rcu)
{
- struct tun_steering_prog *prog = container_of(rcu,
- struct tun_steering_prog, rcu);
+ struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
bpf_prog_destroy(prog->prog);
kfree(prog);
}
-static int __tun_set_steering_ebpf(struct tun_struct *tun,
- struct bpf_prog *prog)
+static int __tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+ struct bpf_prog *prog)
{
- struct tun_steering_prog *old, *new = NULL;
+ struct tun_prog *old, *new = NULL;
if (prog) {
new = kmalloc(sizeof(*new), GFP_KERNEL);
@@ -2049,13 +2048,13 @@ static int __tun_set_steering_ebpf(struct tun_struct *tun,
}
spin_lock_bh(&tun->lock);
- old = rcu_dereference_protected(tun->steering_prog,
+ old = rcu_dereference_protected(*prog_p,
lockdep_is_held(&tun->lock));
- rcu_assign_pointer(tun->steering_prog, new);
+ rcu_assign_pointer(*prog_p, new);
spin_unlock_bh(&tun->lock);
if (old)
- call_rcu(&old->rcu, tun_steering_prog_free);
+ call_rcu(&old->rcu, tun_prog_free);
return 0;
}
@@ -2068,7 +2067,7 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
- __tun_set_steering_ebpf(tun, NULL);
+ __tun_set_ebpf(tun, &tun->steering_prog, NULL);
}
static void tun_setup(struct net_device *dev)
@@ -2550,7 +2549,8 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
return ret;
}
-static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
+static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog **prog_p,
+ void __user *data)
{
struct bpf_prog *prog;
int fd;
@@ -2566,7 +2566,7 @@ static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
return PTR_ERR(prog);
}
- return __tun_set_steering_ebpf(tun, prog);
+ return __tun_set_ebpf(tun, prog_p, prog);
}
static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
@@ -2846,7 +2846,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
break;
case TUNSETSTEERINGEBPF:
- ret = tun_set_steering_ebpf(tun, argp);
+ ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
break;
default:
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 2/2] tun: allow to attach ebpf socket filter
From: Jason Wang @ 2017-12-29 2:44 UTC (permalink / raw)
To: netdev, linux-kernel; +Cc: mst, willemb, Jason Wang
In-Reply-To: <1514515491-6041-1-git-send-email-jasowang@redhat.com>
This patch allows userspace to attach eBPF filter to tun. This will
allow to implement VM dataplane filtering in a more efficient way
compared to cBPF filter.
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/net/tun.c | 26 ++++++++++++++++++++++++++
include/uapi/linux/if_tun.h | 1 +
2 files changed, 27 insertions(+)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0853829..6e9452b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -238,6 +238,7 @@ struct tun_struct {
struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog;
struct tun_prog __rcu *steering_prog;
+ struct tun_prog __rcu *filter_prog;
};
static int tun_napi_receive(struct napi_struct *napi, int budget)
@@ -984,12 +985,25 @@ static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
#endif
}
+static unsigned int run_ebpf_filter(struct tun_struct *tun,
+ struct sk_buff *skb,
+ int len)
+{
+ struct tun_prog *prog = rcu_dereference(tun->filter_prog);
+
+ if (prog)
+ len = bpf_prog_run_clear_cb(prog->prog, skb);
+
+ return len;
+}
+
/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
+ int len = skb->len;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
@@ -1015,9 +1029,16 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
sk_filter(tfile->socket.sk, skb))
goto drop;
+ len = run_ebpf_filter(tun, skb, len);
+ if (!len)
+ goto drop;
+
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
+ if (pskb_trim(skb, len))
+ goto drop;
+
skb_tx_timestamp(skb);
/* Orphan the skb - required as we might hang on to it
@@ -2068,6 +2089,7 @@ static void tun_free_netdev(struct net_device *dev)
tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security);
__tun_set_ebpf(tun, &tun->steering_prog, NULL);
+ __tun_set_ebpf(tun, &tun->filter_prog, NULL);
}
static void tun_setup(struct net_device *dev)
@@ -2849,6 +2871,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
break;
+ case TUNSETFILTEREBPF:
+ ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
+ break;
+
default:
ret = -EINVAL;
break;
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index fb38c17..ee432cd 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -58,6 +58,7 @@
#define TUNSETVNETBE _IOW('T', 222, int)
#define TUNGETVNETBE _IOR('T', 223, int)
#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
+#define TUNSETFILTEREBPF _IOR('T', 225, int)
/* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001
--
2.7.4
^ permalink raw reply related
* [PATCH net-next v7 0/6] net: tcp: sctp: dccp: Replace jprobe usage with trace events
From: Masami Hiramatsu @ 2017-12-29 2:45 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
Hi,
This series is v7 of the replacement of jprobe usage with trace
events. This version fixes net/dccp/trace.h to avoid sparse
warning. Since the TP_STORE_ADDR_PORTS macro can be shared
with trace/events/tcp.h, it also introduce a new common header
file and move the definition of that macro.
Previous version is here;
https://lkml.org/lkml/2017/12/28/7
Changes from v6:
[5/6]: Avoid preprocessor directives in tracepoint macro args
Thank you,
---
Masami Hiramatsu (6):
net: tcp: Add trace events for TCP congestion window tracing
net: tcp: Remove TCP probe module
net: sctp: Add SCTP ACK tracking trace event
net: sctp: Remove debug SCTP probe module
net: dccp: Add DCCP sendmsg trace event
net: dccp: Remove dccpprobe module
include/trace/events/net_probe_common.h | 44 +++++
include/trace/events/sctp.h | 99 ++++++++++
include/trace/events/tcp.h | 60 ++++++
net/Kconfig | 17 --
net/dccp/Kconfig | 17 --
net/dccp/Makefile | 5 -
net/dccp/probe.c | 203 ---------------------
net/dccp/proto.c | 5 +
net/dccp/trace.h | 84 +++++++++
net/ipv4/Makefile | 1
net/ipv4/tcp_input.c | 3
net/ipv4/tcp_probe.c | 301 -------------------------------
net/sctp/Kconfig | 12 -
net/sctp/Makefile | 3
net/sctp/probe.c | 244 -------------------------
net/sctp/sm_statefuns.c | 5 +
16 files changed, 303 insertions(+), 800 deletions(-)
create mode 100644 include/trace/events/net_probe_common.h
create mode 100644 include/trace/events/sctp.h
delete mode 100644 net/dccp/probe.c
create mode 100644 net/dccp/trace.h
delete mode 100644 net/ipv4/tcp_probe.c
delete mode 100644 net/sctp/probe.c
--
Masami Hiramatsu (Linaro) <mhiramat@kernel.org>
^ permalink raw reply
* [PATCH net-next v7 1/6] net: tcp: Add trace events for TCP congestion window tracing
From: Masami Hiramatsu @ 2017-12-29 2:45 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>
This adds an event to trace TCP stat variables with
slightly intrusive trace-event. This uses ftrace/perf
event log buffer to trace those state, no needs to
prepare own ring-buffer, nor custom user apps.
User can use ftrace to trace this event as below;
# cd /sys/kernel/debug/tracing
# echo 1 > events/tcp/tcp_probe/enable
(run workloads)
# cat trace
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
Changes in v6:
- Avoid preprocessor directives in tracepoint macro args as
Mat did on net tree.
---
include/trace/events/tcp.h | 97 ++++++++++++++++++++++++++++++++++++++++++++
net/ipv4/tcp_input.c | 3 +
2 files changed, 100 insertions(+)
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 8e88a1671538..4dea6342f7d4 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM tcp
@@ -8,6 +9,7 @@
#include <linux/tcp.h>
#include <linux/tracepoint.h>
#include <net/ipv6.h>
+#include <net/tcp.h>
/*
* tcp event with arguments sk and skb
@@ -277,6 +279,101 @@ TRACE_EVENT(tcp_retransmit_synack,
__entry->saddr_v6, __entry->daddr_v6)
);
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \
+ do { \
+ struct sockaddr_in *v4 = (void *)__entry->saddr; \
+ \
+ v4->sin_family = AF_INET; \
+ v4->sin_port = inet->inet_sport; \
+ v4->sin_addr.s_addr = inet->inet_saddr; \
+ v4 = (void *)__entry->daddr; \
+ v4->sin_family = AF_INET; \
+ v4->sin_port = inet->inet_dport; \
+ v4->sin_addr.s_addr = inet->inet_daddr; \
+ } while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
+ do { \
+ if (sk->sk_family == AF_INET6) { \
+ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+ \
+ v6->sin6_family = AF_INET6; \
+ v6->sin6_port = inet->inet_sport; \
+ v6->sin6_addr = inet6_sk(sk)->saddr; \
+ v6 = (void *)__entry->daddr; \
+ v6->sin6_family = AF_INET6; \
+ v6->sin6_port = inet->inet_dport; \
+ v6->sin6_addr = sk->sk_v6_daddr; \
+ } else \
+ TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \
+ } while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
+ TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+TRACE_EVENT(tcp_probe,
+
+ TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+ TP_ARGS(sk, skb),
+
+ TP_STRUCT__entry(
+ /* sockaddr_in6 is always bigger than sockaddr_in */
+ __array(__u8, saddr, sizeof(struct sockaddr_in6))
+ __array(__u8, daddr, sizeof(struct sockaddr_in6))
+ __field(__u16, sport)
+ __field(__u16, dport)
+ __field(__u32, mark)
+ __field(__u16, length)
+ __field(__u32, snd_nxt)
+ __field(__u32, snd_una)
+ __field(__u32, snd_cwnd)
+ __field(__u32, ssthresh)
+ __field(__u32, snd_wnd)
+ __field(__u32, srtt)
+ __field(__u32, rcv_wnd)
+ ),
+
+ TP_fast_assign(
+ const struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+
+ memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+ memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+ TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+ /* For filtering use */
+ __entry->sport = ntohs(inet->inet_sport);
+ __entry->dport = ntohs(inet->inet_dport);
+ __entry->mark = skb->mark;
+
+ __entry->length = skb->len;
+ __entry->snd_nxt = tp->snd_nxt;
+ __entry->snd_una = tp->snd_una;
+ __entry->snd_cwnd = tp->snd_cwnd;
+ __entry->snd_wnd = tp->snd_wnd;
+ __entry->rcv_wnd = tp->rcv_wnd;
+ __entry->ssthresh = tcp_current_ssthresh(sk);
+ __entry->srtt = tp->srtt_us >> 3;
+ ),
+
+ TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
+ "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
+ "rcv_wnd=%u",
+ __entry->saddr, __entry->daddr, __entry->mark,
+ __entry->length, __entry->snd_nxt, __entry->snd_una,
+ __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
+ __entry->srtt, __entry->rcv_wnd)
+);
+
#endif /* _TRACE_TCP_H */
/* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..ff71b18d9682 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
unsigned int len = skb->len;
struct tcp_sock *tp = tcp_sk(sk);
+ /* TCP congestion window tracking */
+ trace_tcp_probe(sk, skb);
+
tcp_mstamp_refresh(tp);
if (unlikely(!sk->sk_rx_dst))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
^ permalink raw reply related
* [PATCH net-next v7 2/6] net: tcp: Remove TCP probe module
From: Masami Hiramatsu @ 2017-12-29 2:46 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>
Remove TCP probe module since jprobe has been deprecated.
That function is now replaced by tcp/tcp_probe trace-event.
You can use it via ftrace or perftools.
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
net/Kconfig | 17 ---
net/ipv4/Makefile | 1
net/ipv4/tcp_probe.c | 301 --------------------------------------------------
3 files changed, 319 deletions(-)
delete mode 100644 net/ipv4/tcp_probe.c
diff --git a/net/Kconfig b/net/Kconfig
index 9dba2715919d..efe930db3c08 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -336,23 +336,6 @@ config NET_PKTGEN
To compile this code as a module, choose M here: the
module will be called pktgen.
-config NET_TCPPROBE
- tristate "TCP connection probing"
- depends on INET && PROC_FS && KPROBES
- ---help---
- This module allows for capturing the changes to TCP connection
- state in response to incoming packets. It is used for debugging
- TCP congestion avoidance modules. If you don't understand
- what was just said, you don't need it: say N.
-
- Documentation on how to use TCP connection probing can be found
- at:
-
- http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
-
- To compile this code as a module, choose M here: the
- module will be called tcp_probe.
-
config NET_DROP_MONITOR
tristate "Network packet drop alerting service"
depends on INET && TRACEPOINTS
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6c8ad1d4b6d..47a0a6649a9d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index 697f4c67b2e3..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/ktime.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1");
-
-static int port __read_mostly;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int bufsize __read_mostly = 4096;
-MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, uint, 0);
-
-static unsigned int fwmark __read_mostly;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int full __read_mostly;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct tcp_log {
- ktime_t tstamp;
- union {
- struct sockaddr raw;
- struct sockaddr_in v4;
- struct sockaddr_in6 v6;
- } src, dst;
- u16 length;
- u32 snd_nxt;
- u32 snd_una;
- u32 snd_wnd;
- u32 rcv_wnd;
- u32 snd_cwnd;
- u32 ssthresh;
- u32 srtt;
-};
-
-static struct {
- spinlock_t lock;
- wait_queue_head_t wait;
- ktime_t start;
- u32 lastcwnd;
-
- unsigned long head, tail;
- struct tcp_log *log;
-} tcp_probe;
-
-static inline int tcp_probe_used(void)
-{
- return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
-}
-
-static inline int tcp_probe_avail(void)
-{
- return bufsize - tcp_probe_used() - 1;
-}
-
-#define tcp_probe_copy_fl_to_si4(inet, si4, mem) \
- do { \
- si4.sin_family = AF_INET; \
- si4.sin_port = inet->inet_##mem##port; \
- si4.sin_addr.s_addr = inet->inet_##mem##addr; \
- } while (0) \
-
-/*
- * Hook inserted to be called before each receive packet.
- * Note: arguments must match tcp_rcv_established()!
- */
-static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct tcphdr *th)
-{
- unsigned int len = skb->len;
- const struct tcp_sock *tp = tcp_sk(sk);
- const struct inet_sock *inet = inet_sk(sk);
-
- /* Only update if port or skb mark matches */
- if (((port == 0 && fwmark == 0) ||
- ntohs(inet->inet_dport) == port ||
- ntohs(inet->inet_sport) == port ||
- (fwmark > 0 && skb->mark == fwmark)) &&
- (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
-
- spin_lock(&tcp_probe.lock);
- /* If log fills, just silently drop */
- if (tcp_probe_avail() > 1) {
- struct tcp_log *p = tcp_probe.log + tcp_probe.head;
-
- p->tstamp = ktime_get();
- switch (sk->sk_family) {
- case AF_INET:
- tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
- tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
- break;
- case AF_INET6:
- memset(&p->src.v6, 0, sizeof(p->src.v6));
- memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
- p->src.v6.sin6_family = AF_INET6;
- p->src.v6.sin6_port = inet->inet_sport;
- p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
- p->dst.v6.sin6_family = AF_INET6;
- p->dst.v6.sin6_port = inet->inet_dport;
- p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
- break;
- default:
- BUG();
- }
-
- p->length = len;
- p->snd_nxt = tp->snd_nxt;
- p->snd_una = tp->snd_una;
- p->snd_cwnd = tp->snd_cwnd;
- p->snd_wnd = tp->snd_wnd;
- p->rcv_wnd = tp->rcv_wnd;
- p->ssthresh = tcp_current_ssthresh(sk);
- p->srtt = tp->srtt_us >> 3;
-
- tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
- }
- tcp_probe.lastcwnd = tp->snd_cwnd;
- spin_unlock(&tcp_probe.lock);
-
- wake_up(&tcp_probe.wait);
- }
-
- jprobe_return();
-}
-
-static struct jprobe tcp_jprobe = {
- .kp = {
- .symbol_name = "tcp_rcv_established",
- },
- .entry = jtcp_rcv_established,
-};
-
-static int tcpprobe_open(struct inode *inode, struct file *file)
-{
- /* Reset (empty) log */
- spin_lock_bh(&tcp_probe.lock);
- tcp_probe.head = tcp_probe.tail = 0;
- tcp_probe.start = ktime_get();
- spin_unlock_bh(&tcp_probe.lock);
-
- return 0;
-}
-
-static int tcpprobe_sprint(char *tbuf, int n)
-{
- const struct tcp_log *p
- = tcp_probe.log + tcp_probe.tail;
- struct timespec64 ts
- = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
-
- return scnprintf(tbuf, n,
- "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
- (unsigned long)ts.tv_sec,
- (unsigned long)ts.tv_nsec,
- &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
- p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- int error = 0;
- size_t cnt = 0;
-
- if (!buf)
- return -EINVAL;
-
- while (cnt < len) {
- char tbuf[256];
- int width;
-
- /* Wait for data in buffer */
- error = wait_event_interruptible(tcp_probe.wait,
- tcp_probe_used() > 0);
- if (error)
- break;
-
- spin_lock_bh(&tcp_probe.lock);
- if (tcp_probe.head == tcp_probe.tail) {
- /* multiple readers race? */
- spin_unlock_bh(&tcp_probe.lock);
- continue;
- }
-
- width = tcpprobe_sprint(tbuf, sizeof(tbuf));
-
- if (cnt + width < len)
- tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
-
- spin_unlock_bh(&tcp_probe.lock);
-
- /* if record greater than space available
- return partial buffer (so far) */
- if (cnt + width >= len)
- break;
-
- if (copy_to_user(buf + cnt, tbuf, width))
- return -EFAULT;
- cnt += width;
- }
-
- return cnt == 0 ? error : cnt;
-}
-
-static const struct file_operations tcpprobe_fops = {
- .owner = THIS_MODULE,
- .open = tcpprobe_open,
- .read = tcpprobe_read,
- .llseek = noop_llseek,
-};
-
-static __init int tcpprobe_init(void)
-{
- int ret = -ENOMEM;
-
- /* Warning: if the function signature of tcp_rcv_established,
- * has been changed, you also have to change the signature of
- * jtcp_rcv_established, otherwise you end up right here!
- */
- BUILD_BUG_ON(__same_type(tcp_rcv_established,
- jtcp_rcv_established) == 0);
-
- init_waitqueue_head(&tcp_probe.wait);
- spin_lock_init(&tcp_probe.lock);
-
- if (bufsize == 0)
- return -EINVAL;
-
- bufsize = roundup_pow_of_two(bufsize);
- tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
- if (!tcp_probe.log)
- goto err0;
-
- if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
- goto err0;
-
- ret = register_jprobe(&tcp_jprobe);
- if (ret)
- goto err1;
-
- pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
- port, fwmark, bufsize);
- return 0;
- err1:
- remove_proc_entry(procname, init_net.proc_net);
- err0:
- kfree(tcp_probe.log);
- return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
- remove_proc_entry(procname, init_net.proc_net);
- unregister_jprobe(&tcp_jprobe);
- kfree(tcp_probe.log);
-}
-module_exit(tcpprobe_exit);
^ permalink raw reply related
* [PATCH net-next v7 3/6] net: sctp: Add SCTP ACK tracking trace event
From: Masami Hiramatsu @ 2017-12-29 2:46 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>
Add SCTP ACK tracking trace event to trace the changes of SCTP
association state in response to incoming packets.
It is used for debugging SCTP congestion control algorithms,
and will replace sctp_probe module.
Note that this event a bit tricky. Since this consists of 2
events (sctp_probe and sctp_probe_path) so you have to enable
both events as below.
# cd /sys/kernel/debug/tracing
# echo 1 > events/sctp/sctp_probe/enable
# echo 1 > events/sctp/sctp_probe_path/enable
Or, you can enable all the events under sctp.
# echo 1 > events/sctp/enable
Since sctp_probe_path event is always invoked from sctp_probe
event, you can not see any output if you only enable
sctp_probe_path.
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
Changes in v3:
- Add checking whether sctp_probe_path event is enabled
before iterating sctp paths to record. Thanks Steven.
Changes in v4:
- Move a temporal variable definition in the block.
- Fix to cast pointer to unsigned long instead of __u64
for 32bit environment.
---
include/trace/events/sctp.h | 99 +++++++++++++++++++++++++++++++++++++++++++
net/sctp/sm_statefuns.c | 5 ++
2 files changed, 104 insertions(+)
create mode 100644 include/trace/events/sctp.h
diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h
new file mode 100644
index 000000000000..7475c7be165a
--- /dev/null
+++ b/include/trace/events/sctp.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sctp
+
+#if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCTP_H
+
+#include <net/sctp/structs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sctp_probe_path,
+
+ TP_PROTO(struct sctp_transport *sp,
+ const struct sctp_association *asoc),
+
+ TP_ARGS(sp, asoc),
+
+ TP_STRUCT__entry(
+ __field(__u64, asoc)
+ __field(__u32, primary)
+ __array(__u8, ipaddr, sizeof(union sctp_addr))
+ __field(__u32, state)
+ __field(__u32, cwnd)
+ __field(__u32, ssthresh)
+ __field(__u32, flight_size)
+ __field(__u32, partial_bytes_acked)
+ __field(__u32, pathmtu)
+ ),
+
+ TP_fast_assign(
+ __entry->asoc = (unsigned long)asoc;
+ __entry->primary = (sp == asoc->peer.primary_path);
+ memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr));
+ __entry->state = sp->state;
+ __entry->cwnd = sp->cwnd;
+ __entry->ssthresh = sp->ssthresh;
+ __entry->flight_size = sp->flight_size;
+ __entry->partial_bytes_acked = sp->partial_bytes_acked;
+ __entry->pathmtu = sp->pathmtu;
+ ),
+
+ TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u "
+ "flight_size=%u partial_bytes_acked=%u pathmtu=%u",
+ __entry->asoc, __entry->primary ? "(*)" : "",
+ __entry->ipaddr, __entry->state, __entry->cwnd,
+ __entry->ssthresh, __entry->flight_size,
+ __entry->partial_bytes_acked, __entry->pathmtu)
+);
+
+TRACE_EVENT(sctp_probe,
+
+ TP_PROTO(const struct sctp_endpoint *ep,
+ const struct sctp_association *asoc,
+ struct sctp_chunk *chunk),
+
+ TP_ARGS(ep, asoc, chunk),
+
+ TP_STRUCT__entry(
+ __field(__u64, asoc)
+ __field(__u32, mark)
+ __field(__u16, bind_port)
+ __field(__u16, peer_port)
+ __field(__u32, pathmtu)
+ __field(__u32, rwnd)
+ __field(__u16, unack_data)
+ ),
+
+ TP_fast_assign(
+ struct sk_buff *skb = chunk->skb;
+
+ __entry->asoc = (unsigned long)asoc;
+ __entry->mark = skb->mark;
+ __entry->bind_port = ep->base.bind_addr.port;
+ __entry->peer_port = asoc->peer.port;
+ __entry->pathmtu = asoc->pathmtu;
+ __entry->rwnd = asoc->peer.rwnd;
+ __entry->unack_data = asoc->unack_data;
+
+ if (trace_sctp_probe_path_enabled()) {
+ struct sctp_transport *sp;
+
+ list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+ transports) {
+ trace_sctp_probe_path(sp, asoc);
+ }
+ }
+ ),
+
+ TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d "
+ "rwnd=%u unack_data=%d",
+ __entry->asoc, __entry->mark, __entry->bind_port,
+ __entry->peer_port, __entry->pathmtu, __entry->rwnd,
+ __entry->unack_data)
+);
+
+#endif /* _TRACE_SCTP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 541f34735346..eb7905ffe5f2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -59,6 +59,9 @@
#include <net/sctp/sm.h>
#include <net/sctp/structs.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/sctp.h>
+
static struct sctp_packet *sctp_abort_pkt_new(
struct net *net,
const struct sctp_endpoint *ep,
@@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
struct sctp_sackhdr *sackh;
__u32 ctsn;
+ trace_sctp_probe(ep, asoc, chunk);
+
if (!sctp_vtag_verify(chunk, asoc))
return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
^ permalink raw reply related
* [PATCH net-next v7 4/6] net: sctp: Remove debug SCTP probe module
From: Masami Hiramatsu @ 2017-12-29 2:47 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>
Remove SCTP probe module since jprobe has been deprecated.
That function is now replaced by sctp/sctp_probe and
sctp/sctp_probe_path trace-events.
You can use it via ftrace or perftools.
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
net/sctp/Kconfig | 12 ---
net/sctp/Makefile | 3 -
net/sctp/probe.c | 244 -----------------------------------------------------
3 files changed, 259 deletions(-)
delete mode 100644 net/sctp/probe.c
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index d9c04dc1b3f3..c740b189d4ba 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -37,18 +37,6 @@ menuconfig IP_SCTP
if IP_SCTP
-config NET_SCTPPROBE
- tristate "SCTP: Association probing"
- depends on PROC_FS && KPROBES
- ---help---
- This module allows for capturing the changes to SCTP association
- state in response to incoming packets. It is used for debugging
- SCTP congestion control algorithms. If you don't understand
- what was just said, you don't need it: say N.
-
- To compile this code as a module, choose M here: the
- module will be called sctp_probe.
-
config SCTP_DBG_OBJCNT
bool "SCTP: Debug object counts"
depends on PROC_FS
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 54bd9c1a8aa1..6776582ec449 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,7 +4,6 @@
#
obj-$(CONFIG_IP_SCTP) += sctp.o
-obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
@@ -16,8 +15,6 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
offload.o stream_sched.o stream_sched_prio.o \
stream_sched_rr.o stream_interleave.o
-sctp_probe-y := probe.o
-
sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
sctp-$(CONFIG_PROC_FS) += proc.o
sctp-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
deleted file mode 100644
index 1280f85a598d..000000000000
--- a/net/sctp/probe.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * sctp_probe - Observe the SCTP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for SCTP from Stephen Hemminger's code
- * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/sctp.h>
-#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-MODULE_SOFTDEP("pre: sctp");
-MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
-MODULE_DESCRIPTION("SCTP snooper");
-MODULE_LICENSE("GPL");
-
-static int port __read_mostly = 0;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int fwmark __read_mostly = 0;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int bufsize __read_mostly = 64 * 1024;
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-static int full __read_mostly = 1;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "sctpprobe";
-
-static struct {
- struct kfifo fifo;
- spinlock_t lock;
- wait_queue_head_t wait;
- struct timespec64 tstart;
-} sctpw;
-
-static __printf(1, 2) void printl(const char *fmt, ...)
-{
- va_list args;
- int len;
- char tbuf[256];
-
- va_start(args, fmt);
- len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
- va_end(args);
-
- kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
- wake_up(&sctpw.wait);
-}
-
-static int sctpprobe_open(struct inode *inode, struct file *file)
-{
- kfifo_reset(&sctpw.fifo);
- ktime_get_ts64(&sctpw.tstart);
-
- return 0;
-}
-
-static ssize_t sctpprobe_read(struct file *file, char __user *buf,
- size_t len, loff_t *ppos)
-{
- int error = 0, cnt = 0;
- unsigned char *tbuf;
-
- if (!buf)
- return -EINVAL;
-
- if (len == 0)
- return 0;
-
- tbuf = vmalloc(len);
- if (!tbuf)
- return -ENOMEM;
-
- error = wait_event_interruptible(sctpw.wait,
- kfifo_len(&sctpw.fifo) != 0);
- if (error)
- goto out_free;
-
- cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
- error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
- vfree(tbuf);
-
- return error ? error : cnt;
-}
-
-static const struct file_operations sctpprobe_fops = {
- .owner = THIS_MODULE,
- .open = sctpprobe_open,
- .read = sctpprobe_read,
- .llseek = noop_llseek,
-};
-
-static enum sctp_disposition jsctp_sf_eat_sack(
- struct net *net,
- const struct sctp_endpoint *ep,
- const struct sctp_association *asoc,
- const union sctp_subtype type,
- void *arg,
- struct sctp_cmd_seq *commands)
-{
- struct sctp_chunk *chunk = arg;
- struct sk_buff *skb = chunk->skb;
- struct sctp_transport *sp;
- static __u32 lcwnd = 0;
- struct timespec64 now;
-
- sp = asoc->peer.primary_path;
-
- if (((port == 0 && fwmark == 0) ||
- asoc->peer.port == port ||
- ep->base.bind_addr.port == port ||
- (fwmark > 0 && skb->mark == fwmark)) &&
- (full || sp->cwnd != lcwnd)) {
- lcwnd = sp->cwnd;
-
- ktime_get_ts64(&now);
- now = timespec64_sub(now, sctpw.tstart);
-
- printl("%lu.%06lu ", (unsigned long) now.tv_sec,
- (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-
- printl("%p %5d %5d %5d %8d %5d ", asoc,
- ep->base.bind_addr.port, asoc->peer.port,
- asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
-
- list_for_each_entry(sp, &asoc->peer.transport_addr_list,
- transports) {
- if (sp == asoc->peer.primary_path)
- printl("*");
-
- printl("%pISc %2u %8u %8u %8u %8u %8u ",
- &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
- sp->flight_size, sp->partial_bytes_acked,
- sp->pathmtu);
- }
- printl("\n");
- }
-
- jprobe_return();
- return 0;
-}
-
-static struct jprobe sctp_recv_probe = {
- .kp = {
- .symbol_name = "sctp_sf_eat_sack_6_2",
- },
- .entry = jsctp_sf_eat_sack,
-};
-
-static __init int sctp_setup_jprobe(void)
-{
- int ret = register_jprobe(&sctp_recv_probe);
-
- if (ret) {
- if (request_module("sctp"))
- goto out;
- ret = register_jprobe(&sctp_recv_probe);
- }
-
-out:
- return ret;
-}
-
-static __init int sctpprobe_init(void)
-{
- int ret = -ENOMEM;
-
- /* Warning: if the function signature of sctp_sf_eat_sack_6_2,
- * has been changed, you also have to change the signature of
- * jsctp_sf_eat_sack, otherwise you end up right here!
- */
- BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2,
- jsctp_sf_eat_sack) == 0);
-
- init_waitqueue_head(&sctpw.wait);
- spin_lock_init(&sctpw.lock);
- if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
- return ret;
-
- if (!proc_create(procname, S_IRUSR, init_net.proc_net,
- &sctpprobe_fops))
- goto free_kfifo;
-
- ret = sctp_setup_jprobe();
- if (ret)
- goto remove_proc;
-
- pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
- port, fwmark, bufsize);
- return 0;
-
-remove_proc:
- remove_proc_entry(procname, init_net.proc_net);
-free_kfifo:
- kfifo_free(&sctpw.fifo);
- return ret;
-}
-
-static __exit void sctpprobe_exit(void)
-{
- kfifo_free(&sctpw.fifo);
- remove_proc_entry(procname, init_net.proc_net);
- unregister_jprobe(&sctp_recv_probe);
-}
-
-module_init(sctpprobe_init);
-module_exit(sctpprobe_exit);
^ permalink raw reply related
* [PATCH net-next v7 5/6] net: dccp: Add DCCP sendmsg trace event
From: Masami Hiramatsu @ 2017-12-29 2:47 UTC (permalink / raw)
To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
Stephen Hemminger, Steven Rostedt
Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
Stephen Rothwell, mhiramat
In-Reply-To: <151451552014.17912.11834170408829155608.stgit@devbox>
Add DCCP sendmsg trace event (dccp/dccp_probe) for
replacing dccpprobe. User can trace this event via
ftrace or perftools.
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
Changes in v5:
- Fix to add local directory to include for trace.h.
Thanks Steven!
Changes in v7:
- Avoid preprocessor directives in tracepoint macro args
by sharing TP_STORE_ADDR_PORTS() macro with tcp.h.
---
include/trace/events/net_probe_common.h | 44 ++++++++++++++++
include/trace/events/tcp.h | 39 --------------
net/dccp/Makefile | 3 +
net/dccp/proto.c | 5 ++
net/dccp/trace.h | 84 +++++++++++++++++++++++++++++++
5 files changed, 137 insertions(+), 38 deletions(-)
create mode 100644 include/trace/events/net_probe_common.h
create mode 100644 net/dccp/trace.h
diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
new file mode 100644
index 000000000000..3930119cab08
--- /dev/null
+++ b/include/trace/events/net_probe_common.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_TRACE_NET_PROBE_COMMON_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NET_PROBE_COMMON_H
+
+#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \
+ do { \
+ struct sockaddr_in *v4 = (void *)__entry->saddr; \
+ \
+ v4->sin_family = AF_INET; \
+ v4->sin_port = inet->inet_sport; \
+ v4->sin_addr.s_addr = inet->inet_saddr; \
+ v4 = (void *)__entry->daddr; \
+ v4->sin_family = AF_INET; \
+ v4->sin_port = inet->inet_dport; \
+ v4->sin_addr.s_addr = inet->inet_daddr; \
+ } while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
+ do { \
+ if (sk->sk_family == AF_INET6) { \
+ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+ \
+ v6->sin6_family = AF_INET6; \
+ v6->sin6_port = inet->inet_sport; \
+ v6->sin6_addr = inet6_sk(sk)->saddr; \
+ v6 = (void *)__entry->daddr; \
+ v6->sin6_family = AF_INET6; \
+ v6->sin6_port = inet->inet_dport; \
+ v6->sin6_addr = sk->sk_v6_daddr; \
+ } else \
+ TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \
+ } while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
+ TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
+
+#endif
+
+#endif
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 4dea6342f7d4..1501ca91814f 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -279,44 +279,7 @@ TRACE_EVENT(tcp_retransmit_synack,
__entry->saddr_v6, __entry->daddr_v6)
);
-
-#define TP_STORE_ADDR_PORTS_V4(__entry, inet, sk) \
- do { \
- struct sockaddr_in *v4 = (void *)__entry->saddr; \
- \
- v4->sin_family = AF_INET; \
- v4->sin_port = inet->inet_sport; \
- v4->sin_addr.s_addr = inet->inet_saddr; \
- v4 = (void *)__entry->daddr; \
- v4->sin_family = AF_INET; \
- v4->sin_port = inet->inet_dport; \
- v4->sin_addr.s_addr = inet->inet_daddr; \
- } while (0)
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
- do { \
- if (sk->sk_family == AF_INET6) { \
- struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
- \
- v6->sin6_family = AF_INET6; \
- v6->sin6_port = inet->inet_sport; \
- v6->sin6_addr = inet6_sk(sk)->saddr; \
- v6 = (void *)__entry->daddr; \
- v6->sin6_family = AF_INET6; \
- v6->sin6_port = inet->inet_dport; \
- v6->sin6_addr = sk->sk_v6_daddr; \
- } else \
- TP_STORE_ADDR_PORTS_V4(__entry, inet, sk); \
- } while (0)
-
-#else
-
-#define TP_STORE_ADDR_PORTS(__entry, inet, sk) \
- TP_STORE_ADDR_PORTS_V4(__entry, inet, sk);
-
-#endif
+#include <trace/events/net_probe_common.h>
TRACE_EVENT(tcp_probe,
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2e7b56097bc4..4215f13a63af 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -27,3 +27,6 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o
dccp_diag-y := diag.o
dccp_probe-y := probe.o
+
+# build with local directory for trace.h
+CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7a75a1d3568b..fa7e92e08920 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -38,6 +38,9 @@
#include "dccp.h"
#include "feat.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
EXPORT_SYMBOL_GPL(dccp_statistics);
@@ -761,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int rc, size;
long timeo;
+ trace_dccp_probe(sk, len);
+
if (len > dp->dccps_mss_cache)
return -EMSGSIZE;
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
new file mode 100644
index 000000000000..5062421beee9
--- /dev/null
+++ b/net/dccp/trace.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dccp
+
+#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DCCP_H
+
+#include <net/sock.h>
+#include "dccp.h"
+#include "ccids/ccid3.h"
+#include <linux/tracepoint.h>
+#include <trace/events/net_probe_common.h>
+
+TRACE_EVENT(dccp_probe,
+
+ TP_PROTO(struct sock *sk, size_t size),
+
+ TP_ARGS(sk, size),
+
+ TP_STRUCT__entry(
+ /* sockaddr_in6 is always bigger than sockaddr_in */
+ __array(__u8, saddr, sizeof(struct sockaddr_in6))
+ __array(__u8, daddr, sizeof(struct sockaddr_in6))
+ __field(__u16, sport)
+ __field(__u16, dport)
+ __field(__u16, size)
+ __field(__u16, tx_s)
+ __field(__u32, tx_rtt)
+ __field(__u32, tx_p)
+ __field(__u32, tx_x_calc)
+ __field(__u64, tx_x_recv)
+ __field(__u64, tx_x)
+ __field(__u32, tx_t_ipi)
+ ),
+
+ TP_fast_assign(
+ const struct inet_sock *inet = inet_sk(sk);
+ struct ccid3_hc_tx_sock *hc = NULL;
+
+ if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+ hc = ccid3_hc_tx_sk(sk);
+
+ memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+ memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+ TP_STORE_ADDR_PORTS(__entry, inet, sk);
+
+ /* For filtering use */
+ __entry->sport = ntohs(inet->inet_sport);
+ __entry->dport = ntohs(inet->inet_dport);
+
+ __entry->size = size;
+ if (hc) {
+ __entry->tx_s = hc->tx_s;
+ __entry->tx_rtt = hc->tx_rtt;
+ __entry->tx_p = hc->tx_p;
+ __entry->tx_x_calc = hc->tx_x_calc;
+ __entry->tx_x_recv = hc->tx_x_recv >> 6;
+ __entry->tx_x = hc->tx_x >> 6;
+ __entry->tx_t_ipi = hc->tx_t_ipi;
+ } else {
+ __entry->tx_s = 0;
+ memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi -
+ (void *)&__entry->tx_rtt +
+ sizeof(__entry->tx_t_ipi));
+ }
+ ),
+
+ TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
+ "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
+ __entry->saddr, __entry->daddr, __entry->size,
+ __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
+ __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
+ __entry->tx_t_ipi)
+);
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox