Netdev List

Netdev List
 help / color / mirror / Atom feed

* [net-next V2 16/18] net/mlx5: Add API to set the namespace steering mode
From: Saeed Mahameed @ 2019-09-03 20:05 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Alex Vesker, Erez Shitrit, Maor Gottlieb,
	Mark Bloch, Saeed Mahameed
In-Reply-To: <20190903200409.14406-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Add API to set the flow steering root namesapce mode.
Setting new mode should be called before any steering operation
is executed on the namespace.
This API is going to be used by steering users such switchdev.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/fs_core.c | 49 +++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/fs_core.h | 12 ++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index c2d6e9f4cb90..3bbb49354829 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2995,5 +2995,54 @@ EXPORT_SYMBOL(mlx5_packet_reformat_dealloc);
 int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns,
 				 struct mlx5_flow_root_namespace *peer_ns)
 {
+	if (peer_ns && ns->mode != peer_ns->mode) {
+		mlx5_core_err(ns->dev,
+			      "Can't peer namespace of different steering mode\n");
+		return -EINVAL;
+	}
+
 	return ns->cmds->set_peer(ns, peer_ns);
 }
+
+/* This function should be called only at init stage of the namespace.
+ * It is not safe to call this function while steering operations
+ * are executed in the namespace.
+ */
+int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns,
+				 enum mlx5_flow_steering_mode mode)
+{
+	struct mlx5_flow_root_namespace *root;
+	const struct mlx5_flow_cmds *cmds;
+	int err;
+
+	root = find_root(&ns->node);
+	if (&root->ns != ns)
+	/* Can't set cmds to non root namespace */
+		return -EINVAL;
+
+	if (root->table_type != FS_FT_FDB)
+		return -EOPNOTSUPP;
+
+	if (root->mode == mode)
+		return 0;
+
+	if (mode == MLX5_FLOW_STEERING_MODE_SMFS)
+		cmds = mlx5_fs_cmd_get_dr_cmds();
+	else
+		cmds = mlx5_fs_cmd_get_fw_cmds();
+	if (!cmds)
+		return -EOPNOTSUPP;
+
+	err = cmds->create_ns(root);
+	if (err) {
+		mlx5_core_err(root->dev, "Failed to create flow namespace (%d)\n",
+			      err);
+		return err;
+	}
+
+	root->cmds->destroy_ns(root);
+	root->cmds = cmds;
+	root->mode = mode;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index a133ec5487ae..00717eba2256 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -98,9 +98,15 @@ enum fs_fte_status {
 	FS_FTE_STATUS_EXISTING = 1UL << 0,
 };
 
+enum mlx5_flow_steering_mode {
+	MLX5_FLOW_STEERING_MODE_DMFS,
+	MLX5_FLOW_STEERING_MODE_SMFS
+};
+
 struct mlx5_flow_steering {
 	struct mlx5_core_dev *dev;
-	struct kmem_cache               *fgs_cache;
+	enum   mlx5_flow_steering_mode	mode;
+	struct kmem_cache		*fgs_cache;
 	struct kmem_cache               *ftes_cache;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
@@ -235,6 +241,7 @@ struct mlx5_flow_group {
 
 struct mlx5_flow_root_namespace {
 	struct mlx5_flow_namespace	ns;
+	enum   mlx5_flow_steering_mode	mode;
 	struct mlx5_fs_dr_domain	fs_dr_domain;
 	enum   fs_flow_table_type	table_type;
 	struct mlx5_core_dev		*dev;
@@ -258,6 +265,9 @@ const struct mlx5_flow_cmds *mlx5_fs_cmd_get_fw_cmds(void);
 int mlx5_flow_namespace_set_peer(struct mlx5_flow_root_namespace *ns,
 				 struct mlx5_flow_root_namespace *peer_ns);
 
+int mlx5_flow_namespace_set_mode(struct mlx5_flow_namespace *ns,
+				 enum mlx5_flow_steering_mode mode);
+
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
 
-- 
2.21.0


^ permalink raw reply related

* [net-next V2 18/18] net/mlx5: Add devlink flow_steering_mode parameter
From: Saeed Mahameed @ 2019-09-03 20:05 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Alex Vesker, Erez Shitrit, Maor Gottlieb,
	Saeed Mahameed
In-Reply-To: <20190903200409.14406-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Add new parameter (flow_steering_mode) to control the flow steering
mode of the driver.
Two modes are supported:
1. DMFS - Device managed flow steering
2. SMFS - Software/Driver managed flow steering.

In the DMFS mode, the HW steering entities are created through the
FW. In the SMFS mode this entities are created though the driver
directly.

The driver will use the devlink steering mode only if the steering
domain supports it, for now SMFS will manages only the switchdev eswitch
steering domain.

User command examples:
- Set SMFS flow steering mode::

    $ devlink dev param set pci/0000:06:00.0 name flow_steering_mode value "smfs" cmode runtime

- Read device flow steering mode::

    $ devlink dev param show pci/0000:06:00.0 name flow_steering_mode
      pci/0000:06:00.0:
      name flow_steering_mode type driver-specific
      values:
         cmode runtime value smfs

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../device_drivers/mellanox/mlx5.rst          |  33 ++++++
 .../net/ethernet/mellanox/mlx5/core/devlink.c | 112 +++++++++++++++++-
 2 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst
index b30a63dbf4b7..d071c6b49e1f 100644
--- a/Documentation/networking/device_drivers/mellanox/mlx5.rst
+++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst
@@ -11,6 +11,7 @@ Contents
 
 - `Enabling the driver and kconfig options`_
 - `Devlink info`_
+- `Devlink parameters`_
 - `Devlink health reporters`_
 - `mlx5 tracepoints`_
 
@@ -122,6 +123,38 @@ User command example::
          stored:
             fw.version 16.26.0100
 
+Devlink parameters
+==================
+
+flow_steering_mode: Device flow steering mode
+---------------------------------------------
+The flow steering mode parameter controls the flow steering mode of the driver.
+Two modes are supported:
+1. 'dmfs' - Device managed flow steering.
+2. 'smfs  - Software/Driver managed flow steering.
+
+In DMFS mode, the HW steering entities are created and managed through the
+Firmware.
+In SMFS mode, the HW steering entities are created and managed though by
+the driver directly into Hardware without firmware intervention.
+
+SMFS mode is faster and provides better rule inserstion rate compared to default DMFS mode.
+
+User command examples:
+
+- Set SMFS flow steering mode::
+
+    $ devlink dev param set pci/0000:06:00.0 name flow_steering_mode value "smfs" cmode runtime
+
+- Read device flow steering mode::
+
+    $ devlink dev param show pci/0000:06:00.0 name flow_steering_mode
+      pci/0000:06:00.0:
+      name flow_steering_mode type driver-specific
+      values:
+         cmode runtime value smfs
+
+
 Devlink health reporters
 ========================
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index a400f4430c28..7bf7b6fbc776 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -4,6 +4,7 @@
 #include <devlink.h>
 
 #include "mlx5_core.h"
+#include "fs_core.h"
 #include "eswitch.h"
 
 static int mlx5_devlink_flash_update(struct devlink *devlink,
@@ -107,12 +108,121 @@ void mlx5_devlink_free(struct devlink *devlink)
 	devlink_free(devlink);
 }
 
+static int mlx5_devlink_fs_mode_validate(struct devlink *devlink, u32 id,
+					 union devlink_param_value val,
+					 struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	char *value = val.vstr;
+	int err = 0;
+
+	if (!strcmp(value, "dmfs")) {
+		return 0;
+	} else if (!strcmp(value, "smfs")) {
+		u8 eswitch_mode;
+		bool smfs_cap;
+
+		eswitch_mode = mlx5_eswitch_mode(dev->priv.eswitch);
+		smfs_cap = mlx5_fs_dr_is_supported(dev);
+
+		if (!smfs_cap) {
+			err = -EOPNOTSUPP;
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Software managed steering is not supported by current device");
+		}
+
+		else if (eswitch_mode == MLX5_ESWITCH_OFFLOADS) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Software managed steering is not supported when eswitch offlaods enabled.");
+			err = -EOPNOTSUPP;
+		}
+	} else {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Bad parameter: supported values are [\"dmfs\", \"smfs\"]");
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int mlx5_devlink_fs_mode_set(struct devlink *devlink, u32 id,
+				    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	enum mlx5_flow_steering_mode mode;
+
+	if (!strcmp(ctx->val.vstr, "smfs"))
+		mode = MLX5_FLOW_STEERING_MODE_SMFS;
+	else
+		mode = MLX5_FLOW_STEERING_MODE_DMFS;
+	dev->priv.steering->mode = mode;
+
+	return 0;
+}
+
+static int mlx5_devlink_fs_mode_get(struct devlink *devlink, u32 id,
+				    struct devlink_param_gset_ctx *ctx)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+	if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_SMFS)
+		strcpy(ctx->val.vstr, "smfs");
+	else
+		strcpy(ctx->val.vstr, "dmfs");
+	return 0;
+}
+
+enum mlx5_devlink_param_id {
+	MLX5_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
+	MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
+};
+
+static const struct devlink_param mlx5_devlink_params[] = {
+	DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
+			     "flow_steering_mode", DEVLINK_PARAM_TYPE_STRING,
+			     BIT(DEVLINK_PARAM_CMODE_RUNTIME),
+			     mlx5_devlink_fs_mode_get, mlx5_devlink_fs_mode_set,
+			     mlx5_devlink_fs_mode_validate),
+};
+
+static void mlx5_devlink_set_params_init_values(struct devlink *devlink)
+{
+	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	union devlink_param_value value;
+
+	if (dev->priv.steering->mode == MLX5_FLOW_STEERING_MODE_DMFS)
+		strcpy(value.vstr, "dmfs");
+	else
+		strcpy(value.vstr, "smfs");
+	devlink_param_driverinit_value_set(devlink,
+					   MLX5_DEVLINK_PARAM_FLOW_STEERING_MODE,
+					   value);
+}
+
 int mlx5_devlink_register(struct devlink *devlink, struct device *dev)
 {
-	return devlink_register(devlink, dev);
+	int err;
+
+	err = devlink_register(devlink, dev);
+	if (err)
+		return err;
+
+	err = devlink_params_register(devlink, mlx5_devlink_params,
+				      ARRAY_SIZE(mlx5_devlink_params));
+	if (err)
+		goto params_reg_err;
+	mlx5_devlink_set_params_init_values(devlink);
+	devlink_params_publish(devlink);
+	return 0;
+
+params_reg_err:
+	devlink_unregister(devlink);
+	return err;
 }
 
 void mlx5_devlink_unregister(struct devlink *devlink)
 {
+	devlink_params_unregister(devlink, mlx5_devlink_params,
+				  ARRAY_SIZE(mlx5_devlink_params));
 	devlink_unregister(devlink);
 }
-- 
2.21.0


^ permalink raw reply related

* [net-next V2 17/18] net/mlx5: Add support to use SMFS in switchdev mode
From: Saeed Mahameed @ 2019-09-03 20:05 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Alex Vesker, Erez Shitrit, Maor Gottlieb,
	Mark Bloch, Saeed Mahameed
In-Reply-To: <20190903200409.14406-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

In case that flow steering mode of the driver is SMFS (Software Managed
Flow Steering), then use the DR (SW steering) API to create the steering
objects.

In addition, add a call to the set peer namespace when switchdev gets
devcom pair event. It is required to support VF LAG in SMFS.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  1 +
 .../mellanox/mlx5/core/eswitch_offloads.c     | 61 ++++++++++++++++---
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 4f70202db6af..6bd6f5895244 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -153,6 +153,7 @@ struct mlx5_eswitch_fdb {
 		} legacy;
 
 		struct offloads_fdb {
+			struct mlx5_flow_namespace *ns;
 			struct mlx5_flow_table *slow_fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *peer_miss_grp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index bee67ff58137..afa623b15a38 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1068,6 +1068,13 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 		err = -EOPNOTSUPP;
 		goto ns_err;
 	}
+	esw->fdb_table.offloads.ns = root_ns;
+	err = mlx5_flow_namespace_set_mode(root_ns,
+					   esw->dev->priv.steering->mode);
+	if (err) {
+		esw_warn(dev, "Failed to set FDB namespace steering mode\n");
+		goto ns_err;
+	}
 
 	max_flow_counter = (MLX5_CAP_GEN(dev, max_flow_counter_31_16) << 16) |
 			    MLX5_CAP_GEN(dev, max_flow_counter_15_0);
@@ -1207,6 +1214,8 @@ static int esw_create_offloads_fdb_tables(struct mlx5_eswitch *esw, int nvports)
 	esw_destroy_offloads_fast_fdb_tables(esw);
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 slow_fdb_err:
+	/* Holds true only as long as DMFS is the default */
+	mlx5_flow_namespace_set_mode(root_ns, MLX5_FLOW_STEERING_MODE_DMFS);
 ns_err:
 	kvfree(flow_group_in);
 	return err;
@@ -1226,6 +1235,9 @@ static void esw_destroy_offloads_fdb_tables(struct mlx5_eswitch *esw)
 
 	mlx5_destroy_flow_table(esw->fdb_table.offloads.slow_fdb);
 	esw_destroy_offloads_fast_fdb_tables(esw);
+	/* Holds true only as long as DMFS is the default */
+	mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns,
+				     MLX5_FLOW_STEERING_MODE_DMFS);
 }
 
 static int esw_create_offloads_table(struct mlx5_eswitch *esw, int nvports)
@@ -1623,13 +1635,42 @@ static void mlx5_esw_offloads_unpair(struct mlx5_eswitch *esw)
 	esw_del_fdb_peer_miss_rules(esw);
 }
 
+static int mlx5_esw_offloads_set_ns_peer(struct mlx5_eswitch *esw,
+					 struct mlx5_eswitch *peer_esw,
+					 bool pair)
+{
+	struct mlx5_flow_root_namespace *peer_ns;
+	struct mlx5_flow_root_namespace *ns;
+	int err;
+
+	peer_ns = peer_esw->dev->priv.steering->fdb_root_ns;
+	ns = esw->dev->priv.steering->fdb_root_ns;
+
+	if (pair) {
+		err = mlx5_flow_namespace_set_peer(ns, peer_ns);
+		if (err)
+			return err;
+
+		mlx5_flow_namespace_set_peer(peer_ns, ns);
+		if (err) {
+			mlx5_flow_namespace_set_peer(ns, NULL);
+			return err;
+		}
+	} else {
+		mlx5_flow_namespace_set_peer(ns, NULL);
+		mlx5_flow_namespace_set_peer(peer_ns, NULL);
+	}
+
+	return 0;
+}
+
 static int mlx5_esw_offloads_devcom_event(int event,
 					  void *my_data,
 					  void *event_data)
 {
 	struct mlx5_eswitch *esw = my_data;
-	struct mlx5_eswitch *peer_esw = event_data;
 	struct mlx5_devcom *devcom = esw->dev->priv.devcom;
+	struct mlx5_eswitch *peer_esw = event_data;
 	int err;
 
 	switch (event) {
@@ -1638,9 +1679,12 @@ static int mlx5_esw_offloads_devcom_event(int event,
 		    mlx5_eswitch_vport_match_metadata_enabled(peer_esw))
 			break;
 
-		err = mlx5_esw_offloads_pair(esw, peer_esw);
+		err = mlx5_esw_offloads_set_ns_peer(esw, peer_esw, true);
 		if (err)
 			goto err_out;
+		err = mlx5_esw_offloads_pair(esw, peer_esw);
+		if (err)
+			goto err_peer;
 
 		err = mlx5_esw_offloads_pair(peer_esw, esw);
 		if (err)
@@ -1656,6 +1700,7 @@ static int mlx5_esw_offloads_devcom_event(int event,
 		mlx5_devcom_set_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS, false);
 		mlx5_esw_offloads_unpair(peer_esw);
 		mlx5_esw_offloads_unpair(esw);
+		mlx5_esw_offloads_set_ns_peer(esw, peer_esw, false);
 		break;
 	}
 
@@ -1663,7 +1708,8 @@ static int mlx5_esw_offloads_devcom_event(int event,
 
 err_pair:
 	mlx5_esw_offloads_unpair(esw);
-
+err_peer:
+	mlx5_esw_offloads_set_ns_peer(esw, peer_esw, false);
 err_out:
 	mlx5_core_err(esw->dev, "esw offloads devcom event failure, event %u err %d",
 		      event, err);
@@ -2115,9 +2161,10 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	else
 		esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 
+	mlx5_rdma_enable_roce(esw->dev);
 	err = esw_offloads_steering_init(esw);
 	if (err)
-		return err;
+		goto err_steering_init;
 
 	err = esw_set_passing_vport_metadata(esw, true);
 	if (err)
@@ -2132,8 +2179,6 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	esw_offloads_devcom_init(esw);
 	mutex_init(&esw->offloads.termtbl_mutex);
 
-	mlx5_rdma_enable_roce(esw->dev);
-
 	return 0;
 
 err_reps:
@@ -2141,6 +2186,8 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	esw_set_passing_vport_metadata(esw, false);
 err_vport_metadata:
 	esw_offloads_steering_cleanup(esw);
+err_steering_init:
+	mlx5_rdma_disable_roce(esw->dev);
 	return err;
 }
 
@@ -2165,12 +2212,12 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 
 void esw_offloads_disable(struct mlx5_eswitch *esw)
 {
-	mlx5_rdma_disable_roce(esw->dev);
 	esw_offloads_devcom_cleanup(esw);
 	esw_offloads_unload_all_reps(esw);
 	mlx5_eswitch_disable_pf_vf_vports(esw);
 	esw_set_passing_vport_metadata(esw, false);
 	esw_offloads_steering_cleanup(esw);
+	mlx5_rdma_disable_roce(esw->dev);
 	esw->offloads.encap = DEVLINK_ESWITCH_ENCAP_MODE_NONE;
 }
 
-- 
2.21.0


^ permalink raw reply related

* [net-next V2 11/18] net/mlx5: DR, Expose steering rule functionality
From: Saeed Mahameed @ 2019-09-03 20:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Alex Vesker, Erez Shitrit, Mark Bloch,
	Saeed Mahameed
In-Reply-To: <20190903200409.14406-1-saeedm@mellanox.com>

From: Alex Vesker <valex@mellanox.com>

Rules are the actual objects that tie matchers, header values and
actions. Each rule belongs to a matcher, which can hold multiple rules
sharing the same mask. Each rule is a specific set of values and
actions.
When a packet reaches a matcher it is being matched against the
matcher`s rules. In case of a match over a rule its actions will be
executed. Each rule object contains a set of STEs, where each STE is a
definition of match values and actions defined by the rule.
This file handles the rule operations and processing.

Signed-off-by: Alex Vesker <valex@mellanox.com>
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../mellanox/mlx5/core/steering/dr_rule.c     | 1243 +++++++++++++++++
 1 file changed, 1243 insertions(+)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
new file mode 100644
index 000000000000..3bc3f66b8fa8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
@@ -0,0 +1,1243 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#include "dr_types.h"
+
+#define DR_RULE_MAX_STE_CHAIN (DR_RULE_MAX_STES + DR_ACTION_MAX_STES)
+
+struct mlx5dr_rule_action_member {
+	struct mlx5dr_action *action;
+	struct list_head list;
+};
+
+static int dr_rule_append_to_miss_list(struct mlx5dr_ste *new_last_ste,
+				       struct list_head *miss_list,
+				       struct list_head *send_list)
+{
+	struct mlx5dr_ste_send_info *ste_info_last;
+	struct mlx5dr_ste *last_ste;
+
+	/* The new entry will be inserted after the last */
+	last_ste = list_entry(miss_list->prev, struct mlx5dr_ste, miss_list_node);
+	WARN_ON(!last_ste);
+
+	ste_info_last = kzalloc(sizeof(*ste_info_last), GFP_KERNEL);
+	if (!ste_info_last)
+		return -ENOMEM;
+
+	mlx5dr_ste_set_miss_addr(last_ste->hw_ste,
+				 mlx5dr_ste_get_icm_addr(new_last_ste));
+	list_add_tail(&new_last_ste->miss_list_node, miss_list);
+
+	mlx5dr_send_fill_and_append_ste_send_info(last_ste, DR_STE_SIZE_REDUCED,
+						  0, last_ste->hw_ste,
+						  ste_info_last, send_list, true);
+
+	return 0;
+}
+
+static struct mlx5dr_ste *
+dr_rule_create_collision_htbl(struct mlx5dr_matcher *matcher,
+			      struct mlx5dr_matcher_rx_tx *nic_matcher,
+			      u8 *hw_ste)
+{
+	struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+	struct mlx5dr_ste_htbl *new_htbl;
+	struct mlx5dr_ste *ste;
+
+	/* Create new table for miss entry */
+	new_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool,
+					 DR_CHUNK_SIZE_1,
+					 MLX5DR_STE_LU_TYPE_DONT_CARE,
+					 0);
+	if (!new_htbl) {
+		mlx5dr_dbg(dmn, "Failed allocating collision table\n");
+		return NULL;
+	}
+
+	/* One and only entry, never grows */
+	ste = new_htbl->ste_arr;
+	mlx5dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr);
+	mlx5dr_htbl_get(new_htbl);
+
+	return ste;
+}
+
+static struct mlx5dr_ste *
+dr_rule_create_collision_entry(struct mlx5dr_matcher *matcher,
+			       struct mlx5dr_matcher_rx_tx *nic_matcher,
+			       u8 *hw_ste,
+			       struct mlx5dr_ste *orig_ste)
+{
+	struct mlx5dr_ste *ste;
+
+	ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste);
+	if (!ste) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Failed creating collision entry\n");
+		return NULL;
+	}
+
+	ste->ste_chain_location = orig_ste->ste_chain_location;
+
+	/* In collision entry, all members share the same miss_list_head */
+	ste->htbl->miss_list = mlx5dr_ste_get_miss_list(orig_ste);
+
+	/* Next table */
+	if (mlx5dr_ste_create_next_htbl(matcher, nic_matcher, ste, hw_ste,
+					DR_CHUNK_SIZE_1)) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Failed allocating table\n");
+		goto free_tbl;
+	}
+
+	return ste;
+
+free_tbl:
+	mlx5dr_ste_free(ste, matcher, nic_matcher);
+	return NULL;
+}
+
+static int
+dr_rule_handle_one_ste_in_update_list(struct mlx5dr_ste_send_info *ste_info,
+				      struct mlx5dr_domain *dmn)
+{
+	int ret;
+
+	list_del(&ste_info->send_list);
+	ret = mlx5dr_send_postsend_ste(dmn, ste_info->ste, ste_info->data,
+				       ste_info->size, ste_info->offset);
+	if (ret)
+		goto out;
+	/* Copy data to ste, only reduced size, the last 16B (mask)
+	 * is already written to the hw.
+	 */
+	memcpy(ste_info->ste->hw_ste, ste_info->data, DR_STE_SIZE_REDUCED);
+
+out:
+	kfree(ste_info);
+	return ret;
+}
+
+static int dr_rule_send_update_list(struct list_head *send_ste_list,
+				    struct mlx5dr_domain *dmn,
+				    bool is_reverse)
+{
+	struct mlx5dr_ste_send_info *ste_info, *tmp_ste_info;
+	int ret;
+
+	if (is_reverse) {
+		list_for_each_entry_safe_reverse(ste_info, tmp_ste_info,
+						 send_ste_list, send_list) {
+			ret = dr_rule_handle_one_ste_in_update_list(ste_info,
+								    dmn);
+			if (ret)
+				return ret;
+		}
+	} else {
+		list_for_each_entry_safe(ste_info, tmp_ste_info,
+					 send_ste_list, send_list) {
+			ret = dr_rule_handle_one_ste_in_update_list(ste_info,
+								    dmn);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct mlx5dr_ste *
+dr_rule_find_ste_in_miss_list(struct list_head *miss_list, u8 *hw_ste)
+{
+	struct mlx5dr_ste *ste;
+
+	if (list_empty(miss_list))
+		return NULL;
+
+	/* Check if hw_ste is present in the list */
+	list_for_each_entry(ste, miss_list, miss_list_node) {
+		if (mlx5dr_ste_equal_tag(ste->hw_ste, hw_ste))
+			return ste;
+	}
+
+	return NULL;
+}
+
+static struct mlx5dr_ste *
+dr_rule_rehash_handle_collision(struct mlx5dr_matcher *matcher,
+				struct mlx5dr_matcher_rx_tx *nic_matcher,
+				struct list_head *update_list,
+				struct mlx5dr_ste *col_ste,
+				u8 *hw_ste)
+{
+	struct mlx5dr_ste *new_ste;
+	int ret;
+
+	new_ste = dr_rule_create_collision_htbl(matcher, nic_matcher, hw_ste);
+	if (!new_ste)
+		return NULL;
+
+	/* In collision entry, all members share the same miss_list_head */
+	new_ste->htbl->miss_list = mlx5dr_ste_get_miss_list(col_ste);
+
+	/* Update the previous from the list */
+	ret = dr_rule_append_to_miss_list(new_ste,
+					  mlx5dr_ste_get_miss_list(col_ste),
+					  update_list);
+	if (ret) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Failed update dup entry\n");
+		goto err_exit;
+	}
+
+	return new_ste;
+
+err_exit:
+	mlx5dr_ste_free(new_ste, matcher, nic_matcher);
+	return NULL;
+}
+
+static void dr_rule_rehash_copy_ste_ctrl(struct mlx5dr_matcher *matcher,
+					 struct mlx5dr_matcher_rx_tx *nic_matcher,
+					 struct mlx5dr_ste *cur_ste,
+					 struct mlx5dr_ste *new_ste)
+{
+	new_ste->next_htbl = cur_ste->next_htbl;
+	new_ste->ste_chain_location = cur_ste->ste_chain_location;
+
+	if (!mlx5dr_ste_is_last_in_rule(nic_matcher, new_ste->ste_chain_location))
+		new_ste->next_htbl->pointing_ste = new_ste;
+
+	/* We need to copy the refcount since this ste
+	 * may have been traversed several times
+	 */
+	refcount_set(&new_ste->refcount, refcount_read(&cur_ste->refcount));
+
+	/* Link old STEs rule_mem list to the new ste */
+	mlx5dr_rule_update_rule_member(cur_ste, new_ste);
+	INIT_LIST_HEAD(&new_ste->rule_list);
+	list_splice_tail_init(&cur_ste->rule_list, &new_ste->rule_list);
+}
+
+static struct mlx5dr_ste *
+dr_rule_rehash_copy_ste(struct mlx5dr_matcher *matcher,
+			struct mlx5dr_matcher_rx_tx *nic_matcher,
+			struct mlx5dr_ste *cur_ste,
+			struct mlx5dr_ste_htbl *new_htbl,
+			struct list_head *update_list)
+{
+	struct mlx5dr_ste_send_info *ste_info;
+	bool use_update_list = false;
+	u8 hw_ste[DR_STE_SIZE] = {};
+	struct mlx5dr_ste *new_ste;
+	int new_idx;
+	u8 sb_idx;
+
+	/* Copy STE mask from the matcher */
+	sb_idx = cur_ste->ste_chain_location - 1;
+	mlx5dr_ste_set_bit_mask(hw_ste, nic_matcher->ste_builder[sb_idx].bit_mask);
+
+	/* Copy STE control and tag */
+	memcpy(hw_ste, cur_ste->hw_ste, DR_STE_SIZE_REDUCED);
+	mlx5dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr);
+
+	new_idx = mlx5dr_ste_calc_hash_index(hw_ste, new_htbl);
+	new_ste = &new_htbl->ste_arr[new_idx];
+
+	if (mlx5dr_ste_not_used_ste(new_ste)) {
+		mlx5dr_htbl_get(new_htbl);
+		list_add_tail(&new_ste->miss_list_node,
+			      mlx5dr_ste_get_miss_list(new_ste));
+	} else {
+		new_ste = dr_rule_rehash_handle_collision(matcher,
+							  nic_matcher,
+							  update_list,
+							  new_ste,
+							  hw_ste);
+		if (!new_ste) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Failed adding collision entry, index: %d\n",
+				   new_idx);
+			return NULL;
+		}
+		new_htbl->ctrl.num_of_collisions++;
+		use_update_list = true;
+	}
+
+	memcpy(new_ste->hw_ste, hw_ste, DR_STE_SIZE_REDUCED);
+
+	new_htbl->ctrl.num_of_valid_entries++;
+
+	if (use_update_list) {
+		ste_info = kzalloc(sizeof(*ste_info), GFP_KERNEL);
+		if (!ste_info)
+			goto err_exit;
+
+		mlx5dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0,
+							  hw_ste, ste_info,
+							  update_list, true);
+	}
+
+	dr_rule_rehash_copy_ste_ctrl(matcher, nic_matcher, cur_ste, new_ste);
+
+	return new_ste;
+
+err_exit:
+	mlx5dr_ste_free(new_ste, matcher, nic_matcher);
+	return NULL;
+}
+
+static int dr_rule_rehash_copy_miss_list(struct mlx5dr_matcher *matcher,
+					 struct mlx5dr_matcher_rx_tx *nic_matcher,
+					 struct list_head *cur_miss_list,
+					 struct mlx5dr_ste_htbl *new_htbl,
+					 struct list_head *update_list)
+{
+	struct mlx5dr_ste *tmp_ste, *cur_ste, *new_ste;
+
+	if (list_empty(cur_miss_list))
+		return 0;
+
+	list_for_each_entry_safe(cur_ste, tmp_ste, cur_miss_list, miss_list_node) {
+		new_ste = dr_rule_rehash_copy_ste(matcher,
+						  nic_matcher,
+						  cur_ste,
+						  new_htbl,
+						  update_list);
+		if (!new_ste)
+			goto err_insert;
+
+		list_del(&cur_ste->miss_list_node);
+		mlx5dr_htbl_put(cur_ste->htbl);
+	}
+	return 0;
+
+err_insert:
+	mlx5dr_err(matcher->tbl->dmn, "Fatal error during resize\n");
+	WARN_ON(true);
+	return -EINVAL;
+}
+
+static int dr_rule_rehash_copy_htbl(struct mlx5dr_matcher *matcher,
+				    struct mlx5dr_matcher_rx_tx *nic_matcher,
+				    struct mlx5dr_ste_htbl *cur_htbl,
+				    struct mlx5dr_ste_htbl *new_htbl,
+				    struct list_head *update_list)
+{
+	struct mlx5dr_ste *cur_ste;
+	int cur_entries;
+	int err = 0;
+	int i;
+
+	cur_entries = mlx5dr_icm_pool_chunk_size_to_entries(cur_htbl->chunk_size);
+
+	if (cur_entries < 1) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Invalid number of entries\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < cur_entries; i++) {
+		cur_ste = &cur_htbl->ste_arr[i];
+		if (mlx5dr_ste_not_used_ste(cur_ste)) /* Empty, nothing to copy */
+			continue;
+
+		err = dr_rule_rehash_copy_miss_list(matcher,
+						    nic_matcher,
+						    mlx5dr_ste_get_miss_list(cur_ste),
+						    new_htbl,
+						    update_list);
+		if (err)
+			goto clean_copy;
+	}
+
+clean_copy:
+	return err;
+}
+
+static struct mlx5dr_ste_htbl *
+dr_rule_rehash_htbl(struct mlx5dr_rule *rule,
+		    struct mlx5dr_rule_rx_tx *nic_rule,
+		    struct mlx5dr_ste_htbl *cur_htbl,
+		    u8 ste_location,
+		    struct list_head *update_list,
+		    enum mlx5dr_icm_chunk_size new_size)
+{
+	struct mlx5dr_ste_send_info *del_ste_info, *tmp_ste_info;
+	struct mlx5dr_matcher *matcher = rule->matcher;
+	struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+	struct mlx5dr_matcher_rx_tx *nic_matcher;
+	struct mlx5dr_ste_send_info *ste_info;
+	struct mlx5dr_htbl_connect_info info;
+	struct mlx5dr_domain_rx_tx *nic_dmn;
+	u8 formatted_ste[DR_STE_SIZE] = {};
+	LIST_HEAD(rehash_table_send_list);
+	struct mlx5dr_ste *ste_to_update;
+	struct mlx5dr_ste_htbl *new_htbl;
+	int err;
+
+	nic_matcher = nic_rule->nic_matcher;
+	nic_dmn = nic_matcher->nic_tbl->nic_dmn;
+
+	ste_info = kzalloc(sizeof(*ste_info), GFP_KERNEL);
+	if (!ste_info)
+		return NULL;
+
+	new_htbl = mlx5dr_ste_htbl_alloc(dmn->ste_icm_pool,
+					 new_size,
+					 cur_htbl->lu_type,
+					 cur_htbl->byte_mask);
+	if (!new_htbl) {
+		mlx5dr_err(dmn, "Failed to allocate new hash table\n");
+		goto free_ste_info;
+	}
+
+	/* Write new table to HW */
+	info.type = CONNECT_MISS;
+	info.miss_icm_addr = nic_matcher->e_anchor->chunk->icm_addr;
+	mlx5dr_ste_set_formatted_ste(dmn->info.caps.gvmi,
+				     nic_dmn,
+				     new_htbl,
+				     formatted_ste,
+				     &info);
+
+	new_htbl->pointing_ste = cur_htbl->pointing_ste;
+	new_htbl->pointing_ste->next_htbl = new_htbl;
+	err = dr_rule_rehash_copy_htbl(matcher,
+				       nic_matcher,
+				       cur_htbl,
+				       new_htbl,
+				       &rehash_table_send_list);
+	if (err)
+		goto free_new_htbl;
+
+	if (mlx5dr_send_postsend_htbl(dmn, new_htbl, formatted_ste,
+				      nic_matcher->ste_builder[ste_location - 1].bit_mask)) {
+		mlx5dr_err(dmn, "Failed writing table to HW\n");
+		goto free_new_htbl;
+	}
+
+	/* Writing to the hw is done in regular order of rehash_table_send_list,
+	 * in order to have the origin data written before the miss address of
+	 * collision entries, if exists.
+	 */
+	if (dr_rule_send_update_list(&rehash_table_send_list, dmn, false)) {
+		mlx5dr_err(dmn, "Failed updating table to HW\n");
+		goto free_ste_list;
+	}
+
+	/* Connect previous hash table to current */
+	if (ste_location == 1) {
+		/* The previous table is an anchor, anchors size is always one STE */
+		struct mlx5dr_ste_htbl *prev_htbl = cur_htbl->pointing_ste->htbl;
+
+		/* On matcher s_anchor we keep an extra refcount */
+		mlx5dr_htbl_get(new_htbl);
+		mlx5dr_htbl_put(cur_htbl);
+
+		nic_matcher->s_htbl = new_htbl;
+
+		/* It is safe to operate dr_ste_set_hit_addr on the hw_ste here
+		 * (48B len) which works only on first 32B
+		 */
+		mlx5dr_ste_set_hit_addr(prev_htbl->ste_arr[0].hw_ste,
+					new_htbl->chunk->icm_addr,
+					new_htbl->chunk->num_of_entries);
+
+		ste_to_update = &prev_htbl->ste_arr[0];
+	} else {
+		mlx5dr_ste_set_hit_addr_by_next_htbl(cur_htbl->pointing_ste->hw_ste,
+						     new_htbl);
+		ste_to_update = cur_htbl->pointing_ste;
+	}
+
+	mlx5dr_send_fill_and_append_ste_send_info(ste_to_update, DR_STE_SIZE_REDUCED,
+						  0, ste_to_update->hw_ste, ste_info,
+						  update_list, false);
+
+	return new_htbl;
+
+free_ste_list:
+	/* Clean all ste_info's from the new table */
+	list_for_each_entry_safe(del_ste_info, tmp_ste_info,
+				 &rehash_table_send_list, send_list) {
+		list_del(&del_ste_info->send_list);
+		kfree(del_ste_info);
+	}
+
+free_new_htbl:
+	mlx5dr_ste_htbl_free(new_htbl);
+free_ste_info:
+	kfree(ste_info);
+	mlx5dr_info(dmn, "Failed creating rehash table\n");
+	return NULL;
+}
+
+static struct mlx5dr_ste_htbl *dr_rule_rehash(struct mlx5dr_rule *rule,
+					      struct mlx5dr_rule_rx_tx *nic_rule,
+					      struct mlx5dr_ste_htbl *cur_htbl,
+					      u8 ste_location,
+					      struct list_head *update_list)
+{
+	struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn;
+	enum mlx5dr_icm_chunk_size new_size;
+
+	new_size = mlx5dr_icm_next_higher_chunk(cur_htbl->chunk_size);
+	new_size = min_t(u32, new_size, dmn->info.max_log_sw_icm_sz);
+
+	if (new_size == cur_htbl->chunk_size)
+		return NULL; /* Skip rehash, we already at the max size */
+
+	return dr_rule_rehash_htbl(rule, nic_rule, cur_htbl, ste_location,
+				   update_list, new_size);
+}
+
+static struct mlx5dr_ste *
+dr_rule_handle_collision(struct mlx5dr_matcher *matcher,
+			 struct mlx5dr_matcher_rx_tx *nic_matcher,
+			 struct mlx5dr_ste *ste,
+			 u8 *hw_ste,
+			 struct list_head *miss_list,
+			 struct list_head *send_list)
+{
+	struct mlx5dr_ste_send_info *ste_info;
+	struct mlx5dr_ste *new_ste;
+
+	ste_info = kzalloc(sizeof(*ste_info), GFP_KERNEL);
+	if (!ste_info)
+		return NULL;
+
+	new_ste = dr_rule_create_collision_entry(matcher, nic_matcher, hw_ste, ste);
+	if (!new_ste)
+		goto free_send_info;
+
+	if (dr_rule_append_to_miss_list(new_ste, miss_list, send_list)) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Failed to update prev miss_list\n");
+		goto err_exit;
+	}
+
+	mlx5dr_send_fill_and_append_ste_send_info(new_ste, DR_STE_SIZE, 0, hw_ste,
+						  ste_info, send_list, false);
+
+	ste->htbl->ctrl.num_of_collisions++;
+	ste->htbl->ctrl.num_of_valid_entries++;
+
+	return new_ste;
+
+err_exit:
+	mlx5dr_ste_free(new_ste, matcher, nic_matcher);
+free_send_info:
+	kfree(ste_info);
+	return NULL;
+}
+
+static void dr_rule_remove_action_members(struct mlx5dr_rule *rule)
+{
+	struct mlx5dr_rule_action_member *action_mem;
+	struct mlx5dr_rule_action_member *tmp;
+
+	list_for_each_entry_safe(action_mem, tmp, &rule->rule_actions_list, list) {
+		list_del(&action_mem->list);
+		refcount_dec(&action_mem->action->refcount);
+		kvfree(action_mem);
+	}
+}
+
+static int dr_rule_add_action_members(struct mlx5dr_rule *rule,
+				      size_t num_actions,
+				      struct mlx5dr_action *actions[])
+{
+	struct mlx5dr_rule_action_member *action_mem;
+	int i;
+
+	for (i = 0; i < num_actions; i++) {
+		action_mem = kvzalloc(sizeof(*action_mem), GFP_KERNEL);
+		if (!action_mem)
+			goto free_action_members;
+
+		action_mem->action = actions[i];
+		INIT_LIST_HEAD(&action_mem->list);
+		list_add_tail(&action_mem->list, &rule->rule_actions_list);
+		refcount_inc(&action_mem->action->refcount);
+	}
+
+	return 0;
+
+free_action_members:
+	dr_rule_remove_action_members(rule);
+	return -ENOMEM;
+}
+
+/* While the pointer of ste is no longer valid, like while moving ste to be
+ * the first in the miss_list, and to be in the origin table,
+ * all rule-members that are attached to this ste should update their ste member
+ * to the new pointer
+ */
+void mlx5dr_rule_update_rule_member(struct mlx5dr_ste *ste,
+				    struct mlx5dr_ste *new_ste)
+{
+	struct mlx5dr_rule_member *rule_mem;
+
+	if (!list_empty(&ste->rule_list))
+		list_for_each_entry(rule_mem, &ste->rule_list, use_ste_list)
+			rule_mem->ste = new_ste;
+}
+
+static void dr_rule_clean_rule_members(struct mlx5dr_rule *rule,
+				       struct mlx5dr_rule_rx_tx *nic_rule)
+{
+	struct mlx5dr_rule_member *rule_mem;
+	struct mlx5dr_rule_member *tmp_mem;
+
+	if (list_empty(&nic_rule->rule_members_list))
+		return;
+	list_for_each_entry_safe(rule_mem, tmp_mem, &nic_rule->rule_members_list, list) {
+		list_del(&rule_mem->list);
+		list_del(&rule_mem->use_ste_list);
+		mlx5dr_ste_put(rule_mem->ste, rule->matcher, nic_rule->nic_matcher);
+		kvfree(rule_mem);
+	}
+}
+
+static bool dr_rule_need_enlarge_hash(struct mlx5dr_ste_htbl *htbl,
+				      struct mlx5dr_domain *dmn,
+				      struct mlx5dr_domain_rx_tx *nic_dmn)
+{
+	struct mlx5dr_ste_htbl_ctrl *ctrl = &htbl->ctrl;
+
+	if (dmn->info.max_log_sw_icm_sz <= htbl->chunk_size)
+		return false;
+
+	if (!ctrl->may_grow)
+		return false;
+
+	if (ctrl->num_of_collisions >= ctrl->increase_threshold &&
+	    (ctrl->num_of_valid_entries - ctrl->num_of_collisions) >= ctrl->increase_threshold)
+		return true;
+
+	return false;
+}
+
+static int dr_rule_add_member(struct mlx5dr_rule_rx_tx *nic_rule,
+			      struct mlx5dr_ste *ste)
+{
+	struct mlx5dr_rule_member *rule_mem;
+
+	rule_mem = kvzalloc(sizeof(*rule_mem), GFP_KERNEL);
+	if (!rule_mem)
+		return -ENOMEM;
+
+	rule_mem->ste = ste;
+	list_add_tail(&rule_mem->list, &nic_rule->rule_members_list);
+
+	list_add_tail(&rule_mem->use_ste_list, &ste->rule_list);
+
+	return 0;
+}
+
+static int dr_rule_handle_action_stes(struct mlx5dr_rule *rule,
+				      struct mlx5dr_rule_rx_tx *nic_rule,
+				      struct list_head *send_ste_list,
+				      struct mlx5dr_ste *last_ste,
+				      u8 *hw_ste_arr,
+				      u32 new_hw_ste_arr_sz)
+{
+	struct mlx5dr_matcher_rx_tx *nic_matcher = nic_rule->nic_matcher;
+	struct mlx5dr_ste_send_info *ste_info_arr[DR_ACTION_MAX_STES];
+	u8 num_of_builders = nic_matcher->num_of_builders;
+	struct mlx5dr_matcher *matcher = rule->matcher;
+	u8 *curr_hw_ste, *prev_hw_ste;
+	struct mlx5dr_ste *action_ste;
+	int i, k, ret;
+
+	/* Two cases:
+	 * 1. num_of_builders is equal to new_hw_ste_arr_sz, the action in the ste
+	 * 2. num_of_builders is less then new_hw_ste_arr_sz, new ste was added
+	 *    to support the action.
+	 */
+	if (num_of_builders == new_hw_ste_arr_sz)
+		return 0;
+
+	for (i = num_of_builders, k = 0; i < new_hw_ste_arr_sz; i++, k++) {
+		curr_hw_ste = hw_ste_arr + i * DR_STE_SIZE;
+		prev_hw_ste = (i == 0) ? curr_hw_ste : hw_ste_arr + ((i - 1) * DR_STE_SIZE);
+		action_ste = dr_rule_create_collision_htbl(matcher,
+							   nic_matcher,
+							   curr_hw_ste);
+		if (!action_ste)
+			return -ENOMEM;
+
+		mlx5dr_ste_get(action_ste);
+
+		/* While free ste we go over the miss list, so add this ste to the list */
+		list_add_tail(&action_ste->miss_list_node,
+			      mlx5dr_ste_get_miss_list(action_ste));
+
+		ste_info_arr[k] = kzalloc(sizeof(*ste_info_arr[k]),
+					  GFP_KERNEL);
+		if (!ste_info_arr[k])
+			goto err_exit;
+
+		/* Point current ste to the new action */
+		mlx5dr_ste_set_hit_addr_by_next_htbl(prev_hw_ste, action_ste->htbl);
+		ret = dr_rule_add_member(nic_rule, action_ste);
+		if (ret) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Failed adding rule member\n");
+			goto free_ste_info;
+		}
+		mlx5dr_send_fill_and_append_ste_send_info(action_ste, DR_STE_SIZE, 0,
+							  curr_hw_ste,
+							  ste_info_arr[k],
+							  send_ste_list, false);
+	}
+
+	return 0;
+
+free_ste_info:
+	kfree(ste_info_arr[k]);
+err_exit:
+	mlx5dr_ste_put(action_ste, matcher, nic_matcher);
+	return -ENOMEM;
+}
+
+static int dr_rule_handle_empty_entry(struct mlx5dr_matcher *matcher,
+				      struct mlx5dr_matcher_rx_tx *nic_matcher,
+				      struct mlx5dr_ste_htbl *cur_htbl,
+				      struct mlx5dr_ste *ste,
+				      u8 ste_location,
+				      u8 *hw_ste,
+				      struct list_head *miss_list,
+				      struct list_head *send_list)
+{
+	struct mlx5dr_ste_send_info *ste_info;
+
+	/* Take ref on table, only on first time this ste is used */
+	mlx5dr_htbl_get(cur_htbl);
+
+	/* new entry -> new branch */
+	list_add_tail(&ste->miss_list_node, miss_list);
+
+	mlx5dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr);
+
+	ste->ste_chain_location = ste_location;
+
+	ste_info = kzalloc(sizeof(*ste_info), GFP_KERNEL);
+	if (!ste_info)
+		goto clean_ste_setting;
+
+	if (mlx5dr_ste_create_next_htbl(matcher,
+					nic_matcher,
+					ste,
+					hw_ste,
+					DR_CHUNK_SIZE_1)) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Failed allocating table\n");
+		goto clean_ste_info;
+	}
+
+	cur_htbl->ctrl.num_of_valid_entries++;
+
+	mlx5dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, 0, hw_ste,
+						  ste_info, send_list, false);
+
+	return 0;
+
+clean_ste_info:
+	kfree(ste_info);
+clean_ste_setting:
+	list_del_init(&ste->miss_list_node);
+	mlx5dr_htbl_put(cur_htbl);
+
+	return -ENOMEM;
+}
+
+static struct mlx5dr_ste *
+dr_rule_handle_ste_branch(struct mlx5dr_rule *rule,
+			  struct mlx5dr_rule_rx_tx *nic_rule,
+			  struct list_head *send_ste_list,
+			  struct mlx5dr_ste_htbl *cur_htbl,
+			  u8 *hw_ste,
+			  u8 ste_location,
+			  struct mlx5dr_ste_htbl **put_htbl)
+{
+	struct mlx5dr_matcher *matcher = rule->matcher;
+	struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+	struct mlx5dr_matcher_rx_tx *nic_matcher;
+	struct mlx5dr_domain_rx_tx *nic_dmn;
+	struct mlx5dr_ste_htbl *new_htbl;
+	struct mlx5dr_ste *matched_ste;
+	struct list_head *miss_list;
+	bool skip_rehash = false;
+	struct mlx5dr_ste *ste;
+	int index;
+
+	nic_matcher = nic_rule->nic_matcher;
+	nic_dmn = nic_matcher->nic_tbl->nic_dmn;
+
+again:
+	index = mlx5dr_ste_calc_hash_index(hw_ste, cur_htbl);
+	miss_list = &cur_htbl->chunk->miss_list[index];
+	ste = &cur_htbl->ste_arr[index];
+
+	if (mlx5dr_ste_not_used_ste(ste)) {
+		if (dr_rule_handle_empty_entry(matcher, nic_matcher, cur_htbl,
+					       ste, ste_location,
+					       hw_ste, miss_list,
+					       send_ste_list))
+			return NULL;
+	} else {
+		/* Hash table index in use, check if this ste is in the miss list */
+		matched_ste = dr_rule_find_ste_in_miss_list(miss_list, hw_ste);
+		if (matched_ste) {
+			/* If it is last STE in the chain, and has the same tag
+			 * it means that all the previous stes are the same,
+			 * if so, this rule is duplicated.
+			 */
+			if (mlx5dr_ste_is_last_in_rule(nic_matcher,
+						       matched_ste->ste_chain_location)) {
+				mlx5dr_info(dmn, "Duplicate rule inserted, aborting!!\n");
+				return NULL;
+			}
+			return matched_ste;
+		}
+
+		if (!skip_rehash && dr_rule_need_enlarge_hash(cur_htbl, dmn, nic_dmn)) {
+			/* Hash table index in use, try to resize of the hash */
+			skip_rehash = true;
+
+			/* Hold the table till we update.
+			 * Release in dr_rule_create_rule()
+			 */
+			*put_htbl = cur_htbl;
+			mlx5dr_htbl_get(cur_htbl);
+
+			new_htbl = dr_rule_rehash(rule, nic_rule, cur_htbl,
+						  ste_location, send_ste_list);
+			if (!new_htbl) {
+				mlx5dr_htbl_put(cur_htbl);
+				mlx5dr_info(dmn, "failed creating rehash table, htbl-log_size: %d\n",
+					    cur_htbl->chunk_size);
+			} else {
+				cur_htbl = new_htbl;
+			}
+			goto again;
+		} else {
+			/* Hash table index in use, add another collision (miss) */
+			ste = dr_rule_handle_collision(matcher,
+						       nic_matcher,
+						       ste,
+						       hw_ste,
+						       miss_list,
+						       send_ste_list);
+			if (!ste) {
+				mlx5dr_dbg(dmn, "failed adding collision entry, index: %d\n",
+					   index);
+				return NULL;
+			}
+		}
+	}
+	return ste;
+}
+
+static bool dr_rule_cmp_value_to_mask(u8 *mask, u8 *value,
+				      u32 s_idx, u32 e_idx)
+{
+	u32 i;
+
+	for (i = s_idx; i < e_idx; i++) {
+		if (value[i] & ~mask[i]) {
+			pr_info("Rule parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+	return true;
+}
+
+static bool dr_rule_verify(struct mlx5dr_matcher *matcher,
+			   struct mlx5dr_match_parameters *value,
+			   struct mlx5dr_match_param *param)
+{
+	u8 match_criteria = matcher->match_criteria;
+	size_t value_size = value->match_sz;
+	u8 *mask_p = (u8 *)&matcher->mask;
+	u8 *param_p = (u8 *)param;
+	u32 s_idx, e_idx;
+
+	if (!value_size ||
+	    (value_size > sizeof(struct mlx5dr_match_param) ||
+	     (value_size % sizeof(u32)))) {
+		mlx5dr_dbg(matcher->tbl->dmn, "Rule parameters length is incorrect\n");
+		return false;
+	}
+
+	mlx5dr_ste_copy_param(matcher->match_criteria, param, value);
+
+	if (match_criteria & DR_MATCHER_CRITERIA_OUTER) {
+		s_idx = offsetof(struct mlx5dr_match_param, outer);
+		e_idx = min(s_idx + sizeof(param->outer), value_size);
+
+		if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Rule outer parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+
+	if (match_criteria & DR_MATCHER_CRITERIA_MISC) {
+		s_idx = offsetof(struct mlx5dr_match_param, misc);
+		e_idx = min(s_idx + sizeof(param->misc), value_size);
+
+		if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Rule misc parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+
+	if (match_criteria & DR_MATCHER_CRITERIA_INNER) {
+		s_idx = offsetof(struct mlx5dr_match_param, inner);
+		e_idx = min(s_idx + sizeof(param->inner), value_size);
+
+		if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Rule inner parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+
+	if (match_criteria & DR_MATCHER_CRITERIA_MISC2) {
+		s_idx = offsetof(struct mlx5dr_match_param, misc2);
+		e_idx = min(s_idx + sizeof(param->misc2), value_size);
+
+		if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Rule misc2 parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+
+	if (match_criteria & DR_MATCHER_CRITERIA_MISC3) {
+		s_idx = offsetof(struct mlx5dr_match_param, misc3);
+		e_idx = min(s_idx + sizeof(param->misc3), value_size);
+
+		if (!dr_rule_cmp_value_to_mask(mask_p, param_p, s_idx, e_idx)) {
+			mlx5dr_dbg(matcher->tbl->dmn, "Rule misc3 parameters contains a value not specified by mask\n");
+			return false;
+		}
+	}
+	return true;
+}
+
+static int dr_rule_destroy_rule_nic(struct mlx5dr_rule *rule,
+				    struct mlx5dr_rule_rx_tx *nic_rule)
+{
+	dr_rule_clean_rule_members(rule, nic_rule);
+	return 0;
+}
+
+static int dr_rule_destroy_rule_fdb(struct mlx5dr_rule *rule)
+{
+	dr_rule_destroy_rule_nic(rule, &rule->rx);
+	dr_rule_destroy_rule_nic(rule, &rule->tx);
+	return 0;
+}
+
+static int dr_rule_destroy_rule(struct mlx5dr_rule *rule)
+{
+	struct mlx5dr_domain *dmn = rule->matcher->tbl->dmn;
+
+	switch (dmn->type) {
+	case MLX5DR_DOMAIN_TYPE_NIC_RX:
+		dr_rule_destroy_rule_nic(rule, &rule->rx);
+		break;
+	case MLX5DR_DOMAIN_TYPE_NIC_TX:
+		dr_rule_destroy_rule_nic(rule, &rule->tx);
+		break;
+	case MLX5DR_DOMAIN_TYPE_FDB:
+		dr_rule_destroy_rule_fdb(rule);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dr_rule_remove_action_members(rule);
+	kfree(rule);
+	return 0;
+}
+
+static bool dr_rule_is_ipv6(struct mlx5dr_match_param *param)
+{
+	return (param->outer.ip_version == 6 ||
+		param->inner.ip_version == 6 ||
+		param->outer.ethertype == ETH_P_IPV6 ||
+		param->inner.ethertype == ETH_P_IPV6);
+}
+
+static bool dr_rule_skip(enum mlx5dr_domain_type domain,
+			 enum mlx5dr_ste_entry_type ste_type,
+			 struct mlx5dr_match_param *mask,
+			 struct mlx5dr_match_param *value)
+{
+	if (domain != MLX5DR_DOMAIN_TYPE_FDB)
+		return false;
+
+	if (mask->misc.source_port) {
+		if (ste_type == MLX5DR_STE_TYPE_RX)
+			if (value->misc.source_port != WIRE_PORT)
+				return true;
+
+		if (ste_type == MLX5DR_STE_TYPE_TX)
+			if (value->misc.source_port == WIRE_PORT)
+				return true;
+	}
+
+	/* Metadata C can be used to describe the source vport */
+	if (mask->misc2.metadata_reg_c_0) {
+		if (ste_type == MLX5DR_STE_TYPE_RX)
+			if ((value->misc2.metadata_reg_c_0 & WIRE_PORT) != WIRE_PORT)
+				return true;
+
+		if (ste_type == MLX5DR_STE_TYPE_TX)
+			if ((value->misc2.metadata_reg_c_0 & WIRE_PORT) == WIRE_PORT)
+				return true;
+	}
+	return false;
+}
+
+static int
+dr_rule_create_rule_nic(struct mlx5dr_rule *rule,
+			struct mlx5dr_rule_rx_tx *nic_rule,
+			struct mlx5dr_match_param *param,
+			size_t num_actions,
+			struct mlx5dr_action *actions[])
+{
+	struct mlx5dr_ste_send_info *ste_info, *tmp_ste_info;
+	struct mlx5dr_matcher *matcher = rule->matcher;
+	struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+	struct mlx5dr_matcher_rx_tx *nic_matcher;
+	struct mlx5dr_domain_rx_tx *nic_dmn;
+	struct mlx5dr_ste_htbl *htbl = NULL;
+	struct mlx5dr_ste_htbl *cur_htbl;
+	struct mlx5dr_ste *ste = NULL;
+	LIST_HEAD(send_ste_list);
+	u8 *hw_ste_arr = NULL;
+	u32 new_hw_ste_arr_sz;
+	int ret, i;
+
+	nic_matcher = nic_rule->nic_matcher;
+	nic_dmn = nic_matcher->nic_tbl->nic_dmn;
+
+	INIT_LIST_HEAD(&nic_rule->rule_members_list);
+
+	if (dr_rule_skip(dmn->type, nic_dmn->ste_type, &matcher->mask, param))
+		return 0;
+
+	ret = mlx5dr_matcher_select_builders(matcher,
+					     nic_matcher,
+					     dr_rule_is_ipv6(param));
+	if (ret)
+		goto out_err;
+
+	hw_ste_arr = kzalloc(DR_RULE_MAX_STE_CHAIN * DR_STE_SIZE, GFP_KERNEL);
+	if (!hw_ste_arr) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Set the tag values inside the ste array */
+	ret = mlx5dr_ste_build_ste_arr(matcher, nic_matcher, param, hw_ste_arr);
+	if (ret)
+		goto free_hw_ste;
+
+	/* Set the actions values/addresses inside the ste array */
+	ret = mlx5dr_actions_build_ste_arr(matcher, nic_matcher, actions,
+					   num_actions, hw_ste_arr,
+					   &new_hw_ste_arr_sz);
+	if (ret)
+		goto free_hw_ste;
+
+	cur_htbl = nic_matcher->s_htbl;
+
+	/* Go over the array of STEs, and build dr_ste accordingly.
+	 * The loop is over only the builders which are equal or less to the
+	 * number of stes, in case we have actions that lives in other stes.
+	 */
+	for (i = 0; i < nic_matcher->num_of_builders; i++) {
+		/* Calculate CRC and keep new ste entry */
+		u8 *cur_hw_ste_ent = hw_ste_arr + (i * DR_STE_SIZE);
+
+		ste = dr_rule_handle_ste_branch(rule,
+						nic_rule,
+						&send_ste_list,
+						cur_htbl,
+						cur_hw_ste_ent,
+						i + 1,
+						&htbl);
+		if (!ste) {
+			mlx5dr_err(dmn, "Failed creating next branch\n");
+			ret = -ENOENT;
+			goto free_rule;
+		}
+
+		cur_htbl = ste->next_htbl;
+
+		/* Keep all STEs in the rule struct */
+		ret = dr_rule_add_member(nic_rule, ste);
+		if (ret) {
+			mlx5dr_dbg(dmn, "Failed adding rule member index %d\n", i);
+			goto free_ste;
+		}
+
+		mlx5dr_ste_get(ste);
+	}
+
+	/* Connect actions */
+	ret = dr_rule_handle_action_stes(rule, nic_rule, &send_ste_list,
+					 ste, hw_ste_arr, new_hw_ste_arr_sz);
+	if (ret) {
+		mlx5dr_dbg(dmn, "Failed apply actions\n");
+		goto free_rule;
+	}
+	ret = dr_rule_send_update_list(&send_ste_list, dmn, true);
+	if (ret) {
+		mlx5dr_err(dmn, "Failed sending ste!\n");
+		goto free_rule;
+	}
+
+	if (htbl)
+		mlx5dr_htbl_put(htbl);
+
+	return 0;
+
+free_ste:
+	mlx5dr_ste_put(ste, matcher, nic_matcher);
+free_rule:
+	dr_rule_clean_rule_members(rule, nic_rule);
+	/* Clean all ste_info's */
+	list_for_each_entry_safe(ste_info, tmp_ste_info, &send_ste_list, send_list) {
+		list_del(&ste_info->send_list);
+		kfree(ste_info);
+	}
+free_hw_ste:
+	kfree(hw_ste_arr);
+out_err:
+	return ret;
+}
+
+static int
+dr_rule_create_rule_fdb(struct mlx5dr_rule *rule,
+			struct mlx5dr_match_param *param,
+			size_t num_actions,
+			struct mlx5dr_action *actions[])
+{
+	struct mlx5dr_match_param copy_param = {};
+	int ret;
+
+	/* Copy match_param since they will be consumed during the first
+	 * nic_rule insertion.
+	 */
+	memcpy(&copy_param, param, sizeof(struct mlx5dr_match_param));
+
+	ret = dr_rule_create_rule_nic(rule, &rule->rx, param,
+				      num_actions, actions);
+	if (ret)
+		return ret;
+
+	ret = dr_rule_create_rule_nic(rule, &rule->tx, &copy_param,
+				      num_actions, actions);
+	if (ret)
+		goto destroy_rule_nic_rx;
+
+	return 0;
+
+destroy_rule_nic_rx:
+	dr_rule_destroy_rule_nic(rule, &rule->rx);
+	return ret;
+}
+
+static struct mlx5dr_rule *
+dr_rule_create_rule(struct mlx5dr_matcher *matcher,
+		    struct mlx5dr_match_parameters *value,
+		    size_t num_actions,
+		    struct mlx5dr_action *actions[])
+{
+	struct mlx5dr_domain *dmn = matcher->tbl->dmn;
+	struct mlx5dr_match_param param = {};
+	struct mlx5dr_rule *rule;
+	int ret;
+
+	if (!dr_rule_verify(matcher, value, &param))
+		return NULL;
+
+	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+	if (!rule)
+		return NULL;
+
+	rule->matcher = matcher;
+	INIT_LIST_HEAD(&rule->rule_actions_list);
+
+	ret = dr_rule_add_action_members(rule, num_actions, actions);
+	if (ret)
+		goto free_rule;
+
+	switch (dmn->type) {
+	case MLX5DR_DOMAIN_TYPE_NIC_RX:
+		rule->rx.nic_matcher = &matcher->rx;
+		ret = dr_rule_create_rule_nic(rule, &rule->rx, &param,
+					      num_actions, actions);
+		break;
+	case MLX5DR_DOMAIN_TYPE_NIC_TX:
+		rule->tx.nic_matcher = &matcher->tx;
+		ret = dr_rule_create_rule_nic(rule, &rule->tx, &param,
+					      num_actions, actions);
+		break;
+	case MLX5DR_DOMAIN_TYPE_FDB:
+		rule->rx.nic_matcher = &matcher->rx;
+		rule->tx.nic_matcher = &matcher->tx;
+		ret = dr_rule_create_rule_fdb(rule, &param,
+					      num_actions, actions);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (ret)
+		goto remove_action_members;
+
+	return rule;
+
+remove_action_members:
+	dr_rule_remove_action_members(rule);
+free_rule:
+	kfree(rule);
+	mlx5dr_info(dmn, "Failed creating rule\n");
+	return NULL;
+}
+
+struct mlx5dr_rule *mlx5dr_rule_create(struct mlx5dr_matcher *matcher,
+				       struct mlx5dr_match_parameters *value,
+				       size_t num_actions,
+				       struct mlx5dr_action *actions[])
+{
+	struct mlx5dr_rule *rule;
+
+	mutex_lock(&matcher->tbl->dmn->mutex);
+	refcount_inc(&matcher->refcount);
+
+	rule = dr_rule_create_rule(matcher, value, num_actions, actions);
+	if (!rule)
+		refcount_dec(&matcher->refcount);
+
+	mutex_unlock(&matcher->tbl->dmn->mutex);
+
+	return rule;
+}
+
+int mlx5dr_rule_destroy(struct mlx5dr_rule *rule)
+{
+	struct mlx5dr_matcher *matcher = rule->matcher;
+	struct mlx5dr_table *tbl = rule->matcher->tbl;
+	int ret;
+
+	mutex_lock(&tbl->dmn->mutex);
+
+	ret = dr_rule_destroy_rule(rule);
+
+	mutex_unlock(&tbl->dmn->mutex);
+
+	if (!ret)
+		refcount_dec(&matcher->refcount);
+	return ret;
+}
-- 
2.21.0


^ permalink raw reply related

* Re: [net-next 01/18] net/mlx5: Add flow steering actions to fs_cmd shim layer
From: Saeed Mahameed @ 2019-09-03 20:08 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: Maor Gottlieb, Mark Bloch, netdev@vger.kernel.org, Alex Vesker,
	Erez Shitrit
In-Reply-To: <20190902.121012.1434735697208917415.davem@davemloft.net>

On Mon, 2019-09-02 at 12:10 -0700, David Miller wrote:
> From: Saeed Mahameed <saeedm@mellanox.com>
> Date: Mon, 2 Sep 2019 07:22:52 +0000
> 
> > +     maction->flow_action_raw.pkt_reformat =
> > +             mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
> > +                                        in, namespace);
> > +     if (IS_ERR(maction->flow_action_raw.pkt_reformat))
> >               return ret;
> 
> Don't you have to initialize 'ret' to the pointer error here?
> 
> This transformation doesn't look correct.

Right! fixed in V2.

Thanks!

^ permalink raw reply

* Re: [PATCH] net/mlx5: Use PTR_ERR_OR_ZERO rather than its implementation
From: Saeed Mahameed @ 2019-09-03 20:08 UTC (permalink / raw)
  To: zhongjiang@huawei.com, davem@davemloft.net
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	leon@kernel.org
In-Reply-To: <1567493770-20074-1-git-send-email-zhongjiang@huawei.com>

On Tue, 2019-09-03 at 14:56 +0800, zhong jiang wrote:
> PTR_ERR_OR_ZERO contains if(IS_ERR(...)) + PTR_ERR. It is better
> to use it directly. hence just replace it.
> 
> Signed-off-by: zhong jiang <zhongjiang@huawei.com>
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +----
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> index 5581a80..2e0b467 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> @@ -989,10 +989,7 @@ static void mlx5e_hairpin_flow_del(struct
> mlx5e_priv *priv,
>  					    &flow_act, dest, dest_ix);
>  	mutex_unlock(&priv->fs.tc.t_lock);
>  
> -	if (IS_ERR(flow->rule[0]))
> -		return PTR_ERR(flow->rule[0]);
> -
> -	return 0;
> +	return PTR_ERR_OR_ZERO(flow->rule[0]);
>  }
>  
>  static void mlx5e_tc_del_nic_flow(struct mlx5e_priv *priv,

Acked-by: Saeed Mahameed <saeedm@mellanox.com>

^ permalink raw reply

* Re: [PATCH v4 1/2] netfilter: Terminate rule eval if protocol=IPv6 and ipv6 module is disabled
From: Pablo Neira Ayuso @ 2019-09-03 20:19 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Leonardo Bras, netfilter-devel, coreteam, bridge, netdev,
	linux-kernel, Jozsef Kadlecsik, Roopa Prabhu, Nikolay Aleksandrov,
	David S. Miller
In-Reply-To: <20190903194809.GD13660@breakpoint.cc>

On Tue, Sep 03, 2019 at 09:48:09PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > > I was expecting we could find a way to handle this from br_netfilter
> > > > alone itself.
> > > 
> > > We can't because we support ipv6 fib lookups from the netdev family
> > > as well.
> > > 
> > > Alternative is to auto-accept ipv6 packets from the nf_tables eval loop,
> > > but I think its worse.
> > 
> > Could we add a restriction for nf_tables + br_netfilter + !ipv6. I
> > mean, if this is an IPv6 packet, nf_tables is on and IPv6 module if
> > off, then drop this packet?
> 
> We could do that from nft_do_chain_netdev().

Indeed, this is all about the netdev case.

Probably add something similar to nf_ip6_route() to deal with
ip6_route_lookup() case? This is the one trigering the problem, right?

BTW, how does nft_fib_ipv6 module kicks in if ipv6 module is not
loaded? The symbol dependency would pull in the IPv6 module anyway.

^ permalink raw reply

* Re: net/mlx5e: bind() always returns EINVAL with XDP_ZEROCOPY
From: Saeed Mahameed @ 2019-09-03 20:19 UTC (permalink / raw)
  To: kal.conley@dectris.com, brouer@redhat.com
  Cc: Maxim Mikityanskiy, magnus.karlsson@intel.com,
	toke.hoiland-jorgensen@kau.se, xdp-newbies@vger.kernel.org,
	Tariq Toukan, gospo@broadcom.com, jakub.kicinski@netronome.com,
	netdev@vger.kernel.org, bjorn.topel@intel.com
In-Reply-To: <20190902110818.2f6a8894@carbon>

On Mon, 2019-09-02 at 11:08 +0200, Jesper Dangaard Brouer wrote:
> On Sun, 1 Sep 2019 18:47:15 +0200
> Kal Cutter Conley <kal.conley@dectris.com> wrote:
> 
> > Hi,
> > I figured out the problem. Let me document the issue here for
> > others
> > and hopefully start a discussion.
> > 
> > The mlx5 driver uses special queue ids for ZC. If N is the number
> > of
> > configured queues, then for XDP_ZEROCOPY the queue ids start at N.
> > So
> > queue ids [0..N) can only be used with XDP_COPY and queue ids
> > [N..2N)
> > can only be used with XDP_ZEROCOPY.
> 
> Thanks for the followup and explanation on how mlx5 AF_XDP queue
> implementation is different from other vendors.
> 
> 
> > sudo ethtool -L eth0 combined 16
> > sudo samples/bpf/xdpsock -r -i eth0 -c -q 0   # OK
> > sudo samples/bpf/xdpsock -r -i eth0 -z -q 0   # ERROR
> > sudo samples/bpf/xdpsock -r -i eth0 -c -q 16  # ERROR
> > sudo samples/bpf/xdpsock -r -i eth0 -z -q 16  # OK
> > 
> > Why was this done? To use zerocopy if available and fallback on
> > copy
> > mode normally you would set sxdp_flags=0. However, here this is no
> > longer possible. To support this driver, you have to first try
> > binding
> > with XDP_ZEROCOPY and the special queue id, then if that fails, you
> > have to try binding again with a normal queue id. Peculiarities
> > like
> > this complicate the XDP user api. Maybe someone can explain the
> > benefits?
> 

in mlx5 we like to keep full functional separation between different
queues. Unlike other implementations in mlx5 kernel standard rx rings
can still function while xsk queues are opened. from user perspective
this should be very simple and very usefull:

queues 0..(N-1): can't be used for XSK ZC since they are standard RX
queues managed by kernel  and driver
queues N..(2N-1): Are XSK user app managed queues, they can't be used
for anything else.

benefits:
- RSS is not interrupted, Ongoing traffic and Current RX queues keeps
going normally when XSK apps are activated/deactivated on the fly.
- Well-defined full logical separation between different types of RX
queue.

as Jesper explained we understand the confusion, and we will come up
with a solution the fits all vendors.

> Thanks for complaining, it is actually valuable. It really illustrate
> the kernel need to improve in this area, which is what our talk[1] at
> LPC2019 (Sep 10) is about.
> 
> Title: "Making Networking Queues a First Class Citizen in the Kernel"
>  [1] https://linuxplumbersconf.org/event/4/contributions/462/
> 
> As you can see, several vendors are actually involved. Kudos to
> Magnus
> for taking initiative here!  It's unfortunately not solved
> "tomorrow",
> as first we have to agree this is needed (facility to register
> queues),
> then agree on API and get commitment from vendors, as this requires
> drivers changes.  There is a long road ahead, but I think it will be
> worthwhile in the end, as effective use of dedicated hardware queues
> (both RX and TX) is key to performance.
> 

^ permalink raw reply

* [net PATCH] net: sock_map, fix missing ulp check in sock hash case
From: John Fastabend @ 2019-09-03 20:24 UTC (permalink / raw)
  To: hdanton, jakub.kicinski, davem; +Cc: netdev, john.fastabend

sock_map and ULP only work together when ULP is loaded after the sock
map is loaded. In the sock_map case we added a check for this to fail
the load if ULP is already set. However, we missed the check on the
sock_hash side.

Add a ULP check to the sock_hash update path.

Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface")
Reported-by: syzbot+7a6ee4d0078eac6bf782@syzkaller.appspotmail.com
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 net/core/sock_map.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 1330a74..50916f9 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -656,6 +656,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 				   struct sock *sk, u64 flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 key_size = map->key_size, hash;
 	struct bpf_htab_elem *elem, *elem_new;
 	struct bpf_htab_bucket *bucket;
@@ -666,6 +667,8 @@ static int sock_hash_update_common(struct bpf_map *map, void *key,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	if (unlikely(flags > BPF_EXIST))
 		return -EINVAL;
+	if (unlikely(icsk->icsk_ulp_data))
+		return -EINVAL;
 
 	link = sk_psock_init_link();
 	if (!link)


^ permalink raw reply related

* Re: [PATCH net-next] net: Fail explicit bind to local reserved ports
From: Subash Abhinov Kasiviswanathan @ 2019-09-03 20:34 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, stranche
In-Reply-To: <20190830.142202.1082989152863915040.davem@davemloft.net>

> I don't know how happy I am about this.  Whatever sets up the 
> transparent
> proxy business can block any attempt to communicate over these ports.
> 
> Also, protocols like SCTP need the new handling too.

Hi David

The purpose of this patch was to allow the transparent proxy application
to block the specific socket ranges to prevent the communication on the
specific ports.

Dropping packets for this particular port using iptables could lead to
applications on the system getting stuck without getting a socket error.
If bind fails explicitly, the application can atleast retry for some 
other
port.
Is there some alternate existing mechanism to achieve this already?

-- 
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

^ permalink raw reply

* Re: [PATCH v4 1/2] netfilter: Terminate rule eval if protocol=IPv6 and ipv6 module is disabled
From: Florian Westphal @ 2019-09-03 20:35 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Leonardo Bras, netfilter-devel, coreteam,
	bridge, netdev, linux-kernel, Jozsef Kadlecsik, Roopa Prabhu,
	Nikolay Aleksandrov, David S. Miller
In-Reply-To: <20190903201904.npna6dt25ug5gwvd@salvia>

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Tue, Sep 03, 2019 at 09:48:09PM +0200, Florian Westphal wrote:
> > We could do that from nft_do_chain_netdev().
> 
> Indeed, this is all about the netdev case.
> 
> Probably add something similar to nf_ip6_route() to deal with
> ip6_route_lookup() case? This is the one trigering the problem, right?

Yes, this particular problem is caused by ipv6 fib not being
initialized due to ipv6.disable=1.  I don't know if there are cases
other than FIB.

> BTW, how does nft_fib_ipv6 module kicks in if ipv6 module is not
> loaded? The symbol dependency would pull in the IPv6 module anyway.

ipv6.disabled=1 does load the ipv6 module, but its non-functional.

^ permalink raw reply

* Re: [PATCH bpf-next V9 1/3] bpf: new helper to obtain namespace data from current task
From: Yonghong Song @ 2019-09-03 20:36 UTC (permalink / raw)
  To: Carlos Antonio Neira Bustos
  Cc: netdev@vger.kernel.org, Eric Biederman, brouer@redhat.com,
	bpf@vger.kernel.org
In-Reply-To: <20190903184502.2vpaqnoubbr7nnf6@ebpf-metal>



On 9/3/19 11:45 AM, Carlos Antonio Neira Bustos wrote:
> Hi Yonghong,
> 
>>> Yes, the samples/bpf test case can be removed.
>>> Could you create a selftest with tracpoint net/netif_receive_skb, which
>>> also uses the proposed helper? net/netif_receive_skb will happen in
>>> interrupt context and it should catch the issue as well if
>>> filename_lookup still get called in interrupt context.
>>
> For this one scenario I just created another selftest with the only difference
> that the tracepoint is /net/netif_receive_skb so this fails with -EPERM.
> Is that enough?.

This should be fine.

> 
> I have made this comment on include/uapi/linux/bpf.h, maybe is too terse?
> 
> struct bpf_pidns_info {
> 	__u32 dev;	/* dev_t from /proc/self/ns/pid inode */
> 	__u32 nsid;
> 	__u32 tgid;
> 	__u32 pid;
> };

Let us keep the above for now. I may have further comments based on
your test code which uses "dev".

> 
> I'm only missing clearing out those questions to be ready to submit v11 of this patch.

Please go ahead to submit the new version.

Thanks.

^ permalink raw reply

* Re: [PATCH v4 1/2] netfilter: Terminate rule eval if protocol=IPv6 and ipv6 module is disabled
From: Pablo Neira Ayuso @ 2019-09-03 20:55 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Leonardo Bras, netfilter-devel, coreteam, bridge, netdev,
	linux-kernel, Jozsef Kadlecsik, Roopa Prabhu, Nikolay Aleksandrov,
	David S. Miller
In-Reply-To: <20190903203531.GF13660@breakpoint.cc>

On Tue, Sep 03, 2019 at 10:35:31PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > On Tue, Sep 03, 2019 at 09:48:09PM +0200, Florian Westphal wrote:
> > > We could do that from nft_do_chain_netdev().
> > 
> > Indeed, this is all about the netdev case.
> > 
> > Probably add something similar to nf_ip6_route() to deal with
> > ip6_route_lookup() case? This is the one trigering the problem, right?
> 
> Yes, this particular problem is caused by ipv6 fib not being
> initialized due to ipv6.disable=1.  I don't know if there are cases
> other than FIB.
> 
> > BTW, how does nft_fib_ipv6 module kicks in if ipv6 module is not
> > loaded? The symbol dependency would pull in the IPv6 module anyway.
> 
> ipv6.disabled=1 does load the ipv6 module, but its non-functional.

I see, thanks for explaining.

^ permalink raw reply

* Re: [PATCH v4 1/2] netfilter: Terminate rule eval if protocol=IPv6 and ipv6 module is disabled
From: Pablo Neira Ayuso @ 2019-09-03 20:55 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: netfilter-devel, coreteam, bridge, netdev, linux-kernel,
	Jozsef Kadlecsik, Florian Westphal, Roopa Prabhu,
	Nikolay Aleksandrov, David S. Miller
In-Reply-To: <20190830181354.26279-2-leonardo@linux.ibm.com>

On Fri, Aug 30, 2019 at 03:13:53PM -0300, Leonardo Bras wrote:
> If IPv6 is disabled on boot (ipv6.disable=1), but nft_fib_inet ends up
> dealing with a IPv6 packet, it causes a kernel panic in
> fib6_node_lookup_1(), crashing in bad_page_fault.
> 
> The panic is caused by trying to deference a very low address (0x38
> in ppc64le), due to ipv6.fib6_main_tbl = NULL.
> BUG: Kernel NULL pointer dereference at 0x00000038
> 
> The kernel panic was reproduced in a host that disabled IPv6 on boot and
> have to process guest packets (coming from a bridge) using it's ip6tables.
> 
> Terminate rule evaluation when packet protocol is IPv6 but the ipv6 module
> is not loaded.

Patch is applied, thanks.

^ permalink raw reply

* Re: [PATCH bpf-next 00/13] bpf: adding map batch processing support
From: Alexei Starovoitov @ 2019-09-03 21:01 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Yonghong Song, Jakub Kicinski, Brian Vazquez, Alexei Starovoitov,
	bpf@vger.kernel.org, netdev@vger.kernel.org, Daniel Borkmann,
	Kernel Team
In-Reply-To: <20190830211809.GB2101@mini-arch>

On Fri, Aug 30, 2019 at 02:18:09PM -0700, Stanislav Fomichev wrote:
> > > 
> > > I personally like Jakub's/Quentin's proposal more. So if I get to choose
> > > between this series and Jakub's filter+dump in BPF, I'd pick filter+dump
> > > (pending per-cpu issue which we actually care about).
> > > 
> > > But if we can have both, I don't have any objections; this patch

I think we need to have both.
imo Jakub's and Yonghong's approach are solving slightly different cases.

filter+dump via program is better suited for LRU map walks where filter prog
would do some non-trivial logic.
Whereas plain 'delete all' or 'dump all' is much simpler to use without
loading yet another prog just to dump it.
bpf infra today isn't quite ready for this very short lived auxiliary progs.
At prog load pages get read-only mapping, tlbs across cpus flushed,
kallsyms populated, FDs allocated, etc.
Loading the prog is a heavy operation. There was a chatter before to have
built-in progs. This filter+dump could benefit from builtin 'allow all'
or 'delete all' progs, but imo that complicates design and asks even
more questions than it answers. Should this builtin progs show up
in 'bpftool prog show' ? When do they load/unload? Same safety requirements
as normal progs? etc.
imo it's fine to have little bit overlap between apis.
So I think we should proceed with both batching apis.

Having said that I think both are suffering from the important issue pointed out
by Brian: when kernel deletes an element get_next_key iterator over hash/lru
map will produce duplicates.
The amount of duplicates can be huge. When batched iterator is slow and
bpf prog is doing a lot of update/delete, there could be 10x worth of duplicates,
since walk will resume from the beginning.
User space cannot be tasked to deal with it.
I think this issue has to be solved in the kernel first and it may require
different batching api.

One idea is to use bucket spin_lock and batch process it bucket-at-a-time.
From api pov the user space will tell kernel:
- here is the buffer for N element. start dump from the beginning.
- kernel will return <= N elements and an iterator.
- user space will pass this opaque iterator back to get another batch
For well behaved hash/lru map there will be zero or one elements per bucket.
When there are 2+ the batching logic can process them together.
If 'lookup' is requested the kernel can check whether user space provided
enough space for these 2 elements. If not abort the batch earlier.
get_next_key won't be used. Instead some sort of opaque iterator
will be returned to user space, so next batch lookup can start from it.
This iterator could be the index of the last dumped bucket.
This idea won't work for pathological hash tables though.
A lot of elements in a single bucket may be more than room for single batch.
In such case iterator will get stuck, since num_of_elements_in_bucket > batch_buf_size.
May be special error code can be used to solve that?

I hope we can come up with other ideas to have a stable iterator over hash table.
Let's use email to describe the ideas and upcoming LPC conference to
sort out details and finalize the one to use.

^ permalink raw reply

* Re: [PATCH] net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte
From: Vinicius Costa Gomes @ 2019-09-03 21:26 UTC (permalink / raw)
  To: Vladimir Oltean, Eric Dumazet
  Cc: Gustavo A. R. Silva, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S. Miller, netdev, lkml
In-Reply-To: <CA+h21hpCAJhE8xhsgDQ55_MUUiesV=uVY4tD=TzaCE6wynUPoQ@mail.gmail.com>

Hi,

Vladimir Oltean <olteanv@gmail.com> writes:

> Right. And while we're at it, there's still the potential
> division-by-zero problem which I still don't know how to solve without
> implementing a full-blown __ethtool_get_link_ksettings parser that
> checks against all the possible outputs it can have under the "no
> carrier" condition - see "[RFC PATCH 1/1] phylink: Set speed to
> SPEED_UNKNOWN when there is no PHY connected" for details.
> And there's also a third fix to be made: the netdev_dbg should be made
> to print "speed" instead of "ecmd.base.speed".

For the ksettings part I am thinking on adding something like this to
ethtool.c. Do you think anything is missing (apart from the
documentation)?

->

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 95991e43..d37c80b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -177,6 +177,9 @@ void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
 bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
 				     const unsigned long *src);
 
+u32 ethtool_link_ksettings_to_speed(const struct ethtool_link_ksettings *settings,
+				    u32 default_speed);
+
 /**
  * struct ethtool_ops - optional netdev operations
  * @get_drvinfo: Report driver/device information.  Should only set the
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6288e69..80e3db3 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -539,6 +539,18 @@ struct ethtool_link_usettings {
 	} link_modes;
 };
 
+u32 ethtool_link_ksettings_to_speed(const struct ethtool_link_ksettings *settings,
+				   u32 default_speed)
+{
+	if (settings->base.speed == SPEED_UNKNOWN)
+		return default_speed;
+
+	if (settings->base.speed == 0)
+		return default_speed;
+
+	return settings->base.speed;
+}
+
 /* Internal kernel helper to query a device ethtool_link_settings. */
 int __ethtool_get_link_ksettings(struct net_device *dev,
 				 struct ethtool_link_ksettings *link_ksettings)

^ permalink raw reply related

* Re: Proposal: r8152 firmware patching framework
From: Prashant Malani @ 2019-09-03 21:32 UTC (permalink / raw)
  To: Bambi Yeh, David Miller
  Cc: Hayes Wang, Amber Chen, netdev@vger.kernel.org, Ryankao, Jackc,
	Albertk, marcochen@google.com, nic_swsd, Grant Grundler
In-Reply-To: <BAD4255E2724E442BCB37085A3D9C93AEEA087DF@RTITMBSVM03.realtek.com.tw>

Hi Bambi,

Thank you for your response. We'd be more than happy to assist in
working out a solution that would be acceptable by the upstream
maintainers.
I think having a maintainable and safe way to deploy firmware fixes
would be much appreciated by hardware users as well as upstream devs,
and certainly more manageable than big static byte-arrays in the
source code!

I've moved David to the TO list to hopefully get his suggestions and
guidance about how to design this in a upstream-compatible way.

I'd be happy to implement it too (I feel this can occur concurrent to
Hayes' upstreaming efforts).

David, could you kindly advise the best way to incorporate deploying
these firmware patches? This change link gives an idea of what we're
dealing with: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1417953

My original strawman is to just have a simple firmware format like so:
<section1><size_in_bytes><address1><data1><address2><data2>...<addressN><dataN><section2>

The driver code can have parts to deal with each section in an
appropriate fashion (e.g is each data entry a word or a byte? does
this section have a key which needs to be written to a certain
register etc.)

We'd be grateful if you can offer your advice about best practices (or
suggestions about who might be a good reviewer), so that we can have a
design in place before sending out any patches.

Thanks and best regards,

-Prashant

On Tue, Sep 3, 2019 at 2:01 AM Bambi Yeh <bambi.yeh@realtek.com> wrote:
>
> Hi Prashant:
>
> We will try to implement your requests.
> Based on our experience, upstream reviewer often reject our modification if they have any concern.
> Do you think you can talk to them about this idea and see if they will accept it or not?
> Or if you can help on this after we submit it?
>
> Also, Hayes is now updating our current upstream driver and it goes back and forth for a while.
> So we will need some time to finish it and the target schedule to have your request done is in the end of this month.
>
> Thank you very much.
>
> Best Regards,
> Bambi Yeh
>
> -----Original Message-----
> From: Hayes Wang <hayeswang@realtek.com>
> Sent: Monday, September 2, 2019 2:31 PM
> To: Amber Chen <amber.chen@realtek.com>; Prashant Malani <pmalani@chromium.org>
> Cc: David Miller <davem@davemloft.net>; netdev@vger.kernel.org; Bambi Yeh <bambi.yeh@realtek.com>; Ryankao <ryankao@realtek.com>; Jackc <jackc@realtek.com>; Albertk <albertk@realtek.com>; marcochen@google.com; nic_swsd <nic_swsd@realtek.com>; Grant Grundler <grundler@chromium.org>
> Subject: RE: Proposal: r8152 firmware patching framework
>
> Prashant Malani <pmalani@chromium.org>
> > >
> > > (Adding a few more Realtek folks)
> > >
> > > Friendly ping. Any thoughts / feedback, Realtek folks (and others) ?
> > >
> > >> On Thu, Aug 29, 2019 at 11:40 AM Prashant Malani
> > <pmalani@chromium.org> wrote:
> > >>
> > >> Hi,
> > >>
> > >> The r8152 driver source code distributed by Realtek (on
> > >> www.realtek.com) contains firmware patches. This involves binary
> > >> byte-arrays being written byte/word-wise to the hardware memory
> > >> Example: grundler@chromium.org (cc-ed) has an experimental patch
> > which
> > >> includes the firmware patching code which was distributed with the
> > >> Realtek source :
> > >>
> > https://chromium-review.googlesource.com/c/chromiumos/third_party/kern
> > el
> > /+/1417953
> > >>
> > >> It would be nice to have a way to incorporate these firmware fixes
> > >> into the upstream code. Since having indecipherable byte-arrays is
> > >> not possible upstream, I propose the following:
> > >> - We use the assistance of Realtek to come up with a format which
> > >> the firmware patch files can follow (this can be documented in the
> > >> comments).
> > >>       - A real simple format could look like this:
> > >>               +
> > >>
> > <section1><size_in_bytes><address1><data1><address2><data2>...<address
> > N
> > ><dataN><section2>...
> > >>                + The driver would be able to understand how to
> > >> parse each section (e.g is each data entry a byte or a word?)
> > >>
> > >> - We use request_firmware() to load the firmware, parse it and
> > >> write the data to the relevant registers.
>
> I plan to finish the patches which I am going to submit, first. Then, I could focus on this. However, I don't think I would start this quickly. There are many preparations and they would take me a lot of time.
>
> Best Regards,
> Hayes
>
>

^ permalink raw reply

* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-03 21:42 UTC (permalink / raw)
  To: Michal Hocko; +Cc: Eric Dumazet, davem, netdev, linux-mm, linux-kernel
In-Reply-To: <20190903185305.GA14028@dhcp22.suse.cz>

On Tue, 2019-09-03 at 20:53 +0200, Michal Hocko wrote:
> On Tue 03-09-19 11:42:22, Qian Cai wrote:
> > On Tue, 2019-09-03 at 15:22 +0200, Michal Hocko wrote:
> > > On Fri 30-08-19 18:15:22, Eric Dumazet wrote:
> > > > If there is a risk of flooding the syslog, we should fix this
> > > > generically
> > > > in mm layer, not adding hundred of __GFP_NOWARN all over the places.
> > > 
> > > We do already ratelimit in warn_alloc. If it isn't sufficient then we
> > > can think of a different parameters. Or maybe it is the ratelimiting
> > > which doesn't work here. Hard to tell and something to explore.
> > 
> > The time-based ratelimit won't work for skb_build() as when a system under
> > memory pressure, and the CPU is fast and IO is so slow, it could take a long
> > time to swap and trigger OOM.
> 
> I really do not understand what does OOM and swapping have to do with
> the ratelimiting here. The sole purpose of the ratelimit is to reduce
> the amount of warnings to be printed. Slow IO might have an effect on
> when the OOM killer is invoked but atomic allocations are not directly
> dependent on IO.

When there is a heavy memory pressure, the system is trying hard to reclaim
memory to fill up the watermark. However, the IO is slow to page out, but the
memory pressure keep draining atomic reservoir, and some of those skb_build()
will fail eventually.

Only if there is a fast IO, it will finish swapping sooner and then invoke the
OOM to end the memory pressure.

> 
> > I suppose what happens is those skb_build() allocations are from softirq,
> > and
> > once one of them failed, it calls printk() which generates more interrupts.
> > Hence, the infinite loop.
> 
> Please elaborate more.
> 

If you look at the original report, the failed allocation dump_stack() is,

 <IRQ>
 warn_alloc.cold.43+0x8a/0x148
 __alloc_pages_nodemask+0x1a5c/0x1bb0
 alloc_pages_current+0x9c/0x110
 allocate_slab+0x34a/0x11f0
 new_slab+0x46/0x70
 ___slab_alloc+0x604/0x950
 __slab_alloc+0x12/0x20
 kmem_cache_alloc+0x32a/0x400
 __build_skb+0x23/0x60
 build_skb+0x1a/0xb0
 igb_clean_rx_irq+0xafc/0x1010 [igb]
 igb_poll+0x4bb/0xe30 [igb]
 net_rx_action+0x244/0x7a0
 __do_softirq+0x1a0/0x60a
 irq_exit+0xb5/0xd0
 do_IRQ+0x81/0x170
 common_interrupt+0xf/0xf
 </IRQ>

Since it has no __GFP_NOWARN to begin with, it will call,

printk
  vprintk_default
    vprintk_emit
      wake_up_klogd
        irq_work_queue
          __irq_work_queue_local
            arch_irq_work_raise
              apic->send_IPI_self(IRQ_WORK_VECTOR)
                smp_irq_work_interrupt
                  exiting_irq
                    irq_exit

and end up processing pending net_rx_action softirqs again which are plenty due
to connected via ssh etc.

^ permalink raw reply

* rtnl_lock() question
From: Jonathan Lemon @ 2019-09-03 21:55 UTC (permalink / raw)
  To: Netdev

How appropriate is it to hold the rtnl_lock() across a sleepable
memory allocation?  On one hand it's just a mutex, but it would
seem like it could block quite a few things.
-- 
Jonathan

^ permalink raw reply

* [PATCH bpf] bpf: fix precision tracking of stack slots
From: Alexei Starovoitov @ 2019-09-03 22:16 UTC (permalink / raw)
  To: davem; +Cc: daniel, netdev, bpf, kernel-team

The problem can be seen in the following two tests:
0: (bf) r3 = r10
1: (55) if r3 != 0x7b goto pc+0
2: (7a) *(u64 *)(r3 -8) = 0
3: (79) r4 = *(u64 *)(r10 -8)
..
0: (85) call bpf_get_prandom_u32#7
1: (bf) r3 = r10
2: (55) if r3 != 0x7b goto pc+0
3: (7b) *(u64 *)(r3 -8) = r0
4: (79) r4 = *(u64 *)(r10 -8)

When backtracking need to mark R4 it will mark slot fp-8.
But ST or STX into fp-8 could belong to the same block of instructions.
When backtracing is done the parent state may have fp-8 slot
as "unallocated stack". Which will cause verifier to warn
and incorrectly reject such programs.

Writes into stack via non-R10 register are rare. llvm always
generates canonical stack spill/fill.
For such pathological case fall back to conservative precision
tracking instead of rejecting.

Reported-by: syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com
Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
tests will be submitted to bpf-next.

 kernel/bpf/verifier.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b5c14c9d7b98..c36a719fee6d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1772,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
 		bitmap_from_u64(mask, stack_mask);
 		for_each_set_bit(i, mask, 64) {
 			if (i >= func->allocated_stack / BPF_REG_SIZE) {
-				/* This can happen if backtracking
-				 * is propagating stack precision where
-				 * caller has larger stack frame
-				 * than callee, but backtrack_insn() should
-				 * have returned -ENOTSUPP.
+				/* the sequence of instructions:
+				 * 2: (bf) r3 = r10
+				 * 3: (7b) *(u64 *)(r3 -8) = r0
+				 * 4: (79) r4 = *(u64 *)(r10 -8)
+				 * doesn't contain jmps. It's backtracked
+				 * as a single block.
+				 * During backtracking insn 3 is not recognized as
+				 * stack access, so at the end of backtracking
+				 * stack slot fp-8 is still marked in stack_mask.
+				 * However the parent state may not have accessed
+				 * fp-8 and it's "unallocated" stack space.
+				 * In such case fallback to conservative.
 				 */
-				verbose(env, "BUG spi %d stack_size %d\n",
-					i, func->allocated_stack);
-				WARN_ONCE(1, "verifier backtracking bug");
-				return -EFAULT;
+				mark_all_scalars_precise(env, st);
+				return 0;
 			}
 
 			if (func->stack[i].slot_type[0] != STACK_SPILL) {
-- 
2.20.0


^ permalink raw reply related

* Re: [PATCH v2 net] net: Properly update v4 routes with v6 nexthop
From: David Ahern @ 2019-09-03 22:17 UTC (permalink / raw)
  To: Donald Sharp, netdev, dsahern, sworley
In-Reply-To: <20190831122254.29928-1-sharpd@cumulusnetworks.com>

On 8/31/19 6:22 AM, Donald Sharp wrote:
> @@ -1684,7 +1684,8 @@ EXPORT_SYMBOL_GPL(fib_add_nexthop);
>  #endif
>  
>  #ifdef CONFIG_IP_ROUTE_MULTIPATH
> -static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
> +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi,
> +			     u8 rt_family)

The fib_info argument makes this an IPv4 only function, so the rt_family
is extraneous. Remove it here and the #else path below and use AF_INET
for the 2 calls below.

>  {
>  	struct nlattr *mp;
>  
> @@ -1693,13 +1694,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
>  		goto nla_put_failure;
>  
>  	if (unlikely(fi->nh)) {
> -		if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
> +		if (nexthop_mpath_fill_node(skb, fi->nh, rt_family) < 0)
>  			goto nla_put_failure;
>  		goto mp_end;
>  	}
>  
>  	for_nexthops(fi) {
> -		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
> +		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
> +				    rt_family) < 0)
>  			goto nla_put_failure;
>  #ifdef CONFIG_IP_ROUTE_CLASSID
>  		if (nh->nh_tclassid &&
> @@ -1717,7 +1719,8 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
>  	return -EMSGSIZE;
>  }
>  #else
> -static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
> +static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi,
> +			     u8 family)
>  {
>  	return 0;
>  }

^ permalink raw reply

* [PATCH net 1/2] ipv6: Fix RTA_MULTIPATH with nexthop objects
From: David Ahern @ 2019-09-03 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, sharpd, David Ahern
In-Reply-To: <20190903222213.7029-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

A change to the core nla helpers was missed during the push of
the nexthop changes. rt6_fill_node_nexthop should be calling
nla_nest_start_noflag not nla_nest_start. Currently, iproute2
does not print multipath data because of parsing issues with
the attribute.

Fixes: f88d8ea67fbd ("ipv6: Plumb support for nexthop object in a fib6_info")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 06f2e2d05785..b9b5be1aafff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5323,7 +5323,7 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
 	if (nexthop_is_multipath(nh)) {
 		struct nlattr *mp;
 
-		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
 		if (!mp)
 			goto nla_put_failure;
 
-- 
2.11.0


^ permalink raw reply related

* [PATCH net 0/2] nexthops: Fix multipath notifications for IPv6 and selftests
From: David Ahern @ 2019-09-03 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, sharpd, David Ahern

From: David Ahern <dsahern@gmail.com>

A couple of bug fixes noticed while testing Donald's patch.

David Ahern (2):
  ipv6: Fix RTA_MULTIPATH with nexthop objects
  selftest: A few cleanups for fib_nexthops.sh

 net/ipv6/route.c                            |  2 +-
 tools/testing/selftests/net/fib_nexthops.sh | 24 +++++++++++++-----------
 2 files changed, 14 insertions(+), 12 deletions(-)

-- 
2.11.0


^ permalink raw reply

* [PATCH net 2/2] selftest: A few cleanups for fib_nexthops.sh
From: David Ahern @ 2019-09-03 22:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, sharpd, David Ahern
In-Reply-To: <20190903222213.7029-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Cleanups of the tests in fib_nexthops.sh
1. Several tests noted unexpected route output, but the
   discrepancy was not showing in the summary output and
   overlooked in the verbose output. Add a WARNING message
   to the summary output to make it clear a test is not showing
   expected output.

2. Several check_* calls are missing extra data like scope and metric
   causing mismatches when the nexthops or routes are correct - some of
   them are a side effect of the evolving iproute2 command. Update the
   data to the expected output.

3. Several check_routes are checking for the wrong nexthop data,
   most likely a copy-paste-update error.

4. A couple of tests were re-using a nexthop id that already existed.
   Fix those to use a new id.

Fixes: 6345266a9989 ("selftests: Add test cases for nexthop objects")
Signed-off-by: David Ahern <dsahern@gmail.com>
---
 tools/testing/selftests/net/fib_nexthops.sh | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
index c5c93d5fb3ad..f9ebeac1e6f2 100755
--- a/tools/testing/selftests/net/fib_nexthops.sh
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -212,6 +212,8 @@ check_output()
 			printf "        ${out}\n"
 			printf "    Expected:\n"
 			printf "        ${expected}\n\n"
+		else
+			echo "      WARNING: Unexpected route entry"
 		fi
 	fi
 
@@ -274,7 +276,7 @@ ipv6_fcnal()
 
 	run_cmd "$IP nexthop get id 52"
 	log_test $? 0 "Get nexthop by id"
-	check_nexthop "id 52" "id 52 via 2001:db8:91::2 dev veth1"
+	check_nexthop "id 52" "id 52 via 2001:db8:91::2 dev veth1 scope link"
 
 	run_cmd "$IP nexthop del id 52"
 	log_test $? 0 "Delete nexthop by id"
@@ -479,12 +481,12 @@ ipv6_fcnal_runtime()
 	run_cmd "$IP -6 nexthop add id 85 dev veth1"
 	run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85"
 	log_test $? 0 "IPv6 route with device only nexthop"
-	check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1"
+	check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024 pref medium"
 
 	run_cmd "$IP nexthop add id 123 group 81/85"
 	run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123"
 	log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw"
-	check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 nexthop via 2001:db8:91::2 dev veth1 nexthop dev veth1"
+	check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1 pref medium"
 
 	#
 	# IPv6 route with v4 nexthop - not allowed
@@ -538,7 +540,7 @@ ipv4_fcnal()
 
 	run_cmd "$IP nexthop get id 12"
 	log_test $? 0 "Get nexthop by id"
-	check_nexthop "id 12" "id 12 via 172.16.1.2 src 172.16.1.1 dev veth1 scope link"
+	check_nexthop "id 12" "id 12 via 172.16.1.2 dev veth1 scope link"
 
 	run_cmd "$IP nexthop del id 12"
 	log_test $? 0 "Delete nexthop by id"
@@ -685,7 +687,7 @@ ipv4_withv6_fcnal()
 	set +e
 	run_cmd "$IP ro add 172.16.101.1/32 nhid 11"
 	log_test $? 0 "IPv6 nexthop with IPv4 route"
-	check_route "172.16.101.1" "172.16.101.1 nhid 11 via ${lladdr} dev veth1"
+	check_route "172.16.101.1" "172.16.101.1 nhid 11 via inet6 ${lladdr} dev veth1"
 
 	set -e
 	run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1"
@@ -694,11 +696,11 @@ ipv4_withv6_fcnal()
 	run_cmd "$IP ro replace 172.16.101.1/32 nhid 101"
 	log_test $? 0 "IPv6 nexthop with IPv4 route"
 
-	check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+	check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via inet6 ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
 
 	run_cmd "$IP ro replace 172.16.101.1/32 via inet6 ${lladdr} dev veth1"
 	log_test $? 0 "IPv4 route with IPv6 gateway"
-	check_route "172.16.101.1" "172.16.101.1 via ${lladdr} dev veth1"
+	check_route "172.16.101.1" "172.16.101.1 via inet6 ${lladdr} dev veth1"
 
 	run_cmd "$IP ro replace 172.16.101.1/32 via inet6 2001:db8:50::1 dev veth1"
 	log_test $? 2 "IPv4 route with invalid IPv6 gateway"
@@ -785,10 +787,10 @@ ipv4_fcnal_runtime()
 	log_test $? 0 "IPv4 route with device only nexthop"
 	check_route "172.16.101.1" "172.16.101.1 nhid 85 dev veth1"
 
-	run_cmd "$IP nexthop add id 122 group 21/85"
-	run_cmd "$IP ro replace 172.16.101.1/32 nhid 122"
+	run_cmd "$IP nexthop add id 123 group 21/85"
+	run_cmd "$IP ro replace 172.16.101.1/32 nhid 123"
 	log_test $? 0 "IPv4 multipath route with nexthop mix - dev only + gw"
-	check_route "172.16.101.1" "172.16.101.1 nhid 85 nexthop via 172.16.1.2 dev veth1 nexthop dev veth1"
+	check_route "172.16.101.1" "172.16.101.1 nhid 123 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop dev veth1 weight 1"
 
 	#
 	# IPv4 with IPv6
@@ -820,7 +822,7 @@ ipv4_fcnal_runtime()
 	run_cmd "$IP ro replace 172.16.101.1/32 nhid 101"
 	log_test $? 0 "IPv4 route with mixed v4-v6 multipath route"
 
-	check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+	check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via inet6 ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
 
 	run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
 	log_test $? 0 "IPv6 nexthop with IPv4 route"
-- 
2.11.0


^ permalink raw reply related

* [PATCH v2] net: fixed_phy: Add forward declaration for struct gpio_desc;
From: Moritz Fischer @ 2019-09-03 22:23 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, hkallweit1, davem, Moritz Fischer, Florian Fainelli

From: Moritz Fischer <mdf@kernel.org>

Add forward declaration for struct gpio_desc in order to address
the following:

./include/linux/phy_fixed.h:48:17: error: 'struct gpio_desc' declared inside parameter list [-Werror]
./include/linux/phy_fixed.h:48:17: error: its scope is only this definition or declaration, which is probably not what you want [-Werror]

Fixes: 71bd106d2567 ("net: fixed-phy: Add
fixed_phy_register_with_gpiod() API")
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Moritz Fischer <mdf@kernel.org>
---
 include/linux/phy_fixed.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h
index 1e5d86ebdaeb..52bc8e487ef7 100644
--- a/include/linux/phy_fixed.h
+++ b/include/linux/phy_fixed.h
@@ -11,6 +11,7 @@ struct fixed_phy_status {
 };
 
 struct device_node;
+struct gpio_desc;
 
 #if IS_ENABLED(CONFIG_FIXED_PHY)
 extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier);
-- 
2.23.0.187.g17f5b7556c-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox