Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
* [PATCH for-next V2 11/15] net/mlx5: Refactor find_flow_rule
From: Saeed Mahameed @ 2016-10-30 21:22 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Or Gerlitz, Leon Romanovsky, Tal Alon, Matan Barak, Mark Bloch,
	Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The way we compare between two dests will need to be used in other
places in the future, so we factor out the comparison logic
between two dests into a separate function.

Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 29 ++++++++++++++++-------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index e2bab9d..fca6937 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -153,6 +153,8 @@ static void del_rule(struct fs_node *node);
 static void del_flow_table(struct fs_node *node);
 static void del_flow_group(struct fs_node *node);
 static void del_fte(struct fs_node *node);
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2);
 
 static void tree_init_node(struct fs_node *node,
 			   unsigned int refcount,
@@ -1064,21 +1066,30 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
 	return fg;
 }
 
+static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
+				struct mlx5_flow_destination *d2)
+{
+	if (d1->type == d2->type) {
+		if ((d1->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
+		     d1->vport_num == d2->vport_num) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
+		     d1->ft == d2->ft) ||
+		    (d1->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+		     d1->tir_num == d2->tir_num))
+			return true;
+	}
+
+	return false;
+}
+
 static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
 					     struct mlx5_flow_destination *dest)
 {
 	struct mlx5_flow_rule *rule;
 
 	list_for_each_entry(rule, &fte->node.children, node.list) {
-		if (rule->dest_attr.type == dest->type) {
-			if ((dest->type == MLX5_FLOW_DESTINATION_TYPE_VPORT &&
-			     dest->vport_num == rule->dest_attr.vport_num) ||
-			    (dest->type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE &&
-			     dest->ft == rule->dest_attr.ft) ||
-			    (dest->type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
-			     dest->tir_num == rule->dest_attr.tir_num))
-				return rule;
-		}
+		if (mlx5_flow_dests_cmp(&rule->dest_attr, dest))
+			return rule;
 	}
 	return NULL;
 }
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH for-next V2 12/15] net/mlx5: Group similer rules under the same fte
From: Saeed Mahameed @ 2016-10-30 21:22 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Or Gerlitz, Leon Romanovsky, Tal Alon, Matan Barak, Mark Bloch,
	Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When adding a new rule, if we can match it with compare_match_value and
flow tag we might be able to insert the rule to the same fte.
In order to do that, there must be an overlap between the actions of the
fte and the new rule.

When updating the action of an existing fte, we must tell the firmware
we are doing so.

Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index fca6937..43d7052 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -920,7 +920,8 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest)
 /* fte should not be deleted while calling this function */
 static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 					   struct mlx5_flow_group *fg,
-					   struct mlx5_flow_destination *dest)
+					   struct mlx5_flow_destination *dest,
+					   bool update_action)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_rule *rule;
@@ -931,6 +932,9 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 	if (!rule)
 		return ERR_PTR(-ENOMEM);
 
+	if (update_action)
+		modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
+
 	fs_get_obj(ft, fg->node.parent);
 	/* Add dest to dests list- we need flow tables to be in the
 	 * end of the list for forward to next prio rules.
@@ -1109,7 +1113,9 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 	fs_for_each_fte(fte, fg) {
 		nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
 		if (compare_match_value(&fg->mask, match_value, &fte->val) &&
-		    action == fte->action && flow_tag == fte->flow_tag) {
+		    (action & fte->action) && flow_tag == fte->flow_tag) {
+			int old_action = fte->action;
+
 			rule = find_flow_rule(fte, dest);
 			if (rule) {
 				atomic_inc(&rule->node.refcount);
@@ -1117,11 +1123,15 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 				unlock_ref_node(&fg->node);
 				return rule;
 			}
-			rule = add_rule_fte(fte, fg, dest);
-			if (IS_ERR(rule))
+			fte->action |= action;
+			rule = add_rule_fte(fte, fg, dest,
+					    old_action != action);
+			if (IS_ERR(rule)) {
+				fte->action = old_action;
 				goto unlock_fte;
-			else
+			} else {
 				goto add_rule;
+			}
 		}
 		unlock_ref_node(&fte->node);
 	}
@@ -1138,7 +1148,7 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 	}
 	tree_init_node(&fte->node, 0, del_fte);
 	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-	rule = add_rule_fte(fte, fg, dest);
+	rule = add_rule_fte(fte, fg, dest, false);
 	if (IS_ERR(rule)) {
 		kfree(fte);
 		goto unlock_fg;
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH for-next V2 13/15] net/mlx5: Add multi dest support
From: Saeed Mahameed @ 2016-10-30 21:22 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Or Gerlitz, Leon Romanovsky, Tal Alon, Matan Barak, Mark Bloch,
	Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Currently when calling mlx5_add_flow_rule we accept
only one flow destination, this commit allows to pass
multiple destinations.

This change forces us to change the return structure to a more
flexible one. We introduce a flow handle (struct mlx5_flow_handle),
it holds internally the number for rules created and holds an array
where each cell points the to a flow rule.

>From the consumers (of mlx5_add_flow_rule) point of view this
change is only cosmetic and requires only to change the type
of the returned value they store.

>From the core point of view, we now need to use a loop when
allocating and deleting rules (e.g given to us a flow handler).

Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/main.c                  |  14 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  14 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  |  38 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |  49 ++--
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  19 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  32 +--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  68 ++---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |  22 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |  42 +--
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 289 ++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |   5 +
 include/linux/mlx5/fs.h                            |  28 +-
 14 files changed, 374 insertions(+), 254 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index d02341e..8e0dbd5 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1771,13 +1771,13 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
 	mutex_lock(&dev->flow_db.lock);
 
 	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
-		mlx5_del_flow_rule(iter->rule);
+		mlx5_del_flow_rules(iter->rule);
 		put_flow_table(dev, iter->prio, true);
 		list_del(&iter->list);
 		kfree(iter);
 	}
 
-	mlx5_del_flow_rule(handler->rule);
+	mlx5_del_flow_rules(handler->rule);
 	put_flow_table(dev, handler->prio, true);
 	mutex_unlock(&dev->flow_db.lock);
 
@@ -1907,10 +1907,10 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
 	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
 		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
-	handler->rule = mlx5_add_flow_rule(ft, spec,
+	handler->rule = mlx5_add_flow_rules(ft, spec,
 					   action,
 					   MLX5_FS_DEFAULT_FLOW_TAG,
-					   dst);
+					   dst, 1);
 
 	if (IS_ERR(handler->rule)) {
 		err = PTR_ERR(handler->rule);
@@ -1941,7 +1941,7 @@ static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *de
 		handler_dst = create_flow_rule(dev, ft_prio,
 					       flow_attr, dst);
 		if (IS_ERR(handler_dst)) {
-			mlx5_del_flow_rule(handler->rule);
+			mlx5_del_flow_rules(handler->rule);
 			ft_prio->refcount--;
 			kfree(handler);
 			handler = handler_dst;
@@ -2004,7 +2004,7 @@ static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *de
 						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
 						 dst);
 		if (IS_ERR(handler_ucast)) {
-			mlx5_del_flow_rule(handler->rule);
+			mlx5_del_flow_rules(handler->rule);
 			ft_prio->refcount--;
 			kfree(handler);
 			handler = handler_ucast;
@@ -2046,7 +2046,7 @@ static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
 	return handler_rx;
 
 err_tx:
-	mlx5_del_flow_rule(handler_rx->rule);
+	mlx5_del_flow_rules(handler_rx->rule);
 	ft_rx->refcount--;
 	kfree(handler_rx);
 err:
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index dcdcd19..d5d0077 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -153,7 +153,7 @@ struct mlx5_ib_flow_handler {
 	struct list_head		list;
 	struct ib_flow			ibflow;
 	struct mlx5_ib_flow_prio	*prio;
-	struct mlx5_flow_rule	*rule;
+	struct mlx5_flow_handle		*rule;
 };
 
 struct mlx5_ib_flow_db {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 460363b..47ee8ff 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -520,7 +520,7 @@ struct mlx5e_vxlan_db {
 
 struct mlx5e_l2_rule {
 	u8  addr[ETH_ALEN + 2];
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 };
 
 struct mlx5e_flow_table {
@@ -541,10 +541,10 @@ struct mlx5e_tc_table {
 struct mlx5e_vlan_table {
 	struct mlx5e_flow_table		ft;
 	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
-	struct mlx5_flow_rule	*active_vlans_rule[VLAN_N_VID];
-	struct mlx5_flow_rule	*untagged_rule;
-	struct mlx5_flow_rule	*any_vlan_rule;
-	bool          filter_disabled;
+	struct mlx5_flow_handle	*active_vlans_rule[VLAN_N_VID];
+	struct mlx5_flow_handle	*untagged_rule;
+	struct mlx5_flow_handle	*any_vlan_rule;
+	bool		filter_disabled;
 };
 
 struct mlx5e_l2_table {
@@ -562,14 +562,14 @@ struct mlx5e_l2_table {
 /* L3/L4 traffic type classifier */
 struct mlx5e_ttc_table {
 	struct mlx5e_flow_table  ft;
-	struct mlx5_flow_rule	 *rules[MLX5E_NUM_TT];
+	struct mlx5_flow_handle	 *rules[MLX5E_NUM_TT];
 };
 
 #define ARFS_HASH_SHIFT BITS_PER_BYTE
 #define ARFS_HASH_SIZE BIT(BITS_PER_BYTE)
 struct arfs_table {
 	struct mlx5e_flow_table  ft;
-	struct mlx5_flow_rule    *default_rule;
+	struct mlx5_flow_handle	 *default_rule;
 	struct hlist_head	 rules_hash[ARFS_HASH_SIZE];
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index a8cb387..8ff22e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -56,7 +56,7 @@ struct arfs_tuple {
 struct arfs_rule {
 	struct mlx5e_priv	*priv;
 	struct work_struct      arfs_work;
-	struct mlx5_flow_rule   *rule;
+	struct mlx5_flow_handle *rule;
 	struct hlist_node	hlist;
 	int			rxq;
 	/* Flow ID passed to ndo_rx_flow_steer */
@@ -104,7 +104,7 @@ static int arfs_disable(struct mlx5e_priv *priv)
 		tt = arfs_get_tt(i);
 		/* Modify ttc rules destination to bypass the aRFS tables*/
 		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
-						   &dest);
+						   &dest, NULL);
 		if (err) {
 			netdev_err(priv->netdev,
 				   "%s: modify ttc destination failed\n",
@@ -137,7 +137,7 @@ int mlx5e_arfs_enable(struct mlx5e_priv *priv)
 		tt = arfs_get_tt(i);
 		/* Modify ttc rules destination to point on the aRFS FTs */
 		err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
-						   &dest);
+						   &dest, NULL);
 		if (err) {
 			netdev_err(priv->netdev,
 				   "%s: modify ttc destination failed err=%d\n",
@@ -151,7 +151,7 @@ int mlx5e_arfs_enable(struct mlx5e_priv *priv)
 
 static void arfs_destroy_table(struct arfs_table *arfs_t)
 {
-	mlx5_del_flow_rule(arfs_t->default_rule);
+	mlx5_del_flow_rules(arfs_t->default_rule);
 	mlx5e_destroy_flow_table(&arfs_t->ft);
 }
 
@@ -205,10 +205,10 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 		goto out;
 	}
 
-	arfs_t->default_rule = mlx5_add_flow_rule(arfs_t->ft.t, spec,
-						  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-						  MLX5_FS_DEFAULT_FLOW_TAG,
-						  &dest);
+	arfs_t->default_rule = mlx5_add_flow_rules(arfs_t->ft.t, spec,
+						   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+						   MLX5_FS_DEFAULT_FLOW_TAG,
+						   &dest, 1);
 	if (IS_ERR(arfs_t->default_rule)) {
 		err = PTR_ERR(arfs_t->default_rule);
 		arfs_t->default_rule = NULL;
@@ -396,7 +396,7 @@ static void arfs_may_expire_flow(struct mlx5e_priv *priv)
 	spin_unlock_bh(&priv->fs.arfs.arfs_lock);
 	hlist_for_each_entry_safe(arfs_rule, htmp, &del_list, hlist) {
 		if (arfs_rule->rule)
-			mlx5_del_flow_rule(arfs_rule->rule);
+			mlx5_del_flow_rules(arfs_rule->rule);
 		hlist_del(&arfs_rule->hlist);
 		kfree(arfs_rule);
 	}
@@ -420,7 +420,7 @@ static void arfs_del_rules(struct mlx5e_priv *priv)
 	hlist_for_each_entry_safe(rule, htmp, &del_list, hlist) {
 		cancel_work_sync(&rule->arfs_work);
 		if (rule->rule)
-			mlx5_del_flow_rule(rule->rule);
+			mlx5_del_flow_rules(rule->rule);
 		hlist_del(&rule->hlist);
 		kfree(rule);
 	}
@@ -462,12 +462,12 @@ static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
 	return NULL;
 }
 
-static struct mlx5_flow_rule *arfs_add_rule(struct mlx5e_priv *priv,
-					    struct arfs_rule *arfs_rule)
+static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
+					      struct arfs_rule *arfs_rule)
 {
 	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
 	struct arfs_tuple *tuple = &arfs_rule->tuple;
-	struct mlx5_flow_rule *rule = NULL;
+	struct mlx5_flow_handle *rule = NULL;
 	struct mlx5_flow_destination dest;
 	struct arfs_table *arfs_table;
 	struct mlx5_flow_spec *spec;
@@ -544,9 +544,9 @@ static struct mlx5_flow_rule *arfs_add_rule(struct mlx5e_priv *priv,
 	}
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = priv->direct_tir[arfs_rule->rxq].tirn;
-	rule = mlx5_add_flow_rule(ft, spec, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				  MLX5_FS_DEFAULT_FLOW_TAG,
-				  &dest);
+	rule = mlx5_add_flow_rules(ft, spec, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				   MLX5_FS_DEFAULT_FLOW_TAG,
+				   &dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule(filter id=%d, rq idx=%d) failed, err=%d\n",
@@ -559,14 +559,14 @@ static struct mlx5_flow_rule *arfs_add_rule(struct mlx5e_priv *priv,
 }
 
 static void arfs_modify_rule_rq(struct mlx5e_priv *priv,
-				struct mlx5_flow_rule *rule, u16 rxq)
+				struct mlx5_flow_handle *rule, u16 rxq)
 {
 	struct mlx5_flow_destination dst;
 	int err = 0;
 
 	dst.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dst.tir_num = priv->direct_tir[rxq].tirn;
-	err =  mlx5_modify_rule_destination(rule, &dst);
+	err =  mlx5_modify_rule_destination(rule, &dst, NULL);
 	if (err)
 		netdev_warn(priv->netdev,
 			    "Failed to modfiy aRFS rule destination to rq=%d\n", rxq);
@@ -578,7 +578,7 @@ static void arfs_handle_work(struct work_struct *work)
 						   struct arfs_rule,
 						   arfs_work);
 	struct mlx5e_priv *priv = arfs_rule->priv;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 
 	mutex_lock(&priv->state_lock);
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 36fbc6b..bed544d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -160,7 +160,7 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 {
 	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule **rule_p;
+	struct mlx5_flow_handle **rule_p;
 	int err = 0;
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
@@ -187,10 +187,10 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	*rule_p = mlx5_add_flow_rule(ft, spec,
-				     MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				     MLX5_FS_DEFAULT_FLOW_TAG,
-				     &dest);
+	*rule_p = mlx5_add_flow_rules(ft, spec,
+				      MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				      MLX5_FS_DEFAULT_FLOW_TAG,
+				      &dest, 1);
 
 	if (IS_ERR(*rule_p)) {
 		err = PTR_ERR(*rule_p);
@@ -229,20 +229,20 @@ static void mlx5e_del_vlan_rule(struct mlx5e_priv *priv,
 	switch (rule_type) {
 	case MLX5E_VLAN_RULE_TYPE_UNTAGGED:
 		if (priv->fs.vlan.untagged_rule) {
-			mlx5_del_flow_rule(priv->fs.vlan.untagged_rule);
+			mlx5_del_flow_rules(priv->fs.vlan.untagged_rule);
 			priv->fs.vlan.untagged_rule = NULL;
 		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_ANY_VID:
 		if (priv->fs.vlan.any_vlan_rule) {
-			mlx5_del_flow_rule(priv->fs.vlan.any_vlan_rule);
+			mlx5_del_flow_rules(priv->fs.vlan.any_vlan_rule);
 			priv->fs.vlan.any_vlan_rule = NULL;
 		}
 		break;
 	case MLX5E_VLAN_RULE_TYPE_MATCH_VID:
 		mlx5e_vport_context_update_vlans(priv);
 		if (priv->fs.vlan.active_vlans_rule[vid]) {
-			mlx5_del_flow_rule(priv->fs.vlan.active_vlans_rule[vid]);
+			mlx5_del_flow_rules(priv->fs.vlan.active_vlans_rule[vid]);
 			priv->fs.vlan.active_vlans_rule[vid] = NULL;
 		}
 		mlx5e_vport_context_update_vlans(priv);
@@ -560,7 +560,7 @@ static void mlx5e_cleanup_ttc_rules(struct mlx5e_ttc_table *ttc)
 
 	for (i = 0; i < MLX5E_NUM_TT; i++) {
 		if (!IS_ERR_OR_NULL(ttc->rules[i])) {
-			mlx5_del_flow_rule(ttc->rules[i]);
+			mlx5_del_flow_rules(ttc->rules[i]);
 			ttc->rules[i] = NULL;
 		}
 	}
@@ -616,13 +616,14 @@ static struct {
 	},
 };
 
-static struct mlx5_flow_rule *mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
-						      struct mlx5_flow_table *ft,
-						      struct mlx5_flow_destination *dest,
-						      u16 etype,
-						      u8 proto)
+static struct mlx5_flow_handle *
+mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
+			struct mlx5_flow_table *ft,
+			struct mlx5_flow_destination *dest,
+			u16 etype,
+			u8 proto)
 {
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -643,10 +644,10 @@ static struct mlx5_flow_rule *mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
 		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ethertype, etype);
 	}
 
-	rule = mlx5_add_flow_rule(ft, spec,
-				  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				  MLX5_FS_DEFAULT_FLOW_TAG,
-				  dest);
+	rule = mlx5_add_flow_rules(ft, spec,
+				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				   MLX5_FS_DEFAULT_FLOW_TAG,
+				   dest, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: add rule failed\n", __func__);
@@ -660,7 +661,7 @@ static int mlx5e_generate_ttc_table_rules(struct mlx5e_priv *priv)
 {
 	struct mlx5_flow_destination dest;
 	struct mlx5e_ttc_table *ttc;
-	struct mlx5_flow_rule **rules;
+	struct mlx5_flow_handle **rules;
 	struct mlx5_flow_table *ft;
 	int tt;
 	int err;
@@ -801,7 +802,7 @@ static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
 				   struct mlx5e_l2_rule *ai)
 {
 	if (!IS_ERR_OR_NULL(ai->rule)) {
-		mlx5_del_flow_rule(ai->rule);
+		mlx5_del_flow_rules(ai->rule);
 		ai->rule = NULL;
 	}
 }
@@ -847,9 +848,9 @@ static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 		break;
 	}
 
-	ai->rule = mlx5_add_flow_rule(ft, spec,
-				      MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				      MLX5_FS_DEFAULT_FLOW_TAG, &dest);
+	ai->rule = mlx5_add_flow_rules(ft, spec,
+				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				       MLX5_FS_DEFAULT_FLOW_TAG, &dest, 1);
 	if (IS_ERR(ai->rule)) {
 		netdev_err(priv->netdev, "%s: add l2 rule(mac:%pM) failed\n",
 			   __func__, mv_dmac);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index d17c242..cf52c06 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -36,7 +36,7 @@
 struct mlx5e_ethtool_rule {
 	struct list_head             list;
 	struct ethtool_rx_flow_spec  flow_spec;
-	struct mlx5_flow_rule        *rule;
+	struct mlx5_flow_handle	     *rule;
 	struct mlx5e_ethtool_table   *eth_ft;
 };
 
@@ -284,13 +284,14 @@ static bool outer_header_zero(u32 *match_criteria)
 						  size - 1);
 }
 
-static struct mlx5_flow_rule *add_ethtool_flow_rule(struct mlx5e_priv *priv,
-						    struct mlx5_flow_table *ft,
-						    struct ethtool_rx_flow_spec *fs)
+static struct mlx5_flow_handle *
+add_ethtool_flow_rule(struct mlx5e_priv *priv,
+		      struct mlx5_flow_table *ft,
+		      struct ethtool_rx_flow_spec *fs)
 {
 	struct mlx5_flow_destination *dst = NULL;
 	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	int err = 0;
 	u32 action;
 
@@ -317,8 +318,8 @@ static struct mlx5_flow_rule *add_ethtool_flow_rule(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = (!outer_header_zero(spec->match_criteria));
-	rule = mlx5_add_flow_rule(ft, spec, action,
-				  MLX5_FS_DEFAULT_FLOW_TAG, dst);
+	rule = mlx5_add_flow_rules(ft, spec, action,
+				   MLX5_FS_DEFAULT_FLOW_TAG, dst, 1);
 	if (IS_ERR(rule)) {
 		err = PTR_ERR(rule);
 		netdev_err(priv->netdev, "%s: failed to add ethtool steering rule: %d\n",
@@ -335,7 +336,7 @@ static void del_ethtool_rule(struct mlx5e_priv *priv,
 			     struct mlx5e_ethtool_rule *eth_rule)
 {
 	if (eth_rule->rule)
-		mlx5_del_flow_rule(eth_rule->rule);
+		mlx5_del_flow_rules(eth_rule->rule);
 	list_del(&eth_rule->list);
 	priv->fs.ethtool.tot_num_rules--;
 	put_flow_table(eth_rule->eth_ft);
@@ -475,7 +476,7 @@ int mlx5e_ethtool_flow_replace(struct mlx5e_priv *priv,
 {
 	struct mlx5e_ethtool_table *eth_ft;
 	struct mlx5e_ethtool_rule *eth_rule;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	int num_tuples;
 	int err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 3c97da1..88d3fd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -328,7 +328,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_eswitch_rep *rep = priv->ppriv;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	int err;
 	int i;
 
@@ -360,7 +360,7 @@ static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
 	return 0;
 
 err_del_flow_rule:
-	mlx5_del_flow_rule(rep->vport_rx_rule);
+	mlx5_del_flow_rules(rep->vport_rx_rule);
 err_destroy_direct_tirs:
 	mlx5e_destroy_direct_tirs(priv);
 err_destroy_direct_rqts:
@@ -375,7 +375,7 @@ static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
 	int i;
 
 	mlx5e_tc_cleanup(priv);
-	mlx5_del_flow_rule(rep->vport_rx_rule);
+	mlx5_del_flow_rules(rep->vport_rx_rule);
 	mlx5e_destroy_direct_tirs(priv);
 	for (i = 0; i < priv->params.num_channels; i++)
 		mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index ce8c54d..5d9ac0d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -47,21 +47,22 @@
 struct mlx5e_tc_flow {
 	struct rhash_head	node;
 	u64			cookie;
-	struct mlx5_flow_rule	*rule;
+	struct mlx5_flow_handle *rule;
 	struct mlx5_esw_flow_attr *attr;
 };
 
 #define MLX5E_TC_TABLE_NUM_ENTRIES 1024
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 
-static struct mlx5_flow_rule *mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
-						    struct mlx5_flow_spec *spec,
-						    u32 action, u32 flow_tag)
+static struct mlx5_flow_handle *
+mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
+		      struct mlx5_flow_spec *spec,
+		      u32 action, u32 flow_tag)
 {
 	struct mlx5_core_dev *dev = priv->mdev;
 	struct mlx5_flow_destination dest = { 0 };
 	struct mlx5_fc *counter = NULL;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	bool table_created = false;
 
 	if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
@@ -94,9 +95,9 @@ static struct mlx5_flow_rule *mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	}
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-	rule = mlx5_add_flow_rule(priv->fs.tc.t, spec,
-				  action, flow_tag,
-				  &dest);
+	rule = mlx5_add_flow_rules(priv->fs.tc.t, spec,
+				   action, flow_tag,
+				   &dest, 1);
 
 	if (IS_ERR(rule))
 		goto err_add_rule;
@@ -114,9 +115,10 @@ static struct mlx5_flow_rule *mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 	return rule;
 }
 
-static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
-						    struct mlx5_flow_spec *spec,
-						    struct mlx5_esw_flow_attr *attr)
+static struct mlx5_flow_handle *
+mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
+		      struct mlx5_flow_spec *spec,
+		      struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	int err;
@@ -129,7 +131,7 @@ static struct mlx5_flow_rule *mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 }
 
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
-			      struct mlx5_flow_rule *rule,
+			      struct mlx5_flow_handle *rule,
 			      struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
@@ -140,7 +142,7 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
 	if (esw && esw->mode == SRIOV_OFFLOADS)
 		mlx5_eswitch_del_vlan_action(esw, attr);
 
-	mlx5_del_flow_rule(rule);
+	mlx5_del_flow_rules(rule);
 
 	mlx5_fc_destroy(priv->mdev, counter);
 
@@ -450,7 +452,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	u32 flow_tag, action;
 	struct mlx5e_tc_flow *flow;
 	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_rule *old = NULL;
+	struct mlx5_flow_handle *old = NULL;
 	struct mlx5_esw_flow_attr *old_attr = NULL;
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
@@ -511,7 +513,7 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 protocol,
 	goto out;
 
 err_del_rule:
-	mlx5_del_flow_rule(flow->rule);
+	mlx5_del_flow_rules(flow->rule);
 
 err_free:
 	if (!old)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 9ef01d1..fcd8b15 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -56,7 +56,7 @@ struct esw_uc_addr {
 /* E-Switch MC FDB table hash node */
 struct esw_mc_addr { /* SRIOV only */
 	struct l2addr_node     node;
-	struct mlx5_flow_rule *uplink_rule; /* Forward to uplink rule */
+	struct mlx5_flow_handle *uplink_rule; /* Forward to uplink rule */
 	u32                    refcnt;
 };
 
@@ -65,7 +65,7 @@ struct vport_addr {
 	struct l2addr_node     node;
 	u8                     action;
 	u32                    vport;
-	struct mlx5_flow_rule *flow_rule; /* SRIOV only */
+	struct mlx5_flow_handle *flow_rule; /* SRIOV only */
 	/* A flag indicating that mac was added due to mc promiscuous vport */
 	bool mc_promisc;
 };
@@ -237,13 +237,13 @@ static void del_l2_table_entry(struct mlx5_core_dev *dev, u32 index)
 }
 
 /* E-Switch FDB */
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 			 u8 mac_c[ETH_ALEN], u8 mac_v[ETH_ALEN])
 {
 	int match_header = (is_zero_ether_addr(mac_c) ? 0 :
 			    MLX5_MATCH_OUTER_HEADERS);
-	struct mlx5_flow_rule *flow_rule = NULL;
+	struct mlx5_flow_handle *flow_rule = NULL;
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_spec *spec;
 	void *mv_misc = NULL;
@@ -286,9 +286,9 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 		  dmac_v, dmac_c, vport);
 	spec->match_criteria_enable = match_header;
 	flow_rule =
-		mlx5_add_flow_rule(esw->fdb_table.fdb, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				   0, &dest);
+		mlx5_add_flow_rules(esw->fdb_table.fdb, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+				    0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev,
 			 "FDB: Failed to add flow rule: dmac_v(%pM) dmac_c(%pM) -> vport(%d), err(%ld)\n",
@@ -300,7 +300,7 @@ __esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u32 vport, bool rx_rule,
 	return flow_rule;
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -309,7 +309,7 @@ esw_fdb_set_vport_rule(struct mlx5_eswitch *esw, u8 mac[ETH_ALEN], u32 vport)
 	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac);
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -322,7 +322,7 @@ esw_fdb_set_vport_allmulti_rule(struct mlx5_eswitch *esw, u32 vport)
 	return __esw_fdb_set_vport_rule(esw, vport, false, mac_c, mac_v);
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
 {
 	u8 mac_c[ETH_ALEN];
@@ -515,7 +515,7 @@ static int esw_del_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	del_l2_table_entry(esw->dev, esw_uc->table_index);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule);
+		mlx5_del_flow_rules(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	l2addr_hash_del(esw_uc);
@@ -562,7 +562,7 @@ static void update_allmulti_vports(struct mlx5_eswitch *esw,
 		case MLX5_ACTION_DEL:
 			if (!iter_vaddr)
 				continue;
-			mlx5_del_flow_rule(iter_vaddr->flow_rule);
+			mlx5_del_flow_rules(iter_vaddr->flow_rule);
 			l2addr_hash_del(iter_vaddr);
 			break;
 		}
@@ -632,7 +632,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 		  esw_mc->uplink_rule);
 
 	if (vaddr->flow_rule)
-		mlx5_del_flow_rule(vaddr->flow_rule);
+		mlx5_del_flow_rules(vaddr->flow_rule);
 	vaddr->flow_rule = NULL;
 
 	/* If the multicast mac is added as a result of mc promiscuous vport,
@@ -645,7 +645,7 @@ static int esw_del_mc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
 	update_allmulti_vports(esw, vaddr, esw_mc);
 
 	if (esw_mc->uplink_rule)
-		mlx5_del_flow_rule(esw_mc->uplink_rule);
+		mlx5_del_flow_rules(esw_mc->uplink_rule);
 
 	l2addr_hash_del(esw_mc);
 	return 0;
@@ -828,14 +828,14 @@ static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
 								UPLINK_VPORT);
 		allmulti_addr->refcnt++;
 	} else if (vport->allmulti_rule) {
-		mlx5_del_flow_rule(vport->allmulti_rule);
+		mlx5_del_flow_rules(vport->allmulti_rule);
 		vport->allmulti_rule = NULL;
 
 		if (--allmulti_addr->refcnt > 0)
 			goto promisc;
 
 		if (allmulti_addr->uplink_rule)
-			mlx5_del_flow_rule(allmulti_addr->uplink_rule);
+			mlx5_del_flow_rules(allmulti_addr->uplink_rule);
 		allmulti_addr->uplink_rule = NULL;
 	}
 
@@ -847,7 +847,7 @@ static void esw_apply_vport_rx_mode(struct mlx5_eswitch *esw, u32 vport_num,
 		vport->promisc_rule = esw_fdb_set_vport_promisc_rule(esw,
 								     vport_num);
 	} else if (vport->promisc_rule) {
-		mlx5_del_flow_rule(vport->promisc_rule);
+		mlx5_del_flow_rules(vport->promisc_rule);
 		vport->promisc_rule = NULL;
 	}
 }
@@ -1015,10 +1015,10 @@ static void esw_vport_cleanup_egress_rules(struct mlx5_eswitch *esw,
 					   struct mlx5_vport *vport)
 {
 	if (!IS_ERR_OR_NULL(vport->egress.allowed_vlan))
-		mlx5_del_flow_rule(vport->egress.allowed_vlan);
+		mlx5_del_flow_rules(vport->egress.allowed_vlan);
 
 	if (!IS_ERR_OR_NULL(vport->egress.drop_rule))
-		mlx5_del_flow_rule(vport->egress.drop_rule);
+		mlx5_del_flow_rules(vport->egress.drop_rule);
 
 	vport->egress.allowed_vlan = NULL;
 	vport->egress.drop_rule = NULL;
@@ -1173,10 +1173,10 @@ static void esw_vport_cleanup_ingress_rules(struct mlx5_eswitch *esw,
 					    struct mlx5_vport *vport)
 {
 	if (!IS_ERR_OR_NULL(vport->ingress.drop_rule))
-		mlx5_del_flow_rule(vport->ingress.drop_rule);
+		mlx5_del_flow_rules(vport->ingress.drop_rule);
 
 	if (!IS_ERR_OR_NULL(vport->ingress.allow_rule))
-		mlx5_del_flow_rule(vport->ingress.allow_rule);
+		mlx5_del_flow_rules(vport->ingress.allow_rule);
 
 	vport->ingress.drop_rule = NULL;
 	vport->ingress.allow_rule = NULL;
@@ -1253,9 +1253,9 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 	vport->ingress.allow_rule =
-		mlx5_add_flow_rule(vport->ingress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
+				    0, NULL, 0);
 	if (IS_ERR(vport->ingress.allow_rule)) {
 		err = PTR_ERR(vport->ingress.allow_rule);
 		esw_warn(esw->dev,
@@ -1267,9 +1267,9 @@ static int esw_vport_ingress_config(struct mlx5_eswitch *esw,
 
 	memset(spec, 0, sizeof(*spec));
 	vport->ingress.drop_rule =
-		mlx5_add_flow_rule(vport->ingress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_DROP,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->ingress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_DROP,
+				    0, NULL, 0);
 	if (IS_ERR(vport->ingress.drop_rule)) {
 		err = PTR_ERR(vport->ingress.drop_rule);
 		esw_warn(esw->dev,
@@ -1321,9 +1321,9 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
 	vport->egress.allowed_vlan =
-		mlx5_add_flow_rule(vport->egress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_ALLOW,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_ALLOW,
+				    0, NULL, 0);
 	if (IS_ERR(vport->egress.allowed_vlan)) {
 		err = PTR_ERR(vport->egress.allowed_vlan);
 		esw_warn(esw->dev,
@@ -1336,9 +1336,9 @@ static int esw_vport_egress_config(struct mlx5_eswitch *esw,
 	/* Drop others rule (star rule) */
 	memset(spec, 0, sizeof(*spec));
 	vport->egress.drop_rule =
-		mlx5_add_flow_rule(vport->egress.acl, spec,
-				   MLX5_FLOW_CONTEXT_ACTION_DROP,
-				   0, NULL);
+		mlx5_add_flow_rules(vport->egress.acl, spec,
+				    MLX5_FLOW_CONTEXT_ACTION_DROP,
+				    0, NULL, 0);
 	if (IS_ERR(vport->egress.drop_rule)) {
 		err = PTR_ERR(vport->egress.drop_rule);
 		esw_warn(esw->dev,
@@ -1667,7 +1667,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
 		esw_disable_vport(esw, i);
 
 	if (mc_promisc && mc_promisc->uplink_rule)
-		mlx5_del_flow_rule(mc_promisc->uplink_rule);
+		mlx5_del_flow_rules(mc_promisc->uplink_rule);
 
 	esw_destroy_tsar(esw);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ddae90c..6d414cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -97,16 +97,16 @@ struct vport_ingress {
 	struct mlx5_flow_group *allow_spoofchk_only_grp;
 	struct mlx5_flow_group *allow_untagged_only_grp;
 	struct mlx5_flow_group *drop_grp;
-	struct mlx5_flow_rule  *allow_rule;
-	struct mlx5_flow_rule  *drop_rule;
+	struct mlx5_flow_handle  *allow_rule;
+	struct mlx5_flow_handle  *drop_rule;
 };
 
 struct vport_egress {
 	struct mlx5_flow_table *acl;
 	struct mlx5_flow_group *allowed_vlans_grp;
 	struct mlx5_flow_group *drop_grp;
-	struct mlx5_flow_rule  *allowed_vlan;
-	struct mlx5_flow_rule  *drop_rule;
+	struct mlx5_flow_handle  *allowed_vlan;
+	struct mlx5_flow_handle  *drop_rule;
 };
 
 struct mlx5_vport_info {
@@ -125,8 +125,8 @@ struct mlx5_vport {
 	int                     vport;
 	struct hlist_head       uc_list[MLX5_L2_ADDR_HASH_SIZE];
 	struct hlist_head       mc_list[MLX5_L2_ADDR_HASH_SIZE];
-	struct mlx5_flow_rule   *promisc_rule;
-	struct mlx5_flow_rule   *allmulti_rule;
+	struct mlx5_flow_handle *promisc_rule;
+	struct mlx5_flow_handle *allmulti_rule;
 	struct work_struct      vport_change_handler;
 
 	struct vport_ingress    ingress;
@@ -162,7 +162,7 @@ struct mlx5_eswitch_fdb {
 			struct mlx5_flow_table *fdb;
 			struct mlx5_flow_group *send_to_vport_grp;
 			struct mlx5_flow_group *miss_grp;
-			struct mlx5_flow_rule  *miss_rule;
+			struct mlx5_flow_handle *miss_rule;
 			int vlan_push_pop_refcount;
 		} offloads;
 	};
@@ -175,7 +175,7 @@ enum {
 };
 
 struct mlx5_esw_sq {
-	struct mlx5_flow_rule	*send_to_vport_rule;
+	struct mlx5_flow_handle	*send_to_vport_rule;
 	struct list_head	 list;
 };
 
@@ -188,7 +188,7 @@ struct mlx5_eswitch_rep {
 	u8		       hw_id[ETH_ALEN];
 	void		      *priv_data;
 
-	struct mlx5_flow_rule *vport_rx_rule;
+	struct mlx5_flow_handle *vport_rx_rule;
 	struct list_head       vport_sqs_list;
 	u16		       vlan;
 	u32		       vlan_refcount;
@@ -257,11 +257,11 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
 struct mlx5_flow_spec;
 struct mlx5_esw_flow_attr;
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr);
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
 
 enum {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c55ad8d..8b2a383 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -43,14 +43,14 @@ enum {
 	FDB_SLOW_PATH
 };
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
 	struct mlx5_flow_destination dest = { 0 };
 	struct mlx5_fc *counter = NULL;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
 	void *misc;
 	int action;
 
@@ -80,8 +80,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
 				      MLX5_MATCH_MISC_PARAMETERS;
 
-	rule = mlx5_add_flow_rule((struct mlx5_flow_table *)esw->fdb_table.fdb,
-				  spec, action, 0, &dest);
+	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
+				   spec, action, 0, &dest, 1);
 
 	if (IS_ERR(rule))
 		mlx5_fc_destroy(esw->dev, counter);
@@ -269,11 +269,11 @@ int mlx5_eswitch_del_vlan_action(struct mlx5_eswitch *esw,
 	return err;
 }
 
-static struct mlx5_flow_rule *
+static struct mlx5_flow_handle *
 mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
 	void *misc;
 
@@ -296,9 +296,9 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = vport;
 
-	flow_rule = mlx5_add_flow_rule(esw->fdb_table.offloads.fdb, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule))
 		esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
 out:
@@ -315,7 +315,7 @@ void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
 		return;
 
 	list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
-		mlx5_del_flow_rule(esw_sq->send_to_vport_rule);
+		mlx5_del_flow_rules(esw_sq->send_to_vport_rule);
 		list_del(&esw_sq->list);
 		kfree(esw_sq);
 	}
@@ -325,7 +325,7 @@ int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 				 struct mlx5_eswitch_rep *rep,
 				 u16 *sqns_array, int sqns_num)
 {
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_esw_sq *esw_sq;
 	int err;
 	int i;
@@ -362,7 +362,7 @@ int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
 static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule = NULL;
+	struct mlx5_flow_handle *flow_rule = NULL;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -376,9 +376,9 @@ static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
 	dest.vport_num = 0;
 
-	flow_rule = mlx5_add_flow_rule(esw->fdb_table.offloads.fdb, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.fdb, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		err = PTR_ERR(flow_rule);
 		esw_warn(esw->dev,  "FDB: Failed to add miss flow rule err %d\n", err);
@@ -501,7 +501,7 @@ static void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
 		return;
 
 	esw_debug(esw->dev, "Destroy offloads FDB Table\n");
-	mlx5_del_flow_rule(esw->fdb_table.offloads.miss_rule);
+	mlx5_del_flow_rules(esw->fdb_table.offloads.miss_rule);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
 	mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
 
@@ -585,11 +585,11 @@ static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
 	mlx5_destroy_flow_group(esw->offloads.vport_rx_group);
 }
 
-struct mlx5_flow_rule *
+struct mlx5_flow_handle *
 mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 {
 	struct mlx5_flow_destination dest;
-	struct mlx5_flow_rule *flow_rule;
+	struct mlx5_flow_handle *flow_rule;
 	struct mlx5_flow_spec *spec;
 	void *misc;
 
@@ -610,9 +610,9 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 	dest.tir_num = tirn;
 
-	flow_rule = mlx5_add_flow_rule(esw->offloads.ft_offloads, spec,
-				       MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-				       0, &dest);
+	flow_rule = mlx5_add_flow_rules(esw->offloads.ft_offloads, spec,
+					MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+					0, &dest, 1);
 	if (IS_ERR(flow_rule)) {
 		esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
 		goto out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 43d7052..6732287 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -155,6 +155,9 @@ static void del_flow_group(struct fs_node *node);
 static void del_fte(struct fs_node *node);
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 				struct mlx5_flow_destination *d2);
+static struct mlx5_flow_rule *
+find_flow_rule(struct fs_fte *fte,
+	       struct mlx5_flow_destination *dest);
 
 static void tree_init_node(struct fs_node *node,
 			   unsigned int refcount,
@@ -640,8 +643,8 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	return err;
 }
 
-int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-				 struct mlx5_flow_destination *dest)
+static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
+					 struct mlx5_flow_destination *dest)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
@@ -666,6 +669,28 @@ int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 	return err;
 }
 
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handle,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest)
+{
+	int i;
+
+	if (!old_dest) {
+		if (handle->num_rules != 1)
+			return -EINVAL;
+		return _mlx5_modify_rule_destination(handle->rule[0],
+						     new_dest);
+	}
+
+	for (i = 0; i < handle->num_rules; i++) {
+		if (mlx5_flow_dests_cmp(new_dest, &handle->rule[i]->dest_attr))
+			return _mlx5_modify_rule_destination(handle->rule[i],
+							     new_dest);
+	}
+
+	return -EINVAL;
+}
+
 /* Modify/set FWD rules that point on old_next_ft to point on new_next_ft  */
 static int connect_fwd_rules(struct mlx5_core_dev *dev,
 			     struct mlx5_flow_table *new_next_ft,
@@ -688,7 +713,7 @@ static int connect_fwd_rules(struct mlx5_core_dev *dev,
 	list_splice_init(&old_next_ft->fwd_rules, &new_next_ft->fwd_rules);
 	mutex_unlock(&old_next_ft->lock);
 	list_for_each_entry(iter, &new_next_ft->fwd_rules, next_ft) {
-		err = mlx5_modify_rule_destination(iter, &dest);
+		err = _mlx5_modify_rule_destination(iter, &dest);
 		if (err)
 			pr_err("mlx5_core: failed to modify rule to point on flow table %d\n",
 			       new_next_ft->id);
@@ -917,41 +942,117 @@ static struct mlx5_flow_rule *alloc_rule(struct mlx5_flow_destination *dest)
 	return rule;
 }
 
-/* fte should not be deleted while calling this function */
-static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
-					   struct mlx5_flow_group *fg,
-					   struct mlx5_flow_destination *dest,
-					   bool update_action)
+static struct mlx5_flow_handle *alloc_handle(int num_rules)
 {
+	struct mlx5_flow_handle *handle;
+
+	handle = kzalloc(sizeof(*handle) + sizeof(handle->rule[0]) *
+			  num_rules, GFP_KERNEL);
+	if (!handle)
+		return NULL;
+
+	handle->num_rules = num_rules;
+
+	return handle;
+}
+
+static void destroy_flow_handle(struct fs_fte *fte,
+				struct mlx5_flow_handle *handle,
+				struct mlx5_flow_destination *dest,
+				int i)
+{
+	for (; --i >= 0;) {
+		if (atomic_dec_and_test(&handle->rule[i]->node.refcount)) {
+			fte->dests_size--;
+			list_del(&handle->rule[i]->node.list);
+			kfree(handle->rule[i]);
+		}
+	}
+	kfree(handle);
+}
+
+static struct mlx5_flow_handle *
+create_flow_handle(struct fs_fte *fte,
+		   struct mlx5_flow_destination *dest,
+		   int dest_num,
+		   int *modify_mask,
+		   bool *new_rule)
+{
+	struct mlx5_flow_handle *handle;
+	struct mlx5_flow_rule *rule = NULL;
+	static int count = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+	static int dst = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
+	int type;
+	int i = 0;
+
+	handle = alloc_handle((dest_num) ? dest_num : 1);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	do {
+		if (dest) {
+			rule = find_flow_rule(fte, dest + i);
+			if (rule) {
+				atomic_inc(&rule->node.refcount);
+				goto rule_found;
+			}
+		}
+
+		*new_rule = true;
+		rule = alloc_rule(dest + i);
+		if (!rule)
+			goto free_rules;
+
+		/* Add dest to dests list- we need flow tables to be in the
+		 * end of the list for forward to next prio rules.
+		 */
+		tree_init_node(&rule->node, 1, del_rule);
+		if (dest &&
+		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			list_add(&rule->node.list, &fte->node.children);
+		else
+			list_add_tail(&rule->node.list, &fte->node.children);
+		if (dest) {
+			fte->dests_size++;
+
+			type = dest[i].type ==
+				MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+			*modify_mask |= type ? count : dst;
+		}
+rule_found:
+		handle->rule[i] = rule;
+	} while (++i < dest_num);
+
+	return handle;
+
+free_rules:
+	destroy_flow_handle(fte, handle, dest, i);
+	return ERR_PTR(-ENOMEM);
+}
+
+/* fte should not be deleted while calling this function */
+static struct mlx5_flow_handle *
+add_rule_fte(struct fs_fte *fte,
+	     struct mlx5_flow_group *fg,
+	     struct mlx5_flow_destination *dest,
+	     int dest_num,
+	     bool update_action)
+{
+	struct mlx5_flow_handle *handle;
 	struct mlx5_flow_table *ft;
-	struct mlx5_flow_rule *rule;
 	int modify_mask = 0;
 	int err;
+	bool new_rule = false;
 
-	rule = alloc_rule(dest);
-	if (!rule)
-		return ERR_PTR(-ENOMEM);
+	handle = create_flow_handle(fte, dest, dest_num, &modify_mask,
+				    &new_rule);
+	if (IS_ERR(handle) || !new_rule)
+		goto out;
 
 	if (update_action)
 		modify_mask |= BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
 
 	fs_get_obj(ft, fg->node.parent);
-	/* Add dest to dests list- we need flow tables to be in the
-	 * end of the list for forward to next prio rules.
-	 */
-	tree_init_node(&rule->node, 1, del_rule);
-	if (dest && dest->type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
-		list_add(&rule->node.list, &fte->node.children);
-	else
-		list_add_tail(&rule->node.list, &fte->node.children);
-	if (dest) {
-		fte->dests_size++;
-
-		modify_mask |= dest->type == MLX5_FLOW_DESTINATION_TYPE_COUNTER ?
-			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS) :
-			BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
-	}
-
 	if (!(fte->status & FS_FTE_STATUS_EXISTING))
 		err = mlx5_cmd_create_fte(get_dev(&ft->node),
 					  ft, fg->id, fte);
@@ -959,17 +1060,15 @@ static struct mlx5_flow_rule *add_rule_fte(struct fs_fte *fte,
 		err = mlx5_cmd_update_fte(get_dev(&ft->node),
 					  ft, fg->id, modify_mask, fte);
 	if (err)
-		goto free_rule;
+		goto free_handle;
 
 	fte->status |= FS_FTE_STATUS_EXISTING;
 
-	return rule;
+out:
+	return handle;
 
-free_rule:
-	list_del(&rule->node.list);
-	kfree(rule);
-	if (dest)
-		fte->dests_size--;
+free_handle:
+	destroy_flow_handle(fte, handle, dest, handle->num_rules);
 	return ERR_PTR(err);
 }
 
@@ -1098,16 +1197,18 @@ static struct mlx5_flow_rule *find_flow_rule(struct fs_fte *fte,
 	return NULL;
 }
 
-static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
-					  u32 *match_value,
-					  u8 action,
-					  u32 flow_tag,
-					  struct mlx5_flow_destination *dest)
+static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
+					    u32 *match_value,
+					    u8 action,
+					    u32 flow_tag,
+					    struct mlx5_flow_destination *dest,
+					    int dest_num)
 {
-	struct fs_fte *fte;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *handle;
 	struct mlx5_flow_table *ft;
 	struct list_head *prev;
+	struct fs_fte *fte;
+	int i;
 
 	nested_lock_ref_node(&fg->node, FS_MUTEX_PARENT);
 	fs_for_each_fte(fte, fg) {
@@ -1116,40 +1217,33 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 		    (action & fte->action) && flow_tag == fte->flow_tag) {
 			int old_action = fte->action;
 
-			rule = find_flow_rule(fte, dest);
-			if (rule) {
-				atomic_inc(&rule->node.refcount);
-				unlock_ref_node(&fte->node);
-				unlock_ref_node(&fg->node);
-				return rule;
-			}
 			fte->action |= action;
-			rule = add_rule_fte(fte, fg, dest,
-					    old_action != action);
-			if (IS_ERR(rule)) {
+			handle = add_rule_fte(fte, fg, dest, dest_num,
+					      old_action != action);
+			if (IS_ERR(handle)) {
 				fte->action = old_action;
 				goto unlock_fte;
 			} else {
-				goto add_rule;
+				goto add_rules;
 			}
 		}
 		unlock_ref_node(&fte->node);
 	}
 	fs_get_obj(ft, fg->node.parent);
 	if (fg->num_ftes >= fg->max_ftes) {
-		rule = ERR_PTR(-ENOSPC);
+		handle = ERR_PTR(-ENOSPC);
 		goto unlock_fg;
 	}
 
 	fte = create_fte(fg, match_value, action, flow_tag, &prev);
 	if (IS_ERR(fte)) {
-		rule = (void *)fte;
+		handle = (void *)fte;
 		goto unlock_fg;
 	}
 	tree_init_node(&fte->node, 0, del_fte);
 	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
-	rule = add_rule_fte(fte, fg, dest, false);
-	if (IS_ERR(rule)) {
+	handle = add_rule_fte(fte, fg, dest, dest_num, false);
+	if (IS_ERR(handle)) {
 		kfree(fte);
 		goto unlock_fg;
 	}
@@ -1158,21 +1252,24 @@ static struct mlx5_flow_rule *add_rule_fg(struct mlx5_flow_group *fg,
 
 	tree_add_node(&fte->node, &fg->node);
 	list_add(&fte->node.list, prev);
-add_rule:
-	tree_add_node(&rule->node, &fte->node);
+add_rules:
+	for (i = 0; i < handle->num_rules; i++) {
+		if (atomic_read(&handle->rule[i]->node.refcount) == 1)
+			tree_add_node(&handle->rule[i]->node, &fte->node);
+	}
 unlock_fte:
 	unlock_ref_node(&fte->node);
 unlock_fg:
 	unlock_ref_node(&fg->node);
-	return rule;
+	return handle;
 }
 
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule)
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handle)
 {
 	struct mlx5_flow_rule *dst;
 	struct fs_fte *fte;
 
-	fs_get_obj(fte, rule->node.parent);
+	fs_get_obj(fte, handle->rule[0]->node.parent);
 
 	fs_for_each_dst(dst, fte) {
 		if (dst->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER)
@@ -1211,18 +1308,22 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 	return true;
 }
 
-static struct mlx5_flow_rule *
-_mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		    u32 action,
-		    u32 flow_tag,
-		    struct mlx5_flow_destination *dest)
+static struct mlx5_flow_handle *
+_mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		     struct mlx5_flow_spec *spec,
+		     u32 action,
+		     u32 flow_tag,
+		     struct mlx5_flow_destination *dest,
+		     int dest_num)
 {
 	struct mlx5_flow_group *g;
-	struct mlx5_flow_rule *rule;
+	struct mlx5_flow_handle *rule;
+	int i;
 
-	if (!dest_is_valid(dest, action, ft))
-		return ERR_PTR(-EINVAL);
+	for (i = 0; i < dest_num; i++) {
+		if (!dest_is_valid(&dest[i], action, ft))
+			return ERR_PTR(-EINVAL);
+	}
 
 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
 	fs_for_each_fg(g, ft)
@@ -1231,7 +1332,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 					   g->mask.match_criteria,
 					   spec->match_criteria)) {
 			rule = add_rule_fg(g, spec->match_value,
-					   action, flow_tag, dest);
+					   action, flow_tag, dest, dest_num);
 			if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC)
 				goto unlock;
 		}
@@ -1244,7 +1345,7 @@ _mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 	}
 
 	rule = add_rule_fg(g, spec->match_value,
-			   action, flow_tag, dest);
+			   action, flow_tag, dest, dest_num);
 	if (IS_ERR(rule)) {
 		/* Remove assumes refcount > 0 and autogroup creates a group
 		 * with a refcount = 0.
@@ -1265,17 +1366,18 @@ static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
 		(MLX5_CAP_FLOWTABLE(get_dev(&ft->node), nic_rx_multi_path_tirs)));
 }
 
-struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest)
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    u32 action,
+		    u32 flow_tag,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num)
 {
 	struct mlx5_flow_root_namespace *root = find_root(&ft->node);
 	struct mlx5_flow_destination gen_dest;
 	struct mlx5_flow_table *next_ft = NULL;
-	struct mlx5_flow_rule *rule = NULL;
+	struct mlx5_flow_handle *handle = NULL;
 	u32 sw_action = action;
 	struct fs_prio *prio;
 
@@ -1291,6 +1393,7 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 			gen_dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
 			gen_dest.ft = next_ft;
 			dest = &gen_dest;
+			dest_num = 1;
 			action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
 		} else {
 			mutex_unlock(&root->chain_lock);
@@ -1298,27 +1401,33 @@ mlx5_add_flow_rule(struct mlx5_flow_table *ft,
 		}
 	}
 
-	rule = _mlx5_add_flow_rule(ft, spec, action, flow_tag, dest);
+	handle = _mlx5_add_flow_rules(ft, spec, action, flow_tag, dest,
+				      dest_num);
 
 	if (sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
-		if (!IS_ERR_OR_NULL(rule) &&
-		    (list_empty(&rule->next_ft))) {
+		if (!IS_ERR_OR_NULL(handle) &&
+		    (list_empty(&handle->rule[0]->next_ft))) {
 			mutex_lock(&next_ft->lock);
-			list_add(&rule->next_ft, &next_ft->fwd_rules);
+			list_add(&handle->rule[0]->next_ft,
+				 &next_ft->fwd_rules);
 			mutex_unlock(&next_ft->lock);
-			rule->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+			handle->rule[0]->sw_action = MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
 		}
 		mutex_unlock(&root->chain_lock);
 	}
-	return rule;
+	return handle;
 }
-EXPORT_SYMBOL(mlx5_add_flow_rule);
+EXPORT_SYMBOL(mlx5_add_flow_rules);
 
-void mlx5_del_flow_rule(struct mlx5_flow_rule *rule)
+void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
 {
-	tree_remove_node(&rule->node);
+	int i;
+
+	for (i = handle->num_rules - 1; i >= 0; i--)
+		tree_remove_node(&handle->rule[i]->node);
+	kfree(handle);
 }
-EXPORT_SYMBOL(mlx5_del_flow_rule);
+EXPORT_SYMBOL(mlx5_del_flow_rules);
 
 /* Assuming prio->node.children(flow tables) is sorted by level */
 static struct mlx5_flow_table *find_next_ft(struct mlx5_flow_table *ft)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 71ff03b..d515088 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -94,6 +94,11 @@ struct mlx5_flow_rule {
 	u32					sw_action;
 };
 
+struct mlx5_flow_handle {
+	int num_rules;
+	struct mlx5_flow_rule *rule[];
+};
+
 /* Type of children is mlx5_flow_group */
 struct mlx5_flow_table {
 	struct fs_node			node;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 93ebc5e..0dcd287 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -69,8 +69,8 @@ enum mlx5_flow_namespace_type {
 
 struct mlx5_flow_table;
 struct mlx5_flow_group;
-struct mlx5_flow_rule;
 struct mlx5_flow_namespace;
+struct mlx5_flow_handle;
 
 struct mlx5_flow_spec {
 	u8   match_criteria_enable;
@@ -127,18 +127,20 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg);
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
-struct mlx5_flow_rule *
-mlx5_add_flow_rule(struct mlx5_flow_table *ft,
-		   struct mlx5_flow_spec *spec,
-		   u32 action,
-		   u32 flow_tag,
-		   struct mlx5_flow_destination *dest);
-void mlx5_del_flow_rule(struct mlx5_flow_rule *fr);
-
-int mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
-				 struct mlx5_flow_destination *dest);
-
-struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_rule *rule);
+struct mlx5_flow_handle *
+mlx5_add_flow_rules(struct mlx5_flow_table *ft,
+		    struct mlx5_flow_spec *spec,
+		    u32 action,
+		    u32 flow_tag,
+		    struct mlx5_flow_destination *dest,
+		    int dest_num);
+void mlx5_del_flow_rules(struct mlx5_flow_handle *fr);
+
+int mlx5_modify_rule_destination(struct mlx5_flow_handle *handler,
+				 struct mlx5_flow_destination *new_dest,
+				 struct mlx5_flow_destination *old_dest);
+
+struct mlx5_fc *mlx5_flow_rule_counter(struct mlx5_flow_handle *handler);
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH for-next V2 14/15] net/mlx5: Add option to add fwd rule with counter
From: Saeed Mahameed @ 2016-10-30 21:22 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Or Gerlitz, Leon Romanovsky, Tal Alon,
	Matan Barak, Mark Bloch, Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm@mellanox.com>

From: Mark Bloch <markb@mellanox.com>

Currently the code supports only drop rules to possess counters,
add that ability also for fwd rules.

Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 24 +++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 6732287..0dfd998 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -374,6 +374,7 @@ static void del_rule(struct fs_node *node)
 	struct mlx5_core_dev *dev = get_dev(node);
 	int match_len = MLX5_ST_SZ_BYTES(fte_match_param);
 	int err;
+	bool update_fte = false;
 
 	match_value = mlx5_vzalloc(match_len);
 	if (!match_value) {
@@ -392,13 +393,23 @@ static void del_rule(struct fs_node *node)
 		list_del(&rule->next_ft);
 		mutex_unlock(&rule->dest_attr.ft->lock);
 	}
+
+	if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER  &&
+	    --fte->dests_size) {
+		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION);
+		fte->action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		update_fte = true;
+		goto out;
+	}
+
 	if ((fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
 	    --fte->dests_size) {
 		modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST),
-		err = mlx5_cmd_update_fte(dev, ft,
-					  fg->id,
-					  modify_mask,
-					  fte);
+		update_fte = true;
+	}
+out:
+	if (update_fte && fte->dests_size) {
+		err = mlx5_cmd_update_fte(dev, ft, fg->id, modify_mask, fte);
 		if (err)
 			mlx5_core_warn(dev,
 				       "%s can't del rule fg id=%d fte_index=%d\n",
@@ -1287,8 +1298,9 @@ static bool counter_is_valid(struct mlx5_fc *counter, u32 action)
 	if (!counter)
 		return false;
 
-	/* Hardware support counter for a drop action only */
-	return action == (MLX5_FLOW_CONTEXT_ACTION_DROP | MLX5_FLOW_CONTEXT_ACTION_COUNT);
+	return (action & (MLX5_FLOW_CONTEXT_ACTION_DROP |
+			  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) &&
+		(action & MLX5_FLOW_CONTEXT_ACTION_COUNT);
 }
 
 static bool dest_is_valid(struct mlx5_flow_destination *dest,
-- 
2.7.4

^ permalink raw reply related

* [PATCH for-next V2 15/15] net/mlx5e: Add tc support for FWD rule with counter
From: Saeed Mahameed @ 2016-10-30 21:22 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Or Gerlitz, Leon Romanovsky, Tal Alon, Matan Barak, Mark Bloch,
	Saeed Mahameed, Leon Romanovsky
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When creating a FWD rule using tc create also a HW counter
for this rule.

Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c      |  3 ++-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 20 +++++++++++---------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 5d9ac0d..c2e4728 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -419,7 +419,8 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
 				return -EINVAL;
 			}
 
-			attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+			attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST |
+					MLX5_FLOW_CONTEXT_ACTION_COUNT;
 			out_priv = netdev_priv(out_dev);
 			attr->out_rep = out_priv->ppriv;
 			continue;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8b2a383..53d9d6c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -48,11 +48,12 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				struct mlx5_flow_spec *spec,
 				struct mlx5_esw_flow_attr *attr)
 {
-	struct mlx5_flow_destination dest = { 0 };
+	struct mlx5_flow_destination dest[2] = {};
 	struct mlx5_fc *counter = NULL;
 	struct mlx5_flow_handle *rule;
 	void *misc;
 	int action;
+	int i = 0;
 
 	if (esw->mode != SRIOV_OFFLOADS)
 		return ERR_PTR(-EOPNOTSUPP);
@@ -60,15 +61,17 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 	action = attr->action;
 
 	if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
-		dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-		dest.vport_num = attr->out_rep->vport;
-		action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	} else if (action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+		dest[i].vport_num = attr->out_rep->vport;
+		i++;
+	}
+	if (action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
 		counter = mlx5_fc_create(esw->dev, true);
 		if (IS_ERR(counter))
 			return ERR_CAST(counter);
-		dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest.counter = counter;
+		dest[i].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest[i].counter = counter;
+		i++;
 	}
 
 	misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
@@ -81,8 +84,7 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw,
 				      MLX5_MATCH_MISC_PARAMETERS;
 
 	rule = mlx5_add_flow_rules((struct mlx5_flow_table *)esw->fdb_table.fdb,
-				   spec, action, 0, &dest, 1);
-
+				   spec, action, 0, dest, i);
 	if (IS_ERR(rule))
 		mlx5_fc_destroy(esw->dev, counter);
 
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH for-next V2 00/15][PULL request] Mellanox mlx5 core driver updates 2016-10-25
From: David Miller @ 2016-10-30 21:35 UTC (permalink / raw)
  To: saeedm-VPRAkNaXOzVWk0Htik3J/w
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	ogerlitz-VPRAkNaXOzVWk0Htik3J/w, leonro-VPRAkNaXOzVWk0Htik3J/w,
	talal-VPRAkNaXOzVWk0Htik3J/w, matanb-VPRAkNaXOzVWk0Htik3J/w
In-Reply-To: <1477862528-4328-1-git-send-email-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Sun, 30 Oct 2016 23:21:53 +0200

> This series contains some updates and fixes of mlx5 core and
> IB drivers with the addition of two features that demand
> new low level commands and infrastructure updates.
>  - SRIOV VF max rate limit support
>  - mlx5e tc support for FWD rules with counter.
> 
> Needed for both net and rdma subsystems.

Pulled, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: RDMA developer gatherings around Kernel Summit and Linux Plumbers in Santa Fe
From: Christoph Lameter @ 2016-10-30 23:49 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Doug Ledford,
	skc-YOWKrPYUwWM, Weiny, Ira, Jason Gunthorpe, John Fleck,
	Leon Romanovsky, Liran Liss, Matan Barak, Tzahi Oved
In-Reply-To: <CAJ3xEMiy8+yQTQKpuY8f57VJwWMwXryxQ09ghyTbDr91s=3w+g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Sat, 29 Oct 2016, Or Gerlitz wrote:

> On Fri, Oct 28, 2016 at 8:17 PM, Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote:
>
> > - RDMA workshop on Tuesday, 1st of November
> >         Meeting in Sweeney AB from 9am till 5pm.
> >         See https://www.linuxplumbersconf.org/2016/ocw/events/LPC2016/schedule
> >         This is part of the Kernel Summit and the Linux Plumbers
> >         Conference. Only open to KS and LPC attendees with an invitation.
>
> isn't that open to all LPC attendees? didn't follow on the invitation
> requirement.. it's part of LPC
>

If you were invited to the KS or LPC then you can attend the RDMA
workshop.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: SQ overflow seen running isert traffic
From: Nicholas A. Bellinger @ 2016-10-31  3:40 UTC (permalink / raw)
  To: Steve Wise
  Cc: 'Sagi Grimberg', 'Potnuri Bharat Teja',
	target-devel-u79uwXL29TY76Z2rM5mHXA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <005101d2294c$be5bb460$3b131d20$@opengridcomputing.com>

Hi Steve, Potnuri, & Co,

On Tue, 2016-10-18 at 09:34 -0500, Steve Wise wrote:
> > 
> > > I tried out this change and it works fine with iwarp. I dont see SQ
> > > overflow. Apparently we have increased the sq too big to overflow. I am
> going
> > > to let it run with higher workloads for longer time, to see if it holds
> good.
> > 
> > Actually on second thought, this patch is an overkill. Effectively we
> > now set:
> > 
> > MAX_CMD=266
> > and max_rdma_ctx=128 so together we take 394 which seems to too much.
> > 
> > If we go by the scheme of 1 rdma + 1 send for each IO we need:
> > - 128 sends
> > - 128 rdmas
> > - 10 miscs
> > 
> > so this gives 266.
> > 
> > Perhaps this is due to the fact that iWARP needs to register memory for
> > rdma reads as well? (and also rdma writes > 128k for chelsio HW right?)
> >
> 
> iWARP definitely needs to register memory for the target of reads, due to
> REMOTE_WRITE requirement for the protocol.  The source of a write doesn't need
> to register memory, but the SGE depth can cause multiple WRITE WRs to be
> required to service the IO.  And in theory there should be some threshold where
> it might be better performance-wise to do a memory register + 1 WRITE vs X
> WRITEs.    
> 
> As you mentioned, the RW API should account for this, but perhaps it is still
> off some.  Bharat, have a look into the RDMA-RW API and let us see if we can
> figure out if the additional SQ depth it adds is sufficient.
>  
> > What is the workload you are running? with immediatedata enabled you
> > should issue reg+rdma_read+send only for writes > 8k.
> > 
> > Does this happen when you run only reads for example?
> > 
> > I guess its time to get the sq accounting into shape...
> 
> So to sum up - 2 issues:
> 
> 1) we believe the iSER + RW API correctly sizes the SQ, yet we're seeing SQ
> overflows.  So the SQ sizing needs more investigation.
> 
> 2) if the SQ is full, then the iSER/target code is supposed to resubmit.  And
> apparently that isn't working.
> 

For #2, target-core expects -ENOMEM or -EAGAIN return from fabric driver
callbacks to signal internal queue-full retry logic.  Otherwise, the
extra se_cmd->cmd_kref response SCF_ACK_KREF is leaked until session
shutdown and/or reinstatement occurs.

AFAICT, Potunri's earlier hung task with v4.8.y + ABORT_TASK is likely
the earlier v4.1+ regression:

https://github.com/torvalds/linux/commit/527268df31e57cf2b6d417198717c6d6afdb1e3e

That said, there is room for improvement in target-core queue-full error
signaling, and iscsi-target/iser-target callback error propagation.  

Sending out a series shortly to address these particular items.
Please have a look.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] IBcore/CM: Issue DREQ when receiving REQ/REP for stale QP
From: santosh.shilimkar-QHcLZuEGTsvQT0dZR+AlfA @ 2016-10-31  4:52 UTC (permalink / raw)
  To: Sagi Grimberg, Hans Westgaard Ry, Doug Ledford, Sean Hefty,
	Hal Rosenstock, Matan Barak, Erez Shitrit, Bart Van Assche,
	Ira Weiny, Or Gerlitz, Hakon Bugge, Yuval Shaia,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <8df2c90e-8581-4ce8-fd18-92ac57eb6160-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>

On 10/30/16 2:06 PM, Sagi Grimberg wrote:
>> from "InfiBand Architecture Specifications Volume 1":
>>
>>   A QP is said to have a stale connection when only one side has
>>   connection information. A stale connection may result if the remote CM
>>   had dropped the connection and sent a DREQ but the DREQ was never
>>   received by the local CM. Alternatively the remote CM may have lost
>>   all record of past connections because its node crashed and rebooted,
>>   while the local CM did not become aware of the remote node's reboot
>>   and therefore did not clean up stale connections.
>>
>> and:
>>
>>    A local CM may receive a REQ/REP for a stale connection. It shall
>>    abort the connection issuing REJ to the REQ/REP. It shall then issue
>>    DREQ with "DREQ:remote QPN” set to the remote QPN from the REQ/REP.
>>
>> This patch solves a problem with reuse of QPN. Current codebase, that
>> is IPoIB, relies on a REAP-mechanism to do cleanup of the structures
>> in CM. A problem with this is the timeconstants governing this
>> mechanism; they are up to 768 seconds and the interface may look
>> inresponsive in that period.  Issuing a DREQ (and receiving a DREP)
>> does the necessary cleanup and the interface comes up.
>
> I like this fix, so,
>
Me too and hence suggested Hans to post it on rdma list when
saw this patch in internal review.

> Reviewed-by: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
>
> But I think the CM layer still is buggy in this area.
>
> In vol 1 the state transition table specifically states that DREP
> timeouts should move the cm_id to timewait state but the CM doesn't
> seem to maintain response timeouts on disconnect requests. If the
> DREQ happened to fail (send error completion) things are fine, but
> if the DREQ makes it to the peer but it doesn't reply then no one
> will take care of it (i.e. we will never see a TIMEWAIT event from
> this cm_id)...
>
> I recall some debugging session with Hal on this area a ~year ago
> with a new iser target (which didn't reply to DREQs on reboot
> sequences). iser initiator waits for a DISCONNECTED/TIMEWAIT events
> before destroying the cm_id (which never happened because of the
> above). I think I ended up working around that in iser to just go
> ahead and destroy the cm_id after issuing a DREQ (but now I realize
> it was never included so I'll probably dig it up again soon).

There is another fundamental issue with core CM code wrt DREQ
getting dropped. The the mad agent used to send the DREQ is
associated with a port and if this port is down, the IB link
layer will drop that DREQ as per SPEC. Similarly if the destination
port is down where the DREQ is suppose to reach, then the DREQ
gets dropped there too. These dropped CM ids are retried by MAD
agent on same port till the port comes back alive.

Am not sure in your case the ports were going down or not
but it was the case then IIUC, what you are doing for ISER is
still needed (up front destroying the cm id).

Regards,
Santosh


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 0/3] target: Fix queue-full callback error signaling
From: Nicholas A. Bellinger @ 2016-10-31  5:25 UTC (permalink / raw)
  To: target-devel
  Cc: linux-rdma, Potnuri Bharat Teja, Steve Wise, Sagi Grimberg,
	Nicholas Bellinger

From: Nicholas Bellinger <nab@linux-iscsi.org>

Hi folks,

This series contains target-core queue-full + iscsi-target
callback fixes for a bug reported by Steve and Potnuri
during recent v4.8.y iWARP/iser-target testing.

The first patch fixes target-core queue-full handling response
leaks with non -EAGAIN / -ENOMEM errors.  It uses a new state
TRANSPORT_COMPLETE_QF_ERR to internally generate CHECK_CONDITION
for unknown fabric callback errors, to avoid attempting retry
of fabric data-transfer callbacks for this special case.

This means all non -EAGAIN / -ENOMEM fabric callback errors
during target_core_fabric_ops for:

  *) ->write_pending()
  *) ->queue_data_in()
  *) ->queue_status()

will result in CHECK_CONDITION + LOGICAL_UNIT_COMMUNICATION_FAILURE,
if no non-zero se_cmd->scsi_status was previously set.
It also means target-core ->queue_status() errors retry indefinately,
or until session shutdown explicitly stops outstanding I/O.

The remaining changes are for propagating iscsit_transport
response failure back to target-core queue-full, and updating
iser-target to propagate isert_rdma_rw_ctx_post() errors from
RDMA R/W API back to target-core as well.

Please review.

--nab

Nicholas Bellinger (3):
  target: Fix unknown fabric callback queue-full errors
  iscsi-target: Propigate queue_data_in + queue_status errors
  iser-target: Fix queue-full response handling

 drivers/infiniband/ulp/isert/ib_isert.c      |  53 +++++++++-----
 drivers/target/iscsi/iscsi_target.c          |   3 +-
 drivers/target/iscsi/iscsi_target_configfs.c |  13 ++--
 drivers/target/iscsi/iscsi_target_util.c     |   5 +-
 drivers/target/iscsi/iscsi_target_util.h     |   2 +-
 drivers/target/target_core_transport.c       | 102 ++++++++++++++++++---------
 include/target/target_core_base.h            |   1 +
 7 files changed, 114 insertions(+), 65 deletions(-)

-- 
1.9.1

^ permalink raw reply

* [PATCH 1/3] target: Fix unknown fabric callback queue-full errors
From: Nicholas A. Bellinger @ 2016-10-31  5:25 UTC (permalink / raw)
  To: target-devel
  Cc: linux-rdma, Potnuri Bharat Teja, Steve Wise, Sagi Grimberg,
	Nicholas Bellinger
In-Reply-To: <1477891554-26222-1-git-send-email-nab@linux-iscsi.org>

From: Nicholas Bellinger <nab@linux-iscsi.org>

This patch fixes a set of queue-full response handling
bugs, where outgoing responses are leaked when a fabric
driver is propagating non -EAGAIN or -ENOMEM errors
to target-core.

It introduces TRANSPORT_COMPLETE_QF_ERR state used to
signal when CHECK_CONDITION status should be generated,
when fabric driver ->write_pending(), ->queue_data_in(),
or ->queue_status() callbacks fail with non -EAGAIN or
-ENOMEM errors, and data-transfer should not be retried.

Note all fabric driver -EAGAIN and -ENOMEM errors are
still retried indefinately with associated data-transfer
callbacks, following existing queue-full logic.

Also fix two missing ->queue_status() queue-full cases
related to CMD_T_ABORTED w/ TAS status handling.

Reported-by: Potnuri Bharat Teja <bharat@chelsio.com>
Cc: Potnuri Bharat Teja <bharat@chelsio.com>
Reported-by: Steve Wise <swise@opengridcomputing.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/target_core_transport.c | 102 ++++++++++++++++++++++-----------
 include/target/target_core_base.h      |   1 +
 2 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 7dfefd6..489aa95 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -64,8 +64,9 @@
 struct kmem_cache *t10_alua_lba_map_mem_cache;
 
 static void transport_complete_task_attr(struct se_cmd *cmd);
+static int translate_sense_reason(struct se_cmd *cmd, sense_reason_t reason);
 static void transport_handle_queue_full(struct se_cmd *cmd,
-		struct se_device *dev);
+		struct se_device *dev, int err, bool write_pending);
 static int transport_put_cmd(struct se_cmd *cmd);
 static void target_complete_ok_work(struct work_struct *work);
 
@@ -811,7 +812,8 @@ void target_qf_do_work(struct work_struct *work)
 
 		if (cmd->t_state == TRANSPORT_COMPLETE_QF_WP)
 			transport_write_pending_qf(cmd);
-		else if (cmd->t_state == TRANSPORT_COMPLETE_QF_OK)
+		else if (cmd->t_state == TRANSPORT_COMPLETE_QF_OK ||
+			 cmd->t_state == TRANSPORT_COMPLETE_QF_ERR)
 			transport_complete_qf(cmd);
 	}
 }
@@ -1720,7 +1722,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 		}
 		trace_target_cmd_complete(cmd);
 		ret = cmd->se_tfo->queue_status(cmd);
-		if (ret == -EAGAIN || ret == -ENOMEM)
+		if (ret)
 			goto queue_full;
 		goto check_stop;
 	default:
@@ -1731,7 +1733,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 	}
 
 	ret = transport_send_check_condition_and_sense(cmd, sense_reason, 0);
-	if (ret == -EAGAIN || ret == -ENOMEM)
+	if (ret)
 		goto queue_full;
 
 check_stop:
@@ -1740,8 +1742,7 @@ void transport_generic_request_failure(struct se_cmd *cmd,
 	return;
 
 queue_full:
-	cmd->t_state = TRANSPORT_COMPLETE_QF_OK;
-	transport_handle_queue_full(cmd, cmd->se_dev);
+	transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 EXPORT_SYMBOL(transport_generic_request_failure);
 
@@ -1980,13 +1981,29 @@ static void transport_complete_qf(struct se_cmd *cmd)
 	int ret = 0;
 
 	transport_complete_task_attr(cmd);
+	/*
+	 * If a fabric driver ->write_pending() or ->queue_data_in() callback
+	 * has returned neither -ENOMEM or -EAGAIN, assume it's fatal and
+	 * the same callbacks should not be retried.  Return CHECK_CONDITION
+	 * if a scsi_status is not already set.
+	 *
+	 * If a fabric driver ->queue_status() has returned non zero, always
+	 * keep retrying no matter what..
+	 */
+	if (cmd->t_state == TRANSPORT_COMPLETE_QF_ERR) {
+		if (cmd->scsi_status)
+			goto queue_status;
 
-	if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) {
-		trace_target_cmd_complete(cmd);
-		ret = cmd->se_tfo->queue_status(cmd);
-		goto out;
+		cmd->se_cmd_flags |= SCF_EMULATED_TASK_SENSE;
+		cmd->scsi_status = SAM_STAT_CHECK_CONDITION;
+		cmd->scsi_sense_length  = TRANSPORT_SENSE_BUFFER;
+		translate_sense_reason(cmd, TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE);
+		goto queue_status;
 	}
 
+	if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE)
+		goto queue_status;
+
 	switch (cmd->data_direction) {
 	case DMA_FROM_DEVICE:
 		if (cmd->scsi_status)
@@ -2010,19 +2027,33 @@ static void transport_complete_qf(struct se_cmd *cmd)
 		break;
 	}
 
-out:
 	if (ret < 0) {
-		transport_handle_queue_full(cmd, cmd->se_dev);
+		transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 		return;
 	}
 	transport_lun_remove_cmd(cmd);
 	transport_cmd_check_stop_to_fabric(cmd);
 }
 
-static void transport_handle_queue_full(
-	struct se_cmd *cmd,
-	struct se_device *dev)
+static void transport_handle_queue_full(struct se_cmd *cmd, struct se_device *dev,
+					int err, bool write_pending)
 {
+	/*
+	 * -EAGAIN or -ENOMEM signals retry of ->write_pending() and/or
+	 * ->queue_data_in() callbacks from new process context.
+	 *
+	 * Otherwise for other errors, transport_complete_qf() will send
+	 * CHECK_CONDITION via ->queue_status() instead of attempting to
+	 * retry associated fabric driver data-transfer callbacks.
+	 */
+	if (err == -EAGAIN || err == -ENOMEM) {
+		cmd->t_state = (write_pending) ? TRANSPORT_COMPLETE_QF_WP :
+						 TRANSPORT_COMPLETE_QF_OK;
+	} else {
+		pr_warn_ratelimited("Got unknown fabric queue status: %d\n", err);
+		cmd->t_state = TRANSPORT_COMPLETE_QF_ERR;
+	}
+
 	spin_lock_irq(&dev->qf_cmd_lock);
 	list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list);
 	atomic_inc_mb(&dev->dev_qf_count);
@@ -2086,7 +2117,7 @@ static void target_complete_ok_work(struct work_struct *work)
 		WARN_ON(!cmd->scsi_status);
 		ret = transport_send_check_condition_and_sense(
 					cmd, 0, 1);
-		if (ret == -EAGAIN || ret == -ENOMEM)
+		if (ret)
 			goto queue_full;
 
 		transport_lun_remove_cmd(cmd);
@@ -2112,7 +2143,7 @@ static void target_complete_ok_work(struct work_struct *work)
 		} else if (rc) {
 			ret = transport_send_check_condition_and_sense(cmd,
 						rc, 0);
-			if (ret == -EAGAIN || ret == -ENOMEM)
+			if (ret)
 				goto queue_full;
 
 			transport_lun_remove_cmd(cmd);
@@ -2137,7 +2168,7 @@ static void target_complete_ok_work(struct work_struct *work)
 		if (target_read_prot_action(cmd)) {
 			ret = transport_send_check_condition_and_sense(cmd,
 						cmd->pi_err, 0);
-			if (ret == -EAGAIN || ret == -ENOMEM)
+			if (ret)
 				goto queue_full;
 
 			transport_lun_remove_cmd(cmd);
@@ -2147,7 +2178,7 @@ static void target_complete_ok_work(struct work_struct *work)
 
 		trace_target_cmd_complete(cmd);
 		ret = cmd->se_tfo->queue_data_in(cmd);
-		if (ret == -EAGAIN || ret == -ENOMEM)
+		if (ret)
 			goto queue_full;
 		break;
 	case DMA_TO_DEVICE:
@@ -2160,7 +2191,7 @@ static void target_complete_ok_work(struct work_struct *work)
 			atomic_long_add(cmd->data_length,
 					&cmd->se_lun->lun_stats.tx_data_octets);
 			ret = cmd->se_tfo->queue_data_in(cmd);
-			if (ret == -EAGAIN || ret == -ENOMEM)
+			if (ret)
 				goto queue_full;
 			break;
 		}
@@ -2169,7 +2200,7 @@ static void target_complete_ok_work(struct work_struct *work)
 queue_status:
 		trace_target_cmd_complete(cmd);
 		ret = cmd->se_tfo->queue_status(cmd);
-		if (ret == -EAGAIN || ret == -ENOMEM)
+		if (ret)
 			goto queue_full;
 		break;
 	default:
@@ -2183,8 +2214,8 @@ static void target_complete_ok_work(struct work_struct *work)
 queue_full:
 	pr_debug("Handling complete_ok QUEUE_FULL: se_cmd: %p,"
 		" data_direction: %d\n", cmd, cmd->data_direction);
-	cmd->t_state = TRANSPORT_COMPLETE_QF_OK;
-	transport_handle_queue_full(cmd, cmd->se_dev);
+
+	transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 
 void target_free_sgl(struct scatterlist *sgl, int nents)
@@ -2434,18 +2465,14 @@ void transport_kunmap_data_sg(struct se_cmd *cmd)
 	transport_cmd_check_stop(cmd, false, true);
 
 	ret = cmd->se_tfo->write_pending(cmd);
-	if (ret == -EAGAIN || ret == -ENOMEM)
+	if (ret)
 		goto queue_full;
 
-	/* fabric drivers should only return -EAGAIN or -ENOMEM as error */
-	WARN_ON(ret);
-
-	return (!ret) ? 0 : TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+	return 0;
 
 queue_full:
 	pr_debug("Handling write_pending QUEUE__FULL: se_cmd: %p\n", cmd);
-	cmd->t_state = TRANSPORT_COMPLETE_QF_WP;
-	transport_handle_queue_full(cmd, cmd->se_dev);
+	transport_handle_queue_full(cmd, cmd->se_dev, ret, true);
 	return 0;
 }
 EXPORT_SYMBOL(transport_generic_new_cmd);
@@ -2455,10 +2482,10 @@ static void transport_write_pending_qf(struct se_cmd *cmd)
 	int ret;
 
 	ret = cmd->se_tfo->write_pending(cmd);
-	if (ret == -EAGAIN || ret == -ENOMEM) {
+	if (ret) {
 		pr_debug("Handling write_pending QUEUE__FULL: se_cmd: %p\n",
 			 cmd);
-		transport_handle_queue_full(cmd, cmd->se_dev);
+		transport_handle_queue_full(cmd, cmd->se_dev, ret, true);
 	}
 }
 
@@ -2951,6 +2978,8 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
 	__releases(&cmd->t_state_lock)
 	__acquires(&cmd->t_state_lock)
 {
+	int ret;
+
 	assert_spin_locked(&cmd->t_state_lock);
 	WARN_ON_ONCE(!irqs_disabled());
 
@@ -2974,7 +3003,9 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status)
 	trace_target_cmd_complete(cmd);
 
 	spin_unlock_irq(&cmd->t_state_lock);
-	cmd->se_tfo->queue_status(cmd);
+	ret = cmd->se_tfo->queue_status(cmd);
+	if (ret)
+		transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 	spin_lock_irq(&cmd->t_state_lock);
 
 	return 1;
@@ -2995,6 +3026,7 @@ int transport_check_aborted_status(struct se_cmd *cmd, int send_status)
 void transport_send_task_abort(struct se_cmd *cmd)
 {
 	unsigned long flags;
+	int ret;
 
 	spin_lock_irqsave(&cmd->t_state_lock, flags);
 	if (cmd->se_cmd_flags & (SCF_SENT_CHECK_CONDITION)) {
@@ -3030,7 +3062,9 @@ void transport_send_task_abort(struct se_cmd *cmd)
 		 cmd->t_task_cdb[0], cmd->tag);
 
 	trace_target_cmd_complete(cmd);
-	cmd->se_tfo->queue_status(cmd);
+	ret = cmd->se_tfo->queue_status(cmd);
+	if (ret)
+		transport_handle_queue_full(cmd, cmd->se_dev, ret, false);
 }
 
 static void target_tmr_work(struct work_struct *work)
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index c211900..6a5e439 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -119,6 +119,7 @@ enum transport_state_table {
 	TRANSPORT_ISTATE_PROCESSING = 11,
 	TRANSPORT_COMPLETE_QF_WP = 18,
 	TRANSPORT_COMPLETE_QF_OK = 19,
+	TRANSPORT_COMPLETE_QF_ERR = 20,
 };
 
 /* Used for struct se_cmd->se_cmd_flags */
-- 
1.9.1

^ permalink raw reply related

* [PATCH 2/3] iscsi-target: Propigate queue_data_in + queue_status errors
From: Nicholas A. Bellinger @ 2016-10-31  5:25 UTC (permalink / raw)
  To: target-devel
  Cc: linux-rdma, Potnuri Bharat Teja, Steve Wise, Sagi Grimberg,
	Nicholas Bellinger
In-Reply-To: <1477891554-26222-1-git-send-email-nab@linux-iscsi.org>

From: Nicholas Bellinger <nab@linux-iscsi.org>

This patch changes iscsi-target to propagate iscsit_transport
->iscsit_queue_data_in() and ->iscsit_queue_status() callback
errors, back up into target-core.

This allows target-core to retry failed iscsit_transport
callbacks using internal queue-full logic.

Reported-by: Potnuri Bharat Teja <bharat@chelsio.com>
Cc: Potnuri Bharat Teja <bharat@chelsio.com>
Reported-by: Steve Wise <swise@opengridcomputing.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 drivers/target/iscsi/iscsi_target.c          |  3 +--
 drivers/target/iscsi/iscsi_target_configfs.c | 13 +++++--------
 drivers/target/iscsi/iscsi_target_util.c     |  5 +++--
 drivers/target/iscsi/iscsi_target_util.h     |  2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c
index b7d747e..1b03e85 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -482,8 +482,7 @@ int iscsit_del_np(struct iscsi_np *np)
 
 int iscsit_queue_rsp(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
 {
-	iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
-	return 0;
+	return iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
 }
 EXPORT_SYMBOL(iscsit_queue_rsp);
 
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 923c032..38a72d3 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -1395,11 +1395,10 @@ static u32 lio_sess_get_initiator_sid(
 static int lio_queue_data_in(struct se_cmd *se_cmd)
 {
 	struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
+	struct iscsi_conn *conn = cmd->conn;
 
 	cmd->i_state = ISTATE_SEND_DATAIN;
-	cmd->conn->conn_transport->iscsit_queue_data_in(cmd->conn, cmd);
-
-	return 0;
+	return conn->conn_transport->iscsit_queue_data_in(conn, cmd);
 }
 
 static int lio_write_pending(struct se_cmd *se_cmd)
@@ -1428,16 +1427,14 @@ static int lio_write_pending_status(struct se_cmd *se_cmd)
 static int lio_queue_status(struct se_cmd *se_cmd)
 {
 	struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
+	struct iscsi_conn *conn = cmd->conn;
 
 	cmd->i_state = ISTATE_SEND_STATUS;
 
 	if (cmd->se_cmd.scsi_status || cmd->sense_reason) {
-		iscsit_add_cmd_to_response_queue(cmd, cmd->conn, cmd->i_state);
-		return 0;
+		return iscsit_add_cmd_to_response_queue(cmd, conn, cmd->i_state);
 	}
-	cmd->conn->conn_transport->iscsit_queue_status(cmd->conn, cmd);
-
-	return 0;
+	return conn->conn_transport->iscsit_queue_status(conn, cmd);
 }
 
 static void lio_queue_tm_rsp(struct se_cmd *se_cmd)
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 1f38177..eb9c49a 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -565,7 +565,7 @@ static void iscsit_remove_cmd_from_immediate_queue(
 	}
 }
 
-void iscsit_add_cmd_to_response_queue(
+int iscsit_add_cmd_to_response_queue(
 	struct iscsi_cmd *cmd,
 	struct iscsi_conn *conn,
 	u8 state)
@@ -576,7 +576,7 @@ void iscsit_add_cmd_to_response_queue(
 	if (!qr) {
 		pr_err("Unable to allocate memory for"
 			" struct iscsi_queue_req\n");
-		return;
+		return -ENOMEM;
 	}
 	INIT_LIST_HEAD(&qr->qr_list);
 	qr->cmd = cmd;
@@ -588,6 +588,7 @@ void iscsit_add_cmd_to_response_queue(
 	spin_unlock_bh(&conn->response_queue_lock);
 
 	wake_up(&conn->queues_wq);
+	return 0;
 }
 
 struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *conn)
diff --git a/drivers/target/iscsi/iscsi_target_util.h b/drivers/target/iscsi/iscsi_target_util.h
index 995f1cb..d96ec7d 100644
--- a/drivers/target/iscsi/iscsi_target_util.h
+++ b/drivers/target/iscsi/iscsi_target_util.h
@@ -23,7 +23,7 @@ extern int iscsit_find_cmd_for_recovery(struct iscsi_session *, struct iscsi_cmd
 			struct iscsi_conn_recovery **, itt_t);
 extern void iscsit_add_cmd_to_immediate_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
 extern struct iscsi_queue_req *iscsit_get_cmd_from_immediate_queue(struct iscsi_conn *);
-extern void iscsit_add_cmd_to_response_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
+extern int iscsit_add_cmd_to_response_queue(struct iscsi_cmd *, struct iscsi_conn *, u8);
 extern struct iscsi_queue_req *iscsit_get_cmd_from_response_queue(struct iscsi_conn *);
 extern void iscsit_remove_cmd_from_tx_queues(struct iscsi_cmd *, struct iscsi_conn *);
 extern bool iscsit_conn_all_queues_empty(struct iscsi_conn *);
-- 
1.9.1

^ permalink raw reply related

* [PATCH 3/3] iser-target: Fix queue-full response handling
From: Nicholas A. Bellinger @ 2016-10-31  5:25 UTC (permalink / raw)
  To: target-devel
  Cc: linux-rdma, Potnuri Bharat Teja, Steve Wise, Sagi Grimberg,
	Nicholas Bellinger
In-Reply-To: <1477891554-26222-1-git-send-email-nab-IzHhD5pYlfBP7FQvKIMDCQ@public.gmane.org>

From: Nicholas Bellinger <nab-IzHhD5pYlfBP7FQvKIMDCQ@public.gmane.org>

This patch addresses two queue-full handling bugs in iser-target.

The first is propagating isert_rdma_rw_ctx_post() return back
to target-core via isert_put_datain() + isert_get_dataout()
callbacks, in order to trigger queue-full logic in target-core.
Note target-core expects -EAGAIN or -ENOMEM error to signal
RDMA WRITE/READ data-transfer callbacks should be retried,
after queue-full logic been invoked.

Other types of errors propagated up from RDMA RW API will result
in target-core generating internal CHECK_CONDITION status,
avoiding subsequent isert_put_datain() and isert_get_dataout()
iscsit_transport callback retry attempts.

The second is to use transport_generic_request_failure()
during T10-PI hw-offload errors in isert_rdma_write_done()
and isert_rdma_read_done(), so CHECK_CONDITION queue-full
is handled internally by target-core.

Also add isert_put_response() T10-PI failure case fixme in
isert_rdma_write_done(), which is currently not internally
retried or released until session reinstatement.

Reported-by: Potnuri Bharat Teja <bharat-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Cc: Potnuri Bharat Teja <bharat-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Reported-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
Cc: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
Cc: Sagi Grimberg <sagi-NQWnxTmZq1alnMjI0IkVqw@public.gmane.org>
Signed-off-by: Nicholas Bellinger <nab-IzHhD5pYlfBP7FQvKIMDCQ@public.gmane.org>
---
 drivers/infiniband/ulp/isert/ib_isert.c | 53 ++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 6dd43f6..e658248 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -1666,10 +1666,23 @@
 	ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr);
 	isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn);
 
-	if (ret)
-		transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0);
-	else
-		isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+	if (ret) {
+		/*
+		 * transport_generic_request_failure() expects to have
+		 * plus two references to handle queue-full, so re-add
+		 * one here as target-core will have already dropped
+		 * it after the first isert_put_datain() callback.
+		 */
+		kref_get(&cmd->cmd_kref);
+		transport_generic_request_failure(cmd, cmd->pi_err);
+	} else {
+		/*
+		 * XXX: isert_put_response() failure is not retried.
+		 */
+		ret = isert_put_response(isert_conn->conn, isert_cmd->iscsi_cmd);
+		if (ret)
+			pr_warn_ratelimited("isert_put_response() ret: %d\n", ret);
+	}
 }
 
 static void
@@ -1706,13 +1719,15 @@
 	cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
 	spin_unlock_bh(&cmd->istate_lock);
 
-	if (ret) {
-		target_put_sess_cmd(se_cmd);
-		transport_send_check_condition_and_sense(se_cmd,
-							 se_cmd->pi_err, 0);
-	} else {
+	/*
+	 * transport_generic_request_failure() will drop the extra
+	 * se_cmd->cmd_kref reference after T10-PI error, and handle
+	 * any non-zero ->queue_status() callback error retries.
+	 */
+	if (ret)
+		transport_generic_request_failure(se_cmd, se_cmd->pi_err);
+	else
 		target_execute_cmd(se_cmd);
-	}
 }
 
 static void
@@ -2172,26 +2187,28 @@
 		chain_wr = &isert_cmd->tx_desc.send_wr;
 	}
 
-	isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
-	isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd);
-	return 1;
+	rc = isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr);
+	isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ rc: %d\n",
+		  isert_cmd, rc);
+	return rc;
 }
 
 static int
 isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
 {
 	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	int ret;
 
 	isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
 		 isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done);
 
 	isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done;
-	isert_rdma_rw_ctx_post(isert_cmd, conn->context,
-			&isert_cmd->tx_desc.tx_cqe, NULL);
+	ret = isert_rdma_rw_ctx_post(isert_cmd, conn->context,
+				     &isert_cmd->tx_desc.tx_cqe, NULL);
 
-	isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
-		 isert_cmd);
-	return 0;
+	isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE rc: %d\n",
+		 isert_cmd, ret);
+	return ret;
 }
 
 static int
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH] qedr: Fix possible memory leak in qedr_create_qp()
From: Leon Romanovsky @ 2016-10-31  5:33 UTC (permalink / raw)
  To: Ram Amrani
  Cc: Wei Yongjun, Doug Ledford, Rajesh Borundia, Wei Yongjun,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1477672427-31575-1-git-send-email-weiyj.lk-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 2596 bytes --]

On Fri, Oct 28, 2016 at 04:33:47PM +0000, Wei Yongjun wrote:
> From: Wei Yongjun <weiyongjun1-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
>
> 'qp' is malloced in qedr_create_qp() and should be freed before leaving
> from the error handling cases, otherwise it will cause memory leak.
>
> Signed-off-by: Wei Yongjun <weiyongjun1-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>


Hi Ram,

While looking on this patch and associated code to it, I noticed the
following code stack:

qedr_create_qp
	-->
	dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
		-->
		qed_rdma_destroy_qp
			-->
			qed_roce_destroy_qp
This function will check the QP state and return -INVAL and comment
that this QP needs to be prepared before destroying it.

However immediately after returning, you are calling to kfree(qp)
without any checks.

It looks like an error and it is worth to take a look on it.

And did I miss the fix to memory leak posted during code review?

Thanks

> ---
>  drivers/infiniband/hw/qedr/verbs.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
> index a615142..b60f145 100644
> --- a/drivers/infiniband/hw/qedr/verbs.c
> +++ b/drivers/infiniband/hw/qedr/verbs.c
> @@ -1477,6 +1477,7 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
>  	struct qedr_ucontext *ctx = NULL;
>  	struct qedr_create_qp_ureq ureq;
>  	struct qedr_qp *qp;
> +	struct ib_qp *ibqp;
>  	int rc = 0;
>
>  	DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n",
> @@ -1486,13 +1487,13 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
>  	if (rc)
>  		return ERR_PTR(rc);
>
> +	if (attrs->srq)
> +		return ERR_PTR(-EINVAL);
> +
>  	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
>  	if (!qp)
>  		return ERR_PTR(-ENOMEM);
>
> -	if (attrs->srq)
> -		return ERR_PTR(-EINVAL);
> -
>  	DP_DEBUG(dev, QEDR_MSG_QP,
>  		 "create qp: sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n",
>  		 get_qedr_cq(attrs->send_cq),
> @@ -1508,7 +1509,10 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
>  			       "create qp: unexpected udata when creating GSI QP\n");
>  			goto err0;
>  		}
> -		return qedr_create_gsi_qp(dev, attrs, qp);
> +		ibqp = qedr_create_gsi_qp(dev, attrs, qp);
> +		if (IS_ERR(ibqp))
> +			kfree(qp);
> +		return ibqp;
>  	}
>
>  	memset(&in_params, 0, sizeof(in_params));
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH] qedr: Fix missing unlock on error in qedr_post_send()
From: Leon Romanovsky @ 2016-10-31  5:38 UTC (permalink / raw)
  To: Wei Yongjun
  Cc: Doug Ledford, Sean Hefty, Hal Rosenstock, Ram Amrani,
	Rajesh Borundia, Wei Yongjun, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1477672406-31487-1-git-send-email-weiyj.lk-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 1439 bytes --]

On Fri, Oct 28, 2016 at 04:33:26PM +0000, Wei Yongjun wrote:
> From: Wei Yongjun <weiyongjun1-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
>
> Add the missing unlock before return from function qedr_post_send()
> in the error handling case.
>
> Signed-off-by: Wei Yongjun <weiyongjun1-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> ---
>  drivers/infiniband/hw/qedr/verbs.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
> index a615142..e7c7417 100644
> --- a/drivers/infiniband/hw/qedr/verbs.c
> +++ b/drivers/infiniband/hw/qedr/verbs.c
> @@ -2983,7 +2983,8 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
>
>  	if (!wr) {
>  		DP_ERR(dev, "Got an empty post send.\n");
> -		return -EINVAL;
> +		rc = -EINVAL;
> +		goto out_unlock;
>  	}

IMHO, this if needs to be moved to be before acquiring spinlock and
avoid introducing new labels for this one case only.

>
>  	while (wr) {
> @@ -3012,6 +3013,7 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
>  	/* Make sure write sticks */
>  	mmiowb();
>
> +out_unlock:
>  	spin_unlock_irqrestore(&qp->q_lock, flags);
>
>  	return rc;
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCH rdma-rc 3/6] IB/core: Set routable RoCE gid type for ipv4/ipv6 networks
From: Leon Romanovsky @ 2016-10-31  5:50 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Mark Bloch, Maor Gottlieb
In-Reply-To: <1477575391-20134-4-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 3071 bytes --]

On Thu, Oct 27, 2016 at 04:36:28PM +0300, Leon Romanovsky wrote:
> From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> If the underlying netowrk type is ipv4 or ipv6 and the device supports
> routable RoCE, prefer it so the traffic could cross subnets.
>
> Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> ---

Hi Doug,

Please take the following v1 of this patch where I fixed spelling error
from "netowrk" to be "network".

Thanks.

From 09f96ba3e9b4442cfb44dca04c6726e55525c9c3 Mon Sep 17 00:00:00 2001
From: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Date: Sun, 11 Sep 2016 06:25:10 +0000
Subject: [PATCH rdma-rc v1 3/6] IB/core: Set routable RoCE gid type for ipv4/ipv6
 networks

If the underlying network type is ipv4 or ipv6 and the device supports
routable RoCE, prefer it so the traffic could cross subnets.

Signed-off-by: Mark Bloch <markb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 Changed from v0:
 - fix spelling
---
 drivers/infiniband/core/cma.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 36bf50e..9ca0da0 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -2436,6 +2436,18 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos)
 	return 0;
 }

+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+					   unsigned long supported_gids,
+					   enum ib_gid_type default_gid)
+{
+	if ((network_type == RDMA_NETWORK_IPV4 ||
+	     network_type == RDMA_NETWORK_IPV6) &&
+	    test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+		return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+	return default_gid;
+}
+
 static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 {
 	struct rdma_route *route = &id_priv->id.route;
@@ -2461,6 +2473,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	route->num_paths = 1;

 	if (addr->dev_addr.bound_dev_if) {
+		unsigned long supported_gids;
+
 		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
 		if (!ndev) {
 			ret = -ENODEV;
@@ -2484,7 +2498,12 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)

 		route->path_rec->net = &init_net;
 		route->path_rec->ifindex = ndev->ifindex;
-		route->path_rec->gid_type = id_priv->gid_type;
+		supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+							    id_priv->id.port_num);
+		route->path_rec->gid_type =
+			cma_route_gid_type(addr->dev_addr.network,
+					   supported_gids,
+					   id_priv->gid_type);
 	}
 	if (!ndev) {
 		ret = -ENODEV;
--
2.7.4


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply related

* Re: [PATCH] rdma UAPI: Use __kernel_sockaddr_storage
From: Leon Romanovsky @ 2016-10-31  6:06 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1477587077-15410-1-git-send-email-jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 2081 bytes --]

On Thu, Oct 27, 2016 at 10:51:17AM -0600, Jason Gunthorpe wrote:
> The kernel side is #ifdef'd to this type, and the UAPI header
> should use it directly. It has slightly different alignment
> requirments from the usual user space version.
>
> Signed-off-by: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Thanks,
Reviewed-by: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

> ---
>  include/uapi/rdma/rdma_user_cm.h | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
> index 01923d463673..d71da36e3cd6 100644
> --- a/include/uapi/rdma/rdma_user_cm.h
> +++ b/include/uapi/rdma/rdma_user_cm.h
> @@ -110,7 +110,7 @@ struct rdma_ucm_bind {
>  	__u32 id;
>  	__u16 addr_size;
>  	__u16 reserved;
> -	struct sockaddr_storage addr;
> +	struct __kernel_sockaddr_storage addr;
>  };
>
>  struct rdma_ucm_resolve_ip {
> @@ -126,8 +126,8 @@ struct rdma_ucm_resolve_addr {
>  	__u16 src_size;
>  	__u16 dst_size;
>  	__u32 reserved;
> -	struct sockaddr_storage src_addr;
> -	struct sockaddr_storage dst_addr;
> +	struct __kernel_sockaddr_storage src_addr;
> +	struct __kernel_sockaddr_storage dst_addr;
>  };
>
>  struct rdma_ucm_resolve_route {
> @@ -164,8 +164,8 @@ struct rdma_ucm_query_addr_resp {
>  	__u16 pkey;
>  	__u16 src_size;
>  	__u16 dst_size;
> -	struct sockaddr_storage src_addr;
> -	struct sockaddr_storage dst_addr;
> +	struct __kernel_sockaddr_storage src_addr;
> +	struct __kernel_sockaddr_storage dst_addr;
>  };
>
>  struct rdma_ucm_query_path_resp {
> @@ -257,7 +257,7 @@ struct rdma_ucm_join_mcast {
>  	__u32 id;
>  	__u16 addr_size;
>  	__u16 join_flags;
> -	struct sockaddr_storage addr;
> +	struct __kernel_sockaddr_storage addr;
>  };
>
>  struct rdma_ucm_get_event {
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* Re: [PATCHv12 0/3] rdmacg: IB/core: rdma controller support
From: Leon Romanovsky @ 2016-10-31  6:54 UTC (permalink / raw)
  To: Parav Pandit
  Cc: Tejun Heo, cgroups-u79uwXL29TY76Z2rM5mHXA, linux-rdma, Li Zefan,
	Johannes Weiner, Doug Ledford, Christoph Hellwig, Liran Liss,
	Hefty, Sean, Jason Gunthorpe, Haggai Eran,
	james.l.morris-QHcLZuEGTsvQT0dZR+AlfA, Or Gerlitz, Matan Barak
In-Reply-To: <CAG53R5XkRKdo-SCaREZvov3AGp5MSd18RpQ+0HEu-htUzqwOOw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 2061 bytes --]

On Thu, Oct 20, 2016 at 01:48:27AM +0530, Parav Pandit wrote:
> On Thu, Oct 20, 2016 at 1:35 AM, Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org> wrote:
> > Hello, Parav.
> >
> > On Thu, Oct 20, 2016 at 01:24:42AM +0530, Parav Pandit wrote:
> >> userland can get the max numbers using other framework which is used
> >> by control & data plane available in C library form or in form of
> >> system tools.
> >> I was preferring to get and set through same interface because,
> >> It simplifies user land software which is often not written in C so
> >> its likely that it needs to rely on system tools and parse the
> >> content, iterate through devices etc.
> >> Getting these info through rdma.max just makes it simple. There will
> >> be logic built to read/write rdma.max in userland anyway, which can be
> >> leveraged for percentage calculation instead of doing it from two
> >> places.
> >
> > Yeah, I get that this can be convenient in this case but it isn't a
> > generic approach.  I'd much prefer keeping it in line with other
> > resources.
> >
> Hmm. we don't have /proc/sys/kernel/pid_max type of simple interface
> to get the max values for rdma resources.
> rdma.max is close to that simplicity.

Sorry for my late response (very long weekends and piles of mails after it) and
for not clarifying our requirements better, which are very simple.

1. We will have vendor specific vendors objects in the future (new ABI
support it and designed for that).
2. We don't want to fight for every addition of such objects to cgroup list.
3. We don't want to teach and/or rewrite scripts for "average" user after
addition of new objects.
4. Cgroup configuration should be as close as possible to "standard" if
such exists, so all infinite internet guides will work for RDMA too.

From my understanding of current status.
My naive approach of introducing GLOBAL_HCA object is the way to go and the real question
is to understand how to configure it, am I right?

>
>
>
>
> > Thanks.
> >
> > --
> > tejun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply

* [PATCH 0/2] mm: remove get_user_pages_locked()
From: Lorenzo Stoakes @ 2016-10-31 10:02 UTC (permalink / raw)
  To: linux-mm
  Cc: devel, linux-ia64, Rik van Riel, Jan Kara, kvm, linux-cris-kernel,
	linux-rdma, Mel Gorman, Dave Hansen, Hugh Dickins, linux-kernel,
	dri-devel, Michal Hocko, Paolo Bonzini, Andrew Morton,
	Linus Torvalds, linux-media

by adding an int *locked parameter to get_user_pages() callers to this function
can now utilise VM_FAULT_RETRY functionality.

Taken in conjunction with the patch series adding the same parameter to
get_user_pages_remote() this means all slow-path get_user_pages*() functions
will now have the ability to utilise VM_FAULT_RETRY.

Additionally get_user_pages() and get_user_pages_remote() previously mirrored
one another in functionality differing only in the ability to specify task/mm,
this patch series reinstates this relationship.

This patch series should not introduce any functional changes.

Lorenzo Stoakes (2):
  mm: add locked parameter to get_user_pages()
  mm: remove get_user_pages_locked()

 arch/cris/arch-v32/drivers/cryptocop.c             |  2 +
 arch/ia64/kernel/err_inject.c                      |  2 +-
 arch/x86/mm/mpx.c                                  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c            |  2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c                |  2 +-
 drivers/gpu/drm/via/via_dmablit.c                  |  2 +-
 drivers/infiniband/core/umem.c                     |  2 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c        |  3 +-
 drivers/infiniband/hw/qib/qib_user_pages.c         |  2 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c           |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c          |  2 +-
 drivers/misc/mic/scif/scif_rma.c                   |  1 +
 drivers/misc/sgi-gru/grufault.c                    |  3 +-
 drivers/platform/goldfish/goldfish_pipe.c          |  2 +-
 drivers/rapidio/devices/rio_mport_cdev.c           |  2 +-
 .../interface/vchiq_arm/vchiq_2835_arm.c           |  3 +-
 .../vc04_services/interface/vchiq_arm/vchiq_arm.c  |  3 +-
 drivers/virt/fsl_hypervisor.c                      |  2 +-
 include/linux/mm.h                                 |  4 +-
 mm/frame_vector.c                                  |  4 +-
 mm/gup.c                                           | 62 ++++++++--------------
 mm/ksm.c                                           |  3 +-
 mm/mempolicy.c                                     |  2 +-
 mm/nommu.c                                         | 10 +---
 virt/kvm/kvm_main.c                                |  4 +-
 25 files changed, 55 insertions(+), 73 deletions(-)

^ permalink raw reply

* [PATCH 1/2] mm: add locked parameter to get_user_pages()
From: Lorenzo Stoakes @ 2016-10-31 10:02 UTC (permalink / raw)
  To: linux-mm
  Cc: devel, linux-ia64, Rik van Riel, Jan Kara, kvm, linux-cris-kernel,
	linux-rdma, Mel Gorman, Dave Hansen, Hugh Dickins, linux-kernel,
	dri-devel, Michal Hocko, Paolo Bonzini, Andrew Morton,
	Linus Torvalds, linux-media
In-Reply-To: <20161031100228.17917-1-lstoakes@gmail.com>

This patch adds an int *locked parameter to get_user_pages() to allow
VM_FAULT_RETRY faulting behaviour similar to get_user_pages_[un]locked().

It additionally clears the way for get_user_pages_locked() to be removed as its
sole remaining useful characteristic was to allow for VM_FAULT_RETRY behaviour
when faulting in pages.

It should not introduce any functional changes, however it does allow for
subsequent changes to get_user_pages() callers to take advantage of
VM_FAULT_RETRY.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
---
 arch/cris/arch-v32/drivers/cryptocop.c                            | 2 ++
 arch/ia64/kernel/err_inject.c                                     | 2 +-
 arch/x86/mm/mpx.c                                                 | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c                           | 2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c                               | 2 +-
 drivers/gpu/drm/via/via_dmablit.c                                 | 2 +-
 drivers/infiniband/core/umem.c                                    | 2 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c                       | 3 ++-
 drivers/infiniband/hw/qib/qib_user_pages.c                        | 2 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c                          | 2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c                         | 2 +-
 drivers/misc/mic/scif/scif_rma.c                                  | 1 +
 drivers/misc/sgi-gru/grufault.c                                   | 3 ++-
 drivers/platform/goldfish/goldfish_pipe.c                         | 2 +-
 drivers/rapidio/devices/rio_mport_cdev.c                          | 2 +-
 .../staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c    | 3 ++-
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c     | 3 ++-
 drivers/virt/fsl_hypervisor.c                                     | 2 +-
 include/linux/mm.h                                                | 2 +-
 mm/gup.c                                                          | 8 ++++----
 mm/ksm.c                                                          | 3 ++-
 mm/mempolicy.c                                                    | 2 +-
 mm/nommu.c                                                        | 4 ++--
 virt/kvm/kvm_main.c                                               | 4 ++--
 24 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 0068fd4..7444b45 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2723,6 +2723,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 			     noinpages,
 			     0,  /* read access only for in data */
 			     inpages,
+			     NULL,
 			     NULL);
 
 	if (err < 0) {
@@ -2737,6 +2738,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 				     nooutpages,
 				     FOLL_WRITE, /* write access for out data */
 				     outpages,
+				     NULL,
 				     NULL);
 		up_read(&current->mm->mmap_sem);
 		if (err < 0) {
diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
index 5ed0ea9..99f3fa2 100644
--- a/arch/ia64/kernel/err_inject.c
+++ b/arch/ia64/kernel/err_inject.c
@@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr,
 	u64 virt_addr=simple_strtoull(buf, NULL, 16);
 	int ret;
 
-	ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL);
+	ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL, NULL);
 	if (ret<=0) {
 #ifdef ERR_INJ_DEBUG
 		printk("Virtual address %lx is not existing.\n",virt_addr);
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index e4f8009..b74a7b2 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,7 +546,7 @@ static int mpx_resolve_fault(long __user *addr, int write)
 	int nr_pages = 1;
 
 	gup_ret = get_user_pages((unsigned long)addr, nr_pages,
-			write ? FOLL_WRITE : 0,	NULL, NULL);
+			write ? FOLL_WRITE : 0,	NULL, NULL, NULL);
 	/*
 	 * get_user_pages() returns number of pages gotten.
 	 * 0 means we failed to fault in and get anything,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index dcaf691..3d9a195 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -584,7 +584,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages)
 		list_add(&guptask.list, &gtt->guptasks);
 		spin_unlock(&gtt->guptasklock);
 
-		r = get_user_pages(userptr, num_pages, flags, p, NULL);
+		r = get_user_pages(userptr, num_pages, flags, p, NULL, NULL);
 
 		spin_lock(&gtt->guptasklock);
 		list_del(&guptask.list);
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 3de5e6e..8b0c069 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -567,7 +567,7 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm)
 		struct page **pages = ttm->pages + pinned;
 
 		r = get_user_pages(userptr, num_pages, write ? FOLL_WRITE : 0,
-				   pages, NULL);
+				   pages, NULL, NULL);
 		if (r < 0)
 			goto release_pages;
 
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 1a3ad76..39877fe 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -242,7 +242,7 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  drm_via_dmablit_t *xfer)
 	ret = get_user_pages((unsigned long)xfer->mem_addr,
 			     vsg->num_pages,
 			     (vsg->direction == DMA_FROM_DEVICE) ? FOLL_WRITE : 0,
-			     vsg->pages, NULL);
+			     vsg->pages, NULL, NULL);
 
 	up_read(&current->mm->mmap_sem);
 	if (ret != vsg->num_pages) {
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 224ad27..1d5278f 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -194,7 +194,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		ret = get_user_pages(cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     gup_flags, page_list, vma_list);
+				     gup_flags, page_list, vma_list, NULL);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index c6fe89d..45d037f 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
 		goto out;
 	}
 
-	ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL);
+	ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL,
+			NULL);
 	if (ret < 0)
 		goto out;
 
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 75f0862..433dc3a 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -69,7 +69,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
 		ret = get_user_pages(start_page + got * PAGE_SIZE,
 				     num_pages - got,
 				     FOLL_WRITE | FOLL_FORCE,
-				     p + got, NULL);
+				     p + got, NULL, NULL);
 		if (ret < 0)
 			goto bail_release;
 	}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 1ccee6e..68d583f 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -145,7 +145,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
 		ret = get_user_pages(cur_base,
 					min_t(unsigned long, npages,
 					PAGE_SIZE / sizeof(struct page *)),
-					gup_flags, page_list, NULL);
+					gup_flags, page_list, NULL, NULL);
 
 		if (ret < 0)
 			goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 1db0af6..66c24e6 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,7 +186,7 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
 		data, size, dma->nr_pages);
 
 	err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
-			     flags, dma->pages, NULL);
+			     flags, dma->pages, NULL, NULL);
 
 	if (err != dma->nr_pages) {
 		dma->nr_pages = (err >= 0) ? err : 0;
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index f806a44..6719896 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -1398,6 +1398,7 @@ int __scif_pin_pages(void *addr, size_t len, int *out_prot,
 				nr_pages,
 				(prot & SCIF_PROT_WRITE) ? FOLL_WRITE : 0,
 				pinned_pages->pages,
+				NULL,
 				NULL);
 		up_write(&mm->mmap_sem);
 		if (nr_pages != pinned_pages->nr_pages) {
diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 6fb773d..c6bda94 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -198,7 +198,8 @@ static int non_atomic_pte_lookup(struct vm_area_struct *vma,
 #else
 	*pageshift = PAGE_SHIFT;
 #endif
-	if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL) <= 0)
+	if (get_user_pages(vaddr, 1, write ? FOLL_WRITE : 0, &page, NULL, NULL)
+		<= 0)
 		return -EFAULT;
 	*paddr = page_to_phys(page);
 	put_page(page);
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 1aba2c7..bb5c0fe 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -310,7 +310,7 @@ static ssize_t goldfish_pipe_read_write(struct file *filp, char __user *buffer,
 		 */
 		down_read(&current->mm->mmap_sem);
 		ret = get_user_pages(address, 1, is_write ? 0 : FOLL_WRITE,
-				&page, NULL);
+				&page, NULL, NULL);
 		up_read(&current->mm->mmap_sem);
 		if (ret < 0)
 			break;
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 9013a58..0e507ca 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -894,7 +894,7 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
 				(unsigned long)xfer->loc_addr & PAGE_MASK,
 				nr_pages,
 				dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
-				page_list, NULL);
+				page_list, NULL, NULL);
 		up_read(&current->mm->mmap_sem);
 
 		if (pinned != nr_pages) {
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
index 1091b9f..f3f0bab 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_2835_arm.c
@@ -425,7 +425,8 @@ create_pagelist(char __user *buf, size_t count, unsigned short type,
 					  num_pages,
 					  (type == PAGELIST_READ) ? FOLL_WRITE : 0,
 					  pages,
-					  NULL /*vmas */);
+					  NULL /*vmas */,
+					  NULL /* locked */);
 		up_read(&task->mm->mmap_sem);
 
 		if (actual_pages != num_pages) {
diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index 7b6cd4d..86828d9 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -1479,7 +1479,8 @@ dump_phys_mem(void *virt_addr, uint32_t num_bytes)
 		num_pages,                /* len */
 		0,                        /* gup_flags */
 		pages,                    /* pages (array of page pointers) */
-		NULL);                    /* vmas */
+		NULL,                     /* vmas */
+		NULL);                    /* locked */
 	up_read(&current->mm->mmap_sem);
 
 	prev_idx = -1;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 150ce2a..896bc31 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -246,7 +246,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 	down_read(&current->mm->mmap_sem);
 	num_pinned = get_user_pages(param.local_vaddr - lb_offset,
 		num_pages, (param.source == -1) ? 0 : FOLL_WRITE,
-		pages, NULL);
+		pages, NULL, NULL);
 	up_read(&current->mm->mmap_sem);
 
 	if (num_pinned != num_pages) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a92c8d7..396984e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1277,7 +1277,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 			    struct vm_area_struct **vmas);
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
-			    struct vm_area_struct **vmas);
+			    struct vm_area_struct **vmas, int *locked);
 long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 		    unsigned int gup_flags, struct page **pages, int *locked);
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
diff --git a/mm/gup.c b/mm/gup.c
index ec4f827..6b5539e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -834,7 +834,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
  *
  *      down_read(&mm->mmap_sem);
  *      do_something()
- *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      get_user_pages(tsk, mm, ..., pages, NULL, NULL);
  *      up_read(&mm->mmap_sem);
  *
  *  to:
@@ -886,7 +886,7 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
  * get_user_pages_unlocked() is suitable to replace the form:
  *
  *      down_read(&mm->mmap_sem);
- *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      get_user_pages(tsk, mm, ..., pages, NULL, NULL);
  *      up_read(&mm->mmap_sem);
  *
  *  with:
@@ -979,10 +979,10 @@ EXPORT_SYMBOL(get_user_pages_remote);
  */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
-		struct vm_area_struct **vmas)
+		struct vm_area_struct **vmas, int *locked)
 {
 	return __get_user_pages_locked(current, current->mm, start, nr_pages,
-				       pages, vmas, NULL, false,
+				       pages, vmas, locked, true,
 				       gup_flags | FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages);
diff --git a/mm/ksm.c b/mm/ksm.c
index 9ae6011..b7468dd 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -358,7 +358,8 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 /*
  * We use break_ksm to break COW on a ksm page: it's a stripped down
  *
- *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ *	if (get_user_pages(addr, 1, FOLL_WRITE | FOLL_FORCE, &page,
+ *		NULL, NULL) == 1)
  *		put_page(page);
  *
  * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0b859af..4be72c7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -850,7 +850,7 @@ static int lookup_node(unsigned long addr)
 	struct page *p;
 	int err;
 
-	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
+	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL, NULL);
 	if (err >= 0) {
 		err = page_to_nid(p);
 		put_page(p);
diff --git a/mm/nommu.c b/mm/nommu.c
index 8b8faaf..82aaa33 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -161,7 +161,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 		    unsigned int gup_flags, struct page **pages,
-		    struct vm_area_struct **vmas)
+		    struct vm_area_struct **vmas, int *locked)
 {
 	return __get_user_pages(current, current->mm, start, nr_pages,
 				gup_flags, pages, vmas, NULL);
@@ -172,7 +172,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
 			    int *locked)
 {
-	return get_user_pages(start, nr_pages, gup_flags, pages, NULL);
+	return get_user_pages(start, nr_pages, gup_flags, pages, NULL, NULL);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2907b7b..353bcb2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1351,14 +1351,14 @@ static int get_user_page_nowait(unsigned long start, int write,
 	if (write)
 		flags |= FOLL_WRITE;
 
-	return get_user_pages(start, 1, flags, page, NULL);
+	return get_user_pages(start, 1, flags, page, NULL, NULL);
 }
 
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
 
-	rc = get_user_pages(addr, 1, flags, NULL, NULL);
+	rc = get_user_pages(addr, 1, flags, NULL, NULL, NULL);
 	return rc == -EHWPOISON;
 }
 
-- 
2.10.1

^ permalink raw reply related

* [PATCH 2/2] mm: remove get_user_pages_locked()
From: Lorenzo Stoakes @ 2016-10-31 10:02 UTC (permalink / raw)
  To: linux-mm
  Cc: Linus Torvalds, Michal Hocko, Jan Kara, Hugh Dickins, Dave Hansen,
	Rik van Riel, Mel Gorman, Andrew Morton, Paolo Bonzini,
	linux-kernel, linux-cris-kernel, linux-ia64, dri-devel,
	linux-rdma, kvm, linux-media, devel, Lorenzo Stoakes
In-Reply-To: <20161031100228.17917-1-lstoakes@gmail.com>

get_user_pages() now has an int *locked parameter which renders
get_user_pages_locked() redundant, so remove it.

This patch should not introduce any functional changes.

Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
---
 include/linux/mm.h |  2 --
 mm/frame_vector.c  |  4 ++--
 mm/gup.c           | 56 +++++++++++++++++++-----------------------------------
 mm/nommu.c         |  8 --------
 4 files changed, 22 insertions(+), 48 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 396984e..4db3147 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1278,8 +1278,6 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 			    unsigned int gup_flags, struct page **pages,
 			    struct vm_area_struct **vmas, int *locked);
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
-		    unsigned int gup_flags, struct page **pages, int *locked);
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       unsigned long start, unsigned long nr_pages,
 			       struct page **pages, unsigned int gup_flags);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index db77dcb..c5ce1b1 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -55,8 +55,8 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
 	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
 		vec->got_ref = true;
 		vec->is_pfns = false;
-		ret = get_user_pages_locked(start, nr_frames,
-			gup_flags, (struct page **)(vec->ptrs), &locked);
+		ret = get_user_pages(start, nr_frames,
+			gup_flags, (struct page **)(vec->ptrs), NULL, &locked);
 		goto out;
 	}
 
diff --git a/mm/gup.c b/mm/gup.c
index 6b5539e..6cec693 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -826,37 +826,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 }
 
 /*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- * get_user_pages_locked() is suitable to replace the form:
- *
- *      down_read(&mm->mmap_sem);
- *      do_something()
- *      get_user_pages(tsk, mm, ..., pages, NULL, NULL);
- *      up_read(&mm->mmap_sem);
- *
- *  to:
- *
- *      int locked = 1;
- *      down_read(&mm->mmap_sem);
- *      do_something()
- *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
- *      if (locked)
- *          up_read(&mm->mmap_sem);
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
-			   unsigned int gup_flags, struct page **pages,
-			   int *locked)
-{
-	return __get_user_pages_locked(current, current->mm, start, nr_pages,
-				       pages, NULL, locked, true,
-				       gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
-/*
  * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
  * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
  *
@@ -954,11 +923,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  * use the correct cache flushing APIs.
  *
  * See also get_user_pages_fast, for performance critical applications.
- *
- * get_user_pages should be phased out in favor of
- * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  */
 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages,
@@ -976,6 +940,26 @@ EXPORT_SYMBOL(get_user_pages_remote);
  * less-flexible calling convention where we assume that the task
  * and mm being operated on are the current task's.  We also
  * obviously don't pass FOLL_REMOTE in here.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages()'s locked parameter or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages()'s locked parameter is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  to:
+ *
+ *      int locked = 1;
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL, &locked);
+ *      if (locked)
+ *          up_read(&mm->mmap_sem);
  */
 long get_user_pages(unsigned long start, unsigned long nr_pages,
 		unsigned int gup_flags, struct page **pages,
diff --git a/mm/nommu.c b/mm/nommu.c
index 82aaa33..3d38d40 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -168,14 +168,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
 }
 EXPORT_SYMBOL(get_user_pages);
 
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
-			    unsigned int gup_flags, struct page **pages,
-			    int *locked)
-{
-	return get_user_pages(start, nr_pages, gup_flags, pages, NULL, NULL);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
 long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			       unsigned long start, unsigned long nr_pages,
 			       struct page **pages, unsigned int gup_flags)
-- 
2.10.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH rdma-next 1/2] net/mlx5: Report multi packet WQE capabilities
From: Leon Romanovsky @ 2016-10-31 10:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Leon Romanovsky, Bodong Wang

From: Leon Romanovsky <leonro-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Multi packet WQE enables sending multiple fix sized packets
using a single WQE. The exposed field reports such HW support.

Signed-off-by: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 include/linux/mlx5/mlx5_ifc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 6045d4d..2def5b9 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -556,7 +556,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         self_lb_en_modifiable[0x1];
 	u8         reserved_at_9[0x2];
 	u8         max_lso_cap[0x5];
-	u8         reserved_at_10[0x2];
+	u8         multi_pkt_send_wqe[0x2];
 	u8	   wqe_inline_mode[0x2];
 	u8         rss_ind_tbl_cap[0x4];
 	u8         reg_umr_sq[0x1];
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH rdma-next 2/2] IB/mlx5: Report mlx5 multi packet WQE capabilities when querying device
From: Leon Romanovsky @ 2016-10-31 10:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bodong Wang
In-Reply-To: <1477908921-12222-1-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

From: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The capabilities whether hardware support multi packet WQE or not is
exposed to user space through query_device by uhw.

Signed-off-by: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/main.c | 11 +++++++++++
 include/uapi/rdma/mlx5-abi.h      |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 2217477..9245b51 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -669,6 +669,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
 	}
 
+	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
+			uhw->outlen)) {
+		resp.mlx5_ib_support_multi_pkt_send_wqes =
+			MLX5_CAP_ETH(mdev, multi_pkt_send_wqe);
+		resp.response_length +=
+			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
+	}
+
+	if (field_avail(typeof(resp), reserved, uhw->outlen))
+		resp.response_length += sizeof(resp.reserved);
+
 	if (uhw->outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index f5d0f4e..93d6b9f 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -129,6 +129,8 @@ struct mlx5_ib_query_device_resp {
 	__u32	response_length;
 	struct	mlx5_ib_tso_caps tso_caps;
 	struct	mlx5_ib_rss_caps rss_caps;
+	__u32	mlx5_ib_support_multi_pkt_send_wqes;
+	__u32	reserved;
 };
 
 struct mlx5_ib_create_cq {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH rdma-next 1/2] IB/mlx5: Report mlx5 CQE compression capabilities when querying device
From: Leon Romanovsky @ 2016-10-31 10:16 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bodong Wang

From: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The capabilities include:
- Max number of compressed and aggregated CQEs in a single session,
  while zero means unsupported.
- For Responder, there are two formats of mini CQE: mini CQE with Rx
  hash and mini CQE with checksum. They're mutual exclusive.

Signed-off-by: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/main.c | 10 ++++++++++
 include/uapi/rdma/mlx5-abi.h      | 12 ++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 2217477..3a8d4b7 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -669,6 +669,16 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
 			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
 	}
 
+	if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) {
+		resp.cqe_comp_caps.max_num =
+			MLX5_CAP_GEN(dev->mdev, cqe_compression) ?
+			MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0;
+		resp.cqe_comp_caps.supported_format =
+			MLX5_IB_CQE_RES_FORMAT_HASH |
+			MLX5_IB_CQE_RES_FORMAT_CSUM;
+		resp.response_length += sizeof(resp.cqe_comp_caps);
+	}
+
 	if (uhw->outlen) {
 		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
 
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index f5d0f4e..24a7435 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -124,11 +124,23 @@ struct mlx5_ib_rss_caps {
 	__u8 reserved[7];
 };
 
+enum mlx5_ib_cqe_comp_res_format {
+	MLX5_IB_CQE_RES_FORMAT_HASH	= 1 << 0,
+	MLX5_IB_CQE_RES_FORMAT_CSUM	= 1 << 1,
+	MLX5_IB_CQE_RES_RESERVED	= 1 << 2,
+};
+
+struct mlx5_ib_cqe_comp_caps {
+	__u32 max_num;
+	__u32 supported_format; /* enum mlx5_ib_cqe_comp_res_format */
+};
+
 struct mlx5_ib_query_device_resp {
 	__u32	comp_mask;
 	__u32	response_length;
 	struct	mlx5_ib_tso_caps tso_caps;
 	struct	mlx5_ib_rss_caps rss_caps;
+	struct	mlx5_ib_cqe_comp_caps cqe_comp_caps;
 };
 
 struct mlx5_ib_create_cq {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH rdma-next 2/2] IB/mlx5: Add support for CQE compressing
From: Leon Romanovsky @ 2016-10-31 10:16 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bodong Wang
In-Reply-To: <1477909005-12867-1-git-send-email-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>

From: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

CQE compressing reduces PCI overhead by coalescing and compressing
multiple CQEs into a single merged CQE. Successful compressing
improves message rate especially for small packet traffic.

CQE compressing is supported for all 64B CQE formats (with certain
limitations) generated by RQ/Responder or by SQ/Requestor.

Signed-off-by: Bodong Wang <bodong-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/cq.c | 30 +++++++++++++++++++++++++++++-
 include/uapi/rdma/mlx5-abi.h    |  4 +++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 79d017b..9ac36ef 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -731,7 +731,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 			  int entries, u32 **cqb,
 			  int *cqe_size, int *index, int *inlen)
 {
-	struct mlx5_ib_create_cq ucmd;
+	struct mlx5_ib_create_cq ucmd = {};
 	size_t ucmdlen;
 	int page_shift;
 	__be64 *pas;
@@ -792,8 +792,36 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
 
 	*index = to_mucontext(context)->uuari.uars[0].index;
 
+	if (ucmd.cqe_comp_en == 1) {
+		if (unlikely((*cqe_size != 64) ||
+			     !MLX5_CAP_GEN(dev->mdev, cqe_compression))) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev, "CQE compression is not supported for size %d!\n",
+				     *cqe_size);
+			goto err_cqb;
+		}
+
+		if (unlikely(!ucmd.cqe_comp_res_format ||
+			     !(ucmd.cqe_comp_res_format <
+			       MLX5_IB_CQE_RES_RESERVED) ||
+			     (ucmd.cqe_comp_res_format &
+			      (ucmd.cqe_comp_res_format - 1)))) {
+			err = -EOPNOTSUPP;
+			mlx5_ib_warn(dev, "CQE compression res format %d is not supported!\n",
+				     ucmd.cqe_comp_res_format);
+			goto err_cqb;
+		}
+
+		MLX5_SET(cqc, cqc, cqe_comp_en, 1);
+		MLX5_SET(cqc, cqc, mini_cqe_res_format,
+			 ilog2(ucmd.cqe_comp_res_format));
+	}
+
 	return 0;
 
+err_cqb:
+	kfree(cqb);
+
 err_db:
 	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
 
diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h
index 24a7435..208710b 100644
--- a/include/uapi/rdma/mlx5-abi.h
+++ b/include/uapi/rdma/mlx5-abi.h
@@ -147,7 +147,9 @@ struct mlx5_ib_create_cq {
 	__u64	buf_addr;
 	__u64	db_addr;
 	__u32	cqe_size;
-	__u32	reserved; /* explicit padding (optional on i386) */
+	__u8    cqe_comp_en;
+	__u8    cqe_comp_res_format;
+	__u16	reserved; /* explicit padding (optional on i386) */
 };
 
 struct mlx5_ib_create_cq_resp {
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox