Netdev List

Netdev List
 help / color / mirror / Atom feed

* [for-next 3/9] net/mlx5: Move the entry index allocator to flow group
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Maor Gottlieb, Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When new flow table entry is added, we search for free index
in the flow group and not in the flow table, therefore we can move
the allocator from flow table to flow group.
In downstream patches it will enable us to lock smaller part
of the steering tree.

Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 18 +++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index f390828..2a0b556 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -384,7 +384,6 @@ static void del_flow_table(struct fs_node *node)
 	err = mlx5_cmd_destroy_flow_table(dev, ft);
 	if (err)
 		mlx5_core_warn(dev, "flow steering can't destroy ft\n");
-	ida_destroy(&ft->fte_allocator);
 	rhltable_destroy(&ft->fgs_hash);
 	fs_get_obj(prio, ft->node.parent);
 	prio->num_ft--;
@@ -445,7 +444,7 @@ static void destroy_fte(struct fs_fte *fte, struct mlx5_flow_group *fg)
 	WARN_ON(ret);
 	fte->status = 0;
 	fs_get_obj(ft, fg->node.parent);
-	ida_simple_remove(&ft->fte_allocator, fte->index);
+	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
 }
 
 static void del_fte(struct fs_node *node)
@@ -488,6 +487,7 @@ static void del_flow_group(struct fs_node *node)
 		ft->autogroup.num_groups--;
 
 	rhashtable_destroy(&fg->ftes_hash);
+	ida_destroy(&fg->fte_allocator);
 	err = rhltable_remove(&ft->fgs_hash,
 			      &fg->hash,
 			      rhash_fg);
@@ -537,6 +537,7 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
 		kfree(fg);
 		return ERR_PTR(ret);
 	}
+	ida_init(&fg->fte_allocator);
 	fg->mask.match_criteria_enable = match_criteria_enable;
 	memcpy(&fg->mask.match_criteria, match_criteria,
 	       sizeof(fg->mask.match_criteria));
@@ -575,7 +576,6 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft
 	ft->flags = flags;
 	INIT_LIST_HEAD(&ft->fwd_rules);
 	mutex_init(&ft->lock);
-	ida_init(&ft->fte_allocator);
 
 	return ft;
 }
@@ -892,7 +892,6 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 destroy_ft:
 	mlx5_cmd_destroy_flow_table(root->dev, ft);
 free_ft:
-	ida_destroy(&ft->fte_allocator);
 	kfree(ft);
 unlock_root:
 	mutex_unlock(&root->chain_lock);
@@ -1003,6 +1002,7 @@ static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *
 				rhash_fg));
 err_free_fg:
 	rhashtable_destroy(&fg->ftes_hash);
+	ida_destroy(&fg->fte_allocator);
 	kfree(fg);
 
 	return ERR_PTR(err);
@@ -1181,18 +1181,18 @@ static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
 				 u32 *match_value,
 				 struct mlx5_flow_act *flow_act)
 {
-	struct mlx5_flow_table *ft;
 	struct fs_fte *fte;
 	int index;
 	int ret;
 
-	fs_get_obj(ft, fg->node.parent);
-	index = ida_simple_get(&ft->fte_allocator, fg->start_index,
-			       fg->start_index + fg->max_ftes,
+	index = ida_simple_get(&fg->fte_allocator, 0,
+			       fg->max_ftes,
 			       GFP_KERNEL);
 	if (index < 0)
 		return ERR_PTR(index);
 
+	index += fg->start_index;
+
 	fte = alloc_fte(flow_act, match_value, index);
 	if (IS_ERR(fte)) {
 		ret = PTR_ERR(fte);
@@ -1207,7 +1207,7 @@ static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
 err_hash:
 	kfree(fte);
 err_alloc:
-	ida_simple_remove(&ft->fte_allocator, index);
+	ida_simple_remove(&fg->fte_allocator, index - fg->start_index);
 	return ERR_PTR(ret);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 5509a752..02c969c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -119,7 +119,6 @@ struct mlx5_flow_table {
 	/* FWD rules that point on this flow table */
 	struct list_head		fwd_rules;
 	u32				flags;
-	struct ida			fte_allocator;
 	struct rhltable			fgs_hash;
 };
 
@@ -199,6 +198,7 @@ struct mlx5_flow_group {
 	struct mlx5_flow_group_mask	mask;
 	u32				start_index;
 	u32				max_ftes;
+	struct ida			fte_allocator;
 	u32				id;
 	struct rhashtable		ftes_hash;
 	struct rhlist_head		hash;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [for-next 6/9] net/mlx5: Replace fs_node mutex with reader/writer semaphore
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Maor Gottlieb, Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Currently, steering object is protected by mutex lock, replace
the mutex lock with reader/writer semaphore .
In this patch we still use only write semaphore. In downstream
patches we will switch part of the write locks to read locks.

Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 28 +++++++++++------------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 41f26f4..9406e72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -145,10 +145,10 @@ struct node_caps {
 	}
 };
 
-enum fs_i_mutex_lock_class {
-	FS_MUTEX_GRANDPARENT,
-	FS_MUTEX_PARENT,
-	FS_MUTEX_CHILD
+enum fs_i_lock_class {
+	FS_LOCK_GRANDPARENT,
+	FS_LOCK_PARENT,
+	FS_LOCK_CHILD
 };
 
 static const struct rhashtable_params rhash_fte = {
@@ -184,7 +184,7 @@ static void tree_init_node(struct fs_node *node,
 	atomic_set(&node->refcount, 1);
 	INIT_LIST_HEAD(&node->list);
 	INIT_LIST_HEAD(&node->children);
-	mutex_init(&node->lock);
+	init_rwsem(&node->lock);
 	node->remove_func = remove_func;
 	node->active = false;
 }
@@ -208,10 +208,10 @@ static void tree_get_node(struct fs_node *node)
 }
 
 static void nested_lock_ref_node(struct fs_node *node,
-				 enum fs_i_mutex_lock_class class)
+				 enum fs_i_lock_class class)
 {
 	if (node) {
-		mutex_lock_nested(&node->lock, class);
+		down_write_nested(&node->lock, class);
 		atomic_inc(&node->refcount);
 	}
 }
@@ -219,7 +219,7 @@ static void nested_lock_ref_node(struct fs_node *node,
 static void lock_ref_node(struct fs_node *node)
 {
 	if (node) {
-		mutex_lock(&node->lock);
+		down_write(&node->lock);
 		atomic_inc(&node->refcount);
 	}
 }
@@ -228,7 +228,7 @@ static void unlock_ref_node(struct fs_node *node)
 {
 	if (node) {
 		atomic_dec(&node->refcount);
-		mutex_unlock(&node->lock);
+		up_write(&node->lock);
 	}
 }
 
@@ -1376,7 +1376,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 		int old_action;
 		int ret;
 
-		nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
+		nested_lock_ref_node(&fte->node, FS_LOCK_CHILD);
 		ret = check_conflicting_ftes(fte, flow_act);
 		if (ret) {
 			handle = ERR_PTR(ret);
@@ -1400,7 +1400,7 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	fte = alloc_insert_fte(fg, match_value, flow_act);
 	if (IS_ERR(fte))
 		return (void *)fte;
-	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
+	nested_lock_ref_node(&fte->node, FS_LOCK_CHILD);
 	handle = add_rule_fte(fte, fg, dest, dest_num, false);
 	if (IS_ERR(handle)) {
 		unlock_ref_node(&fte->node);
@@ -1548,7 +1548,7 @@ static int build_match_list(struct match_list_head *match_head,
 		struct fs_fte *fte;
 
 		g = iter->g;
-		nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
+		nested_lock_ref_node(&g->node, FS_LOCK_PARENT);
 		fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
 					     rhash_fte);
 		if (fte) {
@@ -1566,7 +1566,7 @@ static int build_match_list(struct match_list_head *match_head,
 	list_for_each_entry(iter, &match_head.list, list) {
 		g = iter->g;
 
-		nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
+		nested_lock_ref_node(&g->node, FS_LOCK_PARENT);
 		rule = add_rule_fg(g, spec->match_value,
 				   flow_act, dest, dest_num, NULL);
 		if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) {
@@ -1605,7 +1605,7 @@ static int build_match_list(struct match_list_head *match_head,
 			return ERR_PTR(-EINVAL);
 	}
 
-	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
+	nested_lock_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
 	rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
 	if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOENT)
 		goto unlock;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 6e5d25b..b5c079f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -80,7 +80,7 @@ struct fs_node {
 	struct fs_node		*parent;
 	struct fs_node		*root;
 	/* lock the node for writing and traversing */
-	struct mutex		lock;
+	struct rw_semaphore	lock;
 	atomic_t		refcount;
 	bool			active;
 	void			(*remove_func)(struct fs_node *);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [for-next 8/9] net/mlx5: Allocate FTE object without lock
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Maor Gottlieb, Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Allocation of new FTE is a massive operation, part of
it could be done without taking the flow group write lock.
Split the FTE allocation to two functions of actions which
need to be under lock and action which don't have.

Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 92 +++++++++++------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index e7301cf..bc4bbb7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -546,9 +546,33 @@ static void del_sw_flow_group(struct fs_node *node)
 	WARN_ON(err);
 }
 
-static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
-				u32 *match_value,
-				unsigned int index)
+static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
+{
+	int index;
+	int ret;
+
+	index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL);
+	if (index < 0)
+		return index;
+
+	fte->index = index + fg->start_index;
+	ret = rhashtable_insert_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	if (ret)
+		goto err_ida_remove;
+
+	tree_add_node(&fte->node, &fg->node);
+	list_add_tail(&fte->node.list, &fg->node.children);
+	return 0;
+
+err_ida_remove:
+	ida_simple_remove(&fg->fte_allocator, index);
+	return ret;
+}
+
+static struct fs_fte *alloc_fte(u32 *match_value,
+				struct mlx5_flow_act *flow_act)
 {
 	struct fs_fte *fte;
 
@@ -559,51 +583,13 @@ static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
 	memcpy(fte->val, match_value, sizeof(fte->val));
 	fte->node.type =  FS_TYPE_FLOW_ENTRY;
 	fte->flow_tag = flow_act->flow_tag;
-	fte->index = index;
 	fte->action = flow_act->action;
 	fte->encap_id = flow_act->encap_id;
 	fte->modify_id = flow_act->modify_id;
 
-	return fte;
-}
-
-static struct fs_fte *alloc_insert_fte(struct mlx5_flow_group *fg,
-				       u32 *match_value,
-				       struct mlx5_flow_act *flow_act)
-{
-	struct fs_fte *fte;
-	int index;
-	int ret;
-
-	index = ida_simple_get(&fg->fte_allocator, 0,
-			       fg->max_ftes,
-			       GFP_KERNEL);
-	if (index < 0)
-		return ERR_PTR(index);
-
-	fte = alloc_fte(flow_act, match_value, index + fg->start_index);
-	if (IS_ERR(fte)) {
-		ret = PTR_ERR(fte);
-		goto err_ida_remove;
-	}
-
-	ret = rhashtable_insert_fast(&fg->ftes_hash,
-				     &fte->hash,
-				     rhash_fte);
-	if (ret)
-		goto err_free;
-
 	tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
-	tree_add_node(&fte->node, &fg->node);
-	list_add_tail(&fte->node.list, &fg->node.children);
 
 	return fte;
-
-err_free:
-	kfree(fte);
-err_ida_remove:
-	ida_simple_remove(&fg->fte_allocator, index);
-	return ERR_PTR(ret);
 }
 
 static void dealloc_flow_group(struct mlx5_flow_group *fg)
@@ -1589,6 +1575,11 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 	bool take_write = false;
 	struct fs_fte *fte;
 	u64  version;
+	int err;
+
+	fte = alloc_fte(spec->match_value, flow_act);
+	if (IS_ERR(fte))
+		return  ERR_PTR(-ENOMEM);
 
 	list_for_each_entry(iter, match_head, list) {
 		nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT);
@@ -1620,6 +1611,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 				   flow_act, dest, dest_num, fte_tmp);
 		up_write_ref_node(&fte_tmp->node);
 		tree_put_node(&fte_tmp->node);
+		kfree(fte);
 		return rule;
 	}
 
@@ -1655,13 +1647,14 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 
 		if (!g->node.active)
 			continue;
-		fte = alloc_insert_fte(g, spec->match_value, flow_act);
-		if (IS_ERR(fte)) {
-			if (PTR_ERR(fte) == -ENOSPC)
+		err = insert_fte(g, fte);
+		if (err) {
+			if (err == -ENOSPC)
 				continue;
 			list_for_each_entry(iter, match_head, list)
 				up_write_ref_node(&iter->g->node);
-			return (void *)fte;
+			kfree(fte);
+			return ERR_PTR(err);
 		}
 
 		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
@@ -1677,6 +1670,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 out:
 	list_for_each_entry(iter, match_head, list)
 		up_write_ref_node(&iter->g->node);
+	kfree(fte);
 	return rule;
 }
 
@@ -1746,12 +1740,18 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 	if (err)
 		goto err_release_fg;
 
-	fte = alloc_insert_fte(g, spec->match_value, flow_act);
+	fte = alloc_fte(spec->match_value, flow_act);
 	if (IS_ERR(fte)) {
 		err = PTR_ERR(fte);
 		goto err_release_fg;
 	}
 
+	err = insert_fte(g, fte);
+	if (err) {
+		kfree(fte);
+		goto err_release_fg;
+	}
+
 	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
 	up_write_ref_node(&g->node);
 	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [for-next 9/9] net/mlx5: Add FGs and FTEs memory pool
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Leon Romanovsky, Maor Gottlieb, Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

From: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Add memory pool allocation for flow groups and flow
table entry.

It is useful because these objects are not small and could
be allocated/deallocated many times.

Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Saeed Mahameed <saeedm-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 67 +++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index bc4bbb7..7a136ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -269,8 +269,9 @@ static void tree_put_node(struct fs_node *node)
 			if (node->del_sw_func)
 				node->del_sw_func(node);
 			up_write_ref_node(parent_node);
+		} else {
+			kfree(node);
 		}
-		kfree(node);
 		node = NULL;
 	}
 	if (!node && parent_node)
@@ -389,6 +390,15 @@ static struct mlx5_flow_root_namespace *find_root(struct fs_node *node)
 	return container_of(ns, struct mlx5_flow_root_namespace, ns);
 }
 
+static inline struct mlx5_flow_steering *get_steering(struct fs_node *node)
+{
+	struct mlx5_flow_root_namespace *root = find_root(node);
+
+	if (root)
+		return root->dev->priv.steering;
+	return NULL;
+}
+
 static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
 {
 	struct mlx5_flow_root_namespace *root = find_root(node);
@@ -424,6 +434,7 @@ static void del_sw_flow_table(struct fs_node *node)
 	rhltable_destroy(&ft->fgs_hash);
 	fs_get_obj(prio, ft->node.parent);
 	prio->num_ft--;
+	kfree(ft);
 }
 
 static void del_sw_hw_rule(struct fs_node *node)
@@ -469,6 +480,7 @@ static void del_sw_hw_rule(struct fs_node *node)
 				       "%s can't del rule fg id=%d fte_index=%d\n",
 				       __func__, fg->id, fte->index);
 	}
+	kfree(rule);
 }
 
 static void del_hw_fte(struct fs_node *node)
@@ -497,6 +509,7 @@ static void del_hw_fte(struct fs_node *node)
 
 static void del_sw_fte(struct fs_node *node)
 {
+	struct mlx5_flow_steering *steering = get_steering(node);
 	struct mlx5_flow_group *fg;
 	struct fs_fte *fte;
 	int err;
@@ -509,6 +522,7 @@ static void del_sw_fte(struct fs_node *node)
 				     rhash_fte);
 	WARN_ON(err);
 	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
+	kmem_cache_free(steering->ftes_cache, fte);
 }
 
 static void del_hw_flow_group(struct fs_node *node)
@@ -529,6 +543,7 @@ static void del_hw_flow_group(struct fs_node *node)
 
 static void del_sw_flow_group(struct fs_node *node)
 {
+	struct mlx5_flow_steering *steering = get_steering(node);
 	struct mlx5_flow_group *fg;
 	struct mlx5_flow_table *ft;
 	int err;
@@ -544,6 +559,7 @@ static void del_sw_flow_group(struct fs_node *node)
 			      &fg->hash,
 			      rhash_fg);
 	WARN_ON(err);
+	kmem_cache_free(steering->fgs_cache, fg);
 }
 
 static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
@@ -571,12 +587,14 @@ static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte)
 	return ret;
 }
 
-static struct fs_fte *alloc_fte(u32 *match_value,
+static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft,
+				u32 *match_value,
 				struct mlx5_flow_act *flow_act)
 {
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
 	struct fs_fte *fte;
 
-	fte = kzalloc(sizeof(*fte), GFP_KERNEL);
+	fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL);
 	if (!fte)
 		return ERR_PTR(-ENOMEM);
 
@@ -592,13 +610,15 @@ static struct fs_fte *alloc_fte(u32 *match_value,
 	return fte;
 }
 
-static void dealloc_flow_group(struct mlx5_flow_group *fg)
+static void dealloc_flow_group(struct mlx5_flow_steering *steering,
+			       struct mlx5_flow_group *fg)
 {
 	rhashtable_destroy(&fg->ftes_hash);
-	kfree(fg);
+	kmem_cache_free(steering->fgs_cache, fg);
 }
 
-static struct mlx5_flow_group *alloc_flow_group(u8 match_criteria_enable,
+static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering,
+						u8 match_criteria_enable,
 						void *match_criteria,
 						int start_index,
 						int end_index)
@@ -606,13 +626,13 @@ static struct mlx5_flow_group *alloc_flow_group(u8 match_criteria_enable,
 	struct mlx5_flow_group *fg;
 	int ret;
 
-	fg = kzalloc(sizeof(*fg), GFP_KERNEL);
+	fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL);
 	if (!fg)
 		return ERR_PTR(-ENOMEM);
 
 	ret = rhashtable_init(&fg->ftes_hash, &rhash_fte);
 	if (ret) {
-		kfree(fg);
+		kmem_cache_free(steering->fgs_cache, fg);
 		return ERR_PTR(ret);
 }
 	ida_init(&fg->fte_allocator);
@@ -633,10 +653,11 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f
 						       int end_index,
 						       struct list_head *prev)
 {
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
 	struct mlx5_flow_group *fg;
 	int ret;
 
-	fg = alloc_flow_group(match_criteria_enable, match_criteria,
+	fg = alloc_flow_group(steering, match_criteria_enable, match_criteria,
 			      start_index, end_index);
 	if (IS_ERR(fg))
 		return fg;
@@ -646,7 +667,7 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f
 			      &fg->hash,
 			      rhash_fg);
 	if (ret) {
-		dealloc_flow_group(fg);
+		dealloc_flow_group(steering, fg);
 		return ERR_PTR(ret);
 	}
 
@@ -1569,6 +1590,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 		       int dest_num,
 		       int ft_version)
 {
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
 	struct match_list *iter;
@@ -1577,7 +1599,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 	u64  version;
 	int err;
 
-	fte = alloc_fte(spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec->match_value, flow_act);
 	if (IS_ERR(fte))
 		return  ERR_PTR(-ENOMEM);
 
@@ -1611,7 +1633,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 				   flow_act, dest, dest_num, fte_tmp);
 		up_write_ref_node(&fte_tmp->node);
 		tree_put_node(&fte_tmp->node);
-		kfree(fte);
+		kmem_cache_free(steering->ftes_cache, fte);
 		return rule;
 	}
 
@@ -1653,7 +1675,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 				continue;
 			list_for_each_entry(iter, match_head, list)
 				up_write_ref_node(&iter->g->node);
-			kfree(fte);
+			kmem_cache_free(steering->ftes_cache, fte);
 			return ERR_PTR(err);
 		}
 
@@ -1670,7 +1692,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 out:
 	list_for_each_entry(iter, match_head, list)
 		up_write_ref_node(&iter->g->node);
-	kfree(fte);
+	kmem_cache_free(steering->ftes_cache, fte);
 	return rule;
 }
 
@@ -1682,6 +1704,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 		     int dest_num)
 
 {
+	struct mlx5_flow_steering *steering = get_steering(&ft->node);
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
 	struct match_list_head match_head;
@@ -1740,7 +1763,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 	if (err)
 		goto err_release_fg;
 
-	fte = alloc_fte(spec->match_value, flow_act);
+	fte = alloc_fte(ft, spec->match_value, flow_act);
 	if (IS_ERR(fte)) {
 		err = PTR_ERR(fte);
 		goto err_release_fg;
@@ -1748,7 +1771,7 @@ static u64 matched_fgs_get_version(struct list_head *match_head)
 
 	err = insert_fte(g, fte);
 	if (err) {
-		kfree(fte);
+		kmem_cache_free(steering->ftes_cache, fte);
 		goto err_release_fg;
 	}
 
@@ -2281,6 +2304,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev)
 	cleanup_root_ns(steering->sniffer_rx_root_ns);
 	cleanup_root_ns(steering->sniffer_tx_root_ns);
 	mlx5_cleanup_fc_stats(dev);
+	kmem_cache_destroy(steering->ftes_cache);
+	kmem_cache_destroy(steering->fgs_cache);
 	kfree(steering);
 }
 
@@ -2386,6 +2411,16 @@ int mlx5_init_fs(struct mlx5_core_dev *dev)
 	steering->dev = dev;
 	dev->priv.steering = steering;
 
+	steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs",
+						sizeof(struct mlx5_flow_group), 0,
+						0, NULL);
+	steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0,
+						 0, NULL);
+	if (!steering->ftes_cache || !steering->fgs_cache) {
+		err = -ENOMEM;
+		goto err;
+	}
+
 	if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
 	      (MLX5_CAP_GEN(dev, nic_flow_table))) ||
 	     ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 875b753..ebe1845 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -65,6 +65,8 @@ enum fs_fte_status {
 
 struct mlx5_flow_steering {
 	struct mlx5_core_dev *dev;
+	struct kmem_cache               *fgs_cache;
+	struct kmem_cache               *ftes_cache;
 	struct mlx5_flow_root_namespace *root_ns;
 	struct mlx5_flow_root_namespace *fdb_root_ns;
 	struct mlx5_flow_root_namespace *esw_egress_root_ns;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [pull request][for-next 0/9] Mellanox, mlx5 updates 2017-10-06
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Saeed Mahameed

Hi Dave and Doug,

This series includes some shared code updates for kernel 4.15 to both
net-next and rdma-next trees.

The series includes mlx5 low level flow steering updates and optimizations
to support firmware command parallelism for flow steering requests from
Maor Gottlieb and two other small fixes from Matan and Maor.

One fix from Matan adds error handling for when the destination
list of the flow steering rule is full.

Maor introduced a patch to avoid NULL pointer dereference on steering cleanup.

Then Some refactoring patches needed by the series for code sharing purposes.
and split the Flow Table Entry (FTE) and Flow Group (FG) creation code to two parts:
    1) Object allocation - allocate the steering node and initialize
    its resources.

    2) The firmware command execution.

This change will give us the ability to take write lock on the 
parent node (e.g. FG for FTE creating) only on the software data struct allocation
and creation part of the procedure where the synchronization is really required,
and will allow us to execute multiple firmware commands simultaneously and overcome the 
firmware bottleneck.

Refactor the locking scheme of the mlx5 core flow steering as follows:

1) Replace the mutex lock with readers-writers semaphore and take
    the write lock only when necessary (e.g. allocating a new flow
    table entry index or adding a node to the parent's children list).
    When we try to find a suitable child in the parent's children list
    (e.g. search for flow group with the same match_criteria of the rule)
    then we only take the read lock.

2) Add versioning mechanism - each steering entity (FT, FG, FTE, DST)
    will have an incremental version. The version is increased when the 
    entity is changed (e.g. when a new FTE was added to FG - the FG's
    version is increased).
    Versioning is used in order to determine if the last traverse of an
    entity's children is valid or a rescan under write lock is required.

Last patch adds FGs and FTEs memory pool, It is useful because these objects 
are not small and could be allocated/deallocated many times.

This support improves the insertion rate of steering rules
from ~5k/sec to ~40k/sec.

Please pull and let me knwo if there's any problem.

Thanks,
Saeed

---

The following changes since commit e19b205be43d11bff638cad4487008c48d21c103:

  Linux 4.14-rc2 (2017-09-24 16:38:56 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git tags/mlx5-updates-2017-10-06

for you to fetch changes up to a369d4ac4dff92129ea0dfa3d66f45a830e29098:

  net/mlx5: Add FGs and FTEs memory pool (2017-09-26 20:52:05 +0300)

----------------------------------------------------------------
Maor Gottlieb (8):
      net/mlx5: Avoid NULL pointer dereference on steering cleanup
      net/mlx5: Move the entry index allocator to flow group
      net/mlx5: Export building of matched flow groups list
      net/mlx5: Refactor FTE and FG creation code
      net/mlx5: Replace fs_node mutex with reader/writer semaphore
      net/mlx5: Support multiple updates of steering rules in parallel
      net/mlx5: Allocate FTE object without lock
      net/mlx5: Add FGs and FTEs memory pool

Matan Barak (1):
      net/mlx5: Fix creating a new FTE when an existing but full FTE exists

 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 853 ++++++++++++++--------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  11 +-
 2 files changed, 547 insertions(+), 317 deletions(-)

^ permalink raw reply

* [for-next 1/9] net/mlx5: Fix creating a new FTE when an existing but full FTE exists
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Matan Barak, Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm@mellanox.com>

From: Matan Barak <matanb@mellanox.com>

Currently, when a flow steering rule is added, we look for a FTE with
an identical value. If we find a match, we try to merge the required
destinations with the existing ones. In a case where the existing
destination list is full, the code should return an error to its
consumer. However, the current code just tries to create another FTE.
Fixing that by returning an error in this special scenario.

Fixes: f478be79a22e ("net/mlx5: Add hash table for flow groups in flow table")
Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5a7bea6..6ffe925 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1449,7 +1449,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 		       int dest_num)
 {
 	struct mlx5_flow_group *g;
-	struct mlx5_flow_handle *rule = ERR_PTR(-ENOENT);
+	struct mlx5_flow_handle *rule;
 	struct rhlist_head *tmp, *list;
 	struct match_list {
 		struct list_head	list;
@@ -1513,6 +1513,8 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 		unlock_ref_node(&g->node);
 	}

+	rule = ERR_PTR(-ENOENT);
+
 free_list:
 	if (!list_empty(&match_head)) {
 		struct match_list *match_tmp;
@@ -1553,7 +1555,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,

 	nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT);
 	rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
-	if (!IS_ERR(rule))
+	if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOENT)
 		goto unlock;

 	g = create_autogroup(ft, spec->match_criteria_enable,
-- 
1.8.3.1

^ permalink raw reply related

* [for-next 5/9] net/mlx5: Refactor FTE and FG creation code
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Maor Gottlieb,
	Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Split the creation code to two parts:
1) Object allocation - allocate the steering node and initialize
its resources.

2) The firmware command execution.

Adding active flag to each node - this flag indicates if the
object exists in the hardware or not, if not we don't free
the hardware resource in error flow.

This change will give us the ability to take write lock on the
parent node (e.g. FG for FTE creationg) only on the first part.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 356 ++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   1 +
 2 files changed, 190 insertions(+), 167 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 33bcaca..41f26f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -179,14 +179,14 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 	       struct mlx5_flow_destination *dest);
 
 static void tree_init_node(struct fs_node *node,
-			   unsigned int refcount,
 			   void (*remove_func)(struct fs_node *))
 {
-	atomic_set(&node->refcount, refcount);
+	atomic_set(&node->refcount, 1);
 	INIT_LIST_HEAD(&node->list);
 	INIT_LIST_HEAD(&node->children);
 	mutex_init(&node->lock);
 	node->remove_func = remove_func;
+	node->active = false;
 }
 
 static void tree_add_node(struct fs_node *node, struct fs_node *parent)
@@ -381,9 +381,11 @@ static void del_flow_table(struct fs_node *node)
 	fs_get_obj(ft, node);
 	dev = get_dev(&ft->node);
 
-	err = mlx5_cmd_destroy_flow_table(dev, ft);
-	if (err)
-		mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+	if (node->active) {
+		err = mlx5_cmd_destroy_flow_table(dev, ft);
+		if (err)
+			mlx5_core_warn(dev, "flow steering can't destroy ft\n");
+	}
 	rhltable_destroy(&ft->fgs_hash);
 	fs_get_obj(prio, ft->node.parent);
 	prio->num_ft--;
@@ -435,18 +437,6 @@ static void del_rule(struct fs_node *node)
 	}
 }
 
-static void destroy_fte(struct fs_fte *fte, struct mlx5_flow_group *fg)
-{
-	struct mlx5_flow_table *ft;
-	int ret;
-
-	ret = rhashtable_remove_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-	WARN_ON(ret);
-	fte->status = 0;
-	fs_get_obj(ft, fg->node.parent);
-	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
-}
-
 static void del_fte(struct fs_node *node)
 {
 	struct mlx5_flow_table *ft;
@@ -461,14 +451,20 @@ static void del_fte(struct fs_node *node)
 	trace_mlx5_fs_del_fte(fte);
 
 	dev = get_dev(&ft->node);
-	err = mlx5_cmd_delete_fte(dev, ft,
-				  fte->index);
-	if (err)
-		mlx5_core_warn(dev,
-			       "flow steering can't delete fte in index %d of flow group id %d\n",
-			       fte->index, fg->id);
+	if (node->active) {
+		err = mlx5_cmd_delete_fte(dev, ft,
+					  fte->index);
+		if (err)
+			mlx5_core_warn(dev,
+				       "flow steering can't delete fte in index %d of flow group id %d\n",
+				       fte->index, fg->id);
+	}
 
-	destroy_fte(fte, fg);
+	err = rhashtable_remove_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	WARN_ON(err);
+	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
 }
 
 static void del_flow_group(struct fs_node *node)
@@ -492,7 +488,7 @@ static void del_flow_group(struct fs_node *node)
 			      &fg->hash,
 			      rhash_fg);
 	WARN_ON(err);
-	if (mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
+	if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
 		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
 			       fg->id, ft->id);
 }
@@ -518,14 +514,57 @@ static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
 	return fte;
 }
 
-static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
+static struct fs_fte *alloc_insert_fte(struct mlx5_flow_group *fg,
+				       u32 *match_value,
+				       struct mlx5_flow_act *flow_act)
+{
+	struct fs_fte *fte;
+	int index;
+	int ret;
+
+	index = ida_simple_get(&fg->fte_allocator, 0,
+			       fg->max_ftes,
+			       GFP_KERNEL);
+	if (index < 0)
+		return ERR_PTR(index);
+
+	fte = alloc_fte(flow_act, match_value, index + fg->start_index);
+	if (IS_ERR(fte)) {
+		ret = PTR_ERR(fte);
+		goto err_ida_remove;
+	}
+
+	ret = rhashtable_insert_fast(&fg->ftes_hash,
+				     &fte->hash,
+				     rhash_fte);
+	if (ret)
+		goto err_free;
+
+	tree_init_node(&fte->node, del_fte);
+	tree_add_node(&fte->node, &fg->node);
+	list_add_tail(&fte->node.list, &fg->node.children);
+
+	return fte;
+
+err_free:
+	kfree(fte);
+err_ida_remove:
+	ida_simple_remove(&fg->fte_allocator, index);
+	return ERR_PTR(ret);
+}
+
+static void dealloc_flow_group(struct mlx5_flow_group *fg)
+{
+	rhashtable_destroy(&fg->ftes_hash);
+	kfree(fg);
+}
+
+static struct mlx5_flow_group *alloc_flow_group(u8 match_criteria_enable,
+						void *match_criteria,
+						int start_index,
+						int end_index)
 {
 	struct mlx5_flow_group *fg;
-	void *match_criteria = MLX5_ADDR_OF(create_flow_group_in,
-					    create_fg_in, match_criteria);
-	u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
-					    create_fg_in,
-					    match_criteria_enable);
 	int ret;
 
 	fg = kzalloc(sizeof(*fg), GFP_KERNEL);
@@ -536,16 +575,47 @@ static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in)
 	if (ret) {
 		kfree(fg);
 		return ERR_PTR(ret);
-	}
+}
 	ida_init(&fg->fte_allocator);
 	fg->mask.match_criteria_enable = match_criteria_enable;
 	memcpy(&fg->mask.match_criteria, match_criteria,
 	       sizeof(fg->mask.match_criteria));
 	fg->node.type =  FS_TYPE_FLOW_GROUP;
-	fg->start_index = MLX5_GET(create_flow_group_in, create_fg_in,
-				   start_flow_index);
-	fg->max_ftes = MLX5_GET(create_flow_group_in, create_fg_in,
-				end_flow_index) - fg->start_index + 1;
+	fg->start_index = start_index;
+	fg->max_ftes = end_index - start_index + 1;
+
+	return fg;
+}
+
+static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft,
+						       u8 match_criteria_enable,
+						       void *match_criteria,
+						       int start_index,
+						       int end_index,
+						       struct list_head *prev)
+{
+	struct mlx5_flow_group *fg;
+	int ret;
+
+	fg = alloc_flow_group(match_criteria_enable, match_criteria,
+			      start_index, end_index);
+	if (IS_ERR(fg))
+		return fg;
+
+	/* initialize refcnt, add to parent list */
+	ret = rhltable_insert(&ft->fgs_hash,
+			      &fg->hash,
+			      rhash_fg);
+	if (ret) {
+		dealloc_flow_group(fg);
+		return ERR_PTR(ret);
+	}
+
+	tree_init_node(&fg->node, del_flow_group);
+	tree_add_node(&fg->node, &ft->node);
+	/* Add node to group list */
+	list_add(&fg->node.list, prev);
+
 	return fg;
 }
 
@@ -870,7 +940,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 		goto unlock_root;
 	}
 
-	tree_init_node(&ft->node, 1, del_flow_table);
+	tree_init_node(&ft->node, del_flow_table);
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
 	err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type,
@@ -882,6 +952,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 	err = connect_flow_table(root->dev, ft, fs_prio);
 	if (err)
 		goto destroy_ft;
+	ft->node.active = true;
 	lock_ref_node(&fs_prio->node);
 	tree_add_node(&ft->node, &fs_prio->node);
 	list_add_flow_table(ft, fs_prio);
@@ -959,55 +1030,6 @@ struct mlx5_flow_table*
 }
 EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table);
 
-/* Flow table should be locked */
-static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *ft,
-							u32 *fg_in,
-							struct list_head
-							*prev_fg,
-							bool is_auto_fg)
-{
-	struct mlx5_flow_group *fg;
-	struct mlx5_core_dev *dev = get_dev(&ft->node);
-	int err;
-
-	if (!dev)
-		return ERR_PTR(-ENODEV);
-
-	fg = alloc_flow_group(fg_in);
-	if (IS_ERR(fg))
-		return fg;
-
-	err = rhltable_insert(&ft->fgs_hash, &fg->hash, rhash_fg);
-	if (err)
-		goto err_free_fg;
-
-	err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
-	if (err)
-		goto err_remove_fg;
-
-	if (ft->autogroup.active)
-		ft->autogroup.num_groups++;
-	/* Add node to tree */
-	tree_init_node(&fg->node, !is_auto_fg, del_flow_group);
-	tree_add_node(&fg->node, &ft->node);
-	/* Add node to group list */
-	list_add(&fg->node.list, prev_fg);
-
-	trace_mlx5_fs_add_fg(fg);
-	return fg;
-
-err_remove_fg:
-	WARN_ON(rhltable_remove(&ft->fgs_hash,
-				&fg->hash,
-				rhash_fg));
-err_free_fg:
-	rhashtable_destroy(&fg->ftes_hash);
-	ida_destroy(&fg->fte_allocator);
-	kfree(fg);
-
-	return ERR_PTR(err);
-}
-
 struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
 					       u32 *fg_in)
 {
@@ -1016,7 +1038,13 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
 	u8 match_criteria_enable = MLX5_GET(create_flow_group_in,
 					    fg_in,
 					    match_criteria_enable);
+	int start_index = MLX5_GET(create_flow_group_in, fg_in,
+				   start_flow_index);
+	int end_index = MLX5_GET(create_flow_group_in, fg_in,
+				 end_flow_index);
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
 	struct mlx5_flow_group *fg;
+	int err;
 
 	if (!check_valid_mask(match_criteria_enable, match_criteria))
 		return ERR_PTR(-EINVAL);
@@ -1025,8 +1053,20 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
 		return ERR_PTR(-EPERM);
 
 	lock_ref_node(&ft->node);
-	fg = create_flow_group_common(ft, fg_in, ft->node.children.prev, false);
+	fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
+				     start_index, end_index,
+				     ft->node.children.prev);
 	unlock_ref_node(&ft->node);
+	if (IS_ERR(fg))
+		return fg;
+
+	err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id);
+	if (err) {
+		tree_put_node(&fg->node);
+		return ERR_PTR(err);
+	}
+	trace_mlx5_fs_add_fg(fg);
+	fg->node.active = true;
 
 	return fg;
 }
@@ -1111,7 +1151,7 @@ static void destroy_flow_handle(struct fs_fte *fte,
 		/* Add dest to dests list- we need flow tables to be in the
 		 * end of the list for forward to next prio rules.
 		 */
-		tree_init_node(&rule->node, 1, del_rule);
+		tree_init_node(&rule->node, del_rule);
 		if (dest &&
 		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
 			list_add(&rule->node.list, &fte->node.children);
@@ -1167,6 +1207,7 @@ static void destroy_flow_handle(struct fs_fte *fte,
 	if (err)
 		goto free_handle;
 
+	fte->node.active = true;
 	fte->status |= FS_FTE_STATUS_EXISTING;
 
 out:
@@ -1177,59 +1218,17 @@ static void destroy_flow_handle(struct fs_fte *fte,
 	return ERR_PTR(err);
 }
 
-static struct fs_fte *create_fte(struct mlx5_flow_group *fg,
-				 u32 *match_value,
-				 struct mlx5_flow_act *flow_act)
+static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table  *ft,
+						     struct mlx5_flow_spec *spec)
 {
-	struct fs_fte *fte;
-	int index;
-	int ret;
-
-	index = ida_simple_get(&fg->fte_allocator, 0,
-			       fg->max_ftes,
-			       GFP_KERNEL);
-	if (index < 0)
-		return ERR_PTR(index);
-
-	index += fg->start_index;
-
-	fte = alloc_fte(flow_act, match_value, index);
-	if (IS_ERR(fte)) {
-		ret = PTR_ERR(fte);
-		goto err_alloc;
-	}
-	ret = rhashtable_insert_fast(&fg->ftes_hash, &fte->hash, rhash_fte);
-	if (ret)
-		goto err_hash;
-
-	return fte;
-
-err_hash:
-	kfree(fte);
-err_alloc:
-	ida_simple_remove(&fg->fte_allocator, index - fg->start_index);
-	return ERR_PTR(ret);
-}
-
-static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
-						u8 match_criteria_enable,
-						u32 *match_criteria)
-{
-	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	struct list_head *prev = &ft->node.children;
-	unsigned int candidate_index = 0;
 	struct mlx5_flow_group *fg;
-	void *match_criteria_addr;
+	unsigned int candidate_index = 0;
 	unsigned int group_size = 0;
-	u32 *in;
 
 	if (!ft->autogroup.active)
 		return ERR_PTR(-ENOENT);
 
-	in = kvzalloc(inlen, GFP_KERNEL);
-	if (!in)
-		return ERR_PTR(-ENOMEM);
-
 	if (ft->autogroup.num_groups < ft->autogroup.required_groups)
 		/* We save place for flow groups in addition to max types */
 		group_size = ft->max_fte / (ft->autogroup.required_groups + 1);
@@ -1247,25 +1246,55 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft,
 		prev = &fg->node.list;
 	}
 
-	if (candidate_index + group_size > ft->max_fte) {
-		fg = ERR_PTR(-ENOSPC);
+	if (candidate_index + group_size > ft->max_fte)
+		return ERR_PTR(-ENOSPC);
+
+	fg = alloc_insert_flow_group(ft,
+				     spec->match_criteria_enable,
+				     spec->match_criteria,
+				     candidate_index,
+				     candidate_index + group_size - 1,
+				     prev);
+	if (IS_ERR(fg))
 		goto out;
-	}
+
+	ft->autogroup.num_groups++;
+
+out:
+	return fg;
+}
+
+static int create_auto_flow_group(struct mlx5_flow_table *ft,
+				  struct mlx5_flow_group *fg)
+{
+	struct mlx5_core_dev *dev = get_dev(&ft->node);
+	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	void *match_criteria_addr;
+	int err;
+	u32 *in;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
 
 	MLX5_SET(create_flow_group_in, in, match_criteria_enable,
-		 match_criteria_enable);
-	MLX5_SET(create_flow_group_in, in, start_flow_index, candidate_index);
-	MLX5_SET(create_flow_group_in, in, end_flow_index,   candidate_index +
-		 group_size - 1);
+		 fg->mask.match_criteria_enable);
+	MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index);
+	MLX5_SET(create_flow_group_in, in, end_flow_index,   fg->start_index +
+		 fg->max_ftes - 1);
 	match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in,
 					   in, match_criteria);
-	memcpy(match_criteria_addr, match_criteria,
-	       MLX5_ST_SZ_BYTES(fte_match_param));
+	memcpy(match_criteria_addr, fg->mask.match_criteria,
+	       sizeof(fg->mask.match_criteria));
+
+	err = mlx5_cmd_create_flow_group(dev, ft, in, &fg->id);
+	if (!err) {
+		fg->node.active = true;
+		trace_mlx5_fs_add_fg(fg);
+	}
 
-	fg = create_flow_group_common(ft, in, prev, true);
-out:
 	kvfree(in);
-	return fg;
+	return err;
 }
 
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
@@ -1368,23 +1397,17 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 	}
 	fs_get_obj(ft, fg->node.parent);
 
-	fte = create_fte(fg, match_value, flow_act);
+	fte = alloc_insert_fte(fg, match_value, flow_act);
 	if (IS_ERR(fte))
 		return (void *)fte;
-	tree_init_node(&fte->node, 0, del_fte);
 	nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD);
 	handle = add_rule_fte(fte, fg, dest, dest_num, false);
 	if (IS_ERR(handle)) {
 		unlock_ref_node(&fte->node);
-		destroy_fte(fte, fg);
-		kfree(fte);
+		tree_put_node(&fte->node);
 		return handle;
 	}
 
-	tree_add_node(&fte->node, &fg->node);
-	/* fte list isn't sorted */
-	list_add_tail(&fte->node.list, &fg->node.children);
-	trace_mlx5_fs_set_fte(fte, true);
 add_rules:
 	for (i = 0; i < handle->num_rules; i++) {
 		if (atomic_read(&handle->rule[i]->node.refcount) == 1) {
@@ -1571,6 +1594,7 @@ static int build_match_list(struct match_list_head *match_head,
 {
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
+	int err;
 	int i;
 
 	if (!check_valid_spec(spec))
@@ -1586,24 +1610,22 @@ static int build_match_list(struct match_list_head *match_head,
 	if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOENT)
 		goto unlock;
 
-	g = create_autogroup(ft, spec->match_criteria_enable,
-			     spec->match_criteria);
+	g = alloc_auto_flow_group(ft, spec);
 	if (IS_ERR(g)) {
 		rule = (void *)g;
 		goto unlock;
 	}
 
+	err = create_auto_flow_group(ft, g);
+	if (err) {
+		rule = ERR_PTR(err);
+		goto put_fg;
+	}
+
 	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
 			   dest_num, NULL);
-	if (IS_ERR(rule)) {
-		/* Remove assumes refcount > 0 and autogroup creates a group
-		 * with a refcount = 0.
-		 */
-		unlock_ref_node(&ft->node);
-		tree_get_node(&g->node);
-		tree_remove_node(&g->node);
-		return rule;
-	}
+put_fg:
+	tree_put_node(&g->node);
 unlock:
 	unlock_ref_node(&ft->node);
 	return rule;
@@ -1847,7 +1869,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 		return ERR_PTR(-ENOMEM);
 
 	fs_prio->node.type = FS_TYPE_PRIO;
-	tree_init_node(&fs_prio->node, 1, NULL);
+	tree_init_node(&fs_prio->node, NULL);
 	tree_add_node(&fs_prio->node, &ns->node);
 	fs_prio->num_levels = num_levels;
 	fs_prio->prio = prio;
@@ -1873,7 +1895,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
 		return ERR_PTR(-ENOMEM);
 
 	fs_init_namespace(ns);
-	tree_init_node(&ns->node, 1, NULL);
+	tree_init_node(&ns->node, NULL);
 	tree_add_node(&ns->node, &prio->node);
 	list_add_tail(&ns->node.list, &prio->node.children);
 
@@ -1998,7 +2020,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering
 	ns = &root_ns->ns;
 	fs_init_namespace(ns);
 	mutex_init(&root_ns->chain_lock);
-	tree_init_node(&ns->node, 1, NULL);
+	tree_init_node(&ns->node, NULL);
 	tree_add_node(&ns->node, NULL);
 
 	return root_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 02c969c..6e5d25b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -82,6 +82,7 @@ struct fs_node {
 	/* lock the node for writing and traversing */
 	struct mutex		lock;
 	atomic_t		refcount;
+	bool			active;
 	void			(*remove_func)(struct fs_node *);
 };
 
-- 
1.8.3.1

^ permalink raw reply related

* [for-next 4/9] net/mlx5: Export building of matched flow groups list
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Maor Gottlieb,
	Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Refactor the code and export the build of the matched flow groups
list to separate function.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 100 ++++++++++++++--------
 1 file changed, 64 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 2a0b556..33bcaca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -1441,47 +1441,87 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 	return true;
 }
 
-static struct mlx5_flow_handle *
-try_add_to_existing_fg(struct mlx5_flow_table *ft,
-		       struct mlx5_flow_spec *spec,
-		       struct mlx5_flow_act *flow_act,
-		       struct mlx5_flow_destination *dest,
-		       int dest_num)
-{
+struct match_list {
+	struct list_head	list;
 	struct mlx5_flow_group *g;
-	struct mlx5_flow_handle *rule;
+};
+
+struct match_list_head {
+	struct list_head  list;
+	struct match_list first;
+};
+
+static void free_match_list(struct match_list_head *head)
+{
+	if (!list_empty(&head->list)) {
+		struct match_list *iter, *match_tmp;
+
+		list_del(&head->first.list);
+		list_for_each_entry_safe(iter, match_tmp, &head->list,
+					 list) {
+			list_del(&iter->list);
+			kfree(iter);
+		}
+	}
+}
+
+static int build_match_list(struct match_list_head *match_head,
+			    struct mlx5_flow_table *ft,
+			    struct mlx5_flow_spec *spec)
+{
 	struct rhlist_head *tmp, *list;
-	struct match_list {
-		struct list_head	list;
-		struct mlx5_flow_group *g;
-	} match_list, *iter;
-	LIST_HEAD(match_head);
+	struct mlx5_flow_group *g;
+	int err = 0;
 
 	rcu_read_lock();
+	INIT_LIST_HEAD(&match_head->list);
 	/* Collect all fgs which has a matching match_criteria */
 	list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg);
+	/* RCU is atomic, we can't execute FW commands here */
 	rhl_for_each_entry_rcu(g, tmp, list, hash) {
 		struct match_list *curr_match;
 
-		if (likely(list_empty(&match_head))) {
-			match_list.g = g;
-			list_add_tail(&match_list.list, &match_head);
+		if (likely(list_empty(&match_head->list))) {
+			match_head->first.g = g;
+			list_add_tail(&match_head->first.list,
+				      &match_head->list);
 			continue;
 		}
-		curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
 
+		curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC);
 		if (!curr_match) {
-			rcu_read_unlock();
-			rule = ERR_PTR(-ENOMEM);
-			goto free_list;
+			free_match_list(match_head);
+			err = -ENOMEM;
+			goto out;
 		}
 		curr_match->g = g;
-		list_add_tail(&curr_match->list, &match_head);
+		list_add_tail(&curr_match->list, &match_head->list);
 	}
+out:
 	rcu_read_unlock();
+	return err;
+}
+
+static struct mlx5_flow_handle *
+try_add_to_existing_fg(struct mlx5_flow_table *ft,
+		       struct mlx5_flow_spec *spec,
+		       struct mlx5_flow_act *flow_act,
+		       struct mlx5_flow_destination *dest,
+		       int dest_num)
+{
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_handle *rule;
+	struct match_list_head match_head;
+	struct match_list *iter;
+	int err;
+
+	/* Collect all fgs which has a matching match_criteria */
+	err = build_match_list(&match_head, ft, spec);
+	if (err)
+		return ERR_PTR(err);
 
 	/* Try to find a fg that already contains a matching fte */
-	list_for_each_entry(iter, &match_head, list) {
+	list_for_each_entry(iter, &match_head.list, list) {
 		struct fs_fte *fte;
 
 		g = iter->g;
@@ -1500,7 +1540,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 	/* No group with matching fte found. Try to add a new fte to any
 	 * matching fg.
 	 */
-	list_for_each_entry(iter, &match_head, list) {
+	list_for_each_entry(iter, &match_head.list, list) {
 		g = iter->g;
 
 		nested_lock_ref_node(&g->node, FS_MUTEX_PARENT);
@@ -1516,19 +1556,7 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest,
 	rule = ERR_PTR(-ENOENT);
 
 free_list:
-	if (!list_empty(&match_head)) {
-		struct match_list *match_tmp;
-
-		/* The most common case is having one FG. Since we want to
-		 * optimize this case, we save the first on the stack.
-		 * Therefore, no need to free it.
-		 */
-		list_del(&list_first_entry(&match_head, typeof(*iter), list)->list);
-		list_for_each_entry_safe(iter, match_tmp, &match_head, list) {
-			list_del(&iter->list);
-			kfree(iter);
-		}
-	}
+	free_match_list(&match_head);
 
 	return rule;
 }
-- 
1.8.3.1

^ permalink raw reply related

* [for-next 7/9] net/mlx5: Support multiple updates of steering rules in parallel
From: Saeed Mahameed @ 2017-10-06 23:37 UTC (permalink / raw)
  To: David S. Miller, Doug Ledford
  Cc: netdev, linux-rdma, Leon Romanovsky, Maor Gottlieb,
	Saeed Mahameed
In-Reply-To: <20171006233749.25545-1-saeedm@mellanox.com>

From: Maor Gottlieb <maorg@mellanox.com>

Most of the time spent on adding new flow steering rule
is executing the firmware command.
The most common action is adding a new flow steering entry.
In order to enhance the update rate we parallelize the
commands by doing the following:

1) Replace the mutex lock with readers-writers semaphore and take
the write lock only when necessary (e.g. allocating a new flow
table entry index or adding a node to the parent's children list).
When we try to find a suitable child in the parent's children list
(e.g. search for flow group with the same match_criteria of the rule)
then we only take the read lock.

2) Add versioning mechanism - each steering entity (FT, FG, FTE, DST)
will have an incremental version. The version is increased when the
entity is changed (e.g. when a new FTE was added to FG - the FG's
version is increased).
Versioning is used in order to determine if the last traverse of an
entity's children is valid or a rescan under write lock is required.

This support improves the insertion rate of steering rules
from ~5k/sec to ~40k/sec.

Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 386 +++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   4 +-
 2 files changed, 264 insertions(+), 126 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 9406e72..e7301cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -168,10 +168,16 @@ enum fs_i_lock_class {
 
 };
 
-static void del_rule(struct fs_node *node);
-static void del_flow_table(struct fs_node *node);
-static void del_flow_group(struct fs_node *node);
-static void del_fte(struct fs_node *node);
+static void del_hw_flow_table(struct fs_node *node);
+static void del_hw_flow_group(struct fs_node *node);
+static void del_hw_fte(struct fs_node *node);
+static void del_sw_flow_table(struct fs_node *node);
+static void del_sw_flow_group(struct fs_node *node);
+static void del_sw_fte(struct fs_node *node);
+/* Delete rule (destination) is special case that 
+ * requires to lock the FTE for all the deletion process.
+ */
+static void del_sw_hw_rule(struct fs_node *node);
 static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 				struct mlx5_flow_destination *d2);
 static struct mlx5_flow_rule *
@@ -179,13 +185,15 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1,
 	       struct mlx5_flow_destination *dest);
 
 static void tree_init_node(struct fs_node *node,
-			   void (*remove_func)(struct fs_node *))
+			   void (*del_hw_func)(struct fs_node *),
+			   void (*del_sw_func)(struct fs_node *))
 {
 	atomic_set(&node->refcount, 1);
 	INIT_LIST_HEAD(&node->list);
 	INIT_LIST_HEAD(&node->children);
 	init_rwsem(&node->lock);
-	node->remove_func = remove_func;
+	node->del_hw_func = del_hw_func;
+	node->del_sw_func = del_sw_func;
 	node->active = false;
 }
 
@@ -202,50 +210,69 @@ static void tree_add_node(struct fs_node *node, struct fs_node *parent)
 		node->root = parent->root;
 }
 
-static void tree_get_node(struct fs_node *node)
+static int tree_get_node(struct fs_node *node)
 {
-	atomic_inc(&node->refcount);
+	return atomic_add_unless(&node->refcount, 1, 0);
 }
 
-static void nested_lock_ref_node(struct fs_node *node,
-				 enum fs_i_lock_class class)
+static void nested_down_read_ref_node(struct fs_node *node,
+				      enum fs_i_lock_class class)
 {
 	if (node) {
-		down_write_nested(&node->lock, class);
+		down_read_nested(&node->lock, class);
 		atomic_inc(&node->refcount);
 	}
 }
 
-static void lock_ref_node(struct fs_node *node)
+static void nested_down_write_ref_node(struct fs_node *node,
+				       enum fs_i_lock_class class)
 {
 	if (node) {
-		down_write(&node->lock);
+		down_write_nested(&node->lock, class);
 		atomic_inc(&node->refcount);
 	}
 }
 
-static void unlock_ref_node(struct fs_node *node)
+static void down_write_ref_node(struct fs_node *node)
 {
 	if (node) {
-		atomic_dec(&node->refcount);
-		up_write(&node->lock);
+		down_write(&node->lock);
+		atomic_inc(&node->refcount);
 	}
 }
 
+static void up_read_ref_node(struct fs_node *node)
+{
+	atomic_dec(&node->refcount);
+	up_read(&node->lock);
+}
+
+static void up_write_ref_node(struct fs_node *node)
+{
+	atomic_dec(&node->refcount);
+	up_write(&node->lock);
+}
+
 static void tree_put_node(struct fs_node *node)
 {
 	struct fs_node *parent_node = node->parent;
 
-	lock_ref_node(parent_node);
 	if (atomic_dec_and_test(&node->refcount)) {
-		if (parent_node)
+		if (node->del_hw_func)
+			node->del_hw_func(node);
+		if (parent_node) {
+			/* Only root namespace doesn't have parent and we just
+			 * need to free its node.
+			 */
+			down_write_ref_node(parent_node);
 			list_del_init(&node->list);
-		if (node->remove_func)
-			node->remove_func(node);
+			if (node->del_sw_func)
+				node->del_sw_func(node);
+			up_write_ref_node(parent_node);
+		}
 		kfree(node);
 		node = NULL;
 	}
-	unlock_ref_node(parent_node);
 	if (!node && parent_node)
 		tree_put_node(parent_node);
 }
@@ -371,11 +398,10 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node)
 	return NULL;
 }
 
-static void del_flow_table(struct fs_node *node)
+static void del_hw_flow_table(struct fs_node *node)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_core_dev *dev;
-	struct fs_prio *prio;
 	int err;
 
 	fs_get_obj(ft, node);
@@ -386,12 +412,21 @@ static void del_flow_table(struct fs_node *node)
 		if (err)
 			mlx5_core_warn(dev, "flow steering can't destroy ft\n");
 	}
+}
+
+static void del_sw_flow_table(struct fs_node *node)
+{
+	struct mlx5_flow_table *ft;
+	struct fs_prio *prio;
+
+	fs_get_obj(ft, node);
+
 	rhltable_destroy(&ft->fgs_hash);
 	fs_get_obj(prio, ft->node.parent);
 	prio->num_ft--;
 }
 
-static void del_rule(struct fs_node *node)
+static void del_sw_hw_rule(struct fs_node *node)
 {
 	struct mlx5_flow_rule *rule;
 	struct mlx5_flow_table *ft;
@@ -407,7 +442,6 @@ static void del_rule(struct fs_node *node)
 	fs_get_obj(fg, fte->node.parent);
 	fs_get_obj(ft, fg->node.parent);
 	trace_mlx5_fs_del_rule(rule);
-	list_del(&rule->node.list);
 	if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
 		mutex_lock(&rule->dest_attr.ft->lock);
 		list_del(&rule->next_ft);
@@ -437,7 +471,7 @@ static void del_rule(struct fs_node *node)
 	}
 }
 
-static void del_fte(struct fs_node *node)
+static void del_hw_fte(struct fs_node *node)
 {
 	struct mlx5_flow_table *ft;
 	struct mlx5_flow_group *fg;
@@ -448,8 +482,8 @@ static void del_fte(struct fs_node *node)
 	fs_get_obj(fte, node);
 	fs_get_obj(fg, fte->node.parent);
 	fs_get_obj(ft, fg->node.parent);
-	trace_mlx5_fs_del_fte(fte);
 
+	trace_mlx5_fs_del_fte(fte);
 	dev = get_dev(&ft->node);
 	if (node->active) {
 		err = mlx5_cmd_delete_fte(dev, ft,
@@ -459,6 +493,16 @@ static void del_fte(struct fs_node *node)
 				       "flow steering can't delete fte in index %d of flow group id %d\n",
 				       fte->index, fg->id);
 	}
+}
+
+static void del_sw_fte(struct fs_node *node)
+{
+	struct mlx5_flow_group *fg;
+	struct fs_fte *fte;
+	int err;
+
+	fs_get_obj(fte, node);
+	fs_get_obj(fg, fte->node.parent);
 
 	err = rhashtable_remove_fast(&fg->ftes_hash,
 				     &fte->hash,
@@ -467,30 +511,39 @@ static void del_fte(struct fs_node *node)
 	ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index);
 }
 
-static void del_flow_group(struct fs_node *node)
+static void del_hw_flow_group(struct fs_node *node)
 {
 	struct mlx5_flow_group *fg;
 	struct mlx5_flow_table *ft;
 	struct mlx5_core_dev *dev;
-	int err;
 
 	fs_get_obj(fg, node);
 	fs_get_obj(ft, fg->node.parent);
 	dev = get_dev(&ft->node);
 	trace_mlx5_fs_del_fg(fg);
 
-	if (ft->autogroup.active)
-		ft->autogroup.num_groups--;
+	if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
+		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
+			       fg->id, ft->id);
+}
+
+static void del_sw_flow_group(struct fs_node *node)
+{
+	struct mlx5_flow_group *fg;
+	struct mlx5_flow_table *ft;
+	int err;
+
+	fs_get_obj(fg, node);
+	fs_get_obj(ft, fg->node.parent);
 
 	rhashtable_destroy(&fg->ftes_hash);
 	ida_destroy(&fg->fte_allocator);
+	if (ft->autogroup.active)
+		ft->autogroup.num_groups--;
 	err = rhltable_remove(&ft->fgs_hash,
 			      &fg->hash,
 			      rhash_fg);
 	WARN_ON(err);
-	if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id))
-		mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n",
-			       fg->id, ft->id);
 }
 
 static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
@@ -540,7 +593,7 @@ static struct fs_fte *alloc_insert_fte(struct mlx5_flow_group *fg,
 	if (ret)
 		goto err_free;
 
-	tree_init_node(&fte->node, del_fte);
+	tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
 	tree_add_node(&fte->node, &fg->node);
 	list_add_tail(&fte->node.list, &fg->node.children);
 
@@ -611,10 +664,11 @@ static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *f
 		return ERR_PTR(ret);
 	}
 
-	tree_init_node(&fg->node, del_flow_group);
+	tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group);
 	tree_add_node(&fg->node, &ft->node);
 	/* Add node to group list */
 	list_add(&fg->node.list, prev);
+	atomic_inc(&ft->node.version);
 
 	return fg;
 }
@@ -794,7 +848,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 	fs_get_obj(fte, rule->node.parent);
 	if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
 		return -EINVAL;
-	lock_ref_node(&fte->node);
+	down_write_ref_node(&fte->node);
 	fs_get_obj(fg, fte->node.parent);
 	fs_get_obj(ft, fg->node.parent);
 
@@ -803,7 +857,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule,
 				  ft, fg->id,
 				  modify_mask,
 				  fte);
-	unlock_ref_node(&fte->node);
+	up_write_ref_node(&fte->node);
 
 	return err;
 }
@@ -940,7 +994,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 		goto unlock_root;
 	}
 
-	tree_init_node(&ft->node, del_flow_table);
+	tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table);
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
 	err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type,
@@ -953,11 +1007,11 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 	if (err)
 		goto destroy_ft;
 	ft->node.active = true;
-	lock_ref_node(&fs_prio->node);
+	down_write_ref_node(&fs_prio->node);
 	tree_add_node(&ft->node, &fs_prio->node);
 	list_add_flow_table(ft, fs_prio);
 	fs_prio->num_ft++;
-	unlock_ref_node(&fs_prio->node);
+	up_write_ref_node(&fs_prio->node);
 	mutex_unlock(&root->chain_lock);
 	return ft;
 destroy_ft:
@@ -1052,11 +1106,11 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft,
 	if (ft->autogroup.active)
 		return ERR_PTR(-EPERM);
 
-	lock_ref_node(&ft->node);
+	down_write_ref_node(&ft->node);
 	fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
 				     start_index, end_index,
 				     ft->node.children.prev);
-	unlock_ref_node(&ft->node);
+	up_write_ref_node(&ft->node);
 	if (IS_ERR(fg))
 		return fg;
 
@@ -1151,7 +1205,7 @@ static void destroy_flow_handle(struct fs_fte *fte,
 		/* Add dest to dests list- we need flow tables to be in the
 		 * end of the list for forward to next prio rules.
 		 */
-		tree_init_node(&rule->node, del_rule);
+		tree_init_node(&rule->node, NULL, del_sw_hw_rule);
 		if (dest &&
 		    dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
 			list_add(&rule->node.list, &fte->node.children);
@@ -1209,6 +1263,7 @@ static void destroy_flow_handle(struct fs_fte *fte,
 
 	fte->node.active = true;
 	fte->status |= FS_FTE_STATUS_EXISTING;
+	atomic_inc(&fte->node.version);
 
 out:
 	return handle;
@@ -1369,54 +1424,30 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg,
 					    struct fs_fte *fte)
 {
 	struct mlx5_flow_handle *handle;
-	struct mlx5_flow_table *ft;
+	int old_action;
 	int i;
+	int ret;
 
-	if (fte) {
-		int old_action;
-		int ret;
-
-		nested_lock_ref_node(&fte->node, FS_LOCK_CHILD);
-		ret = check_conflicting_ftes(fte, flow_act);
-		if (ret) {
-			handle = ERR_PTR(ret);
-			goto unlock_fte;
-		}
-
-		old_action = fte->action;
-		fte->action |= flow_act->action;
-		handle = add_rule_fte(fte, fg, dest, dest_num,
-				      old_action != flow_act->action);
-		if (IS_ERR(handle)) {
-			fte->action = old_action;
-			goto unlock_fte;
-		} else {
-			trace_mlx5_fs_set_fte(fte, false);
-			goto add_rules;
-		}
-	}
-	fs_get_obj(ft, fg->node.parent);
+	ret = check_conflicting_ftes(fte, flow_act);
+	if (ret)
+		return ERR_PTR(ret);
 
-	fte = alloc_insert_fte(fg, match_value, flow_act);
-	if (IS_ERR(fte))
-		return (void *)fte;
-	nested_lock_ref_node(&fte->node, FS_LOCK_CHILD);
-	handle = add_rule_fte(fte, fg, dest, dest_num, false);
+	old_action = fte->action;
+	fte->action |= flow_act->action;
+	handle = add_rule_fte(fte, fg, dest, dest_num,
+			      old_action != flow_act->action);
 	if (IS_ERR(handle)) {
-		unlock_ref_node(&fte->node);
-		tree_put_node(&fte->node);
+		fte->action = old_action;
 		return handle;
 	}
+	trace_mlx5_fs_set_fte(fte, false);
 
-add_rules:
 	for (i = 0; i < handle->num_rules; i++) {
 		if (atomic_read(&handle->rule[i]->node.refcount) == 1) {
 			tree_add_node(&handle->rule[i]->node, &fte->node);
 			trace_mlx5_fs_add_rule(handle->rule[i]);
 		}
 	}
-unlock_fte:
-	unlock_ref_node(&fte->node);
 	return handle;
 }
 
@@ -1480,8 +1511,10 @@ static void free_match_list(struct match_list_head *head)
 		struct match_list *iter, *match_tmp;
 
 		list_del(&head->first.list);
+		tree_put_node(&head->first.g->node);
 		list_for_each_entry_safe(iter, match_tmp, &head->list,
 					 list) {
+			tree_put_node(&iter->g->node);
 			list_del(&iter->list);
 			kfree(iter);
 		}
@@ -1505,6 +1538,8 @@ static int build_match_list(struct match_list_head *match_head,
 		struct match_list *curr_match;
 
 		if (likely(list_empty(&match_head->list))) {
+			if (!tree_get_node(&g->node))
+				continue;
 			match_head->first.g = g;
 			list_add_tail(&match_head->first.list,
 				      &match_head->list);
@@ -1517,6 +1552,10 @@ static int build_match_list(struct match_list_head *match_head,
 			err = -ENOMEM;
 			goto out;
 		}
+		if (!tree_get_node(&g->node)) {
+			kfree(curr_match);
+			continue;
+		}
 		curr_match->g = g;
 		list_add_tail(&curr_match->list, &match_head->list);
 	}
@@ -1525,62 +1564,119 @@ static int build_match_list(struct match_list_head *match_head,
 	return err;
 }
 
+static u64 matched_fgs_get_version(struct list_head *match_head)
+{
+	struct match_list *iter;
+	u64 version = 0;
+
+	list_for_each_entry(iter, match_head, list)
+		version += (u64)atomic_read(&iter->g->node.version);
+	return version;
+}
+
 static struct mlx5_flow_handle *
 try_add_to_existing_fg(struct mlx5_flow_table *ft,
+		       struct list_head *match_head,
 		       struct mlx5_flow_spec *spec,
 		       struct mlx5_flow_act *flow_act,
 		       struct mlx5_flow_destination *dest,
-		       int dest_num)
+		       int dest_num,
+		       int ft_version)
 {
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
-	struct match_list_head match_head;
 	struct match_list *iter;
-	int err;
+	bool take_write = false;
+	struct fs_fte *fte;
+	u64  version;
 
-	/* Collect all fgs which has a matching match_criteria */
-	err = build_match_list(&match_head, ft, spec);
-	if (err)
-		return ERR_PTR(err);
+	list_for_each_entry(iter, match_head, list) {
+		nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT);
+		ida_pre_get(&iter->g->fte_allocator, GFP_KERNEL);
+	}
 
+search_again_locked:
+	version = matched_fgs_get_version(match_head);
 	/* Try to find a fg that already contains a matching fte */
-	list_for_each_entry(iter, &match_head.list, list) {
-		struct fs_fte *fte;
+	list_for_each_entry(iter, match_head, list) {
+		struct fs_fte *fte_tmp;
 
 		g = iter->g;
-		nested_lock_ref_node(&g->node, FS_LOCK_PARENT);
-		fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
-					     rhash_fte);
-		if (fte) {
-			rule = add_rule_fg(g, spec->match_value,
-					   flow_act, dest, dest_num, fte);
-			unlock_ref_node(&g->node);
-			goto free_list;
+		fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value,
+						 rhash_fte);
+		if (!fte_tmp || !tree_get_node(&fte_tmp->node))
+			continue;
+
+		nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+		if (!take_write) {
+			list_for_each_entry(iter, match_head, list)
+				up_read_ref_node(&iter->g->node);
+		} else {
+			list_for_each_entry(iter, match_head, list)
+				up_write_ref_node(&iter->g->node);
 		}
-		unlock_ref_node(&g->node);
+
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte_tmp);
+		up_write_ref_node(&fte_tmp->node);
+		tree_put_node(&fte_tmp->node);
+		return rule;
 	}
 
 	/* No group with matching fte found. Try to add a new fte to any
 	 * matching fg.
 	 */
-	list_for_each_entry(iter, &match_head.list, list) {
-		g = iter->g;
 
-		nested_lock_ref_node(&g->node, FS_LOCK_PARENT);
-		rule = add_rule_fg(g, spec->match_value,
-				   flow_act, dest, dest_num, NULL);
-		if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) {
-			unlock_ref_node(&g->node);
-			goto free_list;
-		}
-		unlock_ref_node(&g->node);
+	if (!take_write) {
+		list_for_each_entry(iter, match_head, list)
+			up_read_ref_node(&iter->g->node);
+		list_for_each_entry(iter, match_head, list)
+			nested_down_write_ref_node(&iter->g->node,
+						   FS_LOCK_PARENT);
+		take_write = true;
 	}
 
-	rule = ERR_PTR(-ENOENT);
+	/* Check the ft version, for case that new flow group
+	 * was added while the fgs weren't locked
+	 */
+	if (atomic_read(&ft->node.version) != ft_version) {
+		rule = ERR_PTR(-EAGAIN);
+		goto out;
+	}
 
-free_list:
-	free_match_list(&match_head);
+	/* Check the fgs version, for case the new FTE with the
+	 * same values was added while the fgs weren't locked
+	 */
+	if (version != matched_fgs_get_version(match_head))
+		goto search_again_locked;
+
+	list_for_each_entry(iter, match_head, list) {
+		g = iter->g;
+
+		if (!g->node.active)
+			continue;
+		fte = alloc_insert_fte(g, spec->match_value, flow_act);
+		if (IS_ERR(fte)) {
+			if (PTR_ERR(fte) == -ENOSPC)
+				continue;
+			list_for_each_entry(iter, match_head, list)
+				up_write_ref_node(&iter->g->node);
+			return (void *)fte;
+		}
 
+		nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+		list_for_each_entry(iter, match_head, list)
+			up_write_ref_node(&iter->g->node);
+		rule = add_rule_fg(g, spec->match_value,
+				   flow_act, dest, dest_num, fte);
+		up_write_ref_node(&fte->node);
+		tree_put_node(&fte->node);
+		return rule;
+	}
+	rule = ERR_PTR(-ENOENT);
+out:
+	list_for_each_entry(iter, match_head, list)
+		up_write_ref_node(&iter->g->node);
 	return rule;
 }
 
@@ -1594,6 +1690,10 @@ static int build_match_list(struct match_list_head *match_head,
 {
 	struct mlx5_flow_group *g;
 	struct mlx5_flow_handle *rule;
+	struct match_list_head match_head;
+	bool take_write = false;
+	struct fs_fte *fte;
+	int version;
 	int err;
 	int i;
 
@@ -1604,31 +1704,67 @@ static int build_match_list(struct match_list_head *match_head,
 		if (!dest_is_valid(&dest[i], flow_act->action, ft))
 			return ERR_PTR(-EINVAL);
 	}
+	nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+search_again_locked:
+	version = atomic_read(&ft->node.version);
 
-	nested_lock_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
-	rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num);
-	if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOENT)
-		goto unlock;
+	/* Collect all fgs which has a matching match_criteria */
+	err = build_match_list(&match_head, ft, spec);
+	if (err)
+		return ERR_PTR(err);
+
+	if (!take_write)
+		up_read_ref_node(&ft->node);
+
+	rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest,
+				      dest_num, version);
+	free_match_list(&match_head);
+	if (!IS_ERR(rule) ||
+	    (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN))
+		return rule;
+
+	if (!take_write) {
+		nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
+		take_write = true;
+	}
+
+	if (PTR_ERR(rule) == -EAGAIN ||
+	    version != atomic_read(&ft->node.version))
+		goto search_again_locked;
 
 	g = alloc_auto_flow_group(ft, spec);
 	if (IS_ERR(g)) {
 		rule = (void *)g;
-		goto unlock;
+		up_write_ref_node(&ft->node);
+		return rule;
 	}
 
+	nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
+	up_write_ref_node(&ft->node);
+
 	err = create_auto_flow_group(ft, g);
-	if (err) {
-		rule = ERR_PTR(err);
-		goto put_fg;
+	if (err)
+		goto err_release_fg;
+
+	fte = alloc_insert_fte(g, spec->match_value, flow_act);
+	if (IS_ERR(fte)) {
+		err = PTR_ERR(fte);
+		goto err_release_fg;
 	}
 
+	nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
+	up_write_ref_node(&g->node);
 	rule = add_rule_fg(g, spec->match_value, flow_act, dest,
-			   dest_num, NULL);
-put_fg:
+			   dest_num, fte);
+	up_write_ref_node(&fte->node);
+	tree_put_node(&fte->node);
 	tree_put_node(&g->node);
-unlock:
-	unlock_ref_node(&ft->node);
 	return rule;
+
+err_release_fg:
+	up_write_ref_node(&g->node);
+	tree_put_node(&g->node);
+	return ERR_PTR(err);
 }
 
 static bool fwd_next_prio_supported(struct mlx5_flow_table *ft)
@@ -1869,7 +2005,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns,
 		return ERR_PTR(-ENOMEM);
 
 	fs_prio->node.type = FS_TYPE_PRIO;
-	tree_init_node(&fs_prio->node, NULL);
+	tree_init_node(&fs_prio->node, NULL, NULL);
 	tree_add_node(&fs_prio->node, &ns->node);
 	fs_prio->num_levels = num_levels;
 	fs_prio->prio = prio;
@@ -1895,7 +2031,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio)
 		return ERR_PTR(-ENOMEM);
 
 	fs_init_namespace(ns);
-	tree_init_node(&ns->node, NULL);
+	tree_init_node(&ns->node, NULL, NULL);
 	tree_add_node(&ns->node, &prio->node);
 	list_add_tail(&ns->node.list, &prio->node.children);
 
@@ -2020,7 +2156,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering
 	ns = &root_ns->ns;
 	fs_init_namespace(ns);
 	mutex_init(&root_ns->chain_lock);
-	tree_init_node(&ns->node, NULL);
+	tree_init_node(&ns->node, NULL, NULL);
 	tree_add_node(&ns->node, NULL);
 
 	return root_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index b5c079f..875b753 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -83,7 +83,9 @@ struct fs_node {
 	struct rw_semaphore	lock;
 	atomic_t		refcount;
 	bool			active;
-	void			(*remove_func)(struct fs_node *);
+	void			(*del_hw_func)(struct fs_node *);
+	void			(*del_sw_func)(struct fs_node *);
+	atomic_t		version;
 };
 
 struct mlx5_flow_rule {
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net-next 00/16] ipv6: replace rwlock with rcu and spinlock in fib6 table
From: Eric Dumazet @ 2017-10-06 23:49 UTC (permalink / raw)
  To: Wei Wang; +Cc: David Miller, netdev, Eric Dumazet, Martin KaFai Lau
In-Reply-To: <20171006190611.110633-1-tracywwnj@gmail.com>

On Fri, 2017-10-06 at 12:05 -0700, Wei Wang wrote:
> From: Wei Wang <weiwan@google.com>
> 
> Currently, fib6 table is protected by rwlock. During route lookup,
> reader lock is taken and during route insertion, deletion or
> modification, writer lock is taken. This is a very inefficient
> implementation because the fastpath always has to do the operation
> to grab the reader lock.
> According to my latest syn flood test on an iota ivybridage machine
> with 2 10G mlx nics bonded together, each with 8 rx queues on 2 NUMA
> nodes, and with the upstream net-next kernel:
> ipv4 stack can handle around 4.2Mpps
> ipv6 stack can handle around 1.3Mpps
> 
> In order to close the gap of the performance number between ipv4
> and ipv6 stack, this patch series tries to get rid of the usage of
> the rwlock and replace it with rcu and spinlock protection. This will
> greatly speed up the fastpath performance as it only needs to hold
> rcu which is much less expensive than grabbing the reader lock. It
> also makes ipv6 fib implementation more consistent with ipv4.
> 
> In order to be able to replace the current rwlock with rcu and
> spinlock, some preparation work is needed:
> Patch 1-8 introduces a per-route hash table (protected by rcu and a
> different spinlock) to store all cached routes created by pmtu and ip
> redirect under its main route. This makes the main fib6 tree only
> contain static routes.
> Patch 9-14 prepares all the reader path to be ready to tolerate
> concurrent writer.
> Patch 15 finally does the rwlock to rcu and spinlock conversion.
> Patch 16 takes care of rt6_stats.
> 
> After this patch series, in the same syn flood test,
> ipv6 stack can now handle around 3.5Mpps compared to previous 1.3Mpps
> in my test setup.
> 
> After this patch series, there are still some improvements that should
> be done in ipv6 stack:
> 1. During route lookup, dst_use() is called everytime on the selected
> route to update dst->__use and dst->lastuse. This dirties the cacheline 
> and causes extra cacheline miss and should be avoided.
> 2. when no route is found in the current table, net->ip6.ipv6_null_entry
> is used and refcnt is taken on it. As there is no pcpu cache for this
> specific route, frequent change on the refcnt for this route causes
> quite some cacheline misses.
> And to make things worse, if CONFIG_IPV6_MULTIPLE_TABLES is defined,
> output path route lookup always starts with local table first and
> guarantees to hit net->ipv6.ip6_null_entry before continuing to do
> lookup in the main table.
> These operations on net->ipv6.ip6_null_entry could potentially be
> avoided.
> 3. ipv6 input path route lookup grabs refcnt on dst. This is different
> from ipv4. We could potentially change this behavior to let ipv6 input
> path route lookup not to grab refcnt on dst. However, it does not give
> us much performance boost as we currently have pcpu route cache for
> input path as well in ipv6. But this work probably is still worth doing
> to unify ipv6 and ipv4 route lookup behavior.
> 
> The above issues will be addressed separately after this patch series
> has been accepted.
> 
> This is a joint work with Martin KaFai Lau and Eric Dumazet. And many
> many thanks to them for their inspiring ideas and big big code review
> efforts.
> 
> Wei Wang (16):
>   ipv6: introduce a new function fib6_update_sernum()
>   ipv6: introduce a hash table to store dst cache
>   ipv6: prepare fib6_remove_prefsrc() for exception table
>   ipv6: prepare rt6_mtu_change() for exception table
>   ipv6: prepare rt6_clean_tohost() for exception table
>   ipv6: prepare fib6_age() for exception table
>   ipv6: prepare fib6_locate() for exception table
>   ipv6: hook up exception table to store dst cache
>   ipv6: grab rt->rt6i_ref before allocating pcpu rt
>   ipv6: don't release rt->rt6i_pcpu memory during rt6_release()
>   ipv6: replace dst_hold() with dst_hold_safe() in routing code
>   ipv6: update fn_sernum after route is inserted to tree
>   ipv6: check fn->leaf before it is used
>   ipv6: add key length check into rt6_select()
>   ipv6: replace rwlock with rcu and spinlock in fib6_table
>   ipv6: take care of rt6_stats
> 
>  include/net/dst.h       |   2 +-
>  include/net/ip6_fib.h   |  79 ++++-
>  include/net/ip6_route.h |   5 +
>  net/ipv6/addrconf.c     |  17 +-
>  net/ipv6/ip6_fib.c      | 645 ++++++++++++++++++----------------
>  net/ipv6/route.c        | 901 ++++++++++++++++++++++++++++++++++++++++--------
>  6 files changed, 1179 insertions(+), 470 deletions(-)
> 

Awesome work Wei.

For the whole series :

Reviewed-by: Eric Dumazet <edumazet@google.com>

Thanks !

^ permalink raw reply

* [PATCH iproute2 1/2] utils: add print_escape_buf to format and print arbitrary bytes
From: Ivan Delalande @ 2017-10-06 23:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Ivan Delalande

Keep it as simple as possible for now: just escape anything that is not
isprint-able, is among the "escape" parameter or '\' as an octal escape
sequence. This should be pretty easy to extend if any other user needs
something more complex in the future.

Signed-off-by: Ivan Delalande <colona@arista.com>
---
 include/utils.h |  2 ++
 lib/utils.c     | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/utils.h b/include/utils.h
index 76addb32..3d91c50d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -195,6 +195,8 @@ static inline void __jiffies_to_tv(struct timeval *tv, unsigned long jiffies)
 	tv->tv_usec = tvusec - 1000000 * tv->tv_sec;
 }
 
+void print_escape_buf(const __u8 *buf, size_t len, const char *escape);
+
 int print_timestamp(FILE *fp);
 void print_nlmsg_timestamp(FILE *fp, const struct nlmsghdr *n);
 
diff --git a/lib/utils.c b/lib/utils.c
index 0cf99619..a494190e 100644
--- a/lib/utils.c
+++ b/lib/utils.c
@@ -31,6 +31,7 @@
 #include <time.h>
 #include <sys/time.h>
 #include <errno.h>
+#include <ctype.h>
 
 #include "rt_names.h"
 #include "utils.h"
@@ -1047,6 +1048,20 @@ int addr64_n2a(__u64 addr, char *buff, size_t len)
 	return written;
 }
 
+/* Print buffer and escape bytes that are !isprint or among 'escape' */
+void print_escape_buf(const __u8 *buf, size_t len, const char *escape)
+{
+	size_t i;
+
+	for (i = 0; i < len; ++i) {
+		if (isprint(buf[i]) && buf[i] != '\\' &&
+		    !strchr(escape, buf[i]))
+			printf("%c", buf[i]);
+		else
+			printf("\\%03o", buf[i]);
+	}
+}
+
 int print_timestamp(FILE *fp)
 {
 	struct timeval tv;
-- 
2.14.2

^ permalink raw reply related

* [PATCH iproute2 2/2] ss: print MD5 signature keys configured on TCP sockets
From: Ivan Delalande @ 2017-10-06 23:48 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Ivan Delalande
In-Reply-To: <20171006234820.27567-1-colona@arista.com>

These keys are reported by kernel 4.14 and later under the
INET_DIAG_MD5SIG attribute, when INET_DIAG_INFO is requested (ss -i)
and we have CAP_NET_ADMIN. The additional output looks like:

	md5keys:fe80::/64=signing_key,10.1.2.0/24=foobar,::1/128=Test

Signed-off-by: Ivan Delalande <colona@arista.com>
---
 misc/ss.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/misc/ss.c b/misc/ss.c
index dd8dfaa4..09bff8a7 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2153,6 +2153,16 @@ static void print_skmeminfo(struct rtattr *tb[], int attrtype)
 	printf(")");
 }
 
+static void print_md5sig(struct tcp_diag_md5sig *sig)
+{
+	printf("%s/%d=",
+	       format_host(sig->tcpm_family,
+			   sig->tcpm_family == AF_INET6 ? 16 : 4,
+			   &sig->tcpm_addr),
+	       sig->tcpm_prefixlen);
+	print_escape_buf(sig->tcpm_key, sig->tcpm_keylen, " ,");
+}
+
 #define TCPI_HAS_OPT(info, opt) !!(info->tcpi_options & (opt))
 
 static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
@@ -2289,6 +2299,17 @@ static void tcp_show_info(const struct nlmsghdr *nlh, struct inet_diag_msg *r,
 		free(s.dctcp);
 		free(s.bbr_info);
 	}
+	if (tb[INET_DIAG_MD5SIG]) {
+		struct tcp_diag_md5sig *sig = RTA_DATA(tb[INET_DIAG_MD5SIG]);
+		int len = RTA_PAYLOAD(tb[INET_DIAG_MD5SIG]);
+
+		printf(" md5keys:");
+		print_md5sig(sig++);
+		for (len -= sizeof(*sig); len > 0; len -= sizeof(*sig)) {
+			printf(",");
+			print_md5sig(sig++);
+		}
+	}
 }
 
 static const char *format_host_sa(struct sockaddr_storage *sa)
-- 
2.14.2

^ permalink raw reply related

* Re: [PATCH net-next 11/16] ipv6: replace dst_hold() with dst_hold_safe() in routing code
From: 吉藤英明 @ 2017-10-06 23:57 UTC (permalink / raw)
  To: Wei Wang
  Cc: David Miller, network dev, Eric Dumazet, Martin KaFai Lau,
	吉藤英明, YOSHIFUJI Hideaki
In-Reply-To: <20171006190611.110633-12-tracywwnj@gmail.com>

Hi,

2017-10-07 4:06 GMT+09:00 Wei Wang <weiwan@google.com>:
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 941c062389d2..aeb349aea429 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
:
> @@ -1625,12 +1643,17 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>         if (rt_cache)
>                 rt = rt_cache;
>
> -       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
> -               dst_use(&rt->dst, jiffies);
> +       if (rt == net->ipv6.ip6_null_entry) {
> +               read_unlock_bh(&table->tb6_lock);
> +               dst_hold(&rt->dst);
> +               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
> +               return rt;
> +       } else if (rt->rt6i_flags & RTF_CACHE) {
> +               if (ip6_hold_safe(net, &rt, true)) {
> +                       dst_use_noref(&rt->dst, jiffies);
> +                       rt6_dst_from_metrics_check(rt);
> +               }
>                 read_unlock_bh(&table->tb6_lock);
> -
> -               rt6_dst_from_metrics_check(rt);
> -
>                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
>                 return rt;
>         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&

Is it intended to move rt6_dst_from_metrics_check() inside the table lock?

--yoshfuji

^ permalink raw reply

* Re: [PATCH net-next 1/2] bnxt_en: don't consider building bnxt_tc.o if option not enabled
From: Michael Chan @ 2017-10-07  1:01 UTC (permalink / raw)
  To: Jonathan Toppins; +Cc: Netdev, open list
In-Reply-To: <1a4d96f368f199bba0425012f50d55d634484cce.1507319311.git.jtoppins@redhat.com>

On Fri, Oct 6, 2017 at 12:48 PM, Jonathan Toppins <jtoppins@redhat.com> wrote:
> Instead of zeroing out bnxt_tc.c with a #ifdef foo, instead don't compile
> the file when the option is not enabled. Now make and the preprocessor do
> not have to waste time compiling a no-op.
>
> Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>

Acked-by: Michael Chan <michael.chan@broadcom.com>

^ permalink raw reply

* Re: [PATCH net-next 2/2] bnxt_en: tc: only the function prototypes need to be wrapped in #ifdef
From: Michael Chan @ 2017-10-07  1:27 UTC (permalink / raw)
  To: Jonathan Toppins; +Cc: Netdev, open list
In-Reply-To: <a58d04f653e79b12c86eaa67f768f4ff64b2a975.1507319311.git.jtoppins@redhat.com>

On Fri, Oct 6, 2017 at 12:48 PM, Jonathan Toppins <jtoppins@redhat.com> wrote:
> There is no reason to wrap the data structures inside the ifdef.

What's so bad about wrapping unused data structures inside #ifdef?
These structures are only used if CONFIG_BNXT_FLOWER_OFFLOAD is
defined.

^ permalink raw reply

* Re: [PATCH net-next 1/2] bnxt_en: don't consider building bnxt_tc.o if option not enabled
From: David Miller @ 2017-10-07  2:00 UTC (permalink / raw)
  To: michael.chan; +Cc: jtoppins, netdev, linux-kernel
In-Reply-To: <CACKFLikAM3sfHYA-VnNbHWN_aYaG92MwpHjhN3dzhyONDaoTuw@mail.gmail.com>

From: Michael Chan <michael.chan@broadcom.com>
Date: Fri, 6 Oct 2017 18:01:57 -0700

> On Fri, Oct 6, 2017 at 12:48 PM, Jonathan Toppins <jtoppins@redhat.com> wrote:
>> Instead of zeroing out bnxt_tc.c with a #ifdef foo, instead don't compile
>> the file when the option is not enabled. Now make and the preprocessor do
>> not have to waste time compiling a no-op.
>>
>> Signed-off-by: Jonathan Toppins <jtoppins@redhat.com>
> 
> Acked-by: Michael Chan <michael.chan@broadcom.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next 2/2] bnxt_en: tc: only the function prototypes need to be wrapped in #ifdef
From: David Miller @ 2017-10-07  2:01 UTC (permalink / raw)
  To: michael.chan; +Cc: jtoppins, netdev, linux-kernel
In-Reply-To: <CACKFLik-dU=1bouTiHhREt4p5X=cGQFjz3fAjHn_pLJQm4FJ2A@mail.gmail.com>

From: Michael Chan <michael.chan@broadcom.com>
Date: Fri, 6 Oct 2017 18:27:31 -0700

> On Fri, Oct 6, 2017 at 12:48 PM, Jonathan Toppins <jtoppins@redhat.com> wrote:
>> There is no reason to wrap the data structures inside the ifdef.
> 
> What's so bad about wrapping unused data structures inside #ifdef?
> These structures are only used if CONFIG_BNXT_FLOWER_OFFLOAD is
> defined.

Yeah I agree, this ifdef is actually a good way to trap unintentional
usage of those datastructures so it's doubly best to keep the ifdef.

^ permalink raw reply

* Re: [PATCH net-next 11/16] ipv6: replace dst_hold() with dst_hold_safe() in routing code
From: Wei Wang @ 2017-10-07  2:06 UTC (permalink / raw)
  To: 吉藤英明
  Cc: David Miller, network dev, Eric Dumazet, Martin KaFai Lau,
	YOSHIFUJI Hideaki
In-Reply-To: <CAPA1RqB14m4BERjrOO5Fryu2WAU9iwqAbC8H7Sb_bNUi9jfLAg@mail.gmail.com>

On Fri, Oct 6, 2017 at 4:57 PM, 吉藤英明 <hideaki.yoshifuji@miraclelinux.com> wrote:
> Hi,
>
> 2017-10-07 4:06 GMT+09:00 Wei Wang <weiwan@google.com>:
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 941c062389d2..aeb349aea429 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
> :
>> @@ -1625,12 +1643,17 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>>         if (rt_cache)
>>                 rt = rt_cache;
>>
>> -       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
>> -               dst_use(&rt->dst, jiffies);
>> +       if (rt == net->ipv6.ip6_null_entry) {
>> +               read_unlock_bh(&table->tb6_lock);
>> +               dst_hold(&rt->dst);
>> +               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
>> +               return rt;
>> +       } else if (rt->rt6i_flags & RTF_CACHE) {
>> +               if (ip6_hold_safe(net, &rt, true)) {
>> +                       dst_use_noref(&rt->dst, jiffies);
>> +                       rt6_dst_from_metrics_check(rt);
>> +               }
>>                 read_unlock_bh(&table->tb6_lock);
>> -
>> -               rt6_dst_from_metrics_check(rt);
>> -
>>                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
>>                 return rt;
>>         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
>
> Is it intended to move rt6_dst_from_metrics_check() inside the table lock?
>

I think it doesn't really matter whether rt6_dst_from_metrics_check()
is inside the table lock or not. The code looks cleaner if we put it
inside the if (ip6_hold_safe()) {} block because we don't want to do
rt6_dst_from_metrics_check() if ip6_hold_safe() returns false.

> --yoshfuji

^ permalink raw reply

* Re: [PATCH net-next 11/16] ipv6: replace dst_hold() with dst_hold_safe() in routing code
From: David Miller @ 2017-10-07  2:23 UTC (permalink / raw)
  To: hideaki.yoshifuji; +Cc: weiwan, netdev, edumazet, kafai, yoshfuji
In-Reply-To: <CAPA1RqB14m4BERjrOO5Fryu2WAU9iwqAbC8H7Sb_bNUi9jfLAg@mail.gmail.com>

From: 吉藤英明 <hideaki.yoshifuji@miraclelinux.com>
Date: Sat, 7 Oct 2017 08:57:49 +0900

> Hi,
> 
> 2017-10-07 4:06 GMT+09:00 Wei Wang <weiwan@google.com>:
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 941c062389d2..aeb349aea429 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
> :
>> @@ -1625,12 +1643,17 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
>>         if (rt_cache)
>>                 rt = rt_cache;
>>
>> -       if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
>> -               dst_use(&rt->dst, jiffies);
>> +       if (rt == net->ipv6.ip6_null_entry) {
>> +               read_unlock_bh(&table->tb6_lock);
>> +               dst_hold(&rt->dst);
>> +               trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
>> +               return rt;
>> +       } else if (rt->rt6i_flags & RTF_CACHE) {
>> +               if (ip6_hold_safe(net, &rt, true)) {
>> +                       dst_use_noref(&rt->dst, jiffies);
>> +                       rt6_dst_from_metrics_check(rt);
>> +               }
>>                 read_unlock_bh(&table->tb6_lock);
>> -
>> -               rt6_dst_from_metrics_check(rt);
>> -
>>                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
>>                 return rt;
>>         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
> 
> Is it intended to move rt6_dst_from_metrics_check() inside the table lock?

That lock is replaced with rcu_read_unlock() or similar in patch #15 so I
don't think it matters much in the end.

^ permalink raw reply

* Re: [PATCH net-next v5 0/3] bridge: neigh msg proxy and flood suppression support
From: Roopa Prabhu @ 2017-10-07  2:35 UTC (permalink / raw)
  To: davem@davemloft.net; +Cc: Nikolay Aleksandrov, netdev@vger.kernel.org, bridge
In-Reply-To: <1507314880-16310-1-git-send-email-roopa@cumulusnetworks.com>

On Fri, Oct 6, 2017 at 11:34 AM, Roopa Prabhu <roopa@cumulusnetworks.com> wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> This series implements arp and nd suppression in the bridge
> driver for ethernet vpns. It implements rfc7432, section 10
> https://tools.ietf.org/html/rfc7432#section-10
> for ethernet VPN deployments. It is similar to the existing
> BR_PROXYARP* flags but has a few semantic differences to conform
> to EVPN standard. Unlike the existing flags, this new flag suppresses
> flood of all neigh discovery packets (arp and nd) to tunnel ports.
> Supports both vlan filtering and non-vlan filtering bridges.
>
> In case of EVPN, it is mainly used to avoid flooding
> of arp and nd packets to tunnel ports like vxlan.
>
> v2 : rebase to latest + address some optimization feedback from Nikolay.
> v3 : fix kbuild reported build errors with CONFIG_INET off
> v4 : simplify port flag mask as suggested by stephen
> v5 : address some feedback from Toshiaki
>

Looks like I missed applying a cleanup done in v5 to the ipv6 nd path.
Dave, I see these patches 'under review' in patchworks. If its not too
late, pls drop them. I will spin v6 in a few hrs. thanks.

^ permalink raw reply

* Re: [PATCH] net: dsa: mv88e6xxx: rework in-chip bridging
From: Florian Fainelli @ 2017-10-07  3:04 UTC (permalink / raw)
  To: Greg Ungerer, Vivien Didelot; +Cc: netdev@vger.kernel.org, Andrew Lunn
In-Reply-To: <ab86ad59-06af-b4ab-0bc6-dace73f04ef0@linux-m68k.org>

Le 10/03/17 à 23:20, Greg Ungerer a écrit :
> Hi Vivien,
> 
> On Wed, Mar 29, 2017 at 04:30:16PM -0400, Vivien Didelot wrote:
>> All ports -- internal and external, for chips featuring a PVT -- have a
>> mask restricting to which internal ports a frame is allowed to egress.
>>
>> Now that DSA exposes the number of ports and their bridge devices, it is
>> possible to extract the code generating the VLAN map and make it generic
>> so that it can be shared later with the cross-chip bridging code.
> 
> This patch changes the behavior of interfaces on startup if they are
> not part of a bridge.
> 
> I have a board with a Marvell 6350 switch with a device tree that sets
> up the 5 ports as lan1, lan2, lan3, lan4, wan. With kernels before
> this patch (so linux-4.12 and older) after system startup I could do:
> 
>   ifconfig lan1 192.168.0.1
> 
> And then ping out that interface with no problems.
> 
> After this patch is applied (effects linux-4.13 and newer) then the
> ping fails:
> 
>   PING 192.168.0.22 (192.168.0.22) 56(84) bytes of data.
>   From 192.168.0.1 icmp_seq=1 Destination Host Unreachable
>   From 192.168.0.1 icmp_seq=2 Destination Host Unreachable
>   From 192.168.0.1 icmp_seq=3 Destination Host Unreachable
> 
> If I incorporate an interface into a bridge then it all works ok.
> So simply:
> 
>   brctl addbr br0
>   brctl addif br0 lan1
>   ifconfig lan1 up
>   ifconfig br0 192.168.0.1
> 
> Then pings out work as expected. And if I now remove that lan1
> interface from the bridge and use it alone again then it will
> now work ok:
> 
>   ifconfig br0 down
>   brctl delif br0 lan1
>   ifconfig lan1 192.168.0.1
> 
> And that now pings ok.
> 
> I fixed this with the attached patch. It is probably not the correct
> approach, but it does restore the older behavior.
> 
> What do you think?

This is strange, the dsa_switch_tree and its associated dsa_switch
instances should be fully setup by the time ops->setup() is running in
your driver but your patch suggests this may not be happening?

Are you using the new style Device Tree binding or the old style Device
Tree binding out of curiosity?

> 
> Regards
> Greg
> 
> 
> 
>> Signed-off-by: Vivien Didelot <vivien.dide...@savoirfairelinux.com>
>> ---
>>  drivers/net/dsa/mv88e6xxx/chip.c | 53 ++++++++++++++++++++++++++--------------
>>  1 file changed, 34 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
>> index b114bf8e6a11..e5165831e8b5 100644
>> --- a/drivers/net/dsa/mv88e6xxx/chip.c
>> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
>> @@ -1123,27 +1123,42 @@ static int mv88e6xxx_set_eee(struct dsa_switch *ds, int 
>> port,
>>         return err;
>>  }
>>  
>> +static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port)
>> +{
>> +       struct dsa_switch *ds = NULL;
>> +       struct net_device *br;
>> +       u16 pvlan;
>> +       int i;
>> +
>> +       if (dev < DSA_MAX_SWITCHES)
>> +               ds = chip->ds->dst->ds[dev];
>> +
>> +       /* Prevent frames from unknown switch or port */
>> +       if (!ds || port >= ds->num_ports)
>> +               return 0;
>> +
>> +       /* Frames from DSA links and CPU ports can egress any local port */
>> +       if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
>> +               return mv88e6xxx_port_mask(chip);
>> +
>> +       br = ds->ports[port].bridge_dev;
>> +       pvlan = 0;
>> +
>> +       /* Frames from user ports can egress any local DSA links and CPU ports,
>> +        * as well as any local member of their bridge group.
>> +        */
>> +       for (i = 0; i < mv88e6xxx_num_ports(chip); ++i)
>> +               if (dsa_is_cpu_port(chip->ds, i) ||
>> +                   dsa_is_dsa_port(chip->ds, i) ||
>> +                   (br && chip->ds->ports[i].bridge_dev == br))
>> +                       pvlan |= BIT(i);
>> +
>> +       return pvlan;
>> +}
>> +
>>  static int _mv88e6xxx_port_based_vlan_map(struct mv88e6xxx_chip *chip, int 
>> port)
>>  {
>> -       struct dsa_switch *ds = chip->ds;
>> -       struct net_device *bridge = ds->ports[port].bridge_dev;
>> -       u16 output_ports = 0;
>> -       int i;
>> -
>> -       /* allow CPU port or DSA link(s) to send frames to every port */
>> -       if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)) {
>> -               output_ports = ~0;
>> -       } else {
>> -               for (i = 0; i < mv88e6xxx_num_ports(chip); ++i) {
>> -                       /* allow sending frames to every group member */
>> -                       if (bridge && ds->ports[i].bridge_dev == bridge)
>> -                               output_ports |= BIT(i);
>> -
>> -                       /* allow sending frames to CPU port and DSA link(s) */
>> -                       if (dsa_is_cpu_port(ds, i) || dsa_is_dsa_port(ds, i))
>> -                               output_ports |= BIT(i);
>> -               }
>> -       }
>> +       u16 output_ports = mv88e6xxx_port_vlan(chip, chip->ds->index, port);
>>  
>>         /* prevent frames from going back out of the port they came in on */
>>         output_ports &= ~BIT(port);
>> -- 
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH v8 01/20] crypto: change transient busy return code to -EAGAIN
From: Herbert Xu @ 2017-10-07  3:05 UTC (permalink / raw)
  To: Gilad Ben-Yossef
  Cc: David S. Miller, Jonathan Corbet, David Howells, Tom Lendacky,
	Gary Hook, Boris Brezillon, Arnaud Ebalard, Matthias Brugger,
	Alasdair Kergon, Mike Snitzer, dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	Shaohua Li, Steve French, Theodore Y. Ts'o, Jaegeuk Kim,
	Steffen Klassert, Alexey Kuznetsov, Hideaki YOSHIFUJI, Mimi Zohar,
	Dmitry Kasatkin
In-Reply-To: <1504615144-29770-2-git-send-email-gilad-6S/DczAoZh3WXxRugSxzZg@public.gmane.org>

On Tue, Sep 05, 2017 at 03:38:40PM +0300, Gilad Ben-Yossef wrote:
>
> diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
> index 5e92bd2..3b3c154 100644
> --- a/crypto/algif_hash.c
> +++ b/crypto/algif_hash.c
> @@ -39,6 +39,20 @@ struct algif_hash_tfm {
>  	bool has_key;
>  };
>  
> +/* Previous versions of crypto_* ops used to return -EBUSY
> + * rather than -EAGAIN to indicate being tied up. The in
> + * kernel API changed but we don't want to break the user
> + * space API. As only the hash user interface exposed this
> + * error ever to the user, do the translation here.
> + */
> +static inline int crypto_user_err(int err)
> +{
> +	if (err == -EAGAIN)
> +		return -EBUSY;
> +
> +	return err;

I don't see the need to carry along this baggage.  Does anyone
in user-space actually rely on EBUSY?

Cheers,
-- 
Email: Herbert Xu <herbert-lOAM2aK0SrRLBo1qDEOMRrpzq4S04n8Q@public.gmane.org>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH] mwifiex: Use put_unaligned_le32
From: kbuild test robot @ 2017-10-07  3:31 UTC (permalink / raw)
  To: Himanshu Jha
  Cc: kbuild-all, amitkarwar, nishants, gbhat, huxm, kvalo,
	linux-wireless, netdev, linux-kernel, Himanshu Jha
In-Reply-To: <1507141686-5178-1-git-send-email-himanshujha199640@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 12590 bytes --]

Hi Himanshu,

[auto build test ERROR on wireless-drivers-next/master]
[also build test ERROR on v4.14-rc3 next-20170929]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Himanshu-Jha/mwifiex-Use-put_unaligned_le32/20171007-095017
base:   https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git master
config: xtensa-allyesconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:6:19: error: redefinition of 'get_unaligned_be16'
    static inline u16 get_unaligned_be16(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:22:28: note: previous definition of 'get_unaligned_be16' was here
    static __always_inline u16 get_unaligned_be16(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:11:19: error: redefinition of 'get_unaligned_be32'
    static inline u32 get_unaligned_be32(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:27:28: note: previous definition of 'get_unaligned_be32' was here
    static __always_inline u32 get_unaligned_be32(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:16:19: error: redefinition of 'get_unaligned_be64'
    static inline u64 get_unaligned_be64(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:32:28: note: previous definition of 'get_unaligned_be64' was here
    static __always_inline u64 get_unaligned_be64(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:21:20: error: redefinition of 'put_unaligned_be16'
    static inline void put_unaligned_be16(u16 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:52:29: note: previous definition of 'put_unaligned_be16' was here
    static __always_inline void put_unaligned_be16(u16 val, void *p)
                                ^
   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:26:20: error: redefinition of 'put_unaligned_be32'
    static inline void put_unaligned_be32(u32 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:57:29: note: previous definition of 'put_unaligned_be32' was here
    static __always_inline void put_unaligned_be32(u32 val, void *p)
                                ^
   In file included from arch/xtensa/include/asm/unaligned.h:22:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/be_struct.h:31:20: error: redefinition of 'put_unaligned_be64'
    static inline void put_unaligned_be64(u64 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:62:29: note: previous definition of 'put_unaligned_be64' was here
    static __always_inline void put_unaligned_be64(u64 val, void *p)
                                ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:40:19: error: redefinition of 'get_unaligned_le16'
    static inline u16 get_unaligned_le16(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:7:28: note: previous definition of 'get_unaligned_le16' was here
    static __always_inline u16 get_unaligned_le16(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:45:19: error: redefinition of 'get_unaligned_le32'
    static inline u32 get_unaligned_le32(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:12:28: note: previous definition of 'get_unaligned_le32' was here
    static __always_inline u32 get_unaligned_le32(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:50:19: error: redefinition of 'get_unaligned_le64'
    static inline u64 get_unaligned_le64(const void *p)
                      ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:17:28: note: previous definition of 'get_unaligned_le64' was here
    static __always_inline u64 get_unaligned_le64(const void *p)
                               ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:55:20: error: redefinition of 'put_unaligned_le16'
    static inline void put_unaligned_le16(u16 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:37:29: note: previous definition of 'put_unaligned_le16' was here
    static __always_inline void put_unaligned_le16(u16 val, void *p)
                                ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:60:20: error: redefinition of 'put_unaligned_le32'
    static inline void put_unaligned_le32(u32 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:42:29: note: previous definition of 'put_unaligned_le32' was here
    static __always_inline void put_unaligned_le32(u32 val, void *p)
                                ^
   In file included from arch/xtensa/include/asm/unaligned.h:23:0,
                    from include/linux/etherdevice.h:28,
                    from include/linux/ieee80211.h:22,
                    from drivers/net//wireless/marvell/mwifiex/decl.h:28,
                    from drivers/net//wireless/marvell/mwifiex/cmdevt.c:21:
>> include/linux/unaligned/le_byteshift.h:65:20: error: redefinition of 'put_unaligned_le64'
    static inline void put_unaligned_le64(u64 val, void *p)
                       ^
   In file included from drivers/net//wireless/marvell/mwifiex/cmdevt.c:20:0:
   include/linux/unaligned/access_ok.h:47:29: note: previous definition of 'put_unaligned_le64' was here
    static __always_inline void put_unaligned_le64(u64 val, void *p)
                                ^

vim +/put_unaligned_le32 +60 include/linux/unaligned/le_byteshift.h

064106a9 Harvey Harrison 2008-04-29  39  
064106a9 Harvey Harrison 2008-04-29 @40  static inline u16 get_unaligned_le16(const void *p)
064106a9 Harvey Harrison 2008-04-29  41  {
064106a9 Harvey Harrison 2008-04-29  42  	return __get_unaligned_le16((const u8 *)p);
064106a9 Harvey Harrison 2008-04-29  43  }
064106a9 Harvey Harrison 2008-04-29  44  
064106a9 Harvey Harrison 2008-04-29 @45  static inline u32 get_unaligned_le32(const void *p)
064106a9 Harvey Harrison 2008-04-29  46  {
064106a9 Harvey Harrison 2008-04-29  47  	return __get_unaligned_le32((const u8 *)p);
064106a9 Harvey Harrison 2008-04-29  48  }
064106a9 Harvey Harrison 2008-04-29  49  
064106a9 Harvey Harrison 2008-04-29 @50  static inline u64 get_unaligned_le64(const void *p)
064106a9 Harvey Harrison 2008-04-29  51  {
064106a9 Harvey Harrison 2008-04-29  52  	return __get_unaligned_le64((const u8 *)p);
064106a9 Harvey Harrison 2008-04-29  53  }
064106a9 Harvey Harrison 2008-04-29  54  
064106a9 Harvey Harrison 2008-04-29 @55  static inline void put_unaligned_le16(u16 val, void *p)
064106a9 Harvey Harrison 2008-04-29  56  {
064106a9 Harvey Harrison 2008-04-29  57  	__put_unaligned_le16(val, p);
064106a9 Harvey Harrison 2008-04-29  58  }
064106a9 Harvey Harrison 2008-04-29  59  
064106a9 Harvey Harrison 2008-04-29 @60  static inline void put_unaligned_le32(u32 val, void *p)
064106a9 Harvey Harrison 2008-04-29  61  {
064106a9 Harvey Harrison 2008-04-29  62  	__put_unaligned_le32(val, p);
064106a9 Harvey Harrison 2008-04-29  63  }
064106a9 Harvey Harrison 2008-04-29  64  
064106a9 Harvey Harrison 2008-04-29 @65  static inline void put_unaligned_le64(u64 val, void *p)
064106a9 Harvey Harrison 2008-04-29  66  {
064106a9 Harvey Harrison 2008-04-29  67  	__put_unaligned_le64(val, p);
064106a9 Harvey Harrison 2008-04-29  68  }
064106a9 Harvey Harrison 2008-04-29  69  

:::::: The code at line 60 was first introduced by commit
:::::: 064106a91be5e76cb42c1ddf5d3871e3a1bd2a23 kernel: add common infrastructure for unaligned access

:::::: TO: Harvey Harrison <harvey.harrison@gmail.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 51721 bytes --]

^ permalink raw reply

* (unknown), 
From: agar2000 @ 2017-10-07  3:40 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: 26521476.zip --]
[-- Type: application/zip, Size: 7189 bytes --]

^ permalink raw reply

* Re: [PATCH 02/10] drivers:crypto: return -ENOMEM on allocation failure.
From: Herbert Xu @ 2017-10-07  4:21 UTC (permalink / raw)
  To: Allen Pais
  Cc: linux-kernel, nouveau, linux-crypto, dri-devel,
	MPT-FusionLinux.pdl, linux-scsi, netdev, megaraidlinux.pdl,
	target-devel, linux-fbdev, linux-btrfs, allen.lkml
In-Reply-To: <1505287939-14106-2-git-send-email-allen.lkml@gmail.com>

Allen Pais <allen.lkml@gmail.com> wrote:
> Signed-off-by: Allen Pais <allen.lkml@gmail.com>

Patch applied.  Thanks.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox