[PATCH rdma-next 8/8] RDMA/mlx5: Add work to remove temporary entries from the cache

Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed

From: Michael Guralnik <michaelgur@nvidia.com>
To: <jgg@nvidia.com>, <leonro@nvidia.com>
Cc: <maorg@nvidia.com>, <linux-rdma@vger.kernel.org>,
	<saeedm@nvidia.com>, Aharon Landau <aharonl@nvidia.com>,
	Michael Guralnik <michaelgur@nvidia.com>
Subject: [PATCH rdma-next 8/8] RDMA/mlx5: Add work to remove temporary entries from the cache
Date: Thu, 8 Sep 2022 23:54:21 +0300	[thread overview]
Message-ID: <20220908205421.210048-9-michaelgur@nvidia.com> (raw)
In-Reply-To: <20220908205421.210048-1-michaelgur@nvidia.com>

From: Aharon Landau <aharonl@nvidia.com>

The non-cache mkeys are stored in the cache only to shorten restarting
application time. Don't store them longer than needed.

Configure cache entries that store non-cache MR as temporary entries.
If 30 seconds have passed and no user reclaimed the temporarily cached
mkeys, an asynchronous work will destroy the mkeys and the temporary
entries.

When allocating an mkey from a temporary entry, don't keep a pointer to
the entry, as it might be destroyed.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Reviewed-by: Michael Guralnik <michaelgur@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  2 +
 drivers/infiniband/hw/mlx5/mr.c      | 95 ++++++++++++++++++++++++----
 2 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 109e3d666264..c1e1e3be6e84 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -748,6 +748,7 @@ struct mlx5_cache_ent {
 
 	char                    name[4];
 
+	u8 is_tmp:1;
 	u8 disabled:1;
 	u8 fill_to_high_water:1;
 
@@ -781,6 +782,7 @@ struct mlx5_mkey_cache {
 	struct mutex		rb_lock;
 	struct dentry		*fs_root;
 	unsigned long		last_add;
+	struct delayed_work	remove_ent_dwork;
 };
 
 struct mlx5_ib_port_resources {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 1e7b3c2d71a7..c5100b5dcf30 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -141,18 +141,16 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
 }
 
 
-static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
-		     void *to_store)
+static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings,
+			    void *to_store)
 {
 	XA_STATE(xas, &ent->mkeys, 0);
 	void *curr;
 
-	xa_lock_irq(&ent->mkeys);
 	if (limit_pendings &&
-	    (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) {
-		xa_unlock_irq(&ent->mkeys);
+	    (ent->reserved - ent->stored) > MAX_PENDING_REG_MR)
 		return -EAGAIN;
-	}
+
 	while (1) {
 		/*
 		 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version
@@ -191,6 +189,7 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
 			break;
 		xa_lock_irq(&ent->mkeys);
 	}
+	xa_lock_irq(&ent->mkeys);
 	if (xas_error(&xas))
 		return xas_error(&xas);
 	if (WARN_ON(curr))
@@ -198,6 +197,17 @@ static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
 	return 0;
 }
 
+static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings,
+		     void *to_store)
+{
+	int ret;
+
+	xa_lock_irq(&ent->mkeys);
+	ret = push_mkey_locked(ent, limit_pendings, to_store);
+	xa_unlock_irq(&ent->mkeys);
+	return ret;
+}
+
 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent)
 {
 	void *old;
@@ -542,7 +552,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
 	lockdep_assert_held(&ent->mkeys.xa_lock);
 
-	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
+	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
 		return;
 	if (ent->stored < ent->limit) {
 		ent->fill_to_high_water = true;
@@ -718,9 +728,18 @@ static bool mlx5_ent_get_mkey(struct mlx5_cache_ent *ent, struct mlx5_ib_mr *mr)
 
 	mr->mmkey.key = pop_stored_mkey(ent);
 	mr->mmkey.ndescs = ent->rb_key.ndescs;
-	mr->mmkey.cache_ent = ent;
-	queue_adjust_cache_locked(ent);
-	ent->in_use++;
+	if (!ent->is_tmp) {
+		mr->mmkey.cache_ent = ent;
+		queue_adjust_cache_locked(ent);
+		ent->in_use++;
+
+	} else {
+		mr->mmkey.rb_key = ent->rb_key;
+		mod_delayed_work(ent->dev->cache.wq,
+				 &ent->dev->cache.remove_ent_dwork,
+				 msecs_to_jiffies(30 * 1000));
+	}
+
 	xa_unlock_irq(&ent->mkeys);
 	return true;
 }
@@ -890,6 +909,38 @@ struct mlx5_cache_ent *mlx5r_cache_create_ent(struct mlx5_ib_dev *dev,
 	return ent;
 }
 
+static void remove_ent_work_func(struct work_struct *work)
+{
+	struct mlx5_mkey_cache *cache;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *cur;
+
+	cache = container_of(work, struct mlx5_mkey_cache,
+			     remove_ent_dwork.work);
+	mutex_lock(&cache->rb_lock);
+	cur = rb_last(&cache->rb_root);
+	while (cur) {
+		ent = rb_entry(cur, struct mlx5_cache_ent, node);
+		cur = rb_prev(cur);
+		mutex_unlock(&cache->rb_lock);
+
+		xa_lock_irq(&ent->mkeys);
+		if (!ent->is_tmp) {
+			xa_unlock_irq(&ent->mkeys);
+			mutex_lock(&cache->rb_lock);
+			continue;
+		}
+		ent->disabled = true;
+		xa_unlock_irq(&ent->mkeys);
+
+		clean_keys(ent->dev, ent);
+		mutex_lock(&cache->rb_lock);
+		rb_erase(&ent->node, &cache->rb_root);
+		kfree(ent);
+	}
+	mutex_unlock(&cache->rb_lock);
+}
+
 static int mlx5_cache_init_default_entries(struct mlx5_ib_dev *dev)
 {
 	struct mlx5r_cache_rb_key rb_key = { .access_mode =
@@ -944,6 +995,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 	mutex_init(&dev->slow_path_mutex);
 	mutex_init(&dev->cache.rb_lock);
 	dev->cache.rb_root = RB_ROOT;
+	INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func);
 	dev->cache.wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 	if (!dev->cache.wq) {
 		mlx5_ib_warn(dev, "failed to create work queue\n");
@@ -971,6 +1023,7 @@ int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 	if (!dev->cache.wq)
 		return 0;
 
+	cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
 	mutex_lock(&dev->cache.rb_lock);
 	for (node = rb_first(root); node; node = rb_next(node)) {
 		ent = rb_entry(node, struct mlx5_cache_ent, node);
@@ -1730,34 +1783,48 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
 	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct rb_node *node;
+	int ret;
 
 	if (mr->mmkey.cache_ent) {
 		xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
 		mr->mmkey.cache_ent->in_use--;
-		xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
 		goto end;
 	}
 
 	mutex_lock(&cache->rb_lock);
+	mod_delayed_work(cache->wq, &cache->remove_ent_dwork,
+			 msecs_to_jiffies(30 * 1000));
 	node = mlx5_cache_find_smallest_ent(&dev->cache, mr->mmkey.rb_key);
-	mutex_unlock(&cache->rb_lock);
 	if (node) {
 		ent = rb_entry(node, struct mlx5_cache_ent, node);
 		if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
+			if (ent->disabled) {
+				mutex_unlock(&cache->rb_lock);
+				return -EOPNOTSUPP;
+			}
+
 			mr->mmkey.cache_ent = ent;
+			xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+			mutex_unlock(&cache->rb_lock);
 			goto end;
 		}
 	}
+	mutex_unlock(&cache->rb_lock);
 
 	ent = mlx5r_cache_create_ent(dev, mr->mmkey.rb_key);
 	if (IS_ERR(ent))
 		return PTR_ERR(ent);
 
 	mr->mmkey.cache_ent = ent;
+	xa_lock_irq(&mr->mmkey.cache_ent->mkeys);
+	ent->is_tmp = true;
 
 end:
-	return push_mkey(mr->mmkey.cache_ent, false,
-			 xa_mk_value(mr->mmkey.key));
+	ret = push_mkey_locked(mr->mmkey.cache_ent, false,
+			       xa_mk_value(mr->mmkey.key));
+	xa_unlock_irq(&mr->mmkey.cache_ent->mkeys);
+	return ret;
+
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
-- 
2.17.2

     prev parent reply	other threads:[~2022-09-08 20:55 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-09-08 20:54 [PATCH rdma-next 0/8] RDMA/mlx5: Switch MR cache to use RB-tree Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 1/8] RDMA/mlx5: Don't keep umrable 'page_shift' in cache entries Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 2/8] RDMA/mlx5: Generalize mlx5_cache_cache_mr() to fit all cacheable mkeys Michael Guralnik
2022-09-09 14:47   ` Jason Gunthorpe
2022-09-08 20:54 ` [PATCH rdma-next 3/8] RDMA/mlx5: Remove explicit ODP cache entry Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 4/8] RDMA/mlx5: Allow rereg all the mkeys that can load pas with UMR Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 5/8] RDMA/mlx5: Introduce mlx5r_cache_rb_key Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 6/8] RDMA/mlx5: Change the cache structure to an RB-tree Michael Guralnik
2022-09-08 20:54 ` [PATCH rdma-next 7/8] RDMA/mlx5: Cache all user cacheable mkeys on dereg MR flow Michael Guralnik
2022-09-08 20:54 ` Michael Guralnik [this message]

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:109e3d66626 dfblob:c1e1e3be6e8 dfblob:1e7b3c2d71a
dfblob:c5100b5dcf3 )
 OR (
bs:"[PATCH rdma-next 8/8] RDMA/mlx5: Add work to remove temporary entries from the cache" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220908205421.210048-9-michaelgur@nvidia.com \
    --to=michaelgur@nvidia.com \
    --cc=aharonl@nvidia.com \
    --cc=jgg@nvidia.com \
    --cc=leonro@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=maorg@nvidia.com \
    --cc=saeedm@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox