* [PATCH rdma-next 2/6] IB/umem: Update on demand page (ODP) support
[not found] ` <20170118145811.9136-1-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2017-01-18 14:58 ` [PATCH rdma-next 1/6] IB/core: Add implicit MR flag Leon Romanovsky
@ 2017-01-18 14:58 ` Leon Romanovsky
2017-01-18 14:58 ` [PATCH rdma-next 3/6] IB/umem: Indicate that process is being terminated Leon Romanovsky
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2017-01-18 14:58 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Artemy Kovalyov, Ilya Lesokhin
From: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Currently ODP MR may explicitly register virtual address space area
of limited length.
This change allows MR to cover entire process virtual address space
dynamicaly adding/removing translation entries to device MTT.
Add following changes to support implicit MR:
* Allow umem to be zero size to back-up implicit MR.
* Add new function ib_alloc_odp_umem() to add virtual memory regions
to implicit MR dynamically on demand.
* Add new function rbt_ib_umem_lookup() to find dynamically added
virtual memory regions.
* Expose function rbt_ib_umem_for_each_in_range() to other modules and
make it safe
Signed-off-by: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
drivers/infiniband/core/umem.c | 3 --
drivers/infiniband/core/umem_odp.c | 87 +++++++++++++++++++++++++++++++----
drivers/infiniband/core/umem_rbtree.c | 21 +++++++--
include/rdma/ib_umem_odp.h | 21 +++++++--
4 files changed, 113 insertions(+), 19 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 1e62a5f..9f9630b 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -99,9 +99,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (dmasync)
dma_attrs |= DMA_ATTR_WRITE_BARRIER;
- if (!size)
- return ERR_PTR(-EINVAL);
-
/*
* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 6b079a3..1104d36 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -239,6 +239,71 @@ static const struct mmu_notifier_ops ib_umem_notifiers = {
.invalidate_range_end = ib_umem_notifier_invalidate_range_end,
};
+struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+ unsigned long addr,
+ size_t size)
+{
+ struct ib_umem *umem;
+ struct ib_umem_odp *odp_data;
+ int pages = size >> PAGE_SHIFT;
+ int ret;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->address = addr;
+ umem->page_size = PAGE_SIZE;
+ umem->writable = 1;
+
+ odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
+ if (!odp_data) {
+ ret = -ENOMEM;
+ goto out_umem;
+ }
+ odp_data->umem = umem;
+
+ mutex_init(&odp_data->umem_mutex);
+ init_completion(&odp_data->notifier_completion);
+
+ odp_data->page_list = vzalloc(pages * sizeof(*odp_data->page_list));
+ if (!odp_data->page_list) {
+ ret = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ odp_data->dma_list = vzalloc(pages * sizeof(*odp_data->dma_list));
+ if (!odp_data->dma_list) {
+ ret = -ENOMEM;
+ goto out_page_list;
+ }
+
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)))
+ odp_data->mn_counters_active = true;
+ else
+ list_add(&odp_data->no_private_counters,
+ &context->no_private_counters);
+ up_write(&context->umem_rwsem);
+
+ umem->odp_data = odp_data;
+
+ return umem;
+
+out_page_list:
+ vfree(odp_data->page_list);
+out_odp_data:
+ kfree(odp_data);
+out_umem:
+ kfree(umem);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_odp_umem);
+
int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
{
int ret_val;
@@ -270,18 +335,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
init_completion(&umem->odp_data->notifier_completion);
- umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+ if (ib_umem_num_pages(umem)) {
+ umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
sizeof(*umem->odp_data->page_list));
- if (!umem->odp_data->page_list) {
- ret_val = -ENOMEM;
- goto out_odp_data;
- }
+ if (!umem->odp_data->page_list) {
+ ret_val = -ENOMEM;
+ goto out_odp_data;
+ }
- umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+ umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
sizeof(*umem->odp_data->dma_list));
- if (!umem->odp_data->dma_list) {
- ret_val = -ENOMEM;
- goto out_page_list;
+ if (!umem->odp_data->dma_list) {
+ ret_val = -ENOMEM;
+ goto out_page_list;
+ }
}
/*
@@ -466,6 +533,7 @@ static int ib_umem_odp_map_dma_single_page(
}
umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
umem->odp_data->page_list[page_index] = page;
+ umem->npages++;
stored_page = 1;
} else if (umem->odp_data->page_list[page_index] == page) {
umem->odp_data->dma_list[page_index] |= access_mask;
@@ -665,6 +733,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
put_page(page);
umem->odp_data->page_list[idx] = NULL;
umem->odp_data->dma_list[idx] = 0;
+ umem->npages--;
}
}
mutex_unlock(&umem->odp_data->umem_mutex);
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
index 727d788..d176597 100644
--- a/drivers/infiniband/core/umem_rbtree.c
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -78,17 +78,32 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root,
void *cookie)
{
int ret_val = 0;
- struct umem_odp_node *node;
+ struct umem_odp_node *node, *next;
struct ib_umem_odp *umem;
if (unlikely(start == last))
return ret_val;
- for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
- node = rbt_ib_umem_iter_next(node, start, last - 1)) {
+ for (node = rbt_ib_umem_iter_first(root, start, last - 1);
+ node; node = next) {
+ next = rbt_ib_umem_iter_next(node, start, last - 1);
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val;
}
return ret_val;
}
+EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
+
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+ u64 addr, u64 length)
+{
+ struct umem_odp_node *node;
+
+ node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
+ if (node)
+ return container_of(node, struct ib_umem_odp, interval_tree);
+ return NULL;
+
+}
+EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 3da0b16..542cd8b 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -79,11 +79,15 @@ struct ib_umem_odp {
struct completion notifier_completion;
int dying;
+ struct work_struct work;
};
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
+struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+ unsigned long addr,
+ size_t size);
void ib_umem_odp_release(struct ib_umem *umem);
@@ -117,10 +121,12 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
umem_call_back cb, void *cookie);
-struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
- u64 start, u64 last);
-struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
- u64 start, u64 last);
+/*
+ * Find first region intersecting with address range.
+ * Return NULL if not found
+ */
+struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root,
+ u64 addr, u64 length);
static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
unsigned long mmu_seq)
@@ -153,6 +159,13 @@ static inline int ib_umem_odp_get(struct ib_ucontext *context,
return -EINVAL;
}
+static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context,
+ unsigned long addr,
+ size_t size)
+{
+ return ERR_PTR(-EINVAL);
+}
+
static inline void ib_umem_odp_release(struct ib_umem *umem) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
--
2.10.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH rdma-next 5/6] IB/mlx5: Expose MR cache for mlx5_ib
[not found] ` <20170118145811.9136-1-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
` (3 preceding siblings ...)
2017-01-18 14:58 ` [PATCH rdma-next 4/6] IB/mlx5: Add null_mkey access Leon Romanovsky
@ 2017-01-18 14:58 ` Leon Romanovsky
2017-01-18 14:58 ` [PATCH rdma-next 6/6] IB/mlx5: Add implicit MR support Leon Romanovsky
2017-02-14 16:45 ` [PATCH rdma-next 0/6] Expand ODP MR to support implicit registration Doug Ledford
6 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2017-01-18 14:58 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Artemy Kovalyov, Ilya Lesokhin
From: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Allow other parts of mlx5_ib to use MR cache mechanism.
* Add new functions mlx5_mr_cache_alloc and mlx5_mr_cache_free
* Traditional MTT MKey buckets are limited by MAX_UMR_CACHE_ENTRY
Additinal buckets may be added above.
Signed-off-by: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 +++-
drivers/infiniband/hw/mlx5/mr.c | 99 ++++++++++++++++++++++++++----------
include/linux/mlx5/driver.h | 3 +-
3 files changed, 82 insertions(+), 29 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e1a4b93d..0eda2de 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -535,6 +535,10 @@ struct mlx5_cache_ent {
struct dentry *dir;
char name[4];
u32 order;
+ u32 xlt;
+ u32 access_mode;
+ u32 page;
+
u32 size;
u32 cur;
u32 miss;
@@ -549,6 +553,7 @@ struct mlx5_cache_ent {
struct work_struct work;
struct delayed_work dwork;
int pending;
+ struct completion compl;
};
struct mlx5_mr_cache {
@@ -824,7 +829,9 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
+
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8cf2a67..8f5b94d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -49,6 +49,7 @@ enum {
static int clean_mr(struct mlx5_ib_mr *mr);
static int use_umr(struct mlx5_ib_dev *dev, int order);
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
@@ -149,6 +150,9 @@ static void reg_mr_callback(int status, void *context)
if (err)
pr_err("Error inserting to mkey tree. 0x%x\n", -err);
write_unlock_irqrestore(&table->lock, flags);
+
+ if (!completion_done(&ent->compl))
+ complete(&ent->compl);
}
static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
@@ -157,7 +161,6 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
struct mlx5_cache_ent *ent = &cache->ent[c];
int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
struct mlx5_ib_mr *mr;
- int npages = 1 << ent->order;
void *mkc;
u32 *in;
int err = 0;
@@ -185,11 +188,11 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
MLX5_SET(mkc, mkc, free, 1);
MLX5_SET(mkc, mkc, umr_en, 1);
- MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
+ MLX5_SET(mkc, mkc, access_mode, ent->access_mode);
MLX5_SET(mkc, mkc, qpn, 0xffffff);
- MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
- MLX5_SET(mkc, mkc, log_page_size, 12);
+ MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
+ MLX5_SET(mkc, mkc, log_page_size, ent->page);
spin_lock_irq(&ent->lock);
ent->pending++;
@@ -447,6 +450,42 @@ static void cache_work_func(struct work_struct *work)
__cache_work_func(ent);
}
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
+{
+ struct mlx5_mr_cache *cache = &dev->cache;
+ struct mlx5_cache_ent *ent;
+ struct mlx5_ib_mr *mr;
+ int err;
+
+ if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
+ mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
+ return NULL;
+ }
+
+ ent = &cache->ent[entry];
+ while (1) {
+ spin_lock_irq(&ent->lock);
+ if (list_empty(&ent->head)) {
+ spin_unlock_irq(&ent->lock);
+
+ err = add_keys(dev, entry, 1);
+ if (err)
+ return ERR_PTR(err);
+
+ wait_for_completion(&ent->compl);
+ } else {
+ mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+ list);
+ list_del(&mr->list);
+ ent->cur--;
+ spin_unlock_irq(&ent->lock);
+ if (ent->cur < ent->limit)
+ queue_work(cache->wq, &ent->work);
+ return mr;
+ }
+ }
+}
+
static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
{
struct mlx5_mr_cache *cache = &dev->cache;
@@ -456,12 +495,12 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
int i;
c = order2idx(dev, order);
- if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+ if (c < 0 || c > MAX_UMR_CACHE_ENTRY) {
mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
return NULL;
}
- for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
+ for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) {
ent = &cache->ent[i];
mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
@@ -488,7 +527,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
return mr;
}
-static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
{
struct mlx5_mr_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent;
@@ -500,6 +539,10 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
return;
}
+
+ if (unreg_umr(dev, mr))
+ return;
+
ent = &cache->ent[c];
spin_lock_irq(&ent->lock);
list_add_tail(&mr->list, &ent->head);
@@ -602,7 +645,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
{
struct mlx5_mr_cache *cache = &dev->cache;
struct mlx5_cache_ent *ent;
- int limit;
int err;
int i;
@@ -615,26 +657,33 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
- INIT_LIST_HEAD(&cache->ent[i].head);
- spin_lock_init(&cache->ent[i].lock);
-
ent = &cache->ent[i];
INIT_LIST_HEAD(&ent->head);
spin_lock_init(&ent->lock);
ent->order = i + 2;
ent->dev = dev;
+ ent->limit = 0;
- if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
- mlx5_core_is_pf(dev->mdev) &&
- use_umr(dev, ent->order))
- limit = dev->mdev->profile->mr_cache[i].limit;
- else
- limit = 0;
-
+ init_completion(&ent->compl);
INIT_WORK(&ent->work, cache_work_func);
INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
- ent->limit = limit;
queue_work(cache->wq, &ent->work);
+
+ if (i > MAX_UMR_CACHE_ENTRY)
+ continue;
+
+ if (!use_umr(dev, ent->order))
+ continue;
+
+ ent->page = PAGE_SHIFT;
+ ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
+ MLX5_IB_UMR_OCTOWORD;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+ if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
+ mlx5_core_is_pf(dev->mdev))
+ ent->limit = dev->mdev->profile->mr_cache[i].limit;
+ else
+ ent->limit = 0;
}
err = mlx5_mr_cache_debugfs_init(dev);
@@ -758,7 +807,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
static int use_umr(struct mlx5_ib_dev *dev, int order)
{
if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
- return order < MAX_MR_CACHE_ENTRIES + 2;
+ return order <= MAX_UMR_CACHE_ENTRY + 2;
return order <= MLX5_MAX_UMR_SHIFT;
}
@@ -871,7 +920,7 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
MLX5_IB_UPD_XLT_ENABLE);
if (err) {
- free_cached_mr(dev, mr);
+ mlx5_mr_cache_free(dev, mr);
return ERR_PTR(err);
}
@@ -1091,6 +1140,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
goto err_2;
}
mr->mmkey.type = MLX5_MKEY_MR;
+ mr->desc_size = sizeof(struct mlx5_mtt);
mr->umem = umem;
mr->dev = dev;
mr->live = 1;
@@ -1398,12 +1448,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
return err;
}
} else {
- err = unreg_umr(dev, mr);
- if (err) {
- mlx5_ib_warn(dev, "failed unregister\n");
- return err;
- }
- free_cached_mr(dev, mr);
+ mlx5_mr_cache_free(dev, mr);
}
if (!umred)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 10e6325..a78ee73 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1052,7 +1052,8 @@ enum {
};
enum {
- MAX_MR_CACHE_ENTRIES = 21,
+ MAX_UMR_CACHE_ENTRY = 20,
+ MAX_MR_CACHE_ENTRIES
};
enum {
--
2.10.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH rdma-next 6/6] IB/mlx5: Add implicit MR support
[not found] ` <20170118145811.9136-1-leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
` (4 preceding siblings ...)
2017-01-18 14:58 ` [PATCH rdma-next 5/6] IB/mlx5: Expose MR cache for mlx5_ib Leon Romanovsky
@ 2017-01-18 14:58 ` Leon Romanovsky
2017-02-14 16:45 ` [PATCH rdma-next 0/6] Expand ODP MR to support implicit registration Doug Ledford
6 siblings, 0 replies; 8+ messages in thread
From: Leon Romanovsky @ 2017-01-18 14:58 UTC (permalink / raw)
To: dledford-H+wXaHxf7aLQT0dZR+AlfA
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Artemy Kovalyov, Ilya Lesokhin
From: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Add implicit MR, covering entire user address space.
The MR is implemented as an indirect KSM MR consisting of
1GB direct MRs.
Pages and direct MRs are added/removed to MR by ODP.
Signed-off-by: Artemy Kovalyov <artemyko-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Leon Romanovsky <leon-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
drivers/infiniband/hw/mlx5/main.c | 2 +
drivers/infiniband/hw/mlx5/mlx5_ib.h | 20 +-
drivers/infiniband/hw/mlx5/mr.c | 33 ++-
drivers/infiniband/hw/mlx5/odp.c | 505 ++++++++++++++++++++++++++++++++---
include/linux/mlx5/driver.h | 2 +
5 files changed, 513 insertions(+), 49 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a191b93..475a458 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3424,6 +3424,8 @@ static int __init mlx5_ib_init(void)
if (deprecated_prof_sel != 2)
pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
+ mlx5_ib_odp_init();
+
err = mlx5_register_interface(&mlx5_ib_interface);
return err;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 0eda2de..560dae0c 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -202,6 +202,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_ADDR BIT(3)
#define MLX5_IB_UPD_XLT_PD BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
+#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
*
@@ -497,6 +498,10 @@ struct mlx5_ib_mr {
int live;
void *descs_alloc;
int access_flags; /* Needed for rereg MR */
+
+ struct mlx5_ib_mr *parent;
+ atomic_t num_leaf_free;
+ wait_queue_head_t q_leaf_free;
};
struct mlx5_ib_mw {
@@ -624,6 +629,7 @@ struct mlx5_ib_dev {
* being used by a page fault handler.
*/
struct srcu_struct mr_srcu;
+ u32 null_mkey;
#endif
struct mlx5_ib_flow_db flow_db;
/* protect resources needed as part of reset flow */
@@ -776,6 +782,9 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
int mlx5_ib_dealloc_mw(struct ib_mw *mw);
int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
int page_shift, int flags);
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ int access_flags);
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
u64 length, u64 virt_addr, int access_flags,
struct ib_pd *pd, struct ib_udata *udata);
@@ -855,6 +864,9 @@ int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
unsigned long end);
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+ size_t nentries, struct mlx5_ib_mr *mr, int flags);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
{
@@ -862,9 +874,13 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
-static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
+static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
-static inline void mlx5_ib_odp_cleanup(void) {}
+static inline void mlx5_ib_odp_cleanup(void) {}
+static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+ size_t nentries, struct mlx5_ib_mr *mr,
+ int flags) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8f5b94d..3c1f483 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -469,7 +469,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
spin_unlock_irq(&ent->lock);
err = add_keys(dev, entry, 1);
- if (err)
+ if (err && err != -EAGAIN)
return ERR_PTR(err);
wait_for_completion(&ent->compl);
@@ -669,8 +669,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
queue_work(cache->wq, &ent->work);
- if (i > MAX_UMR_CACHE_ENTRY)
+ if (i > MAX_UMR_CACHE_ENTRY) {
+ mlx5_odp_init_mr_cache_entry(ent);
continue;
+ }
if (!use_umr(dev, ent->order))
continue;
@@ -935,6 +937,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
{
struct mlx5_ib_dev *dev = mr->dev;
struct ib_umem *umem = mr->umem;
+ if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+ mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
+ return npages;
+ }
npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
@@ -968,7 +974,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
struct mlx5_umr_wr wr;
struct ib_sge sg;
int err = 0;
- int desc_size = sizeof(struct mlx5_mtt);
+ int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
+ ? sizeof(struct mlx5_klm)
+ : sizeof(struct mlx5_mtt);
const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
const int page_mask = page_align - 1;
size_t pages_mapped = 0;
@@ -1186,6 +1194,18 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
start, virt_addr, length, access_flags);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ if (!start && length == U64_MAX) {
+ if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
+ !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return ERR_PTR(-EINVAL);
+
+ mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+ return &mr->ibmr;
+ }
+#endif
+
err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
&page_shift, &ncont, &order);
@@ -1471,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
/* Wait for all running page-fault handlers to finish. */
synchronize_srcu(&dev->mr_srcu);
/* Destroy all page mappings */
- mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
- ib_umem_end(umem));
+ if (umem->odp_data->page_list)
+ mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+ ib_umem_end(umem));
+ else
+ mlx5_ib_free_implicit_mr(mr);
/*
* We kill the umem before the MR for ODP,
* so that there will not be any invalidations in
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e5bc267..d7b12f0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,7 @@
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
+#include "cmd.h"
#define MAX_PREFETCH_LEN (4*1024*1024U)
@@ -41,6 +42,140 @@
* a pagefault. */
#define MMU_NOTIFIER_TIMEOUT 1000
+#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
+#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
+#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
+#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
+#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
+
+#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
+
+static u64 mlx5_imr_ksm_entries;
+
+static int check_parent(struct ib_umem_odp *odp,
+ struct mlx5_ib_mr *parent)
+{
+ struct mlx5_ib_mr *mr = odp->private;
+
+ return mr && mr->parent == parent;
+}
+
+static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
+{
+ struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
+ struct ib_ucontext *ctx = odp->umem->context;
+ struct rb_node *rb;
+
+ down_read(&ctx->umem_rwsem);
+ while (1) {
+ rb = rb_next(&odp->interval_tree.rb);
+ if (!rb)
+ goto not_found;
+ odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+ if (check_parent(odp, parent))
+ goto end;
+ }
+not_found:
+ odp = NULL;
+end:
+ up_read(&ctx->umem_rwsem);
+ return odp;
+}
+
+static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
+ u64 start, u64 length,
+ struct mlx5_ib_mr *parent)
+{
+ struct ib_umem_odp *odp;
+ struct rb_node *rb;
+
+ down_read(&ctx->umem_rwsem);
+ odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
+ if (!odp)
+ goto end;
+
+ while (1) {
+ if (check_parent(odp, parent))
+ goto end;
+ rb = rb_next(&odp->interval_tree.rb);
+ if (!rb)
+ goto not_found;
+ odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
+ if (ib_umem_start(odp->umem) > start + length)
+ goto not_found;
+ }
+not_found:
+ odp = NULL;
+end:
+ up_read(&ctx->umem_rwsem);
+ return odp;
+}
+
+void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
+ size_t nentries, struct mlx5_ib_mr *mr, int flags)
+{
+ struct ib_pd *pd = mr->ibmr.pd;
+ struct ib_ucontext *ctx = pd->uobject->context;
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct ib_umem_odp *odp;
+ unsigned long va;
+ int i;
+
+ if (flags & MLX5_IB_UPD_XLT_ZAP) {
+ for (i = 0; i < nentries; i++, pklm++) {
+ pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+ pklm->key = cpu_to_be32(dev->null_mkey);
+ pklm->va = 0;
+ }
+ return;
+ }
+
+ odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
+ nentries * MLX5_IMR_MTT_SIZE, mr);
+
+ for (i = 0; i < nentries; i++, pklm++) {
+ pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
+ va = (offset + i) * MLX5_IMR_MTT_SIZE;
+ if (odp && odp->umem->address == va) {
+ struct mlx5_ib_mr *mtt = odp->private;
+
+ pklm->key = cpu_to_be32(mtt->ibmr.lkey);
+ odp = odp_next(odp);
+ } else {
+ pklm->key = cpu_to_be32(dev->null_mkey);
+ }
+ mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
+ i, va, be32_to_cpu(pklm->key));
+ }
+}
+
+static void mr_leaf_free_action(struct work_struct *work)
+{
+ struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
+ int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
+ struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+
+ mr->parent = NULL;
+ synchronize_srcu(&mr->dev->mr_srcu);
+
+ if (!READ_ONCE(odp->dying)) {
+ mr->parent = imr;
+ if (atomic_dec_and_test(&imr->num_leaf_free))
+ wake_up(&imr->q_leaf_free);
+ return;
+ }
+
+ ib_umem_release(odp->umem);
+ if (imr->live)
+ mlx5_ib_update_xlt(imr, idx, 1, 0,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ mlx5_mr_cache_free(mr->dev, mr);
+
+ if (atomic_dec_and_test(&imr->num_leaf_free))
+ wake_up(&imr->q_leaf_free);
+}
+
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
unsigned long end)
{
@@ -111,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
*/
ib_umem_odp_unmap_dma_pages(umem, start, end);
+
+ if (unlikely(!umem->npages && mr->parent &&
+ !umem->odp_data->dying)) {
+ WRITE_ONCE(umem->odp_data->dying, 1);
+ atomic_inc(&mr->parent->num_leaf_free);
+ schedule_work(&umem->odp_data->work);
+ }
}
void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -147,6 +289,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+ if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
+ MLX5_CAP_GEN(dev->mdev, null_mkey) &&
+ MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
+ caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
+
return;
}
@@ -184,6 +331,197 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
wq_num);
}
+static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
+ struct ib_umem *umem,
+ bool ksm, int access_flags)
+{
+ struct mlx5_ib_dev *dev = to_mdev(pd->device);
+ struct mlx5_ib_mr *mr;
+ int err;
+
+ mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
+ MLX5_IMR_MTT_CACHE_ENTRY);
+
+ if (IS_ERR(mr))
+ return mr;
+
+ mr->ibmr.pd = pd;
+
+ mr->dev = dev;
+ mr->access_flags = access_flags;
+ mr->mmkey.iova = 0;
+ mr->umem = umem;
+
+ if (ksm) {
+ err = mlx5_ib_update_xlt(mr, 0,
+ mlx5_imr_ksm_entries,
+ MLX5_KSM_PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE);
+
+ } else {
+ err = mlx5_ib_update_xlt(mr, 0,
+ MLX5_IMR_MTT_ENTRIES,
+ PAGE_SHIFT,
+ MLX5_IB_UPD_XLT_ZAP |
+ MLX5_IB_UPD_XLT_ENABLE |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ }
+
+ if (err)
+ goto fail;
+
+ mr->ibmr.lkey = mr->mmkey.key;
+ mr->ibmr.rkey = mr->mmkey.key;
+
+ mr->live = 1;
+
+ mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
+ mr->mmkey.key, dev->mdev, mr);
+
+ return mr;
+
+fail:
+ mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+ mlx5_mr_cache_free(dev, mr);
+
+ return ERR_PTR(err);
+}
+
+static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
+ u64 io_virt, size_t bcnt)
+{
+ struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
+ struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
+ struct ib_umem_odp *odp, *result = NULL;
+ u64 addr = io_virt & MLX5_IMR_MTT_MASK;
+ int nentries = 0, start_idx = 0, ret;
+ struct mlx5_ib_mr *mtt;
+ struct ib_umem *umem;
+
+ mutex_lock(&mr->umem->odp_data->umem_mutex);
+ odp = odp_lookup(ctx, addr, 1, mr);
+
+ mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
+ io_virt, bcnt, addr, odp);
+
+next_mr:
+ if (likely(odp)) {
+ if (nentries)
+ nentries++;
+ } else {
+ umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
+ if (IS_ERR(umem)) {
+ mutex_unlock(&mr->umem->odp_data->umem_mutex);
+ return ERR_CAST(umem);
+ }
+
+ mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
+ if (IS_ERR(mtt)) {
+ mutex_unlock(&mr->umem->odp_data->umem_mutex);
+ ib_umem_release(umem);
+ return ERR_CAST(mtt);
+ }
+
+ odp = umem->odp_data;
+ odp->private = mtt;
+ mtt->umem = umem;
+ mtt->mmkey.iova = addr;
+ mtt->parent = mr;
+ INIT_WORK(&odp->work, mr_leaf_free_action);
+
+ if (!nentries)
+ start_idx = addr >> MLX5_IMR_MTT_SHIFT;
+ nentries++;
+ }
+
+ odp->dying = 0;
+
+ /* Return first odp if region not covered by single one */
+ if (likely(!result))
+ result = odp;
+
+ addr += MLX5_IMR_MTT_SIZE;
+ if (unlikely(addr < io_virt + bcnt)) {
+ odp = odp_next(odp);
+ if (odp && odp->umem->address != addr)
+ odp = NULL;
+ goto next_mr;
+ }
+
+ if (unlikely(nentries)) {
+ ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
+ MLX5_IB_UPD_XLT_INDIRECT |
+ MLX5_IB_UPD_XLT_ATOMIC);
+ if (ret) {
+ mlx5_ib_err(dev, "Failed to update PAS\n");
+ result = ERR_PTR(ret);
+ }
+ }
+
+ mutex_unlock(&mr->umem->odp_data->umem_mutex);
+ return result;
+}
+
+struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
+ int access_flags)
+{
+ struct ib_ucontext *ctx = pd->ibpd.uobject->context;
+ struct mlx5_ib_mr *imr;
+ struct ib_umem *umem;
+
+ umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
+ if (IS_ERR(umem))
+ return ERR_CAST(umem);
+
+ imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
+ if (IS_ERR(imr)) {
+ ib_umem_release(umem);
+ return ERR_CAST(imr);
+ }
+
+ imr->umem = umem;
+ init_waitqueue_head(&imr->q_leaf_free);
+ atomic_set(&imr->num_leaf_free, 0);
+
+ return imr;
+}
+
+static int mr_leaf_free(struct ib_umem *umem, u64 start,
+ u64 end, void *cookie)
+{
+ struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
+
+ if (mr->parent != imr)
+ return 0;
+
+ ib_umem_odp_unmap_dma_pages(umem,
+ ib_umem_start(umem),
+ ib_umem_end(umem));
+
+ if (umem->odp_data->dying)
+ return 0;
+
+ WRITE_ONCE(umem->odp_data->dying, 1);
+ atomic_inc(&imr->num_leaf_free);
+ schedule_work(&umem->odp_data->work);
+
+ return 0;
+}
+
+void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
+{
+ struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
+
+ down_read(&ctx->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
+ mr_leaf_free, imr);
+ up_read(&ctx->umem_rwsem);
+
+ wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+}
+
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
@@ -195,47 +533,43 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
* -EFAULT when there's an error mapping the requested pages. The caller will
* abort the page fault handling.
*/
-static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
+static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 key, u64 io_virt, size_t bcnt,
u32 *bytes_committed,
u32 *bytes_mapped)
{
int srcu_key;
- unsigned int current_seq;
+ unsigned int current_seq = 0;
u64 start_idx;
int npages = 0, ret = 0;
struct mlx5_ib_mr *mr;
u64 access_mask = ODP_READ_ALLOWED_BIT;
+ struct ib_umem_odp *odp;
+ int implicit = 0;
+ size_t size;
- srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
- mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
+ srcu_key = srcu_read_lock(&dev->mr_srcu);
+ mr = mlx5_ib_odp_find_mr_lkey(dev, key);
/*
* If we didn't find the MR, it means the MR was closed while we were
* handling the ODP event. In this case we return -EFAULT so that the
* QP will be closed.
*/
if (!mr || !mr->ibmr.pd) {
- pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
- key);
+ mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
+ key);
ret = -EFAULT;
goto srcu_unlock;
}
if (!mr->umem->odp_data) {
- pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
- key);
+ mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+ key);
if (bytes_mapped)
*bytes_mapped +=
(bcnt - *bytes_committed);
goto srcu_unlock;
}
- current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
- /*
- * Ensure the sequence number is valid for some time before we call
- * gup.
- */
- smp_rmb();
-
/*
* Avoid branches - this code will perform correctly
* in all iterations (in iteration 2 and above,
@@ -244,63 +578,109 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
io_virt += *bytes_committed;
bcnt -= *bytes_committed;
+ if (!mr->umem->odp_data->page_list) {
+ odp = implicit_mr_get_data(mr, io_virt, bcnt);
+
+ if (IS_ERR(odp)) {
+ ret = PTR_ERR(odp);
+ goto srcu_unlock;
+ }
+ mr = odp->private;
+ implicit = 1;
+
+ } else {
+ odp = mr->umem->odp_data;
+ }
+
+next_mr:
+ current_seq = READ_ONCE(odp->notifiers_seq);
+ /*
+ * Ensure the sequence number is valid for some time before we call
+ * gup.
+ */
+ smp_rmb();
+
+ size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
if (mr->umem->writable)
access_mask |= ODP_WRITE_ALLOWED_BIT;
- npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
- access_mask, current_seq);
- if (npages < 0) {
- ret = npages;
+
+ ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
+ access_mask, current_seq);
+
+ if (ret < 0)
goto srcu_unlock;
- }
- if (npages > 0) {
- mutex_lock(&mr->umem->odp_data->umem_mutex);
+ if (ret > 0) {
+ int np = ret;
+
+ mutex_lock(&odp->umem_mutex);
if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
/*
* No need to check whether the MTTs really belong to
* this MR, since ib_umem_odp_map_dma_pages already
* checks this.
*/
- ret = mlx5_ib_update_xlt(mr, start_idx, npages,
+ ret = mlx5_ib_update_xlt(mr, start_idx, np,
PAGE_SHIFT,
MLX5_IB_UPD_XLT_ATOMIC);
} else {
ret = -EAGAIN;
}
- mutex_unlock(&mr->umem->odp_data->umem_mutex);
+ mutex_unlock(&odp->umem_mutex);
if (ret < 0) {
if (ret != -EAGAIN)
- pr_err("Failed to update mkey page tables\n");
+ mlx5_ib_err(dev, "Failed to update mkey page tables\n");
goto srcu_unlock;
}
if (bytes_mapped) {
- u32 new_mappings = npages * PAGE_SIZE -
+ u32 new_mappings = np * PAGE_SIZE -
(io_virt - round_down(io_virt, PAGE_SIZE));
- *bytes_mapped += min_t(u32, new_mappings, bcnt);
+ *bytes_mapped += min_t(u32, new_mappings, size);
}
+
+ npages += np;
+ }
+
+ bcnt -= size;
+ if (unlikely(bcnt)) {
+ struct ib_umem_odp *next;
+
+ io_virt += size;
+ next = odp_next(odp);
+ if (unlikely(!next || next->umem->address != io_virt)) {
+ mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
+ io_virt, next);
+ ret = -EAGAIN;
+ goto srcu_unlock_no_wait;
+ }
+ odp = next;
+ mr = odp->private;
+ goto next_mr;
}
srcu_unlock:
if (ret == -EAGAIN) {
- if (!mr->umem->odp_data->dying) {
- struct ib_umem_odp *odp_data = mr->umem->odp_data;
+ if (implicit || !odp->dying) {
unsigned long timeout =
msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
if (!wait_for_completion_timeout(
- &odp_data->notifier_completion,
+ &odp->notifier_completion,
timeout)) {
- pr_warn("timeout waiting for mmu notifier completion\n");
+ mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
+ current_seq, odp->notifiers_seq);
}
} else {
/* The MR is being killed, kill the QP as well. */
ret = -EFAULT;
}
}
- srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
+
+srcu_unlock_no_wait:
+ srcu_read_unlock(&dev->mr_srcu, srcu_key);
*bytes_committed = 0;
return ret ? ret : npages;
}
@@ -618,8 +998,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
goto resolve_page_fault;
} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
if (ret != -ENOENT)
- mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n",
- ret);
+ mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n",
+ ret, pfault->wqe.wq_num, pfault->type);
goto resolve_page_fault;
}
@@ -627,7 +1007,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
resolve_page_fault:
mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
- pfault->token, resume_with_error,
+ pfault->wqe.wq_num, resume_with_error,
pfault->type);
free_page((unsigned long)buffer);
}
@@ -700,10 +1080,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
ret = pagefault_single_data_segment(dev, rkey, address,
prefetch_len,
&bytes_committed, NULL);
- if (ret < 0) {
+ if (ret < 0 && ret != -EAGAIN) {
mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
- ret, pfault->token, address,
- prefetch_len);
+ ret, pfault->token, address, prefetch_len);
}
}
}
@@ -728,19 +1107,61 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
}
}
-int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
+void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
+{
+ if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return;
+
+ switch (ent->order - 2) {
+ case MLX5_IMR_MTT_CACHE_ENTRY:
+ ent->page = PAGE_SHIFT;
+ ent->xlt = MLX5_IMR_MTT_ENTRIES *
+ sizeof(struct mlx5_mtt) /
+ MLX5_IB_UMR_OCTOWORD;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
+ ent->limit = 0;
+ break;
+
+ case MLX5_IMR_KSM_CACHE_ENTRY:
+ ent->page = MLX5_KSM_PAGE_SHIFT;
+ ent->xlt = mlx5_imr_ksm_entries *
+ sizeof(struct mlx5_klm) /
+ MLX5_IB_UMR_OCTOWORD;
+ ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
+ ent->limit = 0;
+ break;
+ }
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
{
int ret;
- ret = init_srcu_struct(&ibdev->mr_srcu);
+ ret = init_srcu_struct(&dev->mr_srcu);
if (ret)
return ret;
+ if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
+ ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
+ if (ret) {
+ mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
+ return ret;
+ }
+ }
+
return 0;
}
-void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
+{
+ cleanup_srcu_struct(&dev->mr_srcu);
+}
+
+int mlx5_ib_odp_init(void)
{
- cleanup_srcu_struct(&ibdev->mr_srcu);
+ mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
+ MLX5_IMR_MTT_BITS);
+
+ return 0;
}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a78ee73..d9f928c 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1053,6 +1053,8 @@ enum {
enum {
MAX_UMR_CACHE_ENTRY = 20,
+ MLX5_IMR_MTT_CACHE_ENTRY,
+ MLX5_IMR_KSM_CACHE_ENTRY,
MAX_MR_CACHE_ENTRIES
};
--
2.10.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread