All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Hellwig <hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
To: Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org,
	talal-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Subject: Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
Date: Tue, 8 Dec 2015 07:18:52 -0800	[thread overview]
Message-ID: <20151208151852.GA6688@infradead.org> (raw)
In-Reply-To: <1449587707-24214-2-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

WARNING: multiple messages have this Message-ID (diff)
From: Christoph Hellwig <hch@infradead.org>
To: Yishai Hadas <yishaih@mellanox.com>
Cc: dledford@redhat.com, linux-rdma@vger.kernel.org,
	ogerlitz@mellanox.com, talal@mellanox.com, linux-mm@kvack.org
Subject: Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
Date: Tue, 8 Dec 2015 07:18:52 -0800	[thread overview]
Message-ID: <20151208151852.GA6688@infradead.org> (raw)
In-Reply-To: <1449587707-24214-2-git-send-email-yishaih@mellanox.com>

There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2015-12-08 15:18 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-12-08 15:15 [RFC contig pages support 0/2] Add contiguous pages support Yishai Hadas
     [not found] ` <1449587707-24214-1-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-12-08 15:15   ` [RFC contig pages support 1/2] IB: Supports contiguous memory operations Yishai Hadas
     [not found]     ` <1449587707-24214-2-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-12-08 15:18       ` Christoph Hellwig [this message]
2015-12-08 15:18         ` Christoph Hellwig
     [not found]         ` <20151208151852.GA6688-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-12-08 17:15           ` Jason Gunthorpe
2015-12-08 17:15             ` Jason Gunthorpe
2015-12-09 10:00             ` Shachar Raindel
     [not found]               ` <AM4PR05MB146005B448BEA876519335CDDCE80-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2015-12-09 17:48                 ` Jason Gunthorpe
2015-12-09 17:48                   ` Jason Gunthorpe
2015-12-09 18:39                 ` Christoph Hellwig
2015-12-09 18:39                   ` Christoph Hellwig
2015-12-13 12:48                   ` Shachar Raindel
     [not found]                     ` <AM4PR05MB14603FC8169D50AD2A8F5AA3DCEC0-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2015-12-22 14:59                       ` Vlastimil Babka
2015-12-22 14:59                         ` Vlastimil Babka
     [not found]                         ` <56796538.9040906-AlSwsSmVLrQ@public.gmane.org>
2015-12-23 16:30                           ` Shachar Raindel
2015-12-23 16:30                             ` Shachar Raindel
     [not found]                             ` <AM4PR05MB14603CF21CB493086BDEE026DCE60-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-01-04 14:43                               ` Vlastimil Babka
2016-01-04 14:43                                 ` Vlastimil Babka
2016-01-04 14:44                             ` Vlastimil Babka
2015-12-08 15:15   ` [RFC contig pages support 2/2] IB/mlx5: Exporting to user space the contiguous allocation capability Yishai Hadas

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151208151852.GA6688@infradead.org \
    --to=hch-wegcikhe2lqwvfeawa7xhq@public.gmane.org \
    --cc=dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org \
    --cc=talal-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org \
    --cc=yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.