From: Tom Tucker <tom-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
To: Ralph Campbell <ralph.campbell-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org>
Cc: Tom Tucker <tom-/Yg/VP3ZvrM@public.gmane.org>,
"rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org"
<rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>,
"linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org"
<linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>,
"brandt-4OHPYypu0djtX7QSmKvirg@public.gmane.org"
<brandt-4OHPYypu0djtX7QSmKvirg@public.gmane.org>,
"swise-/Yg/VP3ZvrM@public.gmane.org"
<swise-/Yg/VP3ZvrM@public.gmane.org>
Subject: Re: [RFC PATCH 2/4] uverbs: Add common ib_iomem_get service
Date: Thu, 29 Jul 2010 14:07:11 -0500 [thread overview]
Message-ID: <4C51D15F.8060708@opengridcomputing.com> (raw)
In-Reply-To: <1280427723.6820.56.camel-/vjeY7uYZjrPXfVEPVhPGq6RkeBMCJyt@public.gmane.org>
On 7/29/10 1:22 PM, Ralph Campbell wrote:
> On Thu, 2010-07-29 at 09:25 -0700, Tom Tucker wrote:
>
>> From: Tom Tucker<tom-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
>>
>> Add an ib_iomem_get service that converts a vma to an array of
>> physical addresses. This makes it easier for each device driver to
>> add support for the reg_io_mr provider method.
>>
>> Signed-off-by: Tom Tucker<tom-/Yg/VP3ZvrM@public.gmane.org>
>> ---
>>
>> drivers/infiniband/core/umem.c | 248 ++++++++++++++++++++++++++++++++++++++--
>> include/rdma/ib_umem.h | 14 ++
>> 2 files changed, 251 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
>> index 415e186..f103956 100644
>> --- a/drivers/infiniband/core/umem.c
>> +++ b/drivers/infiniband/core/umem.c
>>
> ...
>
>> @@ -292,3 +295,226 @@ int ib_umem_page_count(struct ib_umem *umem)
>> return n;
>> }
>> EXPORT_SYMBOL(ib_umem_page_count);
>> +/*
>> + * Return the PFN for the specified address in the vma. This only
>> + * works for a vma that is VM_PFNMAP.
>> + */
>> +static unsigned long follow_io_pfn(struct vm_area_struct *vma,
>> + unsigned long address, int write)
>> +{
>> + pgd_t *pgd;
>> + pud_t *pud;
>> + pmd_t *pmd;
>> + pte_t *ptep, pte;
>> + spinlock_t *ptl;
>> + unsigned long pfn;
>> + struct mm_struct *mm = vma->vm_mm;
>> +
>> + BUG_ON(0 == (vma->vm_flags& VM_PFNMAP));
>>
> Why use BUG_ON?
> WARN_ON is more appropriate but
> if (!(vma->vm_flags& VM_PFNMAP))
> return 0;
> seems better.
> In fact, move it outside the inner do loop in ib_get_io_pfn().
>
>
It's paranoia from the debug phase. It's already in the 'outer loop'. I
should just delete it I think.
>> + pgd = pgd_offset(mm, address);
>> + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> + return 0;
>> +
>> + pud = pud_offset(pgd, address);
>> + if (pud_none(*pud))
>> + return 0;
>> + if (unlikely(pud_bad(*pud)))
>> + return 0;
>> +
>> + pmd = pmd_offset(pud, address);
>> + if (pmd_none(*pmd))
>> + return 0;
>> + if (unlikely(pmd_bad(*pmd)))
>> + return 0;
>> +
>> + ptep = pte_offset_map_lock(mm, pmd, address,&ptl);
>> + pte = *ptep;
>> + if (!pte_present(pte))
>> + goto bad;
>> + if (write&& !pte_write(pte))
>> + goto bad;
>> +
>> + pfn = pte_pfn(pte);
>> + pte_unmap_unlock(ptep, ptl);
>> + return pfn;
>> + bad:
>> + pte_unmap_unlock(ptep, ptl);
>> + return 0;
>> +}
>> +
>> +int ib_get_io_pfn(struct task_struct *tsk, struct mm_struct *mm,
>> + unsigned long start, int len, int write, int force,
>> + unsigned long *pfn_list, struct vm_area_struct **vmas)
>> +{
>> + unsigned long pfn;
>> + int i;
>> + if (len<= 0)
>> + return 0;
>> +
>> + i = 0;
>> + do {
>> + struct vm_area_struct *vma;
>> +
>> + vma = find_vma(mm, start);
>> + if (0 == (vma->vm_flags& VM_PFNMAP))
>> + return -EINVAL;
>>
> Style nit: I would use ! instead of 0 ==
>
>
ok.
>> + if (0 == (vma->vm_flags& VM_IO))
>> + return -EFAULT;
>> +
>> + if (is_vm_hugetlb_page(vma))
>> + return -EFAULT;
>> +
>> + do {
>> + cond_resched();
>> + pfn = follow_io_pfn(vma, start, write);
>> + if (!pfn)
>> + return -EFAULT;
>> + if (pfn_list)
>> + pfn_list[i] = pfn;
>> + if (vmas)
>> + vmas[i] = vma;
>> + i++;
>> + start += PAGE_SIZE;
>> + len--;
>> + } while (len&& start< vma->vm_end);
>> + } while (len);
>> + return i;
>> +}
>> +
>> +/**
>> + * ib_iomem_get - DMA map a userspace map of IO memory.
>> + * @context: userspace context to map memory for
>> + * @addr: userspace virtual address to start at
>> + * @size: length of region to map
>> + * @access: IB_ACCESS_xxx flags for memory being mapped
>> + * @dmasync: flush in-flight DMA when the memory region is written
>> + */
>> +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr,
>> + size_t size, int access, int dmasync)
>> +{
>> + struct ib_umem *umem;
>> + unsigned long *pfn_list;
>> + struct ib_umem_chunk *chunk;
>> + unsigned long locked;
>> + unsigned long lock_limit;
>> + unsigned long cur_base;
>> + unsigned long npages;
>> + int ret;
>> + int off;
>> + int i;
>> + DEFINE_DMA_ATTRS(attrs);
>> +
>> + if (dmasync)
>> + dma_set_attr(DMA_ATTR_WRITE_BARRIER,&attrs);
>> +
>> + if (!can_do_mlock())
>> + return ERR_PTR(-EPERM);
>> +
>> + umem = kmalloc(sizeof *umem, GFP_KERNEL);
>> + if (!umem)
>> + return ERR_PTR(-ENOMEM);
>> +
>> + umem->type = IB_UMEM_IO_MAP;
>> + umem->context = context;
>> + umem->length = size;
>> + umem->offset = addr& ~PAGE_MASK;
>> + umem->page_size = PAGE_SIZE;
>> + /*
>> + * We ask for writable memory if any access flags other than
>> + * "remote read" are set. "Local write" and "remote write"
>> + * obviously require write access. "Remote atomic" can do
>> + * things like fetch and add, which will modify memory, and
>> + * "MW bind" can change permissions by binding a window.
>> + */
>> + umem->writable = !!(access& ~IB_ACCESS_REMOTE_READ);
>> +
>> + /* IO memory is not hugetlb memory */
>> + umem->hugetlb = 0;
>> +
>> + INIT_LIST_HEAD(&umem->chunk_list);
>> +
>> + pfn_list = (unsigned long *) __get_free_page(GFP_KERNEL);
>> + if (!pfn_list) {
>> + kfree(umem);
>> + return ERR_PTR(-ENOMEM);
>> + }
>> +
>> + npages = PAGE_ALIGN(size + umem->offset)>> PAGE_SHIFT;
>> +
>> + down_write(¤t->mm->mmap_sem);
>> +
>> + locked = npages + current->mm->locked_vm;
>> + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur
>> + >> PAGE_SHIFT;
>>
> I think the current kernel code is supposed to look like:
> lock_limit = rlimit(RLIMIT_MEMLOCK)>> PAGE_SHIFT;
>
>
agreed.
>> + if ((locked> lock_limit)&& !capable(CAP_IPC_LOCK)) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + cur_base = addr& PAGE_MASK;
>> +
>> + ret = 0;
>> + while (npages) {
>> + ret = ib_get_io_pfn(current, current->mm, cur_base,
>> + min_t(unsigned long, npages,
>> + PAGE_SIZE / sizeof(unsigned long *)),
>> + umem->writable,
>> + !umem->writable, pfn_list, NULL);
>> +
>> + if (ret< 0)
>> + goto out;
>> +
>> + cur_base += ret * PAGE_SIZE;
>> + npages -= ret;
>> +
>> + off = 0;
>> +
>> + while (ret) {
>> + chunk = kmalloc(sizeof *chunk +
>> + sizeof(struct scatterlist) *
>> + min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
>> + GFP_KERNEL);
>> + if (!chunk) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> + chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
>> + sg_init_table(chunk->page_list, chunk->nents);
>> + /* The pfn_list we built is a set of Page
>> + * Frame Numbers (PFN) whose physical address
>> + * is PFN<< PAGE_SHIFT. The SG DMA mapping
>> + * services expect page addresses, not PFN,
>> + * therefore, we have to do the dma mapping
>> + * ourselves here. */
>> + for (i = 0; i< chunk->nents; ++i) {
>> + sg_set_page(&chunk->page_list[i], 0,
>> + PAGE_SIZE, 0);
>> + chunk->page_list[i].dma_address =
>> + (pfn_list[i]<< PAGE_SHIFT);
>> + chunk->page_list[i].dma_length = PAGE_SIZE;
>>
> dma_length isn't present if CONFIG_NEED_SG_DMA_LENGTH is undefined.
>
Right.
> In fact, the kmalloc() should probably be kzalloc since the other
> struct scatterlist entries are uninitialized.
>
>
I think chunk->nents handles this.
>> + }
>> + chunk->nmap = chunk->nents;
>> + ret -= chunk->nents;
>> + off += chunk->nents;
>> + list_add_tail(&chunk->list,&umem->chunk_list);
>> + }
>> +
>> + ret = 0;
>>
> This is redundant since ret == 0 at the end of the loop.
>
>
ok.
>> + }
>> +
>> +out:
>> + if (ret< 0) {
>> + __ib_umem_release(context->device, umem, 0);
>> + kfree(umem);
>> + } else
>> + current->mm->locked_vm = locked;
>> + up_write(¤t->mm->mmap_sem);
>> + free_page((unsigned long) pfn_list);
>> +
>> + return ret< 0 ? ERR_PTR(ret) : umem;
>> +}
>> +EXPORT_SYMBOL(ib_iomem_get);
>> +
>> diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
>> index 9ee0d2e..2c64d82 100644
>> --- a/include/rdma/ib_umem.h
>> +++ b/include/rdma/ib_umem.h
>> @@ -39,8 +39,14 @@
>>
>> struct ib_ucontext;
>>
>> +enum ib_umem_type {
>> + IB_UMEM_MEM_MAP = 0,
>> + IB_UMEM_IO_MAP = 1
>> +};
>> +
>> struct ib_umem {
>> struct ib_ucontext *context;
>> + enum ib_umem_type type;
>> size_t length;
>> int offset;
>> int page_size;
>> @@ -61,6 +67,8 @@ struct ib_umem_chunk {
>>
>> #ifdef CONFIG_INFINIBAND_USER_MEM
>>
>> +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr,
>> + size_t size, int access, int dmasync);
>> struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>> size_t size, int access, int dmasync);
>> void ib_umem_release(struct ib_umem *umem);
>> @@ -70,6 +78,12 @@ int ib_umem_page_count(struct ib_umem *umem);
>>
>> #include<linux/err.h>
>>
>> +static struct ib_umem *ib_iomem_get(struct ib_ucontext *context,
>> + unsigned long addr, size_t size,
>> + int access, int dmasync) {
>> + return ERR_PTR(-EINVAL);
>> +}
>> +
>> static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
>> unsigned long addr, size_t size,
>> int access, int dmasync) {
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2010-07-29 19:07 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-07-29 16:24 [RFC PATCH 0/4] ibverbs: new verbs for registering I/O memory Tom Tucker
[not found] ` <20100729162339.14674.15788.stgit-T4OLL4TyM9aNDNWfRnPdfg@public.gmane.org>
2010-07-29 16:25 ` [RFC PATCH 1/4] ibverbs: Add new provider verb for I/O memory registration Tom Tucker
2010-07-29 16:25 ` [RFC PATCH 2/4] uverbs: Add common ib_iomem_get service Tom Tucker
[not found] ` <20100729162509.14674.34237.stgit-T4OLL4TyM9aNDNWfRnPdfg@public.gmane.org>
2010-07-29 18:22 ` Ralph Campbell
[not found] ` <1280427723.6820.56.camel-/vjeY7uYZjrPXfVEPVhPGq6RkeBMCJyt@public.gmane.org>
2010-07-29 19:07 ` Tom Tucker [this message]
[not found] ` <4C51D15F.8060708-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2010-07-29 19:33 ` Ralph Campbell
2010-07-29 20:13 ` Jason Gunthorpe
[not found] ` <20100729201317.GA7920-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-07-29 20:29 ` Tom Tucker
[not found] ` <4C51E4B1.6040800-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
2010-07-29 20:41 ` Jason Gunthorpe
[not found] ` <20100729204150.GC7920-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-07-29 20:54 ` Tom Tucker
2010-07-29 22:49 ` [Suspected SPAM] " Ralph Campbell
[not found] ` <1280443789.6820.153.camel-/vjeY7uYZjrPXfVEPVhPGq6RkeBMCJyt@public.gmane.org>
2010-07-29 22:57 ` Jason Gunthorpe
[not found] ` <20100729225705.GC26390-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-07-29 23:13 ` Ralph Campbell
2010-07-30 2:16 ` Tom Tucker
2010-07-29 21:20 ` Tom Tucker
2010-07-29 16:25 ` [RFC PATCH 3/4] uverbs_cmd: Add uverbs command definitions for reg_io_mr Tom Tucker
2010-07-29 16:25 ` [RFC PATCH 4/4] mthca: Add support for reg_io_mr and unreg_io_mr Tom Tucker
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4C51D15F.8060708@opengridcomputing.com \
--to=tom-7bpotxp6k4+p2yhjcf5u+vpxobypeauw@public.gmane.org \
--cc=brandt-4OHPYypu0djtX7QSmKvirg@public.gmane.org \
--cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
--cc=ralph.campbell-h88ZbnxC6KDQT0dZR+AlfA@public.gmane.org \
--cc=rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org \
--cc=swise-/Yg/VP3ZvrM@public.gmane.org \
--cc=tom-/Yg/VP3ZvrM@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.