* [PATCH mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert.
@ 2020-01-11 0:28 Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
0 siblings, 2 replies; 3+ messages in thread
From: Arjun Roy @ 2020-01-11 0:28 UTC (permalink / raw)
To: davem, netdev, akpm, linux-mm
Cc: arjunroy, Arjun Roy, Eric Dumazet, Soheil Hassas Yeganeh
Add helper methods for vm_insert_page()/insert_page() to prepare for
vm_insert_pages(), which batch-inserts pages to reduce spinlock
operations when inserting multiple consecutive pages into the user
page table.
The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.
Signed-off-by: Arjun Roy <arjunroy.kdev@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
mm/memory.c | 41 ++++++++++++++++++++++++-----------------
1 file changed, 24 insertions(+), 17 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 7c6e19bdc428..7e23a9275386 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1693,6 +1693,27 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
+static int validate_page_before_insert(struct page *page)
+{
+ if (PageAnon(page) || PageSlab(page))
+ return -EINVAL;
+ flush_dcache_page(page);
+ return 0;
+}
+
+static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ if (!pte_none(*pte))
+ return -EBUSY;
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
+ page_add_file_rmap(page, false);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ return 0;
+}
+
/*
* This is the old fallback for page remapping.
*
@@ -1708,28 +1729,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- retval = -EINVAL;
- if (PageAnon(page) || PageSlab(page))
+ retval = validate_page_before_insert(page);
+ if (retval)
goto out;
retval = -ENOMEM;
- flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
- retval = -EBUSY;
- if (!pte_none(*pte))
- goto out_unlock;
-
- /* Ok, finally just insert the thing.. */
- get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
-
- retval = 0;
- pte_unmap_unlock(pte, ptl);
- return retval;
-out_unlock:
+ retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
--
2.25.0.rc1.283.g88dfdc4193-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH mm,net-next 2/3] mm: Add vm_insert_pages().
2020-01-11 0:28 [PATCH mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
@ 2020-01-11 0:28 ` Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
1 sibling, 0 replies; 3+ messages in thread
From: Arjun Roy @ 2020-01-11 0:28 UTC (permalink / raw)
To: davem, netdev, akpm, linux-mm
Cc: arjunroy, Arjun Roy, Eric Dumazet, Soheil Hassas Yeganeh
Add the ability to insert multiple pages at once to a user VM with
lower PTE spinlock operations.
The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.
Signed-off-by: Arjun Roy <arjunroy.kdev@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
include/linux/mm.h | 2 +
mm/memory.c | 111 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 111 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 29c928ba6b42..a3ac40fbe8fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2563,6 +2563,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num);
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index 7e23a9275386..1655c5feaf32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1670,8 +1670,7 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
- spinlock_t **ptl)
+static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -1690,6 +1689,16 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
+ return pmd;
+}
+
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
+ spinlock_t **ptl)
+{
+ pmd_t *pmd = walk_to_pmd(mm, addr);
+
+ if (!pmd)
+ return NULL;
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
@@ -1714,6 +1723,15 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
return 0;
}
+static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, struct page *page, pgprot_t prot)
+{
+ const int err = validate_page_before_insert(page);
+
+ return err ? err : insert_page_into_pte_locked(
+ mm, pte_offset_map(pmd, addr), addr, page, prot);
+}
+
/*
* This is the old fallback for page remapping.
*
@@ -1742,6 +1760,95 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
return retval;
}
+/* insert_pages() amortizes the cost of spinlock operations
+ * when inserting pages in a loop.
+ */
+static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num, pgprot_t prot)
+{
+ pmd_t *pmd = NULL;
+ spinlock_t *pte_lock = NULL;
+ struct mm_struct *const mm = vma->vm_mm;
+ unsigned long curr_page_idx = 0;
+ unsigned long remaining_pages_total = *num;
+ unsigned long pages_to_write_in_pmd;
+ int ret;
+more:
+ ret = -EFAULT;
+ pmd = walk_to_pmd(mm, addr);
+ if (!pmd)
+ goto out;
+
+ pages_to_write_in_pmd = min_t(unsigned long,
+ remaining_pages_total, PTRS_PER_PTE - pte_index(addr));
+
+ /* Allocate the PTE if necessary; takes PMD lock once only. */
+ ret = -ENOMEM;
+ if (pte_alloc(mm, pmd, addr))
+ goto out;
+ pte_lock = pte_lockptr(mm, pmd);
+
+ while (pages_to_write_in_pmd) {
+ int pte_idx = 0;
+ const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
+
+ spin_lock(pte_lock);
+ for (; pte_idx < batch_size; ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pmd,
+ addr, pages[curr_page_idx], prot);
+ if (unlikely(err)) {
+ spin_unlock(pte_lock);
+ ret = err;
+ remaining_pages_total -= pte_idx;
+ goto out;
+ }
+ addr += PAGE_SIZE;
+ ++curr_page_idx;
+ }
+ spin_unlock(pte_lock);
+ pages_to_write_in_pmd -= batch_size;
+ remaining_pages_total -= batch_size;
+ }
+ if (remaining_pages_total)
+ goto more;
+ ret = 0;
+out:
+ *num = remaining_pages_total;
+ return ret;
+}
+
+/**
+ * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
+ * @vma: user vma to map to
+ * @addr: target start user address of these pages
+ * @pages: source kernel pages
+ * @num: in: number of pages to map. out: number of pages that were *not*
+ * mapped. (0 means all pages were successfully mapped).
+ *
+ * Preferred over vm_insert_page() when inserting multiple pages.
+ *
+ * In case of error, we may have mapped a subset of the provided
+ * pages. It is the caller's responsibility to account for this case.
+ *
+ * The same restrictions apply as in vm_insert_page().
+ */
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num)
+{
+ const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;
+
+ if (addr < vma->vm_start || end_addr >= vma->vm_end)
+ return -EFAULT;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
+ /* Defer page refcount checking till we're about to map that page. */
+ return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pages);
+
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
--
2.25.0.rc1.283.g88dfdc4193-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
* [PATCH mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy.
2020-01-11 0:28 [PATCH mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
@ 2020-01-11 0:28 ` Arjun Roy
1 sibling, 0 replies; 3+ messages in thread
From: Arjun Roy @ 2020-01-11 0:28 UTC (permalink / raw)
To: davem, netdev, akpm, linux-mm
Cc: arjunroy, Arjun Roy, Eric Dumazet, Soheil Hassas Yeganeh
Use vm_insert_pages() for tcp receive zerocopy. Spin lock cycles
(as reported by perf) drop from a couple of percentage points
to a fraction of a percent. This results in a roughly 6% increase in
efficiency, measured roughly as zerocopy receive count divided by CPU
utilization.
The intention of this patch-set is to reduce atomic ops for
tcp zerocopy receives, which normally hits the same spinlock multiple
times consecutively.
Signed-off-by: Arjun Roy <arjunroy.kdev@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
---
net/ipv4/tcp.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 61 insertions(+), 6 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34490d972758..52f96c3ceab3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1861,14 +1861,48 @@ int tcp_mmap(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(tcp_mmap);
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+ struct page **pages,
+ unsigned long pages_to_map,
+ unsigned long *insert_addr,
+ u32 *length_with_pending,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc)
+{
+ unsigned long pages_remaining = pages_to_map;
+ int bytes_mapped;
+ int ret;
+
+ ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
+ * mapping (some but not all of the pages).
+ */
+ *seq += bytes_mapped;
+ *insert_addr += bytes_mapped;
+ if (ret) {
+ /* But if vm_insert_pages did fail, we have to unroll some state
+ * we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+ *length_with_pending -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return ret;
+}
+
static int tcp_zerocopy_receive(struct sock *sk,
struct tcp_zerocopy_receive *zc)
{
unsigned long address = (unsigned long)zc->address;
u32 length = 0, seq, offset, zap_len;
+ #define PAGE_BATCH_SIZE 8
+ struct page *pages[PAGE_BATCH_SIZE];
const skb_frag_t *frags = NULL;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
+ unsigned long pg_idx = 0;
+ unsigned long curr_addr;
struct tcp_sock *tp;
int inq;
int ret;
@@ -1901,15 +1935,26 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint = zc->length;
}
ret = 0;
+ curr_addr = address;
while (length + PAGE_SIZE <= zc->length) {
if (zc->recv_skip_hint < PAGE_SIZE) {
+ /* If we're here, finish the current batch. */
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pg_idx,
+ &curr_addr,
+ &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
if (skb) {
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
-
zc->recv_skip_hint = skb->len - offset;
offset -= skb_headlen(skb);
if ((int)offset < 0 || skb_has_frag_list(skb))
@@ -1933,14 +1978,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
zc->recv_skip_hint -= remaining;
break;
}
- ret = vm_insert_page(vma, address + length,
- skb_frag_page(frags));
- if (ret)
- break;
+ pages[pg_idx] = skb_frag_page(frags);
+ pg_idx++;
length += PAGE_SIZE;
- seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
+ if (pg_idx == PAGE_BATCH_SIZE) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length,
+ &seq, zc);
+ if (ret)
+ goto out;
+ pg_idx = 0;
+ }
+ }
+ if (pg_idx) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+ &curr_addr, &length, &seq,
+ zc);
}
out:
up_read(¤t->mm->mmap_sem);
--
2.25.0.rc1.283.g88dfdc4193-goog
^ permalink raw reply related [flat|nested] 3+ messages in thread
end of thread, other threads:[~2020-01-11 0:29 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-01-11 0:28 [PATCH mm,net-next 1/3] mm: Refactor insert_page to prepare for batched-lock insert Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 2/3] mm: Add vm_insert_pages() Arjun Roy
2020-01-11 0:28 ` [PATCH mm,net-next 3/3] net-zerocopy: Use vm_insert_pages() for tcp rcv zerocopy Arjun Roy
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.