* [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-07 20:21 ` Robert C Jennings
0 siblings, 0 replies; 30+ messages in thread
From: Robert C Jennings @ 2013-10-07 20:21 UTC (permalink / raw)
To: linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Dave Hansen, Robert C Jennings, Matt Helsley,
Anthony Liguori, Michael Roth, Lei Li, Leonardo Garcia,
Vlastimil Babka
Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
pages.
When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
writer's gift'ed pages would be zapped. This patch supports further work
to move vmsplice'd pages rather than copying them. That patch has the
restriction that the page must not be mapped by the source for the move,
otherwise it will fall back to copying the page.
Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
---
Since the RFC went out I have coalesced the zap_page_range() call to
operate on VMAs rather than calling this for each page. For a 256MB
vmsplice this reduced the write side 50% from the RFC.
---
fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/splice.h | 1 +
2 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/fs/splice.c b/fs/splice.c
index 3b7ee65..a62d61e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
{
unsigned int spd_pages = spd->nr_pages;
int ret, do_wakeup, page_nr;
+ struct vm_area_struct *vma;
+ unsigned long user_start, user_end;
ret = 0;
do_wakeup = 0;
page_nr = 0;
+ vma = NULL;
+ user_start = user_end = 0;
pipe_lock(pipe);
+ down_read(¤t->mm->mmap_sem);
for (;;) {
if (!pipe->readers) {
@@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
buf->len = spd->partial[page_nr].len;
buf->private = spd->partial[page_nr].private;
buf->ops = spd->ops;
- if (spd->flags & SPLICE_F_GIFT)
+ if (spd->flags & SPLICE_F_GIFT) {
+ unsigned long useraddr =
+ spd->partial[page_nr].useraddr;
+
+ if ((spd->flags & SPLICE_F_MOVE) &&
+ !buf->offset &&
+ (buf->len == PAGE_SIZE)) {
+ /* Can move page aligned buf, gather
+ * requests to make a single
+ * zap_page_range() call per VMA
+ */
+ if (vma && (useraddr == user_end) &&
+ ((useraddr + PAGE_SIZE) <=
+ vma->vm_end)) {
+ /* same vma, no holes */
+ user_end += PAGE_SIZE;
+ } else {
+ if (vma)
+ zap_page_range(vma,
+ user_start,
+ (user_end -
+ user_start),
+ NULL);
+ vma = find_vma_intersection(
+ current->mm,
+ useraddr,
+ (useraddr +
+ PAGE_SIZE));
+ if (!IS_ERR_OR_NULL(vma)) {
+ user_start = useraddr;
+ user_end = (useraddr +
+ PAGE_SIZE);
+ } else
+ vma = NULL;
+ }
+ }
buf->flags |= PIPE_BUF_FLAG_GIFT;
+ }
pipe->nrbufs++;
page_nr++;
@@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
pipe->waiting_writers--;
}
+ if (vma)
+ zap_page_range(vma, user_start, (user_end - user_start), NULL);
+
+ up_read(¤t->mm->mmap_sem);
pipe_unlock(pipe);
if (do_wakeup)
@@ -485,6 +530,7 @@ fill_it:
spd.partial[page_nr].offset = loff;
spd.partial[page_nr].len = this_len;
+ spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
len -= this_len;
loff = 0;
spd.nr_pages++;
@@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
this_len = min_t(size_t, vec[i].iov_len, res);
spd.partial[i].offset = 0;
spd.partial[i].len = this_len;
+ spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
if (!this_len) {
__free_page(spd.pages[i]);
spd.pages[i] = NULL;
@@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
partial[buffers].offset = off;
partial[buffers].len = plen;
+ partial[buffers].useraddr = (unsigned long)base;
+ base = (void*)((unsigned long)base + PAGE_SIZE);
off = 0;
len -= plen;
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 74575cb..56661e3 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -44,6 +44,7 @@ struct partial_page {
unsigned int offset;
unsigned int len;
unsigned long private;
+ unsigned long useraddr;
};
/*
--
1.8.1.2
^ permalink raw reply related [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-07 20:21 ` Robert C Jennings
@ 2013-10-08 16:14 ` Dave Hansen
-1 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 16:14 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Matt Helsley, Anthony Liguori, Michael Roth,
Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> + } else {
> + if (vma)
> + zap_page_range(vma,
> + user_start,
> + (user_end -
> + user_start),
> + NULL);
> + vma = find_vma_intersection(
> + current->mm,
> + useraddr,
> + (useraddr +
> + PAGE_SIZE));
> + if (!IS_ERR_OR_NULL(vma)) {
> + user_start = useraddr;
> + user_end = (useraddr +
> + PAGE_SIZE);
> + } else
> + vma = NULL;
> + }
This is pretty unspeakably hideous. Was there truly no better way to do
this?
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-08 16:14 ` Dave Hansen
0 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 16:14 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Matt Helsley, Anthony Liguori, Michael Roth,
Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> + } else {
> + if (vma)
> + zap_page_range(vma,
> + user_start,
> + (user_end -
> + user_start),
> + NULL);
> + vma = find_vma_intersection(
> + current->mm,
> + useraddr,
> + (useraddr +
> + PAGE_SIZE));
> + if (!IS_ERR_OR_NULL(vma)) {
> + user_start = useraddr;
> + user_end = (useraddr +
> + PAGE_SIZE);
> + } else
> + vma = NULL;
> + }
This is pretty unspeakably hideous. Was there truly no better way to do
this?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-08 16:14 ` Dave Hansen
@ 2013-10-08 19:48 ` Robert Jennings
-1 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-08 19:48 UTC (permalink / raw)
To: Dave Hansen
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
* Dave Hansen (dave@sr71.net) wrote:
> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> > + } else {
> > + if (vma)
> > + zap_page_range(vma,
> > + user_start,
> > + (user_end -
> > + user_start),
> > + NULL);
> > + vma = find_vma_intersection(
> > + current->mm,
> > + useraddr,
> > + (useraddr +
> > + PAGE_SIZE));
> > + if (!IS_ERR_OR_NULL(vma)) {
> > + user_start = useraddr;
> > + user_end = (useraddr +
> > + PAGE_SIZE);
> > + } else
> > + vma = NULL;
> > + }
>
> This is pretty unspeakably hideous. Was there truly no better way to do
> this?
I was hoping to find a better way to coalesce pipe buffers and zap
entire VMAs (and it needs better documentation but your argument is with
structure and I agree). I would love suggestions for improving this but
that is not to say that I've abandoned it; I'm still looking for ways
to make this cleaner.
Doing find_vma() on a single page in the VMA rather than on each and
then zapping once provides a 50% runtime reduction for the writer when
tested with a 256MB vmsplice operation. Based on the result I felt that
coalescing was justfied but the implementation is ugly.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-08 19:48 ` Robert Jennings
0 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-08 19:48 UTC (permalink / raw)
To: Dave Hansen
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
* Dave Hansen (dave@sr71.net) wrote:
> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> > + } else {
> > + if (vma)
> > + zap_page_range(vma,
> > + user_start,
> > + (user_end -
> > + user_start),
> > + NULL);
> > + vma = find_vma_intersection(
> > + current->mm,
> > + useraddr,
> > + (useraddr +
> > + PAGE_SIZE));
> > + if (!IS_ERR_OR_NULL(vma)) {
> > + user_start = useraddr;
> > + user_end = (useraddr +
> > + PAGE_SIZE);
> > + } else
> > + vma = NULL;
> > + }
>
> This is pretty unspeakably hideous. Was there truly no better way to do
> this?
I was hoping to find a better way to coalesce pipe buffers and zap
entire VMAs (and it needs better documentation but your argument is with
structure and I agree). I would love suggestions for improving this but
that is not to say that I've abandoned it; I'm still looking for ways
to make this cleaner.
Doing find_vma() on a single page in the VMA rather than on each and
then zapping once provides a 50% runtime reduction for the writer when
tested with a 256MB vmsplice operation. Based on the result I felt that
coalescing was justfied but the implementation is ugly.
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-08 19:48 ` Robert Jennings
@ 2013-10-08 21:22 ` Dave Hansen
-1 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 21:22 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/08/2013 12:48 PM, Robert Jennings wrote:
> * Dave Hansen (dave@sr71.net) wrote:
>> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
>>> + } else {
>>> + if (vma)
>>> + zap_page_range(vma,
>>> + user_start,
>>> + (user_end -
>>> + user_start),
>>> + NULL);
>>> + vma = find_vma_intersection(
>>> + current->mm,
>>> + useraddr,
>>> + (useraddr +
>>> + PAGE_SIZE));
>>> + if (!IS_ERR_OR_NULL(vma)) {
>>> + user_start = useraddr;
>>> + user_end = (useraddr +
>>> + PAGE_SIZE);
>>> + } else
>>> + vma = NULL;
>>> + }
>>
>> This is pretty unspeakably hideous. Was there truly no better way to do
>> this?
>
> I was hoping to find a better way to coalesce pipe buffers and zap
> entire VMAs (and it needs better documentation but your argument is with
> structure and I agree). I would love suggestions for improving this but
> that is not to say that I've abandoned it; I'm still looking for ways
> to make this cleaner.
Doing the VMA search each and every time seems a bit silly. Do one
find_vma(), the look at the _end_ virtual address of the VMA. You can
continue to collect your set of zap_page_range() addresses as long as
you do not hit the end of that address range.
If and only if you hit the end of the vma, do the zap_page_range(), and
then look up the VMA again.
Storing the .useraddr still seems odd to me, and you haven't fully
explained why you're doing it or how it is safe, or why you store both
virtual addresses and file locations in it.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-08 21:22 ` Dave Hansen
0 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 21:22 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/08/2013 12:48 PM, Robert Jennings wrote:
> * Dave Hansen (dave@sr71.net) wrote:
>> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
>>> + } else {
>>> + if (vma)
>>> + zap_page_range(vma,
>>> + user_start,
>>> + (user_end -
>>> + user_start),
>>> + NULL);
>>> + vma = find_vma_intersection(
>>> + current->mm,
>>> + useraddr,
>>> + (useraddr +
>>> + PAGE_SIZE));
>>> + if (!IS_ERR_OR_NULL(vma)) {
>>> + user_start = useraddr;
>>> + user_end = (useraddr +
>>> + PAGE_SIZE);
>>> + } else
>>> + vma = NULL;
>>> + }
>>
>> This is pretty unspeakably hideous. Was there truly no better way to do
>> this?
>
> I was hoping to find a better way to coalesce pipe buffers and zap
> entire VMAs (and it needs better documentation but your argument is with
> structure and I agree). I would love suggestions for improving this but
> that is not to say that I've abandoned it; I'm still looking for ways
> to make this cleaner.
Doing the VMA search each and every time seems a bit silly. Do one
find_vma(), the look at the _end_ virtual address of the VMA. You can
continue to collect your set of zap_page_range() addresses as long as
you do not hit the end of that address range.
If and only if you hit the end of the vma, do the zap_page_range(), and
then look up the VMA again.
Storing the .useraddr still seems odd to me, and you haven't fully
explained why you're doing it or how it is safe, or why you store both
virtual addresses and file locations in it.
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-07 20:21 ` Robert C Jennings
@ 2013-10-08 16:23 ` Dave Hansen
-1 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 16:23 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Matt Helsley, Anthony Liguori, Michael Roth,
Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> spd.partial[page_nr].offset = loff;
> spd.partial[page_nr].len = this_len;
> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> len -= this_len;
> loff = 0;
> spd.nr_pages++;
> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> this_len = min_t(size_t, vec[i].iov_len, res);
> spd.partial[i].offset = 0;
> spd.partial[i].len = this_len;
> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> if (!this_len) {
> __free_page(spd.pages[i]);
> spd.pages[i] = NULL;
> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>
> partial[buffers].offset = off;
> partial[buffers].len = plen;
> + partial[buffers].useraddr = (unsigned long)base;
> + base = (void*)((unsigned long)base + PAGE_SIZE);
>
> off = 0;
> len -= plen;
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index 74575cb..56661e3 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -44,6 +44,7 @@ struct partial_page {
> unsigned int offset;
> unsigned int len;
> unsigned long private;
> + unsigned long useraddr;
> };
"useraddr" confuses me. You make it an 'unsigned long', yet two of the
three assignments are from "void __user *". The other assignment:
spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
'index' looks to be the offset inside the file, not a user address, so
I'm confused what that is doing.
Could you elaborate a little more on why 'useraddr' is suddenly needed
in these patches? How is "index << PAGE_CACHE_SHIFT" a virtual address?
Also, are we losing any of the advantages of sparse checking since
'useraddr' is without the __user annotation?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-08 16:23 ` Dave Hansen
0 siblings, 0 replies; 30+ messages in thread
From: Dave Hansen @ 2013-10-08 16:23 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Matt Helsley, Anthony Liguori, Michael Roth,
Lei Li, Leonardo Garcia, Vlastimil Babka
On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> spd.partial[page_nr].offset = loff;
> spd.partial[page_nr].len = this_len;
> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> len -= this_len;
> loff = 0;
> spd.nr_pages++;
> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> this_len = min_t(size_t, vec[i].iov_len, res);
> spd.partial[i].offset = 0;
> spd.partial[i].len = this_len;
> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> if (!this_len) {
> __free_page(spd.pages[i]);
> spd.pages[i] = NULL;
> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>
> partial[buffers].offset = off;
> partial[buffers].len = plen;
> + partial[buffers].useraddr = (unsigned long)base;
> + base = (void*)((unsigned long)base + PAGE_SIZE);
>
> off = 0;
> len -= plen;
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index 74575cb..56661e3 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -44,6 +44,7 @@ struct partial_page {
> unsigned int offset;
> unsigned int len;
> unsigned long private;
> + unsigned long useraddr;
> };
"useraddr" confuses me. You make it an 'unsigned long', yet two of the
three assignments are from "void __user *". The other assignment:
spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
'index' looks to be the offset inside the file, not a user address, so
I'm confused what that is doing.
Could you elaborate a little more on why 'useraddr' is suddenly needed
in these patches? How is "index << PAGE_CACHE_SHIFT" a virtual address?
Also, are we losing any of the advantages of sparse checking since
'useraddr' is without the __user annotation?
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-08 16:23 ` Dave Hansen
@ 2013-10-17 13:54 ` Robert Jennings
-1 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-17 13:54 UTC (permalink / raw)
To: Dave Hansen
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
* Dave Hansen (dave@sr71.net) wrote:
> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> > spd.partial[page_nr].offset = loff;
> > spd.partial[page_nr].len = this_len;
> > + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> > len -= this_len;
> > loff = 0;
> > spd.nr_pages++;
> > @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> > this_len = min_t(size_t, vec[i].iov_len, res);
> > spd.partial[i].offset = 0;
> > spd.partial[i].len = this_len;
> > + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> > if (!this_len) {
> > __free_page(spd.pages[i]);
> > spd.pages[i] = NULL;
> > @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> >
> > partial[buffers].offset = off;
> > partial[buffers].len = plen;
> > + partial[buffers].useraddr = (unsigned long)base;
> > + base = (void*)((unsigned long)base + PAGE_SIZE);
> >
> > off = 0;
> > len -= plen;
> > diff --git a/include/linux/splice.h b/include/linux/splice.h
> > index 74575cb..56661e3 100644
> > --- a/include/linux/splice.h
> > +++ b/include/linux/splice.h
> > @@ -44,6 +44,7 @@ struct partial_page {
> > unsigned int offset;
> > unsigned int len;
> > unsigned long private;
> > + unsigned long useraddr;
> > };
>
> "useraddr" confuses me. You make it an 'unsigned long', yet two of the
> three assignments are from "void __user *". The other assignment:
>
> spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
>
> 'index' looks to be the offset inside the file, not a user address, so
> I'm confused what that is doing.
>
> Could you elaborate a little more on why 'useraddr' is suddenly needed
> in these patches? How is "index << PAGE_CACHE_SHIFT" a virtual address?
> Also, are we losing any of the advantages of sparse checking since
> 'useraddr' is without the __user annotation?
>
I'm working on cleaning this up. Trying to remove useraddr altogher
through the use of the existing 'private' field just for the
splice_to_user/pipe_to_user flow without upsetting other uses of the
private field.
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-17 13:54 ` Robert Jennings
0 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-17 13:54 UTC (permalink / raw)
To: Dave Hansen
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia, Vlastimil Babka
* Dave Hansen (dave@sr71.net) wrote:
> On 10/07/2013 01:21 PM, Robert C Jennings wrote:
> > spd.partial[page_nr].offset = loff;
> > spd.partial[page_nr].len = this_len;
> > + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> > len -= this_len;
> > loff = 0;
> > spd.nr_pages++;
> > @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> > this_len = min_t(size_t, vec[i].iov_len, res);
> > spd.partial[i].offset = 0;
> > spd.partial[i].len = this_len;
> > + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> > if (!this_len) {
> > __free_page(spd.pages[i]);
> > spd.pages[i] = NULL;
> > @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> >
> > partial[buffers].offset = off;
> > partial[buffers].len = plen;
> > + partial[buffers].useraddr = (unsigned long)base;
> > + base = (void*)((unsigned long)base + PAGE_SIZE);
> >
> > off = 0;
> > len -= plen;
> > diff --git a/include/linux/splice.h b/include/linux/splice.h
> > index 74575cb..56661e3 100644
> > --- a/include/linux/splice.h
> > +++ b/include/linux/splice.h
> > @@ -44,6 +44,7 @@ struct partial_page {
> > unsigned int offset;
> > unsigned int len;
> > unsigned long private;
> > + unsigned long useraddr;
> > };
>
> "useraddr" confuses me. You make it an 'unsigned long', yet two of the
> three assignments are from "void __user *". The other assignment:
>
> spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
>
> 'index' looks to be the offset inside the file, not a user address, so
> I'm confused what that is doing.
>
> Could you elaborate a little more on why 'useraddr' is suddenly needed
> in these patches? How is "index << PAGE_CACHE_SHIFT" a virtual address?
> Also, are we losing any of the advantages of sparse checking since
> 'useraddr' is without the __user annotation?
>
I'm working on cleaning this up. Trying to remove useraddr altogher
through the use of the existing 'private' field just for the
splice_to_user/pipe_to_user flow without upsetting other uses of the
private field.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread
* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-07 20:21 ` Robert C Jennings
@ 2013-10-17 10:20 ` Vlastimil Babka
-1 siblings, 0 replies; 30+ messages in thread
From: Vlastimil Babka @ 2013-10-17 10:20 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Dave Hansen, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia
On 10/07/2013 10:21 PM, Robert C Jennings wrote:
> Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
> pages.
>
> When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
> writer's gift'ed pages would be zapped. This patch supports further work
> to move vmsplice'd pages rather than copying them. That patch has the
> restriction that the page must not be mapped by the source for the move,
> otherwise it will fall back to copying the page.
>
> Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
> Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
> ---
> Since the RFC went out I have coalesced the zap_page_range() call to
> operate on VMAs rather than calling this for each page. For a 256MB
> vmsplice this reduced the write side 50% from the RFC.
> ---
> fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/splice.h | 1 +
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/fs/splice.c b/fs/splice.c
> index 3b7ee65..a62d61e 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> {
> unsigned int spd_pages = spd->nr_pages;
> int ret, do_wakeup, page_nr;
> + struct vm_area_struct *vma;
> + unsigned long user_start, user_end;
>
> ret = 0;
> do_wakeup = 0;
> page_nr = 0;
> + vma = NULL;
> + user_start = user_end = 0;
>
> pipe_lock(pipe);
> + down_read(¤t->mm->mmap_sem);
Seems like you could take the mmap_sem only when GIFT and MOVE is set.
Maybe it won't help that much for performance but at least serve as
documenting the reason it's needed?
Vlastimil
> for (;;) {
> if (!pipe->readers) {
> @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> buf->len = spd->partial[page_nr].len;
> buf->private = spd->partial[page_nr].private;
> buf->ops = spd->ops;
> - if (spd->flags & SPLICE_F_GIFT)
> + if (spd->flags & SPLICE_F_GIFT) {
> + unsigned long useraddr =
> + spd->partial[page_nr].useraddr;
> +
> + if ((spd->flags & SPLICE_F_MOVE) &&
> + !buf->offset &&
> + (buf->len == PAGE_SIZE)) {
> + /* Can move page aligned buf, gather
> + * requests to make a single
> + * zap_page_range() call per VMA
> + */
> + if (vma && (useraddr == user_end) &&
> + ((useraddr + PAGE_SIZE) <=
> + vma->vm_end)) {
> + /* same vma, no holes */
> + user_end += PAGE_SIZE;
> + } else {
> + if (vma)
> + zap_page_range(vma,
> + user_start,
> + (user_end -
> + user_start),
> + NULL);
> + vma = find_vma_intersection(
> + current->mm,
> + useraddr,
> + (useraddr +
> + PAGE_SIZE));
> + if (!IS_ERR_OR_NULL(vma)) {
> + user_start = useraddr;
> + user_end = (useraddr +
> + PAGE_SIZE);
> + } else
> + vma = NULL;
> + }
> + }
> buf->flags |= PIPE_BUF_FLAG_GIFT;
> + }
>
> pipe->nrbufs++;
> page_nr++;
> @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> pipe->waiting_writers--;
> }
>
> + if (vma)
> + zap_page_range(vma, user_start, (user_end - user_start), NULL);
> +
> + up_read(¤t->mm->mmap_sem);
> pipe_unlock(pipe);
>
> if (do_wakeup)
> @@ -485,6 +530,7 @@ fill_it:
>
> spd.partial[page_nr].offset = loff;
> spd.partial[page_nr].len = this_len;
> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> len -= this_len;
> loff = 0;
> spd.nr_pages++;
> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> this_len = min_t(size_t, vec[i].iov_len, res);
> spd.partial[i].offset = 0;
> spd.partial[i].len = this_len;
> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> if (!this_len) {
> __free_page(spd.pages[i]);
> spd.pages[i] = NULL;
> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>
> partial[buffers].offset = off;
> partial[buffers].len = plen;
> + partial[buffers].useraddr = (unsigned long)base;
> + base = (void*)((unsigned long)base + PAGE_SIZE);
>
> off = 0;
> len -= plen;
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index 74575cb..56661e3 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -44,6 +44,7 @@ struct partial_page {
> unsigned int offset;
> unsigned int len;
> unsigned long private;
> + unsigned long useraddr;
> };
>
> /*
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-17 10:20 ` Vlastimil Babka
0 siblings, 0 replies; 30+ messages in thread
From: Vlastimil Babka @ 2013-10-17 10:20 UTC (permalink / raw)
To: Robert C Jennings, linux-kernel
Cc: linux-fsdevel, linux-mm, Alexander Viro, Rik van Riel,
Andrea Arcangeli, Dave Hansen, Matt Helsley, Anthony Liguori,
Michael Roth, Lei Li, Leonardo Garcia
On 10/07/2013 10:21 PM, Robert C Jennings wrote:
> Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
> pages.
>
> When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
> writer's gift'ed pages would be zapped. This patch supports further work
> to move vmsplice'd pages rather than copying them. That patch has the
> restriction that the page must not be mapped by the source for the move,
> otherwise it will fall back to copying the page.
>
> Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
> Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
> ---
> Since the RFC went out I have coalesced the zap_page_range() call to
> operate on VMAs rather than calling this for each page. For a 256MB
> vmsplice this reduced the write side 50% from the RFC.
> ---
> fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/splice.h | 1 +
> 2 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/fs/splice.c b/fs/splice.c
> index 3b7ee65..a62d61e 100644
> --- a/fs/splice.c
> +++ b/fs/splice.c
> @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> {
> unsigned int spd_pages = spd->nr_pages;
> int ret, do_wakeup, page_nr;
> + struct vm_area_struct *vma;
> + unsigned long user_start, user_end;
>
> ret = 0;
> do_wakeup = 0;
> page_nr = 0;
> + vma = NULL;
> + user_start = user_end = 0;
>
> pipe_lock(pipe);
> + down_read(¤t->mm->mmap_sem);
Seems like you could take the mmap_sem only when GIFT and MOVE is set.
Maybe it won't help that much for performance but at least serve as
documenting the reason it's needed?
Vlastimil
> for (;;) {
> if (!pipe->readers) {
> @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> buf->len = spd->partial[page_nr].len;
> buf->private = spd->partial[page_nr].private;
> buf->ops = spd->ops;
> - if (spd->flags & SPLICE_F_GIFT)
> + if (spd->flags & SPLICE_F_GIFT) {
> + unsigned long useraddr =
> + spd->partial[page_nr].useraddr;
> +
> + if ((spd->flags & SPLICE_F_MOVE) &&
> + !buf->offset &&
> + (buf->len == PAGE_SIZE)) {
> + /* Can move page aligned buf, gather
> + * requests to make a single
> + * zap_page_range() call per VMA
> + */
> + if (vma && (useraddr == user_end) &&
> + ((useraddr + PAGE_SIZE) <=
> + vma->vm_end)) {
> + /* same vma, no holes */
> + user_end += PAGE_SIZE;
> + } else {
> + if (vma)
> + zap_page_range(vma,
> + user_start,
> + (user_end -
> + user_start),
> + NULL);
> + vma = find_vma_intersection(
> + current->mm,
> + useraddr,
> + (useraddr +
> + PAGE_SIZE));
> + if (!IS_ERR_OR_NULL(vma)) {
> + user_start = useraddr;
> + user_end = (useraddr +
> + PAGE_SIZE);
> + } else
> + vma = NULL;
> + }
> + }
> buf->flags |= PIPE_BUF_FLAG_GIFT;
> + }
>
> pipe->nrbufs++;
> page_nr++;
> @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> pipe->waiting_writers--;
> }
>
> + if (vma)
> + zap_page_range(vma, user_start, (user_end - user_start), NULL);
> +
> + up_read(¤t->mm->mmap_sem);
> pipe_unlock(pipe);
>
> if (do_wakeup)
> @@ -485,6 +530,7 @@ fill_it:
>
> spd.partial[page_nr].offset = loff;
> spd.partial[page_nr].len = this_len;
> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> len -= this_len;
> loff = 0;
> spd.nr_pages++;
> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> this_len = min_t(size_t, vec[i].iov_len, res);
> spd.partial[i].offset = 0;
> spd.partial[i].len = this_len;
> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> if (!this_len) {
> __free_page(spd.pages[i]);
> spd.pages[i] = NULL;
> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>
> partial[buffers].offset = off;
> partial[buffers].len = plen;
> + partial[buffers].useraddr = (unsigned long)base;
> + base = (void*)((unsigned long)base + PAGE_SIZE);
>
> off = 0;
> len -= plen;
> diff --git a/include/linux/splice.h b/include/linux/splice.h
> index 74575cb..56661e3 100644
> --- a/include/linux/splice.h
> +++ b/include/linux/splice.h
> @@ -44,6 +44,7 @@ struct partial_page {
> unsigned int offset;
> unsigned int len;
> unsigned long private;
> + unsigned long useraddr;
> };
>
> /*
>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-17 10:20 ` Vlastimil Babka
@ 2013-10-17 13:48 ` Robert Jennings
-1 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-17 13:48 UTC (permalink / raw)
To: Vlastimil Babka
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Dave Hansen, Matt Helsley,
Anthony Liguori, Michael Roth, Lei Li, Leonardo Garcia
* Vlastimil Babka (vbabka@suse.cz) wrote:
> On 10/07/2013 10:21 PM, Robert C Jennings wrote:
> > Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
> > pages.
> >
> > When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
> > writer's gift'ed pages would be zapped. This patch supports further work
> > to move vmsplice'd pages rather than copying them. That patch has the
> > restriction that the page must not be mapped by the source for the move,
> > otherwise it will fall back to copying the page.
> >
> > Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
> > Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
> > ---
> > Since the RFC went out I have coalesced the zap_page_range() call to
> > operate on VMAs rather than calling this for each page. For a 256MB
> > vmsplice this reduced the write side 50% from the RFC.
> > ---
> > fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
> > include/linux/splice.h | 1 +
> > 2 files changed, 51 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/splice.c b/fs/splice.c
> > index 3b7ee65..a62d61e 100644
> > --- a/fs/splice.c
> > +++ b/fs/splice.c
> > @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > {
> > unsigned int spd_pages = spd->nr_pages;
> > int ret, do_wakeup, page_nr;
> > + struct vm_area_struct *vma;
> > + unsigned long user_start, user_end;
> >
> > ret = 0;
> > do_wakeup = 0;
> > page_nr = 0;
> > + vma = NULL;
> > + user_start = user_end = 0;
> >
> > pipe_lock(pipe);
> > + down_read(¤t->mm->mmap_sem);
>
> Seems like you could take the mmap_sem only when GIFT and MOVE is set.
> Maybe it won't help that much for performance but at least serve as
> documenting the reason it's needed?
>
> Vlastimil
>
I had been doing that previously but moving this outside the loop and
acquiring it once did improve performance. I'll add a comment on
down_read() as to the reason for taking this though.
-Rob
> > for (;;) {
> > if (!pipe->readers) {
> > @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > buf->len = spd->partial[page_nr].len;
> > buf->private = spd->partial[page_nr].private;
> > buf->ops = spd->ops;
> > - if (spd->flags & SPLICE_F_GIFT)
> > + if (spd->flags & SPLICE_F_GIFT) {
> > + unsigned long useraddr =
> > + spd->partial[page_nr].useraddr;
> > +
> > + if ((spd->flags & SPLICE_F_MOVE) &&
> > + !buf->offset &&
> > + (buf->len == PAGE_SIZE)) {
> > + /* Can move page aligned buf, gather
> > + * requests to make a single
> > + * zap_page_range() call per VMA
> > + */
> > + if (vma && (useraddr == user_end) &&
> > + ((useraddr + PAGE_SIZE) <=
> > + vma->vm_end)) {
> > + /* same vma, no holes */
> > + user_end += PAGE_SIZE;
> > + } else {
> > + if (vma)
> > + zap_page_range(vma,
> > + user_start,
> > + (user_end -
> > + user_start),
> > + NULL);
> > + vma = find_vma_intersection(
> > + current->mm,
> > + useraddr,
> > + (useraddr +
> > + PAGE_SIZE));
> > + if (!IS_ERR_OR_NULL(vma)) {
> > + user_start = useraddr;
> > + user_end = (useraddr +
> > + PAGE_SIZE);
> > + } else
> > + vma = NULL;
> > + }
> > + }
> > buf->flags |= PIPE_BUF_FLAG_GIFT;
> > + }
> >
> > pipe->nrbufs++;
> > page_nr++;
> > @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > pipe->waiting_writers--;
> > }
> >
> > + if (vma)
> > + zap_page_range(vma, user_start, (user_end - user_start), NULL);
> > +
> > + up_read(¤t->mm->mmap_sem);
> > pipe_unlock(pipe);
> >
> > if (do_wakeup)
> > @@ -485,6 +530,7 @@ fill_it:
> >
> > spd.partial[page_nr].offset = loff;
> > spd.partial[page_nr].len = this_len;
> > + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> > len -= this_len;
> > loff = 0;
> > spd.nr_pages++;
> > @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> > this_len = min_t(size_t, vec[i].iov_len, res);
> > spd.partial[i].offset = 0;
> > spd.partial[i].len = this_len;
> > + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> > if (!this_len) {
> > __free_page(spd.pages[i]);
> > spd.pages[i] = NULL;
> > @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> >
> > partial[buffers].offset = off;
> > partial[buffers].len = plen;
> > + partial[buffers].useraddr = (unsigned long)base;
> > + base = (void*)((unsigned long)base + PAGE_SIZE);
> >
> > off = 0;
> > len -= plen;
> > diff --git a/include/linux/splice.h b/include/linux/splice.h
> > index 74575cb..56661e3 100644
> > --- a/include/linux/splice.h
> > +++ b/include/linux/splice.h
> > @@ -44,6 +44,7 @@ struct partial_page {
> > unsigned int offset;
> > unsigned int len;
> > unsigned long private;
> > + unsigned long useraddr;
> > };
> >
> > /*
> >
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-17 13:48 ` Robert Jennings
0 siblings, 0 replies; 30+ messages in thread
From: Robert Jennings @ 2013-10-17 13:48 UTC (permalink / raw)
To: Vlastimil Babka
Cc: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Dave Hansen, Matt Helsley,
Anthony Liguori, Michael Roth, Lei Li, Leonardo Garcia
* Vlastimil Babka (vbabka@suse.cz) wrote:
> On 10/07/2013 10:21 PM, Robert C Jennings wrote:
> > Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
> > pages.
> >
> > When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
> > writer's gift'ed pages would be zapped. This patch supports further work
> > to move vmsplice'd pages rather than copying them. That patch has the
> > restriction that the page must not be mapped by the source for the move,
> > otherwise it will fall back to copying the page.
> >
> > Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
> > Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
> > ---
> > Since the RFC went out I have coalesced the zap_page_range() call to
> > operate on VMAs rather than calling this for each page. For a 256MB
> > vmsplice this reduced the write side 50% from the RFC.
> > ---
> > fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
> > include/linux/splice.h | 1 +
> > 2 files changed, 51 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/splice.c b/fs/splice.c
> > index 3b7ee65..a62d61e 100644
> > --- a/fs/splice.c
> > +++ b/fs/splice.c
> > @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > {
> > unsigned int spd_pages = spd->nr_pages;
> > int ret, do_wakeup, page_nr;
> > + struct vm_area_struct *vma;
> > + unsigned long user_start, user_end;
> >
> > ret = 0;
> > do_wakeup = 0;
> > page_nr = 0;
> > + vma = NULL;
> > + user_start = user_end = 0;
> >
> > pipe_lock(pipe);
> > + down_read(¤t->mm->mmap_sem);
>
> Seems like you could take the mmap_sem only when GIFT and MOVE is set.
> Maybe it won't help that much for performance but at least serve as
> documenting the reason it's needed?
>
> Vlastimil
>
I had been doing that previously but moving this outside the loop and
acquiring it once did improve performance. I'll add a comment on
down_read() as to the reason for taking this though.
-Rob
> > for (;;) {
> > if (!pipe->readers) {
> > @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > buf->len = spd->partial[page_nr].len;
> > buf->private = spd->partial[page_nr].private;
> > buf->ops = spd->ops;
> > - if (spd->flags & SPLICE_F_GIFT)
> > + if (spd->flags & SPLICE_F_GIFT) {
> > + unsigned long useraddr =
> > + spd->partial[page_nr].useraddr;
> > +
> > + if ((spd->flags & SPLICE_F_MOVE) &&
> > + !buf->offset &&
> > + (buf->len == PAGE_SIZE)) {
> > + /* Can move page aligned buf, gather
> > + * requests to make a single
> > + * zap_page_range() call per VMA
> > + */
> > + if (vma && (useraddr == user_end) &&
> > + ((useraddr + PAGE_SIZE) <=
> > + vma->vm_end)) {
> > + /* same vma, no holes */
> > + user_end += PAGE_SIZE;
> > + } else {
> > + if (vma)
> > + zap_page_range(vma,
> > + user_start,
> > + (user_end -
> > + user_start),
> > + NULL);
> > + vma = find_vma_intersection(
> > + current->mm,
> > + useraddr,
> > + (useraddr +
> > + PAGE_SIZE));
> > + if (!IS_ERR_OR_NULL(vma)) {
> > + user_start = useraddr;
> > + user_end = (useraddr +
> > + PAGE_SIZE);
> > + } else
> > + vma = NULL;
> > + }
> > + }
> > buf->flags |= PIPE_BUF_FLAG_GIFT;
> > + }
> >
> > pipe->nrbufs++;
> > page_nr++;
> > @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
> > pipe->waiting_writers--;
> > }
> >
> > + if (vma)
> > + zap_page_range(vma, user_start, (user_end - user_start), NULL);
> > +
> > + up_read(¤t->mm->mmap_sem);
> > pipe_unlock(pipe);
> >
> > if (do_wakeup)
> > @@ -485,6 +530,7 @@ fill_it:
> >
> > spd.partial[page_nr].offset = loff;
> > spd.partial[page_nr].len = this_len;
> > + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
> > len -= this_len;
> > loff = 0;
> > spd.nr_pages++;
> > @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
> > this_len = min_t(size_t, vec[i].iov_len, res);
> > spd.partial[i].offset = 0;
> > spd.partial[i].len = this_len;
> > + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
> > if (!this_len) {
> > __free_page(spd.pages[i]);
> > spd.pages[i] = NULL;
> > @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
> >
> > partial[buffers].offset = off;
> > partial[buffers].len = plen;
> > + partial[buffers].useraddr = (unsigned long)base;
> > + base = (void*)((unsigned long)base + PAGE_SIZE);
> >
> > off = 0;
> > len -= plen;
> > diff --git a/include/linux/splice.h b/include/linux/splice.h
> > index 74575cb..56661e3 100644
> > --- a/include/linux/splice.h
> > +++ b/include/linux/splice.h
> > @@ -44,6 +44,7 @@ struct partial_page {
> > unsigned int offset;
> > unsigned int len;
> > unsigned long private;
> > + unsigned long useraddr;
> > };
> >
> > /*
> >
>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
2013-10-17 13:48 ` Robert Jennings
@ 2013-10-18 8:10 ` Vlastimil Babka
-1 siblings, 0 replies; 30+ messages in thread
From: Vlastimil Babka @ 2013-10-18 8:10 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Dave Hansen, Matt Helsley,
Anthony Liguori, Michael Roth, Lei Li, Leonardo Garcia
On 10/17/2013 03:48 PM, Robert Jennings wrote:
> * Vlastimil Babka (vbabka@suse.cz) wrote:
>> On 10/07/2013 10:21 PM, Robert C Jennings wrote:
>>> Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
>>> pages.
>>>
>>> When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
>>> writer's gift'ed pages would be zapped. This patch supports further work
>>> to move vmsplice'd pages rather than copying them. That patch has the
>>> restriction that the page must not be mapped by the source for the move,
>>> otherwise it will fall back to copying the page.
>>>
>>> Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
>>> Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
>>> ---
>>> Since the RFC went out I have coalesced the zap_page_range() call to
>>> operate on VMAs rather than calling this for each page. For a 256MB
>>> vmsplice this reduced the write side 50% from the RFC.
>>> ---
>>> fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>> include/linux/splice.h | 1 +
>>> 2 files changed, 51 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/splice.c b/fs/splice.c
>>> index 3b7ee65..a62d61e 100644
>>> --- a/fs/splice.c
>>> +++ b/fs/splice.c
>>> @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> {
>>> unsigned int spd_pages = spd->nr_pages;
>>> int ret, do_wakeup, page_nr;
>>> + struct vm_area_struct *vma;
>>> + unsigned long user_start, user_end;
>>>
>>> ret = 0;
>>> do_wakeup = 0;
>>> page_nr = 0;
>>> + vma = NULL;
>>> + user_start = user_end = 0;
>>>
>>> pipe_lock(pipe);
>>> + down_read(¤t->mm->mmap_sem);
>>
>> Seems like you could take the mmap_sem only when GIFT and MOVE is set.
>> Maybe it won't help that much for performance but at least serve as
>> documenting the reason it's needed?
>>
>> Vlastimil
>>
>
> I had been doing that previously but moving this outside the loop and
> acquiring it once did improve performance. I'll add a comment on
> down_read() as to the reason for taking this though.
>
> -Rob
Hm perhaps in light of recent patches to reduce mmap_sem usage only to
really critical regions, maybe it really shouldn't be taken at all if
not needed.
Vlastimil
>>> for (;;) {
>>> if (!pipe->readers) {
>>> @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> buf->len = spd->partial[page_nr].len;
>>> buf->private = spd->partial[page_nr].private;
>>> buf->ops = spd->ops;
>>> - if (spd->flags & SPLICE_F_GIFT)
>>> + if (spd->flags & SPLICE_F_GIFT) {
>>> + unsigned long useraddr =
>>> + spd->partial[page_nr].useraddr;
>>> +
>>> + if ((spd->flags & SPLICE_F_MOVE) &&
>>> + !buf->offset &&
>>> + (buf->len == PAGE_SIZE)) {
>>> + /* Can move page aligned buf, gather
>>> + * requests to make a single
>>> + * zap_page_range() call per VMA
>>> + */
>>> + if (vma && (useraddr == user_end) &&
>>> + ((useraddr + PAGE_SIZE) <=
>>> + vma->vm_end)) {
>>> + /* same vma, no holes */
>>> + user_end += PAGE_SIZE;
>>> + } else {
>>> + if (vma)
>>> + zap_page_range(vma,
>>> + user_start,
>>> + (user_end -
>>> + user_start),
>>> + NULL);
>>> + vma = find_vma_intersection(
>>> + current->mm,
>>> + useraddr,
>>> + (useraddr +
>>> + PAGE_SIZE));
>>> + if (!IS_ERR_OR_NULL(vma)) {
>>> + user_start = useraddr;
>>> + user_end = (useraddr +
>>> + PAGE_SIZE);
>>> + } else
>>> + vma = NULL;
>>> + }
>>> + }
>>> buf->flags |= PIPE_BUF_FLAG_GIFT;
>>> + }
>>>
>>> pipe->nrbufs++;
>>> page_nr++;
>>> @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> pipe->waiting_writers--;
>>> }
>>>
>>> + if (vma)
>>> + zap_page_range(vma, user_start, (user_end - user_start), NULL);
>>> +
>>> + up_read(¤t->mm->mmap_sem);
>>> pipe_unlock(pipe);
>>>
>>> if (do_wakeup)
>>> @@ -485,6 +530,7 @@ fill_it:
>>>
>>> spd.partial[page_nr].offset = loff;
>>> spd.partial[page_nr].len = this_len;
>>> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
>>> len -= this_len;
>>> loff = 0;
>>> spd.nr_pages++;
>>> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
>>> this_len = min_t(size_t, vec[i].iov_len, res);
>>> spd.partial[i].offset = 0;
>>> spd.partial[i].len = this_len;
>>> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
>>> if (!this_len) {
>>> __free_page(spd.pages[i]);
>>> spd.pages[i] = NULL;
>>> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>>>
>>> partial[buffers].offset = off;
>>> partial[buffers].len = plen;
>>> + partial[buffers].useraddr = (unsigned long)base;
>>> + base = (void*)((unsigned long)base + PAGE_SIZE);
>>>
>>> off = 0;
>>> len -= plen;
>>> diff --git a/include/linux/splice.h b/include/linux/splice.h
>>> index 74575cb..56661e3 100644
>>> --- a/include/linux/splice.h
>>> +++ b/include/linux/splice.h
>>> @@ -44,6 +44,7 @@ struct partial_page {
>>> unsigned int offset;
>>> unsigned int len;
>>> unsigned long private;
>>> + unsigned long useraddr;
>>> };
>>>
>>> /*
>>>
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 30+ messages in thread* Re: [PATCH 1/2] vmsplice: unmap gifted pages for recipient
@ 2013-10-18 8:10 ` Vlastimil Babka
0 siblings, 0 replies; 30+ messages in thread
From: Vlastimil Babka @ 2013-10-18 8:10 UTC (permalink / raw)
To: linux-kernel, linux-fsdevel, linux-mm, Alexander Viro,
Rik van Riel, Andrea Arcangeli, Dave Hansen, Matt Helsley,
Anthony Liguori, Michael Roth, Lei Li, Leonardo Garcia
On 10/17/2013 03:48 PM, Robert Jennings wrote:
> * Vlastimil Babka (vbabka@suse.cz) wrote:
>> On 10/07/2013 10:21 PM, Robert C Jennings wrote:
>>> Introduce use of the unused SPLICE_F_MOVE flag for vmsplice to zap
>>> pages.
>>>
>>> When vmsplice is called with flags (SPLICE_F_GIFT | SPLICE_F_MOVE) the
>>> writer's gift'ed pages would be zapped. This patch supports further work
>>> to move vmsplice'd pages rather than copying them. That patch has the
>>> restriction that the page must not be mapped by the source for the move,
>>> otherwise it will fall back to copying the page.
>>>
>>> Signed-off-by: Matt Helsley <matt.helsley@gmail.com>
>>> Signed-off-by: Robert C Jennings <rcj@linux.vnet.ibm.com>
>>> ---
>>> Since the RFC went out I have coalesced the zap_page_range() call to
>>> operate on VMAs rather than calling this for each page. For a 256MB
>>> vmsplice this reduced the write side 50% from the RFC.
>>> ---
>>> fs/splice.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>> include/linux/splice.h | 1 +
>>> 2 files changed, 51 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/fs/splice.c b/fs/splice.c
>>> index 3b7ee65..a62d61e 100644
>>> --- a/fs/splice.c
>>> +++ b/fs/splice.c
>>> @@ -188,12 +188,17 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> {
>>> unsigned int spd_pages = spd->nr_pages;
>>> int ret, do_wakeup, page_nr;
>>> + struct vm_area_struct *vma;
>>> + unsigned long user_start, user_end;
>>>
>>> ret = 0;
>>> do_wakeup = 0;
>>> page_nr = 0;
>>> + vma = NULL;
>>> + user_start = user_end = 0;
>>>
>>> pipe_lock(pipe);
>>> + down_read(¤t->mm->mmap_sem);
>>
>> Seems like you could take the mmap_sem only when GIFT and MOVE is set.
>> Maybe it won't help that much for performance but at least serve as
>> documenting the reason it's needed?
>>
>> Vlastimil
>>
>
> I had been doing that previously but moving this outside the loop and
> acquiring it once did improve performance. I'll add a comment on
> down_read() as to the reason for taking this though.
>
> -Rob
Hm perhaps in light of recent patches to reduce mmap_sem usage only to
really critical regions, maybe it really shouldn't be taken at all if
not needed.
Vlastimil
>>> for (;;) {
>>> if (!pipe->readers) {
>>> @@ -212,8 +217,44 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> buf->len = spd->partial[page_nr].len;
>>> buf->private = spd->partial[page_nr].private;
>>> buf->ops = spd->ops;
>>> - if (spd->flags & SPLICE_F_GIFT)
>>> + if (spd->flags & SPLICE_F_GIFT) {
>>> + unsigned long useraddr =
>>> + spd->partial[page_nr].useraddr;
>>> +
>>> + if ((spd->flags & SPLICE_F_MOVE) &&
>>> + !buf->offset &&
>>> + (buf->len == PAGE_SIZE)) {
>>> + /* Can move page aligned buf, gather
>>> + * requests to make a single
>>> + * zap_page_range() call per VMA
>>> + */
>>> + if (vma && (useraddr == user_end) &&
>>> + ((useraddr + PAGE_SIZE) <=
>>> + vma->vm_end)) {
>>> + /* same vma, no holes */
>>> + user_end += PAGE_SIZE;
>>> + } else {
>>> + if (vma)
>>> + zap_page_range(vma,
>>> + user_start,
>>> + (user_end -
>>> + user_start),
>>> + NULL);
>>> + vma = find_vma_intersection(
>>> + current->mm,
>>> + useraddr,
>>> + (useraddr +
>>> + PAGE_SIZE));
>>> + if (!IS_ERR_OR_NULL(vma)) {
>>> + user_start = useraddr;
>>> + user_end = (useraddr +
>>> + PAGE_SIZE);
>>> + } else
>>> + vma = NULL;
>>> + }
>>> + }
>>> buf->flags |= PIPE_BUF_FLAG_GIFT;
>>> + }
>>>
>>> pipe->nrbufs++;
>>> page_nr++;
>>> @@ -255,6 +296,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
>>> pipe->waiting_writers--;
>>> }
>>>
>>> + if (vma)
>>> + zap_page_range(vma, user_start, (user_end - user_start), NULL);
>>> +
>>> + up_read(¤t->mm->mmap_sem);
>>> pipe_unlock(pipe);
>>>
>>> if (do_wakeup)
>>> @@ -485,6 +530,7 @@ fill_it:
>>>
>>> spd.partial[page_nr].offset = loff;
>>> spd.partial[page_nr].len = this_len;
>>> + spd.partial[page_nr].useraddr = index << PAGE_CACHE_SHIFT;
>>> len -= this_len;
>>> loff = 0;
>>> spd.nr_pages++;
>>> @@ -656,6 +702,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
>>> this_len = min_t(size_t, vec[i].iov_len, res);
>>> spd.partial[i].offset = 0;
>>> spd.partial[i].len = this_len;
>>> + spd.partial[i].useraddr = (unsigned long)vec[i].iov_base;
>>> if (!this_len) {
>>> __free_page(spd.pages[i]);
>>> spd.pages[i] = NULL;
>>> @@ -1475,6 +1522,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
>>>
>>> partial[buffers].offset = off;
>>> partial[buffers].len = plen;
>>> + partial[buffers].useraddr = (unsigned long)base;
>>> + base = (void*)((unsigned long)base + PAGE_SIZE);
>>>
>>> off = 0;
>>> len -= plen;
>>> diff --git a/include/linux/splice.h b/include/linux/splice.h
>>> index 74575cb..56661e3 100644
>>> --- a/include/linux/splice.h
>>> +++ b/include/linux/splice.h
>>> @@ -44,6 +44,7 @@ struct partial_page {
>>> unsigned int offset;
>>> unsigned int len;
>>> unsigned long private;
>>> + unsigned long useraddr;
>>> };
>>>
>>> /*
>>>
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
^ permalink raw reply [flat|nested] 30+ messages in thread