* [patch] pipe: add support for shrinking and growing pipes @ 2010-05-19 16:45 Miklos Szeredi 2010-05-19 16:49 ` Linus Torvalds 2010-05-20 12:52 ` Andi Kleen 0 siblings, 2 replies; 47+ messages in thread From: Miklos Szeredi @ 2010-05-19 16:45 UTC (permalink / raw) To: linux-fsdevel; +Cc: linux-kernel, jens.axboe, torvalds, akpm This is a forward port to 2.6.34 of this patch from Jens: http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff_plain;h=24547ac4d97bebb58caf9ce58bd507a95c812a3f One issue I see is that it's possible to grow pipes indefinitely. Should this be restricted to privileged users? Thanks, Miklos From: Jens Axboe <jens.axboe@oracle.com> This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for growing and shrinking the size of a pipe and adjusts pipe.c and splice.c (and relay and network splice) usage to work with these larger (or smaller) pipes. Signed-off-by: Jens Axboe <jens.axboe@oracle.com> Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> --- fs/fcntl.c | 5 + fs/pipe.c | 107 ++++++++++++++++++++++++++--- fs/splice.c | 160 +++++++++++++++++++++++++++++--------------- include/asm-generic/fcntl.h | 7 + include/linux/pipe_fs_i.h | 11 +-- include/linux/splice.h | 7 + kernel/relay.c | 19 +++-- kernel/trace/trace.c | 60 +++++++++------- net/core/skbuff.c | 39 ++++++---- 9 files changed, 299 insertions(+), 116 deletions(-) Index: linux-2.6/fs/fcntl.c =================================================================== --- linux-2.6.orig/fs/fcntl.c 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/fs/fcntl.c 2010-05-14 18:23:06.000000000 +0200 @@ -14,6 +14,7 @@ #include <linux/dnotify.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/pipe_fs_i.h> #include <linux/security.h> #include <linux/ptrace.h> #include <linux/signal.h> @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned in case F_NOTIFY: err = fcntl_dirnotify(fd, filp, arg); break; + case F_SETPIPE_SZ: + case F_GETPIPE_SZ: + err = pipe_fcntl(filp, cmd, arg); + break; default: break; } Index: linux-2.6/fs/pipe.c =================================================================== --- linux-2.6.orig/fs/pipe.c 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/fs/pipe.c 2010-05-14 18:23:06.000000000 +0200 @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> @@ -390,7 +391,7 @@ redo: if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); + curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; @@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const str chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & - (PIPE_BUFFERS-1); + (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; const struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; @@ -518,8 +519,8 @@ redo1: break; } bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + if (bufs < pipe->buffers) { + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; char *src; @@ -580,7 +581,7 @@ redo2: if (!total_len) break; } - if (bufs < PIPE_BUFFERS) + if (bufs < pipe->buffers) continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) @@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { count += pipe->bufs[buf].len; - buf = (buf+1) & (PIPE_BUFFERS-1); + buf = (buf+1) & (pipe->buffers - 1); } mutex_unlock(&inode->i_mutex); @@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table } if (filp->f_mode & FMODE_WRITE) { - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; /* * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). @@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - init_waitqueue_head(&pipe->wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->inode = inode; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + if (pipe->bufs) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + pipe->buffers = PIPE_DEF_BUFFERS; + return pipe; + } + kfree(pipe); } - return pipe; + return NULL; } void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) buf->ops->release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); + kfree(pipe->bufs); kfree(pipe); } @@ -1094,6 +1102,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fild } /* + * Allocate a new array of pipe buffers and copy the info over. Returns the + * pipe size if successful, or return -ERROR on error. + */ +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +{ + struct pipe_buffer *bufs; + + /* + * Must be a power-of-2 currently + */ + if (!is_power_of_2(arg)) + return -EINVAL; + + /* + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't + * expect a lot of shrink+grow operations, just free and allocate + * again like we would do for growing. If the pipe currently + * contains more buffers than arg, then return busy. + */ + if (arg < pipe->nrbufs) + return -EBUSY; + + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + if (unlikely(!bufs)) + return -ENOMEM; + + /* + * The pipe array wraps around, so just start the new one at zero + * and adjust the indexes. + */ + if (pipe->nrbufs) { + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); + const unsigned int head = pipe->nrbufs - tail; + + if (head) + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); + if (tail) + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); + } + + pipe->curbuf = 0; + kfree(pipe->bufs); + pipe->bufs = bufs; + pipe->buffers = arg; + return arg; +} + +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pipe_inode_info *pipe; + long ret; + + pipe = file->f_path.dentry->d_inode->i_pipe; + if (!pipe) + return -EBADF; + + mutex_lock(&pipe->inode->i_mutex); + + switch (cmd) { + case F_SETPIPE_SZ: + ret = pipe_set_size(pipe, arg); + break; + case F_GETPIPE_SZ: + ret = pipe->buffers; + break; + default: + ret = -EINVAL; + break; + } + + mutex_unlock(&pipe->inode->i_mutex); + return ret; +} + +/* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial Index: linux-2.6/fs/splice.c =================================================================== --- linux-2.6.orig/fs/splice.c 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/fs/splice.c 2010-05-14 18:23:06.000000000 +0200 @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode break; } - if (pipe->nrbufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < pipe->buffers) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; buf->page = spd->pages[page_nr]; @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode if (!--spd->nr_pages) break; - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) continue; break; @@ -265,6 +265,36 @@ static void spd_release_page(struct spli page_cache_release(spd->pages[i]); } +/* + * Check if we need to grow the arrays holding pages and partial page + * descriptions. + */ +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return 0; + + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + + if (spd->pages && spd->partial) + return 0; + + kfree(spd->pages); + kfree(spd->partial); + return -ENOMEM; +} + +void splice_shrink_spd(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) +{ + if (pipe->buffers <= PIPE_DEF_BUFFERS) + return; + + kfree(spd->pages); + kfree(spd->partial); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -272,29 +302,32 @@ __generic_file_splice_read(struct file * { struct address_space *mapping = in->f_mapping; unsigned int loff, nr_pages, req_pages; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; int error, page_nr; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); + nr_pages = min(req_pages, pipe->buffers); /* * Lookup the (hopefully) full range of pages we need. */ - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); index += spd.nr_pages; /* @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file * unlock_page(page); } - pages[spd.nr_pages++] = page; + spd.pages[spd.nr_pages++] = page; index++; } @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file * * this_len is the max we'll use from this page */ this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); - page = pages[page_nr]; + page = spd.pages[page_nr]; if (PageReadahead(page)) page_cache_async_readahead(mapping, &in->f_ra, in, @@ -393,8 +426,8 @@ __generic_file_splice_read(struct file * error = -ENOMEM; break; } - page_cache_release(pages[page_nr]); - pages[page_nr] = page; + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great @@ -451,8 +484,8 @@ fill_it: len = this_len; } - partial[page_nr].offset = loff; - partial[page_nr].len = this_len; + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; len -= this_len; loff = 0; spd.nr_pages++; @@ -464,12 +497,13 @@ fill_it: * we got, 'nr_pages' is how many pages are in the map. */ while (page_nr < nr_pages) - page_cache_release(pages[page_nr++]); + page_cache_release(spd.pages[page_nr++]); in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; if (spd.nr_pages) - return splice_to_pipe(pipe, &spd); + error = splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); return error; } @@ -560,27 +594,36 @@ ssize_t default_file_splice_read(struct unsigned int nr_pages; unsigned int nr_freed; size_t offset; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct iovec *vec; pgoff_t index; ssize_t res; size_t this_len; int error; int i; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &default_pipe_buf_ops, .spd_release = spd_release_page, }; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (!vec) { + res = -ENOMEM; + goto out; + } + index = *ppos >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -591,7 +634,7 @@ ssize_t default_file_splice_read(struct this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; - pages[i] = page; + spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; @@ -610,11 +653,11 @@ ssize_t default_file_splice_read(struct nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); - partial[i].offset = 0; - partial[i].len = this_len; + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; if (!this_len) { - __free_page(pages[i]); - pages[i] = NULL; + __free_page(spd.pages[i]); + spd.pages[i] = NULL; nr_freed++; } res -= this_len; @@ -625,13 +668,17 @@ ssize_t default_file_splice_read(struct if (res > 0) *ppos += res; +out: + splice_shrink_spd(pipe, &spd); + kfree(vec); return res; err: for (i = 0; i < spd.nr_pages; i++) - __free_page(pages[i]); + __free_page(spd.pages[i]); - return error; + res = error; + goto out; } EXPORT_SYMBOL(default_file_splice_read); @@ -784,7 +831,7 @@ int splice_from_pipe_feed(struct pipe_in if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; if (pipe->inode) sd->need_wakeup = true; @@ -1211,7 +1258,7 @@ out_release: * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < PIPE_BUFFERS; i++) { + for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) { @@ -1371,7 +1418,8 @@ static long do_splice(struct file *in, l */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial, int aligned) + struct partial_page *partial, int aligned, + unsigned int pipe_buffers) { int buffers = 0, error = 0; @@ -1414,8 +1462,8 @@ static int get_iovec_page_array(const st break; npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (npages > PIPE_BUFFERS - buffers) - npages = PIPE_BUFFERS - buffers; + if (npages > pipe_buffers - buffers) + npages = pipe_buffers - buffers; error = get_user_pages_fast((unsigned long)base, npages, 0, &pages[buffers]); @@ -1450,7 +1498,7 @@ static int get_iovec_page_array(const st * or if we mapped the max number of pages that we have * room for. */ - if (error < npages || buffers == PIPE_BUFFERS) + if (error < npages || buffers == pipe_buffers) break; nr_vecs--; @@ -1593,26 +1641,34 @@ static long vmsplice_to_pipe(struct file unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, }; + long ret; pipe = pipe_info(file->f_path.dentry->d_inode); if (!pipe) return -EBADF; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, - flags & SPLICE_F_GIFT); + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, + spd.partial, flags & SPLICE_F_GIFT, + pipe->buffers); if (spd.nr_pages <= 0) - return spd.nr_pages; + ret = spd.nr_pages; + else + ret = splice_to_pipe(pipe, &spd); - return splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); + return ret; } /* @@ -1738,13 +1794,13 @@ static int opipe_prep(struct pipe_inode_ * Check ->nrbufs without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe->nrbufs < PIPE_BUFFERS) + if (pipe->nrbufs < pipe->buffers) return 0; ret = 0; pipe_lock(pipe); - while (pipe->nrbufs >= PIPE_BUFFERS) { + while (pipe->nrbufs >= pipe->buffers) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1810,7 +1866,7 @@ retry: * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { /* Already processed some buffers, break */ if (ret) break; @@ -1831,7 +1887,7 @@ retry: } ibuf = ipipe->bufs + ipipe->curbuf; - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); obuf = opipe->bufs + nbuf; if (len >= ibuf->len) { @@ -1841,7 +1897,7 @@ retry: *obuf = *ibuf; ibuf->ops = NULL; opipe->nrbufs++; - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); ipipe->nrbufs--; input_wakeup = true; } else { @@ -1914,11 +1970,11 @@ static int link_pipe(struct pipe_inode_i * If we have iterated all input buffers or ran out of * output room, break. */ - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) break; - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); /* * Get a reference to this pipe buffer, Index: linux-2.6/include/asm-generic/fcntl.h =================================================================== --- linux-2.6.orig/include/asm-generic/fcntl.h 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/include/asm-generic/fcntl.h 2010-05-14 18:23:06.000000000 +0200 @@ -104,6 +104,13 @@ #define F_GETOWN_EX 16 #endif +#ifndef F_SETPIPE_SZ /* for growing pipe sizes */ +#define F_SETPIPE_SZ 17 +#endif +#ifndef F_GETPIPE_SZ +#define F_GETPIPE_SZ 18 +#endif + #define F_OWNER_TID 0 #define F_OWNER_PID 1 #define F_OWNER_PGRP 2 Index: linux-2.6/include/linux/pipe_fs_i.h =================================================================== --- linux-2.6.orig/include/linux/pipe_fs_i.h 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/include/linux/pipe_fs_i.h 2010-05-14 18:23:06.000000000 +0200 @@ -3,7 +3,7 @@ #define PIPEFS_MAGIC 0x50495045 -#define PIPE_BUFFERS (16) +#define PIPE_DEF_BUFFERS 16 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ @@ -44,17 +44,17 @@ struct pipe_buffer { **/ struct pipe_inode_info { wait_queue_head_t wait; - unsigned int nrbufs, curbuf; - struct page *tmp_page; + unsigned int nrbufs, curbuf, buffers; unsigned int readers; unsigned int writers; unsigned int waiting_writers; unsigned int r_counter; unsigned int w_counter; + struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct inode *inode; - struct pipe_buffer bufs[PIPE_BUFFERS]; + struct pipe_buffer *bufs; }; /* @@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); +/* for F_SETPIPE_SZ and F_GETPIPE_SZ */ +long pipe_fcntl(struct file *, unsigned int, unsigned long arg); + #endif Index: linux-2.6/include/linux/splice.h =================================================================== --- linux-2.6.orig/include/linux/splice.h 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/include/linux/splice.h 2010-05-14 18:23:06.000000000 +0200 @@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pip extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, splice_direct_actor *); +/* + * for dynamic pipe sizing + */ +extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); +extern void splice_shrink_spd(struct pipe_inode_info *, + struct splice_pipe_desc *); + #endif Index: linux-2.6/kernel/relay.c =================================================================== --- linux-2.6.orig/kernel/relay.c 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/kernel/relay.c 2010-05-14 18:23:06.000000000 +0200 @@ -1231,12 +1231,12 @@ static ssize_t subbuf_splice_actor(struc size_t read_subbuf = read_start / subbuf_size; size_t padding = rbuf->padding[read_subbuf]; size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, + .pages = pages_def, .nr_pages = 0, - .partial = partial, + .partial = partial_def, .flags = flags, .ops = &relay_pipe_buf_ops, .spd_release = relay_page_release, @@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struc if (rbuf->subbufs_produced == rbuf->subbufs_consumed) return 0; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; /* * Adjust read len, if longer than what is available @@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struc subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); + nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; @@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struc } } + ret = 0; if (!spd.nr_pages) - return 0; + goto out; ret = *nonpad_ret = splice_to_pipe(pipe, &spd); if (ret < 0 || ret < total_len) - return ret; + goto out; if (read_start + ret == nonpad_end) ret += padding; +out: + splice_shrink_spd(pipe, &spd); return ret; } Index: linux-2.6/net/core/skbuff.c =================================================================== --- linux-2.6.orig/net/core/skbuff.c 2010-05-14 18:23:05.000000000 +0200 +++ linux-2.6/net/core/skbuff.c 2010-05-14 18:23:06.000000000 +0200 @@ -1417,12 +1417,13 @@ new_page: /* * Fill page/offset/length into spd, if it can hold more pages. */ -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, +static inline int spd_fill_page(struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, struct sk_buff *skb, int linear, struct sock *sk) { - if (unlikely(spd->nr_pages == PIPE_BUFFERS)) + if (unlikely(spd->nr_pages == pipe->buffers)) return 1; if (linear) { @@ -1457,8 +1458,9 @@ static inline void __segment_seek(struct static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd, int linear, - struct sock *sk) + struct splice_pipe_desc *spd, + struct pipe_inode_info *pipe, + int linear, struct sock *sk) { if (!*len) return 1; @@ -1481,7 +1483,7 @@ static inline int __splice_segment(struc /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1498,7 +1500,7 @@ static inline int __splice_segment(struc */ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, unsigned int *len, struct splice_pipe_desc *spd, - struct sock *sk) + struct pipe_inode_info *pipe, struct sock *sk) { int seg; @@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_b if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, 1, sk)) + offset, len, skb, spd, pipe, 1, sk)) return 1; /* @@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_b const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0, sk)) + offset, len, skb, spd, pipe, 0, sk)) return 1; } @@ -1535,23 +1537,27 @@ int skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; struct sk_buff *frag_iter; struct sock *sk = skb->sk; + int ret = 0; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) + if (__skb_splice_bits(skb, &offset, &tlen, &spd, pipe, sk)) goto done; else if (!tlen) goto done; @@ -1562,13 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, skb_walk_frags(skb, frag_iter) { if (!tlen) break; - if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) + if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, pipe, sk)) break; } done: if (spd.nr_pages) { - int ret; /* * Drop the socket lock, otherwise we have reverse @@ -1582,10 +1587,10 @@ done: release_sock(sk); ret = splice_to_pipe(pipe, &spd); lock_sock(sk); - return ret; } - return 0; + splice_shrink_spd(pipe, &spd); + return ret; } /** Index: linux-2.6/kernel/trace/trace.c =================================================================== --- linux-2.6.orig/kernel/trace/trace.c 2010-05-12 11:19:25.000000000 +0200 +++ linux-2.6/kernel/trace/trace.c 2010-05-14 18:27:42.000000000 +0200 @@ -3269,12 +3269,12 @@ static ssize_t tracing_splice_read_pipe( size_t len, unsigned int flags) { - struct page *pages[PIPE_BUFFERS]; - struct partial_page partial[PIPE_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct trace_iterator *iter = filp->private_data; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .flags = flags, .ops = &tracing_pipe_buf_ops, @@ -3285,6 +3285,9 @@ static ssize_t tracing_splice_read_pipe( size_t rem; unsigned int i; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); if (unlikely(old_tracer != current_trace && current_trace)) { @@ -3315,23 +3318,23 @@ static ssize_t tracing_splice_read_pipe( trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { - pages[i] = alloc_page(GFP_KERNEL); - if (!pages[i]) + for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, - page_address(pages[i]), + page_address(spd.pages[i]), iter->seq.len); if (ret < 0) { - __free_page(pages[i]); + __free_page(spd.pages[i]); break; } - partial[i].offset = 0; - partial[i].len = iter->seq.len; + spd.partial[i].offset = 0; + spd.partial[i].len = iter->seq.len; trace_seq_init(&iter->seq); } @@ -3342,12 +3345,14 @@ static ssize_t tracing_splice_read_pipe( spd.nr_pages = i; - return splice_to_pipe(pipe, &spd); + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(pipe, &spd); + return ret; out_err: mutex_unlock(&iter->mutex); - - return ret; + goto out; } static ssize_t @@ -3746,11 +3751,11 @@ tracing_buffers_splice_read(struct file unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; - struct partial_page partial[PIPE_BUFFERS]; - struct page *pages[PIPE_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, + .pages = pages_def, + .partial = partial_def, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -3759,22 +3764,28 @@ tracing_buffers_splice_read(struct file int entries, size, i; size_t ret; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + if (*ppos & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: previous read must page-align\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (len & (PAGE_SIZE - 1)) { WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); - if (len < PAGE_SIZE) - return -EINVAL; + if (len < PAGE_SIZE) { + ret = -EINVAL; + goto out; + } len &= PAGE_MASK; } trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); - for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3829,11 +3840,12 @@ tracing_buffers_splice_read(struct file else ret = 0; /* TODO: block */ - return ret; + goto out; } ret = splice_to_pipe(pipe, &spd); - + splice_shrink_spd(pipe, &spd); +out: return ret; } ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 16:45 [patch] pipe: add support for shrinking and growing pipes Miklos Szeredi @ 2010-05-19 16:49 ` Linus Torvalds 2010-05-19 18:05 ` Jens Axboe 2010-05-23 5:30 ` Michael Kerrisk 2010-05-20 12:52 ` Andi Kleen 1 sibling, 2 replies; 47+ messages in thread From: Linus Torvalds @ 2010-05-19 16:49 UTC (permalink / raw) To: Miklos Szeredi; +Cc: linux-fsdevel, linux-kernel, jens.axboe, akpm On Wed, 19 May 2010, Miklos Szeredi wrote: > > One issue I see is that it's possible to grow pipes indefinitely. > Should this be restricted to privileged users? Yes. But perhaps only if it grows past the default (or perhaps "default*2" or similar). That way a normal user could shrink the pipe buffers, and then grow them again if he wants to. Oh, and I think you need to also require that there be at least two buffers. Otherwise we can't guarantee POSIX behavior, I think. Linus ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 16:49 ` Linus Torvalds @ 2010-05-19 18:05 ` Jens Axboe 2010-05-19 19:05 ` Jens Axboe 2010-05-23 5:30 ` Michael Kerrisk 1 sibling, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-19 18:05 UTC (permalink / raw) To: Linus Torvalds; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, akpm On Wed, May 19 2010, Linus Torvalds wrote: > > > On Wed, 19 May 2010, Miklos Szeredi wrote: > > > > One issue I see is that it's possible to grow pipes indefinitely. > > Should this be restricted to privileged users? > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > or similar). That way a normal user could shrink the pipe buffers, and > then grow them again if he wants to. That's still a bit arbitrary, I don't think allowing default*2 only for non-root is going to be hugely interesting. But limiting makes sense, but lets at least allow a larger max limit for the normal user. I'm suspecting that the media application that wants to use this will not be running as root, and we don't make the feature properly available to the ones that want to use it, then we may as well not do it. Or we could expose a sysctl for instance that holds the max non-root size. And make that default to default*16 or something. How does that sound? > Oh, and I think you need to also require that there be at least two > buffers. Otherwise we can't guarantee POSIX behavior, I think. Good point, and at least that part is easily doable :-) -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 18:05 ` Jens Axboe @ 2010-05-19 19:05 ` Jens Axboe 2010-05-20 8:33 ` Miklos Szeredi 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-19 19:05 UTC (permalink / raw) To: Linus Torvalds; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, akpm On Wed, May 19 2010, Jens Axboe wrote: > On Wed, May 19 2010, Linus Torvalds wrote: > > > > > > On Wed, 19 May 2010, Miklos Szeredi wrote: > > > > > > One issue I see is that it's possible to grow pipes indefinitely. > > > Should this be restricted to privileged users? > > > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > > or similar). That way a normal user could shrink the pipe buffers, and > > then grow them again if he wants to. > > That's still a bit arbitrary, I don't think allowing default*2 only for > non-root is going to be hugely interesting. But limiting makes sense, > but lets at least allow a larger max limit for the normal user. I'm > suspecting that the media application that wants to use this will not be > running as root, and we don't make the feature properly available to the > ones that want to use it, then we may as well not do it. > > Or we could expose a sysctl for instance that holds the max non-root > size. And make that default to default*16 or something. How does that > sound? > > > Oh, and I think you need to also require that there be at least two > > buffers. Otherwise we can't guarantee POSIX behavior, I think. > > Good point, and at least that part is easily doable :-) So I updated the patch, that branch was pretty ancient... The fcntl pipe numbers were also screwed up, so got that fixed. New patch is here: http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=23dcb845246946aeda5a5e398c6911381ad28365 and I implemented a /proc/sys/fs/pipe-max-pages addon, that part is here: http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=6ef1fd2ea2fdba7a47706a46f0ca564ab2c46a52 Totally untested. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 19:05 ` Jens Axboe @ 2010-05-20 8:33 ` Miklos Szeredi 2010-05-20 8:37 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Miklos Szeredi @ 2010-05-20 8:33 UTC (permalink / raw) To: Jens Axboe; +Cc: torvalds, miklos, linux-fsdevel, linux-kernel, akpm On Wed, 19 May 2010, Jens Axboe wrote: > On Wed, May 19 2010, Jens Axboe wrote: > > On Wed, May 19 2010, Linus Torvalds wrote: > > > > > > > > > On Wed, 19 May 2010, Miklos Szeredi wrote: > > > > > > > > One issue I see is that it's possible to grow pipes indefinitely. > > > > Should this be restricted to privileged users? > > > > > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > > > or similar). That way a normal user could shrink the pipe buffers, and > > > then grow them again if he wants to. > > > > That's still a bit arbitrary, I don't think allowing default*2 only for > > non-root is going to be hugely interesting. But limiting makes sense, > > but lets at least allow a larger max limit for the normal user. I'm > > suspecting that the media application that wants to use this will not be > > running as root, and we don't make the feature properly available to the > > ones that want to use it, then we may as well not do it. > > > > Or we could expose a sysctl for instance that holds the max non-root > > size. And make that default to default*16 or something. How does that > > sound? > > > > > Oh, and I think you need to also require that there be at least two > > > buffers. Otherwise we can't guarantee POSIX behavior, I think. > > > > Good point, and at least that part is easily doable :-) > > So I updated the patch, that branch was pretty ancient... The fcntl pipe > numbers were also screwed up, so got that fixed. New patch is here: > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=23dcb845246946aeda5a5e398c6911381ad28365 Not sure why you didn't take my updated patch. Yours misses conversion of kernel/trace/trace.c as well as some hunks below (that renaming the "pages" and "partial" arrays and a compile test would have revealed). Thanks, Miklos > From: Jens Axboe <jens.axboe@oracle.com> > Date: Wed, 19 May 2010 18:47:30 +0000 (+0200) > Subject: pipe: add support for shrinking and growing pipes > X-Git-Url: http://git.kernel.dk/?p=linux-2.6-block.git;a=commitdiff_plain;h=23dcb845246946aeda5a5e398c6911381ad28365 > > pipe: add support for shrinking and growing pipes > > This patch adds F_GETPIPE_SZ and F_SETPIPE_SZ fcntl() actions for > growing and shrinking the size of a pipe and adjusts pipe.c and splice.c > (and relay and network splice) usage to work with these larger (or smaller) > pipes. > > Signed-off-by: Jens Axboe <jens.axboe@oracle.com> > --- > > diff --git a/fs/fcntl.c b/fs/fcntl.c > index 452d02f..bcba960 100644 > --- a/fs/fcntl.c > +++ b/fs/fcntl.c > @@ -14,6 +14,7 @@ > #include <linux/dnotify.h> > #include <linux/slab.h> > #include <linux/module.h> > +#include <linux/pipe_fs_i.h> > #include <linux/security.h> > #include <linux/ptrace.h> > #include <linux/signal.h> > @@ -412,6 +413,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, > case F_NOTIFY: > err = fcntl_dirnotify(fd, filp, arg); > break; > + case F_SETPIPE_SZ: > + case F_GETPIPE_SZ: > + err = pipe_fcntl(filp, cmd, arg); > + break; > default: > break; > } > diff --git a/fs/pipe.c b/fs/pipe.c > index 37ba29f..054b8a6 100644 > --- a/fs/pipe.c > +++ b/fs/pipe.c > @@ -11,6 +11,7 @@ > #include <linux/module.h> > #include <linux/init.h> > #include <linux/fs.h> > +#include <linux/log2.h> > #include <linux/mount.h> > #include <linux/pipe_fs_i.h> > #include <linux/uio.h> > @@ -390,7 +391,7 @@ redo: > if (!buf->len) { > buf->ops = NULL; > ops->release(pipe, buf); > - curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); > + curbuf = (curbuf + 1) & (pipe->buffers - 1); > pipe->curbuf = curbuf; > pipe->nrbufs = --bufs; > do_wakeup = 1; > @@ -472,7 +473,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, > chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ > if (pipe->nrbufs && chars != 0) { > int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & > - (PIPE_BUFFERS-1); > + (pipe->buffers - 1); > struct pipe_buffer *buf = pipe->bufs + lastbuf; > const struct pipe_buf_operations *ops = buf->ops; > int offset = buf->offset + buf->len; > @@ -518,8 +519,8 @@ redo1: > break; > } > bufs = pipe->nrbufs; > - if (bufs < PIPE_BUFFERS) { > - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); > + if (bufs < pipe->buffers) { > + int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); > struct pipe_buffer *buf = pipe->bufs + newbuf; > struct page *page = pipe->tmp_page; > char *src; > @@ -580,7 +581,7 @@ redo2: > if (!total_len) > break; > } > - if (bufs < PIPE_BUFFERS) > + if (bufs < pipe->buffers) > continue; > if (filp->f_flags & O_NONBLOCK) { > if (!ret) > @@ -640,7 +641,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) > nrbufs = pipe->nrbufs; > while (--nrbufs >= 0) { > count += pipe->bufs[buf].len; > - buf = (buf+1) & (PIPE_BUFFERS-1); > + buf = (buf+1) & (pipe->buffers - 1); > } > mutex_unlock(&inode->i_mutex); > > @@ -671,7 +672,7 @@ pipe_poll(struct file *filp, poll_table *wait) > } > > if (filp->f_mode & FMODE_WRITE) { > - mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; > + mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; > /* > * Most Unices do not set POLLERR for FIFOs but on Linux they > * behave exactly like pipes for poll(). > @@ -877,25 +878,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) > > pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); > if (pipe) { > - init_waitqueue_head(&pipe->wait); > - pipe->r_counter = pipe->w_counter = 1; > - pipe->inode = inode; > + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); > + if (pipe->bufs) { > + init_waitqueue_head(&pipe->wait); > + pipe->r_counter = pipe->w_counter = 1; > + pipe->inode = inode; > + pipe->buffers = PIPE_DEF_BUFFERS; > + return pipe; > + } > + kfree(pipe); > } > > - return pipe; > + return NULL; > } > > void __free_pipe_info(struct pipe_inode_info *pipe) > { > int i; > > - for (i = 0; i < PIPE_BUFFERS; i++) { > + for (i = 0; i < pipe->buffers; i++) { > struct pipe_buffer *buf = pipe->bufs + i; > if (buf->ops) > buf->ops->release(pipe, buf); > } > if (pipe->tmp_page) > __free_page(pipe->tmp_page); > + kfree(pipe->bufs); > kfree(pipe); > } > > @@ -1094,6 +1102,81 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) > } > > /* > + * Allocate a new array of pipe buffers and copy the info over. Returns the > + * pipe size if successful, or return -ERROR on error. > + */ > +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) > +{ > + struct pipe_buffer *bufs; > + > + /* > + * Must be a power-of-2 currently > + */ > + if (!is_power_of_2(arg)) > + return -EINVAL; > + > + /* > + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't > + * expect a lot of shrink+grow operations, just free and allocate > + * again like we would do for growing. If the pipe currently > + * contains more buffers than arg, then return busy. > + */ > + if (arg < pipe->nrbufs) > + return -EBUSY; > + > + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); > + if (unlikely(!bufs)) > + return -ENOMEM; > + > + /* > + * The pipe array wraps around, so just start the new one at zero > + * and adjust the indexes. > + */ > + if (pipe->nrbufs) { > + const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1); > + const unsigned int head = pipe->nrbufs - tail; > + > + if (head) > + memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); > + if (tail) > + memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer)); > + } > + > + pipe->curbuf = 0; > + kfree(pipe->bufs); > + pipe->bufs = bufs; > + pipe->buffers = arg; > + return arg; > +} > + > +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) > +{ > + struct pipe_inode_info *pipe; > + long ret; > + > + pipe = file->f_path.dentry->d_inode->i_pipe; > + if (!pipe) > + return -EBADF; > + > + mutex_lock(&pipe->inode->i_mutex); > + > + switch (cmd) { > + case F_SETPIPE_SZ: > + ret = pipe_set_size(pipe, arg); > + break; > + case F_GETPIPE_SZ: > + ret = pipe->buffers; > + break; > + default: > + ret = -EINVAL; > + break; > + } > + > + mutex_unlock(&pipe->inode->i_mutex); > + return ret; > +} > + > +/* > * pipefs should _never_ be mounted by userland - too much of security hassle, > * no real gain from having the whole whorehouse mounted. So we don't need > * any operations on the root directory. However, we need a non-trivial > diff --git a/fs/splice.c b/fs/splice.c > index 9313b61..39f907d 100644 > --- a/fs/splice.c > +++ b/fs/splice.c > @@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, > break; > } > > - if (pipe->nrbufs < PIPE_BUFFERS) { > - int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); > + if (pipe->nrbufs < pipe->buffers) { > + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); > struct pipe_buffer *buf = pipe->bufs + newbuf; > > buf->page = spd->pages[page_nr]; > @@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, > > if (!--spd->nr_pages) > break; > - if (pipe->nrbufs < PIPE_BUFFERS) > + if (pipe->nrbufs < pipe->buffers) > continue; > > break; > @@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) > page_cache_release(spd->pages[i]); > } > > +/* > + * Check if we need to grow the arrays holding pages and partial page > + * descriptions. > + */ > +int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) > +{ > + if (pipe->buffers <= PIPE_DEF_BUFFERS) > + return 0; > + > + spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); > + spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); > + > + if (spd->pages && spd->partial) > + return 0; > + > + kfree(spd->pages); > + kfree(spd->partial); > + return -ENOMEM; > +} > + > +void splice_shrink_spd(struct pipe_inode_info *pipe, > + struct splice_pipe_desc *spd) > +{ > + if (pipe->buffers <= PIPE_DEF_BUFFERS) > + return; > + > + kfree(spd->pages); > + kfree(spd->partial); > +} > + > static int > __generic_file_splice_read(struct file *in, loff_t *ppos, > struct pipe_inode_info *pipe, size_t len, > @@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, > { > struct address_space *mapping = in->f_mapping; > unsigned int loff, nr_pages, req_pages; > - struct page *pages[PIPE_BUFFERS]; > - struct partial_page partial[PIPE_BUFFERS]; > + struct page *pages[PIPE_DEF_BUFFERS]; > + struct partial_page partial[PIPE_DEF_BUFFERS]; > struct page *page; > pgoff_t index, end_index; > loff_t isize; > @@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, > .spd_release = spd_release_page, > }; > > + if (splice_grow_spd(pipe, &spd)) > + return -ENOMEM; > + > index = *ppos >> PAGE_CACHE_SHIFT; > loff = *ppos & ~PAGE_CACHE_MASK; > req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; > - nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); > + nr_pages = min(req_pages, pipe->buffers); > > /* > * Lookup the (hopefully) full range of pages we need. > */ > - spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); > + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); > index += spd.nr_pages; > > /* > @@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, > unlock_page(page); > } > > - pages[spd.nr_pages++] = page; > + spd.pages[spd.nr_pages++] = page; > index++; > } > > @@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, > * this_len is the max we'll use from this page > */ > this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); > - page = pages[page_nr]; > + page = spd.pages[page_nr]; > > if (PageReadahead(page)) > page_cache_async_readahead(mapping, &in->f_ra, in, missing a hunk here: @@ -393,8 +426,8 @@ __generic_file_splice_read(struct file * error = -ENOMEM; break; } - page_cache_release(pages[page_nr]); - pages[page_nr] = page; + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; } /* * page was already under io and is now done, great > @@ -451,8 +484,8 @@ fill_it: > len = this_len; > } > > - partial[page_nr].offset = loff; > - partial[page_nr].len = this_len; > + spd.partial[page_nr].offset = loff; > + spd.partial[page_nr].len = this_len; > len -= this_len; > loff = 0; > spd.nr_pages++; > @@ -464,12 +497,13 @@ fill_it: > * we got, 'nr_pages' is how many pages are in the map. > */ > while (page_nr < nr_pages) > - page_cache_release(pages[page_nr++]); > + page_cache_release(spd.pages[page_nr++]); > in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; > > if (spd.nr_pages) > - return splice_to_pipe(pipe, &spd); > + error = splice_to_pipe(pipe, &spd); > > + splice_shrink_spd(pipe, &spd); > return error; > } > > @@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, > unsigned int nr_pages; > unsigned int nr_freed; > size_t offset; > - struct page *pages[PIPE_BUFFERS]; > - struct partial_page partial[PIPE_BUFFERS]; > - struct iovec vec[PIPE_BUFFERS]; > + struct page *pages[PIPE_DEF_BUFFERS]; > + struct partial_page partial[PIPE_DEF_BUFFERS]; > + struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; > pgoff_t index; > ssize_t res; > size_t this_len; > @@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, > .spd_release = spd_release_page, > }; > > + if (splice_grow_spd(pipe, &spd)) > + return -ENOMEM; > + > + res = -ENOMEM; > + vec = __vec; > + if (pipe->buffers > PIPE_DEF_BUFFERS) { > + vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); > + if (!vec) > + goto shrink_ret; > + } > + > index = *ppos >> PAGE_CACHE_SHIFT; > offset = *ppos & ~PAGE_CACHE_MASK; > nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; > > - for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { > + for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { > struct page *page; > > page = alloc_page(GFP_USER); and another two: @@ -591,7 +634,7 @@ ssize_t default_file_splice_read(struct this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; - pages[i] = page; + spd.pages[i] = page; spd.nr_pages++; len -= this_len; offset = 0; @@ -610,11 +653,11 @@ ssize_t default_file_splice_read(struct nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { this_len = min_t(size_t, vec[i].iov_len, res); - partial[i].offset = 0; - partial[i].len = this_len; + spd.partial[i].offset = 0; + spd.partial[i].len = this_len; if (!this_len) { - __free_page(pages[i]); - pages[i] = NULL; + __free_page(spd.pages[i]); + spd.pages[i] = NULL; nr_freed++; } res -= this_len; > @@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, > if (res > 0) > *ppos += res; > > +shrink_ret: > + if (vec != __vec) > + kfree(vec); > + splice_shrink_spd(pipe, &spd); > return res; > > err: > for (i = 0; i < spd.nr_pages; i++) > __free_page(pages[i]); > > - return error; > + res = error; > + goto shrink_ret; > } > EXPORT_SYMBOL(default_file_splice_read); > > @@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, > if (!buf->len) { > buf->ops = NULL; > ops->release(pipe, buf); > - pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); > + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); > pipe->nrbufs--; > if (pipe->inode) > sd->need_wakeup = true; > @@ -1211,7 +1261,7 @@ out_release: > * If we did an incomplete transfer we must release > * the pipe buffers in question: > */ > - for (i = 0; i < PIPE_BUFFERS; i++) { > + for (i = 0; i < pipe->buffers; i++) { > struct pipe_buffer *buf = pipe->bufs + i; > > if (buf->ops) { > @@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, > */ > static int get_iovec_page_array(const struct iovec __user *iov, > unsigned int nr_vecs, struct page **pages, > - struct partial_page *partial, int aligned) > + struct partial_page *partial, int aligned, > + unsigned int pipe_buffers) > { > int buffers = 0, error = 0; > > @@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov, > break; > > npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; > - if (npages > PIPE_BUFFERS - buffers) > - npages = PIPE_BUFFERS - buffers; > + if (npages > pipe_buffers - buffers) > + npages = pipe_buffers - buffers; > > error = get_user_pages_fast((unsigned long)base, npages, > 0, &pages[buffers]); > @@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, > * or if we mapped the max number of pages that we have > * room for. > */ > - if (error < npages || buffers == PIPE_BUFFERS) > + if (error < npages || buffers == pipe_buffers) > break; > > nr_vecs--; > @@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, > unsigned long nr_segs, unsigned int flags) > { > struct pipe_inode_info *pipe; > - struct page *pages[PIPE_BUFFERS]; > - struct partial_page partial[PIPE_BUFFERS]; > + struct page *pages[PIPE_DEF_BUFFERS]; > + struct partial_page partial[PIPE_DEF_BUFFERS]; > struct splice_pipe_desc spd = { > .pages = pages, > .partial = partial, > @@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, > .ops = &user_page_pipe_buf_ops, > .spd_release = spd_release_page, > }; > + long ret; > > pipe = pipe_info(file->f_path.dentry->d_inode); > if (!pipe) > return -EBADF; > > - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, > - flags & SPLICE_F_GIFT); > + if (splice_grow_spd(pipe, &spd)) > + return -ENOMEM; > + > + spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, > + spd.partial, flags & SPLICE_F_GIFT, > + pipe->buffers); > if (spd.nr_pages <= 0) > - return spd.nr_pages; > + ret = spd.nr_pages; > + else > + ret = splice_to_pipe(pipe, &spd); > > - return splice_to_pipe(pipe, &spd); > + splice_shrink_spd(pipe, &spd); > + return ret; > } > > /* > @@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) > * Check ->nrbufs without the inode lock first. This function > * is speculative anyways, so missing one is ok. > */ > - if (pipe->nrbufs < PIPE_BUFFERS) > + if (pipe->nrbufs < pipe->buffers) > return 0; > > ret = 0; > pipe_lock(pipe); > > - while (pipe->nrbufs >= PIPE_BUFFERS) { > + while (pipe->nrbufs >= pipe->buffers) { > if (!pipe->readers) { > send_sig(SIGPIPE, current, 0); > ret = -EPIPE; > @@ -1810,7 +1869,7 @@ retry: > * Cannot make any progress, because either the input > * pipe is empty or the output pipe is full. > */ > - if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { > + if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { > /* Already processed some buffers, break */ > if (ret) > break; > @@ -1831,7 +1890,7 @@ retry: > } > > ibuf = ipipe->bufs + ipipe->curbuf; > - nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; > + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); > obuf = opipe->bufs + nbuf; > > if (len >= ibuf->len) { > @@ -1841,7 +1900,7 @@ retry: > *obuf = *ibuf; > ibuf->ops = NULL; > opipe->nrbufs++; > - ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; > + ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); > ipipe->nrbufs--; > input_wakeup = true; > } else { > @@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, > * If we have iterated all input buffers or ran out of > * output room, break. > */ > - if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) > + if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) > break; > > - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); > - nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); > + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); > + nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); > > /* > * Get a reference to this pipe buffer, > diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h > index 8603740..afc00af 100644 > --- a/include/linux/fcntl.h > +++ b/include/linux/fcntl.h > @@ -22,6 +22,12 @@ > #define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2) > > /* > + * Set and get of pipe page size array > + */ > +#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7) > +#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) > + > +/* > * Types of directory notifications that may be requested. > */ > #define DN_ACCESS 0x00000001 /* File accessed */ > diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h > index b43a9e0..65f4282 100644 > --- a/include/linux/pipe_fs_i.h > +++ b/include/linux/pipe_fs_i.h > @@ -3,7 +3,7 @@ > > #define PIPEFS_MAGIC 0x50495045 > > -#define PIPE_BUFFERS (16) > +#define PIPE_DEF_BUFFERS 16 > > #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ > #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ > @@ -44,17 +44,17 @@ struct pipe_buffer { > **/ > struct pipe_inode_info { > wait_queue_head_t wait; > - unsigned int nrbufs, curbuf; > - struct page *tmp_page; > + unsigned int nrbufs, curbuf, buffers; > unsigned int readers; > unsigned int writers; > unsigned int waiting_writers; > unsigned int r_counter; > unsigned int w_counter; > + struct page *tmp_page; > struct fasync_struct *fasync_readers; > struct fasync_struct *fasync_writers; > struct inode *inode; > - struct pipe_buffer bufs[PIPE_BUFFERS]; > + struct pipe_buffer *bufs; > }; > > /* > @@ -154,4 +154,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); > int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); > void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); > > +/* for F_SETPIPE_SZ and F_GETPIPE_SZ */ > +long pipe_fcntl(struct file *, unsigned int, unsigned long arg); > + > #endif > diff --git a/include/linux/splice.h b/include/linux/splice.h > index 18e7c7c..997c3b4 100644 > --- a/include/linux/splice.h > +++ b/include/linux/splice.h > @@ -82,4 +82,11 @@ extern ssize_t splice_to_pipe(struct pipe_inode_info *, > extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, > splice_direct_actor *); > > +/* > + * for dynamic pipe sizing > + */ > +extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); > +extern void splice_shrink_spd(struct pipe_inode_info *, > + struct splice_pipe_desc *); > + > #endif > diff --git a/kernel/relay.c b/kernel/relay.c > index 3d97f28..4268287 100644 > --- a/kernel/relay.c > +++ b/kernel/relay.c > @@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in, > size_t read_subbuf = read_start / subbuf_size; > size_t padding = rbuf->padding[read_subbuf]; > size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; > - struct page *pages[PIPE_BUFFERS]; > - struct partial_page partial[PIPE_BUFFERS]; > + struct page *pages[PIPE_DEF_BUFFERS]; > + struct partial_page partial[PIPE_DEF_BUFFERS]; > struct splice_pipe_desc spd = { > .pages = pages, > .nr_pages = 0, > @@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in, > > if (rbuf->subbufs_produced == rbuf->subbufs_consumed) > return 0; > + if (splice_grow_spd(pipe, &spd)) > + return -ENOMEM; > > /* > * Adjust read len, if longer than what is available > @@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in, > subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; > pidx = (read_start / PAGE_SIZE) % subbuf_pages; > poff = read_start & ~PAGE_MASK; > - nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); > + nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); > > for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { > unsigned int this_len, this_end, private; > @@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in, > } > } > > + ret = 0; > if (!spd.nr_pages) > - return 0; > + goto out; > > ret = *nonpad_ret = splice_to_pipe(pipe, &spd); > if (ret < 0 || ret < total_len) > - return ret; > + goto out; > > if (read_start + ret == nonpad_end) > ret += padding; > > +out: > + splice_shrink_spd(pipe, &spd); > return ret; > } > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 93c4e06..9319817 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -1417,12 +1417,13 @@ new_page: > /* > * Fill page/offset/length into spd, if it can hold more pages. > */ > -static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, > +static inline int spd_fill_page(struct splice_pipe_desc *spd, > + struct pipe_inode_info *pipe, struct page *page, > unsigned int *len, unsigned int offset, > struct sk_buff *skb, int linear, > struct sock *sk) > { > - if (unlikely(spd->nr_pages == PIPE_BUFFERS)) > + if (unlikely(spd->nr_pages == pipe->buffers)) > return 1; > > if (linear) { > @@ -1458,7 +1459,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, > unsigned int plen, unsigned int *off, > unsigned int *len, struct sk_buff *skb, > struct splice_pipe_desc *spd, int linear, > - struct sock *sk) > + struct sock *sk, > + struct pipe_inode_info *pipe) > { > if (!*len) > return 1; > @@ -1481,7 +1483,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, > /* the linear region may spread across several pages */ > flen = min_t(unsigned int, flen, PAGE_SIZE - poff); > > - if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) > + if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk)) > return 1; > > __segment_seek(&page, &poff, &plen, flen); > @@ -1496,9 +1498,9 @@ static inline int __splice_segment(struct page *page, unsigned int poff, > * Map linear and fragment data from the skb to spd. It reports failure if the > * pipe is full or if we already spliced the requested length. > */ > -static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, > - unsigned int *len, struct splice_pipe_desc *spd, > - struct sock *sk) > +static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, > + unsigned int *offset, unsigned int *len, > + struct splice_pipe_desc *spd, struct sock *sk) > { > int seg; > > @@ -1508,7 +1510,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, > if (__splice_segment(virt_to_page(skb->data), > (unsigned long) skb->data & (PAGE_SIZE - 1), > skb_headlen(skb), > - offset, len, skb, spd, 1, sk)) > + offset, len, skb, spd, 1, sk, pipe)) > return 1; > > /* > @@ -1518,7 +1520,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, > const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; > > if (__splice_segment(f->page, f->page_offset, f->size, > - offset, len, skb, spd, 0, sk)) > + offset, len, skb, spd, 0, sk, pipe)) > return 1; > } > > @@ -1535,8 +1537,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, > struct pipe_inode_info *pipe, unsigned int tlen, > unsigned int flags) > { > - struct partial_page partial[PIPE_BUFFERS]; > - struct page *pages[PIPE_BUFFERS]; > + struct partial_page partial[PIPE_DEF_BUFFERS]; > + struct page *pages[PIPE_DEF_BUFFERS]; > struct splice_pipe_desc spd = { > .pages = pages, > .partial = partial, > @@ -1546,12 +1548,16 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, > }; > struct sk_buff *frag_iter; > struct sock *sk = skb->sk; > + int ret = 0; > + > + if (splice_grow_spd(pipe, &spd)) > + return -ENOMEM; > > /* > * __skb_splice_bits() only fails if the output has no room left, > * so no point in going over the frag_list for the error case. > */ > - if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) > + if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk)) > goto done; > else if (!tlen) > goto done; > @@ -1562,14 +1568,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, > skb_walk_frags(skb, frag_iter) { > if (!tlen) > break; > - if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) > + if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk)) > break; > } > > done: > if (spd.nr_pages) { > - int ret; > - > /* > * Drop the socket lock, otherwise we have reverse > * locking dependencies between sk_lock and i_mutex > @@ -1582,10 +1586,10 @@ done: > release_sock(sk); > ret = splice_to_pipe(pipe, &spd); > lock_sock(sk); > - return ret; > } > > - return 0; > + splice_shrink_spd(pipe, &spd); > + return ret; > } > > /** ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-20 8:33 ` Miklos Szeredi @ 2010-05-20 8:37 ` Jens Axboe 2010-05-20 17:42 ` Linus Torvalds 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-20 8:37 UTC (permalink / raw) To: Miklos Szeredi; +Cc: torvalds, linux-fsdevel, linux-kernel, akpm On Thu, May 20 2010, Miklos Szeredi wrote: > On Wed, 19 May 2010, Jens Axboe wrote: > > On Wed, May 19 2010, Jens Axboe wrote: > > > On Wed, May 19 2010, Linus Torvalds wrote: > > > > > > > > > > > > On Wed, 19 May 2010, Miklos Szeredi wrote: > > > > > > > > > > One issue I see is that it's possible to grow pipes indefinitely. > > > > > Should this be restricted to privileged users? > > > > > > > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > > > > or similar). That way a normal user could shrink the pipe buffers, and > > > > then grow them again if he wants to. > > > > > > That's still a bit arbitrary, I don't think allowing default*2 only for > > > non-root is going to be hugely interesting. But limiting makes sense, > > > but lets at least allow a larger max limit for the normal user. I'm > > > suspecting that the media application that wants to use this will not be > > > running as root, and we don't make the feature properly available to the > > > ones that want to use it, then we may as well not do it. > > > > > > Or we could expose a sysctl for instance that holds the max non-root > > > size. And make that default to default*16 or something. How does that > > > sound? > > > > > > > Oh, and I think you need to also require that there be at least two > > > > buffers. Otherwise we can't guarantee POSIX behavior, I think. > > > > > > Good point, and at least that part is easily doable :-) > > > > So I updated the patch, that branch was pretty ancient... The fcntl pipe > > numbers were also screwed up, so got that fixed. New patch is here: > > > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=23dcb845246946aeda5a5e398c6911381ad28365 > > Not sure why you didn't take my updated patch. Yours misses > conversion of kernel/trace/trace.c as well as some hunks below (that > renaming the "pages" and "partial" arrays and a compile test would > have revealed). Sorry, I missed that you had done extra changes. I had to rebase the whole thing anyway, and as I said it was totally untested. I'll get it updated. The main reason I didn't push it before is that I wasn't completely sure that we wanted to use fcntl() for this. But Linus doesn't seem to object on that side at least. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-20 8:37 ` Jens Axboe @ 2010-05-20 17:42 ` Linus Torvalds 2010-05-20 17:48 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Linus Torvalds @ 2010-05-20 17:42 UTC (permalink / raw) To: Jens Axboe; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, akpm On Thu, 20 May 2010, Jens Axboe wrote: > > The main reason I didn't push it before is that I wasn't completely sure > that we wanted to use fcntl() for this. But Linus doesn't seem to object > on that side at least. Well, I don't object as long as there is some real use-case. If it's a "feature just for the sake of the feature" kind of theoretical "shouldn't we be able to do this" thing, I don't think we should do it. But it sounded like Miklos actually had a reason for the request. Linus ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-20 17:42 ` Linus Torvalds @ 2010-05-20 17:48 ` Jens Axboe 2010-05-21 17:13 ` Rick Sherm 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-20 17:48 UTC (permalink / raw) To: Linus Torvalds; +Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, akpm On Thu, May 20 2010, Linus Torvalds wrote: > > > On Thu, 20 May 2010, Jens Axboe wrote: > > > > The main reason I didn't push it before is that I wasn't completely sure > > that we wanted to use fcntl() for this. But Linus doesn't seem to object > > on that side at least. > > Well, I don't object as long as there is some real use-case. If it's a > "feature just for the sake of the feature" kind of theoretical "shouldn't > we be able to do this" thing, I don't think we should do it. > > But it sounded like Miklos actually had a reason for the request. Yeah, I'd say his 4x results pretty much speak for themselves. I've observed splice being slower as well here for high bandwidth testing, the 64kb max buffer size ends up hurting there. So I'm very convinced that this is a good idea. I have been for a while, was just waiting for someone else to pop up with an independent need and results to help shove it in :-) -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-20 17:48 ` Jens Axboe @ 2010-05-21 17:13 ` Rick Sherm 0 siblings, 0 replies; 47+ messages in thread From: Rick Sherm @ 2010-05-21 17:13 UTC (permalink / raw) To: Linus Torvalds, Jens Axboe Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, akpm --- On Thu, 5/20/10, Jens Axboe <jens.axboe@oracle.com> wrote: > Yeah, I'd say his 4x results pretty much speak for > themselves. I've > observed splice being slower as well here for high > bandwidth testing, > the 64kb max buffer size ends up hurting there. So I'm very > convinced > that this is a good idea. I have been for a while, was just > waiting for > someone else to pop up with an independent need and results > to help > shove it in :-) > I see a very minor reduction in CPU usage(used same code referenced in this < http://marc.info/?l=linux-kernel&m=127143736527459&w=2 > thread but with new fcntl args): I'm attempting a 1G file-to-file copy: Case1) O_DIRECT copy time ./pcap_file_to_file direct 2048 0.000u 0.200s 0:01.93 10.3% 0+0k 2097176+2097152io 0pf+0w ========================== iostat -d /dev/sdb1 -x 1 ========================== Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 7.00 0.00 4096.00 0.00 585.14 0.13 17.71 3.86 2.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 1947.00 1949.00 1137816.00 1139072.00 584.42 5.18 1.33 0.25 96.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 1633.00 1641.00 955264.00 958848.00 584.64 4.87 1.49 0.26 86.50 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Case2) with new fcntl-splice calls: 2.1) splice: 64 pipe buffers tcsh# time ./pcap_splice_2_splice direct 2048 64 0.008u 3.137s 0:07.38 42.4% 0+0k 2097176+2097152io 0pf+0w ========================== iostat -d /dev/sdb1 -x 1 ========================== Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 3397.00 257.00 1141.00 65048.00 129024.00 138.82 0.38 0.27 0.19 25.90 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15733.00 1153.00 5027.00 295168.00 590464.00 143.31 0.96 0.16 0.13 81.30 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15123.00 1093.00 4552.00 279808.00 559744.00 148.72 0.88 0.16 0.14 76.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14163.00 1025.00 4296.00 262400.00 525056.00 147.99 0.93 0.18 0.15 80.40 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14866.00 1074.00 4464.00 274944.00 549632.00 148.89 0.91 0.16 0.14 79.40 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15169.00 1098.00 4596.00 281088.00 562304.00 148.12 0.93 0.16 0.14 79.50 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 16299.00 1216.00 5581.00 311296.00 622464.00 137.38 0.93 0.14 0.11 77.80 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15505.00 1162.00 5418.00 297472.00 594944.00 135.63 0.92 0.14 0.12 77.10 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 1596.00 117.00 530.00 29952.00 60672.00 140.07 0.08 0.13 0.11 7.10 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.2) splice: 256 pipe buffers tcsh# time ./pcap_splice_2_splice direct 2048 256 0.002u 3.149s 0:07.52 41.7% 0+0k 2097176+2097152io 0pf+0w ========================== iostat -d /dev/sdb1 -x 1 ========================== Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 8022.00 606.00 2817.00 154392.00 308224.00 135.15 0.66 0.19 0.14 47.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15958.00 1201.00 5662.00 307456.00 614938.00 134.40 1.02 0.15 0.12 83.40 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15877.00 1170.00 5173.00 299520.00 598656.00 141.60 0.93 0.15 0.12 78.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14945.00 1075.00 4415.00 275200.00 550784.00 150.45 0.97 0.18 0.15 84.30 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14796.00 1066.00 4399.00 272896.00 545792.00 149.81 0.93 0.17 0.15 80.10 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 13767.00 993.00 4090.00 254208.00 508032.00 149.96 0.95 0.17 0.17 84.40 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14830.00 1068.00 4404.00 273408.00 547200.00 149.96 0.94 0.18 0.15 82.80 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 14113.00 1016.00 4193.00 260096.00 520704.00 149.89 0.93 0.18 0.16 81.70 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 2.3) 512 pipe-buffers tcsh# time ./pcap_splice_2_splice direct 2048 512 0.005u 3.222s 0:06.91 46.5% 0+0k 2097176+2097152io 0pf+0w Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 1.00 0.00 16.00 16.00 0.00 0.00 0.00 0.00 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 4687.00 352.00 1576.00 89368.00 178048.00 138.70 0.40 0.21 0.15 28.80 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15265.00 1133.00 5124.00 290048.00 579985.00 139.05 0.91 0.15 0.11 71.60 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 15080.00 1094.00 4621.00 280064.00 560384.00 147.06 0.70 0.12 0.12 66.10 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 16200.00 1189.00 5201.00 304384.00 608640.00 142.88 0.78 0.12 0.11 72.50 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 17009.00 1246.00 5419.00 318976.00 637952.00 143.58 0.75 0.11 0.10 69.10 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 16014.00 1170.00 5038.00 299520.00 598912.00 144.72 0.75 0.12 0.11 70.20 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 17363.00 1268.00 5470.00 324608.00 649472.00 144.57 0.74 0.11 0.10 66.90 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 10231.00 743.00 3161.00 190208.00 380928.00 146.30 0.44 0.11 0.10 40.50 Device: rrqm/s wrqm/s r/s w/s rsec/s wsec/s avgrq-sz avgqu-sz await svctm %util sdb1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 16:49 ` Linus Torvalds 2010-05-19 18:05 ` Jens Axboe @ 2010-05-23 5:30 ` Michael Kerrisk 2010-05-23 2:38 ` Andrew Morton 1 sibling, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-23 5:30 UTC (permalink / raw) To: Linus Torvalds Cc: Miklos Szeredi, linux-fsdevel, linux-kernel, jens.axboe, akpm, Michael Kerrisk Hi all, I see that this patch has hit Linus's git, so some questions On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds <torvalds@linux-foundation.org> wrote: > > > On Wed, 19 May 2010, Miklos Szeredi wrote: >> >> One issue I see is that it's possible to grow pipes indefinitely. >> Should this be restricted to privileged users? > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > or similar). That way a normal user could shrink the pipe buffers, and > then grow them again if he wants to. > > Oh, and I think you need to also require that there be at least two > buffers. Otherwise we can't guarantee POSIX behavior, I think. Is there any documentation (e.g., a man-pages patch) for these changes? The argument of the fcntl() operations is expressed in pages. I take it that this means that the semantics of the argument will very depending on the system page size? So for example, 2 on x86 will mean 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And what about architectures where the page size is switchable?) Such changes in semantics should not be silent for the use, IMO. I'm not so sure about Linus's assertion above about needing at least two buffers (pages?) to guarantee POSIX behavior. Back in 2.16.10 and earlier, the buffer size was a page (4096 bytes) on x86-32, and we had POSIX-compliant behavior. Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface", http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 5:30 ` Michael Kerrisk @ 2010-05-23 2:38 ` Andrew Morton 2010-05-23 5:52 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Andrew Morton @ 2010-05-23 2:38 UTC (permalink / raw) To: Michael Kerrisk Cc: Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel, jens.axboe On Sun, 23 May 2010 07:30:01 +0200 Michael Kerrisk <mtk.manpages@gmail.com> wrote: > Hi all, > > I see that this patch has hit Linus's git, so some questions > > On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > > > On Wed, 19 May 2010, Miklos Szeredi wrote: > >> > >> One issue I see is that it's possible to grow pipes indefinitely. > >> Should this be restricted to privileged users? > > > > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > > or similar). That way a normal user could shrink the pipe buffers, and > > then grow them again if he wants to. > > > > Oh, and I think you need to also require that there be at least two > > buffers. Otherwise we can't guarantee POSIX behavior, I think. > > Is there any documentation (e.g., a man-pages patch) for these changes? > > The argument of the fcntl() operations is expressed in pages. I take > it that this means that the semantics of the argument will very > depending on the system page size? So for example, 2 on x86 will mean > 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And > what about architectures where the page size is switchable?) Such > changes in semantics should not be silent for the use, IMO. Well, there is getpagesize(). But I agree - this interface is just asking (x86) people to write non-portable code. otoh, if the arg was in bytes, they'd just hard-code "8192". They're clever like that. But we have gone to some lengths to avoid exposing things like PAGE_SIZE and HZ in procfs, so it makes sense to take the same approach to syscalls. ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 2:38 ` Andrew Morton @ 2010-05-23 5:52 ` Michael Kerrisk 2010-05-23 7:09 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-23 5:52 UTC (permalink / raw) To: Andrew Morton Cc: Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel, jens.axboe On Sun, May 23, 2010 at 4:38 AM, Andrew Morton <akpm@linux-foundation.org> wrote: > On Sun, 23 May 2010 07:30:01 +0200 Michael Kerrisk <mtk.manpages@gmail.com> wrote: > >> Hi all, >> >> I see that this patch has hit Linus's git, so some questions >> >> On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds >> <torvalds@linux-foundation.org> wrote: >> > >> > >> > On Wed, 19 May 2010, Miklos Szeredi wrote: >> >> >> >> One issue I see is that it's possible to grow pipes indefinitely. >> >> Should this be restricted to privileged users? >> > >> > Yes. But perhaps only if it grows past the default (or perhaps "default*2" >> > or similar). That way a normal user could shrink the pipe buffers, and >> > then grow them again if he wants to. >> > >> > Oh, and I think you need to also require that there be at least two >> > buffers. Otherwise we can't guarantee POSIX behavior, I think. >> >> Is there any documentation (e.g., a man-pages patch) for these changes? >> >> The argument of the fcntl() operations is expressed in pages. I take >> it that this means that the semantics of the argument will very >> depending on the system page size? So for example, 2 on x86 will mean >> 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And >> what about architectures where the page size is switchable?) Such >> changes in semantics should not be silent for the use, IMO. > > Well, there is getpagesize(). But I agree - this interface is just > asking (x86) people to write non-portable code. > > otoh, if the arg was in bytes, they'd just hard-code "8192". They're > clever like that. > > But we have gone to some lengths to avoid exposing things like > PAGE_SIZE and HZ in procfs, so it makes sense to take the same approach > to syscalls. Quite. All of the other memory-related APIs that I can think of require the user to express the info in bytes. (mlock(), remap_file_pages(), mmap(), mremap(), mprotect(), shmget(), and so on). Not doing the same for this interface is needlessly inconsistent. And while there will be the silly users you mention above, smart users will know how to do the right thing with a consistently designed interface. -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 5:52 ` Michael Kerrisk @ 2010-05-23 7:09 ` Jens Axboe 2010-05-23 9:24 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-23 7:09 UTC (permalink / raw) To: mtk.manpages Cc: Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Sun, May 23 2010, Michael Kerrisk wrote: > On Sun, May 23, 2010 at 4:38 AM, Andrew Morton > <akpm@linux-foundation.org> wrote: > > On Sun, 23 May 2010 07:30:01 +0200 Michael Kerrisk <mtk.manpages@gmail.com> wrote: > > > >> Hi all, > >> > >> I see that this patch has hit Linus's git, so some questions > >> > >> On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds > >> <torvalds@linux-foundation.org> wrote: > >> > > >> > > >> > On Wed, 19 May 2010, Miklos Szeredi wrote: > >> >> > >> >> One issue I see is that it's possible to grow pipes indefinitely. > >> >> Should this be restricted to privileged users? > >> > > >> > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > >> > or similar). That way a normal user could shrink the pipe buffers, and > >> > then grow them again if he wants to. > >> > > >> > Oh, and I think you need to also require that there be at least two > >> > buffers. Otherwise we can't guarantee POSIX behavior, I think. > >> > >> Is there any documentation (e.g., a man-pages patch) for these changes? > >> > >> The argument of the fcntl() operations is expressed in pages. I take > >> it that this means that the semantics of the argument will very > >> depending on the system page size? So for example, 2 on x86 will mean > >> 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And > >> what about architectures where the page size is switchable?) Such > >> changes in semantics should not be silent for the use, IMO. > > > > Well, there is getpagesize(). But I agree - this interface is just > > asking (x86) people to write non-portable code. > > > > otoh, if the arg was in bytes, they'd just hard-code "8192". They're > > clever like that. > > > > But we have gone to some lengths to avoid exposing things like > > PAGE_SIZE and HZ in procfs, so it makes sense to take the same approach > > to syscalls. > > Quite. All of the other memory-related APIs that I can think of > require the user to express the info in bytes. (mlock(), > remap_file_pages(), mmap(), mremap(), mprotect(), shmget(), and so > on). Not doing the same for this interface is needlessly inconsistent. > And while there will be the silly users you mention above, smart users > will know how to do the right thing with a consistently designed > interface. We can easily make F_GETPIPE_SZ return bytes, but I don't think passing in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a power of 2 in pages. So the question is if that makes the API cleaner, passing in number of pages but returning bytes? Or pass in bytes all around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in pages if need be. Then it would return a size at least what was passed in, or error. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 7:09 ` Jens Axboe @ 2010-05-23 9:24 ` Michael Kerrisk 2010-05-23 17:47 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-23 9:24 UTC (permalink / raw) To: Jens Axboe Cc: Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Sun, May 23, 2010 at 9:09 AM, Jens Axboe <jens.axboe@oracle.com> wrote: > On Sun, May 23 2010, Michael Kerrisk wrote: >> On Sun, May 23, 2010 at 4:38 AM, Andrew Morton >> <akpm@linux-foundation.org> wrote: >> > On Sun, 23 May 2010 07:30:01 +0200 Michael Kerrisk <mtk.manpages@gmail.com> wrote: >> > >> >> Hi all, >> >> >> >> I see that this patch has hit Linus's git, so some questions >> >> >> >> On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds >> >> <torvalds@linux-foundation.org> wrote: >> >> > >> >> > >> >> > On Wed, 19 May 2010, Miklos Szeredi wrote: >> >> >> >> >> >> One issue I see is that it's possible to grow pipes indefinitely. >> >> >> Should this be restricted to privileged users? >> >> > >> >> > Yes. But perhaps only if it grows past the default (or perhaps "default*2" >> >> > or similar). That way a normal user could shrink the pipe buffers, and >> >> > then grow them again if he wants to. >> >> > >> >> > Oh, and I think you need to also require that there be at least two >> >> > buffers. Otherwise we can't guarantee POSIX behavior, I think. >> >> >> >> Is there any documentation (e.g., a man-pages patch) for these changes? >> >> >> >> The argument of the fcntl() operations is expressed in pages. I take >> >> it that this means that the semantics of the argument will very >> >> depending on the system page size? So for example, 2 on x86 will mean >> >> 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And >> >> what about architectures where the page size is switchable?) Such >> >> changes in semantics should not be silent for the use, IMO. >> > >> > Well, there is getpagesize(). But I agree - this interface is just >> > asking (x86) people to write non-portable code. >> > >> > otoh, if the arg was in bytes, they'd just hard-code "8192". They're >> > clever like that. >> > >> > But we have gone to some lengths to avoid exposing things like >> > PAGE_SIZE and HZ in procfs, so it makes sense to take the same approach >> > to syscalls. >> >> Quite. All of the other memory-related APIs that I can think of >> require the user to express the info in bytes. (mlock(), >> remap_file_pages(), mmap(), mremap(), mprotect(), shmget(), and so >> on). Not doing the same for this interface is needlessly inconsistent. >> And while there will be the silly users you mention above, smart users >> will know how to do the right thing with a consistently designed >> interface. > > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a > power of 2 in pages. So the question is if that makes the API cleaner, > passing in number of pages but returning bytes? Or pass in bytes all > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in > pages if need be. Then it would return a size at least what was passed > in, or error. I'd recommend this: Pass it in and out in bytes. Don't round to a power of 2. Require the user to know what they are doing. Give an error if the user doesn't supply a power-of-2 * page-size for F_SETPIPE_SZ. (Again, consider the case of architectures with switchable page sizes.) Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 9:24 ` Michael Kerrisk @ 2010-05-23 17:47 ` Jens Axboe 2010-05-24 1:43 ` OGAWA Hirofumi 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-23 17:47 UTC (permalink / raw) To: mtk.manpages Cc: Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Sun, May 23 2010, Michael Kerrisk wrote: > On Sun, May 23, 2010 at 9:09 AM, Jens Axboe <jens.axboe@oracle.com> wrote: > > On Sun, May 23 2010, Michael Kerrisk wrote: > >> On Sun, May 23, 2010 at 4:38 AM, Andrew Morton > >> <akpm@linux-foundation.org> wrote: > >> > On Sun, 23 May 2010 07:30:01 +0200 Michael Kerrisk <mtk.manpages@gmail.com> wrote: > >> > > >> >> Hi all, > >> >> > >> >> I see that this patch has hit Linus's git, so some questions > >> >> > >> >> On Wed, May 19, 2010 at 6:49 PM, Linus Torvalds > >> >> <torvalds@linux-foundation.org> wrote: > >> >> > > >> >> > > >> >> > On Wed, 19 May 2010, Miklos Szeredi wrote: > >> >> >> > >> >> >> One issue I see is that it's possible to grow pipes indefinitely. > >> >> >> Should this be restricted to privileged users? > >> >> > > >> >> > Yes. But perhaps only if it grows past the default (or perhaps "default*2" > >> >> > or similar). That way a normal user could shrink the pipe buffers, and > >> >> > then grow them again if he wants to. > >> >> > > >> >> > Oh, and I think you need to also require that there be at least two > >> >> > buffers. Otherwise we can't guarantee POSIX behavior, I think. > >> >> > >> >> Is there any documentation (e.g., a man-pages patch) for these changes? > >> >> > >> >> The argument of the fcntl() operations is expressed in pages. I take > >> >> it that this means that the semantics of the argument will very > >> >> depending on the system page size? So for example, 2 on x86 will mean > >> >> 8192 bytes, but will mean 32768 of ia64? That seems very weird. (And > >> >> what about architectures where the page size is switchable?) Such > >> >> changes in semantics should not be silent for the use, IMO. > >> > > >> > Well, there is getpagesize(). But I agree - this interface is just > >> > asking (x86) people to write non-portable code. > >> > > >> > otoh, if the arg was in bytes, they'd just hard-code "8192". They're > >> > clever like that. > >> > > >> > But we have gone to some lengths to avoid exposing things like > >> > PAGE_SIZE and HZ in procfs, so it makes sense to take the same approach > >> > to syscalls. > >> > >> Quite. All of the other memory-related APIs that I can think of > >> require the user to express the info in bytes. (mlock(), > >> remap_file_pages(), mmap(), mremap(), mprotect(), shmget(), and so > >> on). Not doing the same for this interface is needlessly inconsistent. > >> And while there will be the silly users you mention above, smart users > >> will know how to do the right thing with a consistently designed > >> interface. > > > > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing > > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a > > power of 2 in pages. So the question is if that makes the API cleaner, > > passing in number of pages but returning bytes? Or pass in bytes all > > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in > > pages if need be. Then it would return a size at least what was passed > > in, or error. > > I'd recommend this: Pass it in and out in bytes. Don't round to a > power of 2. Require the user to know what they are doing. Give an > error if the user doesn't supply a power-of-2 * page-size for > F_SETPIPE_SZ. (Again, consider the case of architectures with > switchable page sizes.) But is there much point in erroring on an incorrect size? If the application says "I need at least 120kb of space in there", kernel returns "OK, you got 128kb". Would returning -1/EINVAL for that case really make a better API? Doesn't seem like it to me. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-23 17:47 ` Jens Axboe @ 2010-05-24 1:43 ` OGAWA Hirofumi 2010-05-24 4:43 ` Michael Kerrisk 2010-05-24 7:04 ` Jens Axboe 0 siblings, 2 replies; 47+ messages in thread From: OGAWA Hirofumi @ 2010-05-24 1:43 UTC (permalink / raw) To: Jens Axboe Cc: mtk.manpages, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Jens Axboe <jens.axboe@oracle.com> writes: >> > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing >> > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a >> > power of 2 in pages. So the question is if that makes the API cleaner, >> > passing in number of pages but returning bytes? Or pass in bytes all >> > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in >> > pages if need be. Then it would return a size at least what was passed >> > in, or error. I really think "power of 2 in pages" is simply current implementation detail, not detail of pipe API. >> I'd recommend this: Pass it in and out in bytes. Don't round to a >> power of 2. Require the user to know what they are doing. Give an >> error if the user doesn't supply a power-of-2 * page-size for >> F_SETPIPE_SZ. (Again, consider the case of architectures with >> switchable page sizes.) > > But is there much point in erroring on an incorrect size? If the > application says "I need at least 120kb of space in there", kernel > returns "OK, you got 128kb". Would returning -1/EINVAL for that case > really make a better API? Doesn't seem like it to me. FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix socket. Well, API itself wouldn't say "at least this size" or "exactly this size", so, in here, important thing is consistency of interfaces, I think. (And the both is sane API at least for me if those had consistency in the system.) Well, so how about set/get in bytes, and kernel will set "at least specified size" actually like setsockopt(SO_RCV/SNDBUF)? Thanks. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 1:43 ` OGAWA Hirofumi @ 2010-05-24 4:43 ` Michael Kerrisk 2010-05-24 7:05 ` Jens Axboe 2010-05-24 7:04 ` Jens Axboe 1 sibling, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 4:43 UTC (permalink / raw) To: OGAWA Hirofumi Cc: Jens Axboe, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 3:43 AM, OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote: > Jens Axboe <jens.axboe@oracle.com> writes: > >>> > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing >>> > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a >>> > power of 2 in pages. So the question is if that makes the API cleaner, >>> > passing in number of pages but returning bytes? Or pass in bytes all >>> > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in >>> > pages if need be. Then it would return a size at least what was passed >>> > in, or error. > > I really think "power of 2 in pages" is simply current implementation > detail, not detail of pipe API. That's a good point. >>> I'd recommend this: Pass it in and out in bytes. Don't round to a >>> power of 2. Require the user to know what they are doing. Give an >>> error if the user doesn't supply a power-of-2 * page-size for >>> F_SETPIPE_SZ. (Again, consider the case of architectures with >>> switchable page sizes.) >> >> But is there much point in erroring on an incorrect size? If the >> application says "I need at least 120kb of space in there", kernel >> returns "OK, you got 128kb". Would returning -1/EINVAL for that case >> really make a better API? Doesn't seem like it to me. > > FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix > socket. Well, API itself wouldn't say "at least this size" or "exactly > this size", so, in here, important thing is consistency of interfaces, I > think. (And the both is sane API at least for me if those had > consistency in the system.) > > Well, so how about set/get in bytes, and kernel will set "at least > specified size" actually like setsockopt(SO_RCV/SNDBUF)? The "at least" idea makes sense. So, I'd change my recommendation to: Pass the buffer size in and out in bytes (for consistency with other APIs). Round the input (F_SETPIPE_SZ) value up as required by the implementation. For the output (F_GETPIPE_SZ) value do one of the following: a) Return the value given on input. b) Return the rounded up value actually used by the kernel. I suspect (b) might be more useful: if an application cares enough about pipe size to want to change it, then at least some such applications might care to know exactly the size that the kernel used. (And: I can't see any downside to (b).) One other comment about the interface. We have if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) return -EINVAL; The usual error on a capability denied is EPERM. Please change. Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 4:43 ` Michael Kerrisk @ 2010-05-24 7:05 ` Jens Axboe 2010-05-24 7:27 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-24 7:05 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel > The "at least" idea makes sense. So, I'd change my recommendation to: > > Pass the buffer size in and out in bytes (for consistency with other > APIs). Round the input (F_SETPIPE_SZ) value up as required by the > implementation. For the output (F_GETPIPE_SZ) value do one of the > following: > a) Return the value given on input. > b) Return the rounded up value actually used by the kernel. > > I suspect (b) might be more useful: if an application cares enough > about pipe size to want to change it, then at least some such > applications might care to know exactly the size that the kernel used. > (And: I can't see any downside to (b).) b definitely, since it's the real size (plus then we don't have to track the passed in size). > One other comment about the interface. We have > > if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) > return -EINVAL; > > The usual error on a capability denied is EPERM. Please change. Right, that looks like a thinko. I'll submit a patch changing it to bytes and the agreed API and fix this -Eerror. Thanks for your comments and suggestions! -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:05 ` Jens Axboe @ 2010-05-24 7:27 ` Michael Kerrisk 2010-05-24 17:35 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 7:27 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 9:05 AM, Jens Axboe <jens.axboe@oracle.com> wrote: >> The "at least" idea makes sense. So, I'd change my recommendation to: >> >> Pass the buffer size in and out in bytes (for consistency with other >> APIs). Round the input (F_SETPIPE_SZ) value up as required by the >> implementation. For the output (F_GETPIPE_SZ) value do one of the >> following: >> a) Return the value given on input. >> b) Return the rounded up value actually used by the kernel. >> >> I suspect (b) might be more useful: if an application cares enough >> about pipe size to want to change it, then at least some such >> applications might care to know exactly the size that the kernel used. >> (And: I can't see any downside to (b).) > > b definitely, since it's the real size (plus then we don't have to track > the passed in size). Okay. >> One other comment about the interface. We have >> >> if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) >> return -EINVAL; >> >> The usual error on a capability denied is EPERM. Please change. > > Right, that looks like a thinko. > > I'll submit a patch changing it to bytes and the agreed API and fix this > -Eerror. Thanks for your comments and suggestions! Thanks. And of course you are welcome. (Please CC linux-api@vger on this patche (and all patches that change the API/ABI.) Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:27 ` Michael Kerrisk @ 2010-05-24 17:35 ` Jens Axboe 2010-05-24 17:52 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-24 17:35 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24 2010, Michael Kerrisk wrote: > > Right, that looks like a thinko. > > > > I'll submit a patch changing it to bytes and the agreed API and fix this > > -Eerror. Thanks for your comments and suggestions! > > Thanks. And of course you are welcome. (Please CC linux-api@vger on > this patche (and all patches that change the API/ABI.) The first change is this: http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b and the one dealing with the pages vs bytes API is this: http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 Not tested yet, will do so before sending in of course. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 17:35 ` Jens Axboe @ 2010-05-24 17:52 ` Michael Kerrisk 2010-05-24 17:56 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 17:52 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > On Mon, May 24 2010, Michael Kerrisk wrote: >> > Right, that looks like a thinko. >> > >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> > -Eerror. Thanks for your comments and suggestions! >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> this patche (and all patches that change the API/ABI.) > > The first change is this: > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > > and the one dealing with the pages vs bytes API is this: > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > > Not tested yet, will do so before sending in of course. Eyeballing it quickly, these changes look right. Do you have some test programs you can make available? Thanks, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 17:52 ` Michael Kerrisk @ 2010-05-24 17:56 ` Jens Axboe 2010-05-25 4:01 ` Michael Kerrisk 2010-05-27 6:49 ` Michael Kerrisk 0 siblings, 2 replies; 47+ messages in thread From: Jens Axboe @ 2010-05-24 17:56 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24 2010, Michael Kerrisk wrote: > On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > > On Mon, May 24 2010, Michael Kerrisk wrote: > >> > Right, that looks like a thinko. > >> > > >> > I'll submit a patch changing it to bytes and the agreed API and fix this > >> > -Eerror. Thanks for your comments and suggestions! > >> > >> Thanks. And of course you are welcome. (Please CC linux-api@vger on > >> this patche (and all patches that change the API/ABI.) > > > > The first change is this: > > > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > > > > and the one dealing with the pages vs bytes API is this: > > > > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > > > > Not tested yet, will do so before sending in of course. > > Eyeballing it quickly, these changes look right. Good, thanks. > Do you have some test programs you can make available? Actually I don't, I test it by modifying fio's splice engine to set/get the pipe size and test the resulting transfers. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 17:56 ` Jens Axboe @ 2010-05-25 4:01 ` Michael Kerrisk 2010-06-01 7:48 ` Jens Axboe 2010-05-27 6:49 ` Michael Kerrisk 1 sibling, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-25 4:01 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > On Mon, May 24 2010, Michael Kerrisk wrote: >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> > Right, that looks like a thinko. >> >> > >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> >> > -Eerror. Thanks for your comments and suggestions! >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> >> this patche (and all patches that change the API/ABI.) >> > >> > The first change is this: >> > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >> > >> > and the one dealing with the pages vs bytes API is this: >> > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >> > >> > Not tested yet, will do so before sending in of course. >> >> Eyeballing it quickly, these changes look right. > > Good, thanks. > >> Do you have some test programs you can make available? > > Actually I don't, I test it by modifying fio's splice engine to set/get > the pipe size and test the resulting transfers. Two more questions: is the rationale for this feature written up somewhere? I could not find it. Is it primarily intended for splice/vmsplice/tee, with the effect for pipe(2) being a side effect? Also, the minuimum size of the buffer is 2 pages. Why is it not 1? (Notwithstanding Linus's assertion, a buffer size of 1 page did give us POSIX compliance in kernels before 2.6.10.) Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-25 4:01 ` Michael Kerrisk @ 2010-06-01 7:48 ` Jens Axboe 2010-06-01 15:22 ` Linus Torvalds 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-06-01 7:48 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Tue, May 25 2010, Michael Kerrisk wrote: > On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > > On Mon, May 24 2010, Michael Kerrisk wrote: > >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> > Right, that looks like a thinko. > >> >> > > >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this > >> >> > -Eerror. Thanks for your comments and suggestions! > >> >> > >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on > >> >> this patche (and all patches that change the API/ABI.) > >> > > >> > The first change is this: > >> > > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > >> > > >> > and the one dealing with the pages vs bytes API is this: > >> > > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > >> > > >> > Not tested yet, will do so before sending in of course. > >> > >> Eyeballing it quickly, these changes look right. > > > > Good, thanks. > > > >> Do you have some test programs you can make available? > > > > Actually I don't, I test it by modifying fio's splice engine to set/get > > the pipe size and test the resulting transfers. > > Two more questions: is the rationale for this feature written up > somewhere? I could not find it. Is it primarily intended for > splice/vmsplice/tee, with the effect for pipe(2) being a side effect? Yes it's primarily for splice, where the 64kb size can sometimes become a limiting factor because of the pipe mutex lock/unlocking. > Also, the minuimum size of the buffer is 2 pages. Why is it not 1? > (Notwithstanding Linus's assertion, a buffer size of 1 page did give > us POSIX compliance in kernels before 2.6.10.) I'll defer to Linus on that, I remember some emails on that part from way back when. As far as I can tell, POSIX wants atomic writes of "less than a page size", which would make more sense as "of a page size and less". And since it should not be a page size from either side on a uni-directional pipe, then 1 page seems enough for that guarantee at least. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-01 7:48 ` Jens Axboe @ 2010-06-01 15:22 ` Linus Torvalds 2010-06-01 16:36 ` Loke, Chetan 0 siblings, 1 reply; 47+ messages in thread From: Linus Torvalds @ 2010-06-01 15:22 UTC (permalink / raw) To: Jens Axboe Cc: mtk.manpages, OGAWA Hirofumi, Andrew Morton, Miklos Szeredi, linux-fsdevel, linux-kernel On Tue, 1 Jun 2010, Jens Axboe wrote: > > > Also, the minuimum size of the buffer is 2 pages. Why is it not 1? > > (Notwithstanding Linus's assertion, a buffer size of 1 page did give > > us POSIX compliance in kernels before 2.6.10.) > > I'll defer to Linus on that, I remember some emails on that part from > way back when. As far as I can tell, POSIX wants atomic writes of "less > than a page size", which would make more sense as "of a page size and > less". And since it should not be a page size from either side on a > uni-directional pipe, then 1 page seems enough for that guarantee at > least. Hmm. You guys may well be right that a single slot is sufficient. It still gives us PIPE_BUF worth of data for writing atomically. I had this memory that we needed two because of the merging logic (we have that special case for re-using the previous page, so that we don't use waste of memory for lots of small writes), but looking at the code there is no reason at all for me to hav thought so. So I don't know why I thought we needed the extra slot, and a single slot (if anybody really wants slow writes) looks to be fine. Linus ^ permalink raw reply [flat|nested] 47+ messages in thread
* RE: [patch] pipe: add support for shrinking and growing pipes 2010-06-01 15:22 ` Linus Torvalds @ 2010-06-01 16:36 ` Loke, Chetan 0 siblings, 0 replies; 47+ messages in thread From: Loke, Chetan @ 2010-06-01 16:36 UTC (permalink / raw) To: Linus Torvalds, Jens Axboe Cc: mtk.manpages, OGAWA Hirofumi, Andrew Morton, Miklos Szeredi, linux-fsdevel, linux-kernel > -----Original Message----- > From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel- > owner@vger.kernel.org] On Behalf Of Linus Torvalds > Sent: June 01, 2010 11:22 AM > > On Tue, 1 Jun 2010, Jens Axboe wrote: > > > > > Also, the minuimum size of the buffer is 2 pages. Why is it not 1? > > > (Notwithstanding Linus's assertion, a buffer size of 1 page did > give us POSIX compliance in kernels before 2.6.10.) > > > > I'll defer to Linus on that, I remember some emails on that part from > > way back when. As far as I can tell, POSIX wants atomic writes of > > "less than a page size", which would make more sense as "of a page size and > > less". And since it should not be a page size from either side on a > > uni-directional pipe, then 1 page seems enough for that guarantee at > > least. > > Hmm. You guys may well be right that a single slot is sufficient. It > still gives us PIPE_BUF worth of data for writing atomically. I had this > memory that we needed two because of the merging logic (we have that special > case for re-using the previous page, so that we don't use waste of memory > for lots of small writes), but looking at the code there is no reason at > all for me to hav thought so. > > So I don't know why I thought we needed the extra slot, and a single > slot (if anybody really wants slow writes) looks to be fine. > Ok, I have a really dumb/basic question. The reason we are letting users grow the pipe->buffers is to decrease the number of splice-calls. This implies the user has fnctl'd(when he/she wants performance). Can we not have an option where we don't have to 'alloc pipe->buffers' worth pages every single time? As an example look at 'default_file_splice_read'. Is it possible to enhance the existing functionality by defining a new cmd and a flag(in struct pipe_xxx etc) and allowing an user to control that? Something like 'fcntl->F_SETPIPE_SZ_AND_LOCK_PIPE_PAGES'? Does this make sense? regards Chetan Loke ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 17:56 ` Jens Axboe 2010-05-25 4:01 ` Michael Kerrisk @ 2010-05-27 6:49 ` Michael Kerrisk 2010-06-01 7:45 ` Jens Axboe 1 sibling, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-05-27 6:49 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Jens, On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > On Mon, May 24 2010, Michael Kerrisk wrote: >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> > Right, that looks like a thinko. >> >> > >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> >> > -Eerror. Thanks for your comments and suggestions! >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> >> this patche (and all patches that change the API/ABI.) >> > >> > The first change is this: >> > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >> > >> > and the one dealing with the pages vs bytes API is this: >> > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >> > >> > Not tested yet, will do so before sending in of course. >> >> Eyeballing it quickly, these changes look right. > > Good, thanks. > >> Do you have some test programs you can make available? > > Actually I don't, I test it by modifying fio's splice engine to set/get > the pipe size and test the resulting transfers. An afterthought. Do there not also need to be fixes to the /proc interfaces. I don't think they were included in your revised patches. Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-27 6:49 ` Michael Kerrisk @ 2010-06-01 7:45 ` Jens Axboe 2010-06-02 19:25 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-06-01 7:45 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Thu, May 27 2010, Michael Kerrisk wrote: > Jens, > > On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > > On Mon, May 24 2010, Michael Kerrisk wrote: > >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> > Right, that looks like a thinko. > >> >> > > >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this > >> >> > -Eerror. Thanks for your comments and suggestions! > >> >> > >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on > >> >> this patche (and all patches that change the API/ABI.) > >> > > >> > The first change is this: > >> > > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > >> > > >> > and the one dealing with the pages vs bytes API is this: > >> > > >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > >> > > >> > Not tested yet, will do so before sending in of course. > >> > >> Eyeballing it quickly, these changes look right. > > > > Good, thanks. > > > >> Do you have some test programs you can make available? > > > > Actually I don't, I test it by modifying fio's splice engine to set/get > > the pipe size and test the resulting transfers. > > An afterthought. Do there not also need to be fixes to the /proc > interfaces. I don't think they were included in your revised patches. I think the proc part can be sanely left in pages, since it's just a memory limiter. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-01 7:45 ` Jens Axboe @ 2010-06-02 19:25 ` Michael Kerrisk 2010-06-03 6:10 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-02 19:25 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: > On Thu, May 27 2010, Michael Kerrisk wrote: >> Jens, >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> >> > Right, that looks like a thinko. >> >> >> > >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> >> >> > -Eerror. Thanks for your comments and suggestions! >> >> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> >> >> this patche (and all patches that change the API/ABI.) >> >> > >> >> > The first change is this: >> >> > >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >> >> > >> >> > and the one dealing with the pages vs bytes API is this: >> >> > >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >> >> > >> >> > Not tested yet, will do so before sending in of course. >> >> >> >> Eyeballing it quickly, these changes look right. >> > >> > Good, thanks. >> > >> >> Do you have some test programs you can make available? >> > >> > Actually I don't, I test it by modifying fio's splice engine to set/get >> > the pipe size and test the resulting transfers. >> >> An afterthought. Do there not also need to be fixes to the /proc >> interfaces. I don't think they were included in your revised patches. > > I think the proc part can be sanely left in pages, since it's just a > memory limiter. I can't see any advantage to using two different units for these closely related APIs, and it does seem like it could be a source of confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and shmget() SHMMAX that impose per-process memory-related limits use bytes. Best to be consistent, don't you think? Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-02 19:25 ` Michael Kerrisk @ 2010-06-03 6:10 ` Jens Axboe 2010-06-03 6:46 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-06-03 6:10 UTC (permalink / raw) To: mtk.manpages Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Wed, Jun 02 2010, Michael Kerrisk wrote: > On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: > > On Thu, May 27 2010, Michael Kerrisk wrote: > >> Jens, > >> > >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> >> > Right, that looks like a thinko. > >> >> >> > > >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this > >> >> >> > -Eerror. Thanks for your comments and suggestions! > >> >> >> > >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on > >> >> >> this patche (and all patches that change the API/ABI.) > >> >> > > >> >> > The first change is this: > >> >> > > >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > >> >> > > >> >> > and the one dealing with the pages vs bytes API is this: > >> >> > > >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > >> >> > > >> >> > Not tested yet, will do so before sending in of course. > >> >> > >> >> Eyeballing it quickly, these changes look right. > >> > > >> > Good, thanks. > >> > > >> >> Do you have some test programs you can make available? > >> > > >> > Actually I don't, I test it by modifying fio's splice engine to set/get > >> > the pipe size and test the resulting transfers. > >> > >> An afterthought. Do there not also need to be fixes to the /proc > >> interfaces. I don't think they were included in your revised patches. > > > > I think the proc part can be sanely left in pages, since it's just a > > memory limiter. > > I can't see any advantage to using two different units for these > closely related APIs, and it does seem like it could be a source of > confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and > shmget() SHMMAX that impose per-process memory-related limits use > bytes. Best to be consistent, don't you think? But they are different interfaces. I think the 'pass in required size, return actual size' where actual size is >= required size makes sense for the syscall part, but for an "admin" interface it is more logical to deal in pages. Perhaps that's just me and the average admin does not agree. So while it's just detail, it's also an interface so has some importance. And if there's consensus that bytes is a cleaner interface on the proc side as well, then lets change it. -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 6:10 ` Jens Axboe @ 2010-06-03 6:46 ` Michael Kerrisk 2010-06-03 7:01 ` Jens Axboe 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 6:46 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Hi Jens, On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: > On Wed, Jun 02 2010, Michael Kerrisk wrote: >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >> > On Thu, May 27 2010, Michael Kerrisk wrote: >> >> Jens, >> >> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> >> >> > Right, that looks like a thinko. >> >> >> >> > >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >> >> >> >> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> >> >> >> this patche (and all patches that change the API/ABI.) >> >> >> > >> >> >> > The first change is this: >> >> >> > >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >> >> >> > >> >> >> > and the one dealing with the pages vs bytes API is this: >> >> >> > >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >> >> >> > >> >> >> > Not tested yet, will do so before sending in of course. >> >> >> >> >> >> Eyeballing it quickly, these changes look right. >> >> > >> >> > Good, thanks. >> >> > >> >> >> Do you have some test programs you can make available? >> >> > >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >> >> > the pipe size and test the resulting transfers. >> >> >> >> An afterthought. Do there not also need to be fixes to the /proc >> >> interfaces. I don't think they were included in your revised patches. >> > >> > I think the proc part can be sanely left in pages, since it's just a >> > memory limiter. >> >> I can't see any advantage to using two different units for these >> closely related APIs, and it does seem like it could be a source of >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >> shmget() SHMMAX that impose per-process memory-related limits use >> bytes. Best to be consistent, don't you think? > > But they are different interfaces. I think the 'pass in required size, > return actual size' where actual size is >= required size makes sense > for the syscall part, but for an "admin" interface it is more logical to > deal in pages. Perhaps that's just me and the average admin does not > agree. So while it's just detail, it's also an interface so has some > importance. And if there's consensus that bytes is a cleaner interface > on the proc side as well, then lets change it. I'll add one more datapoint to those that I already mentioned. RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. There was only one vaguely related limit that I could find that measured things in pages. Consider these two System V shared memory limits: SHMMAX This is the maximum size (in bytes) of a shared memory segment. SHMALL This is a system-wide limit on the total number of pages of shared memory. But in a way this almost confirms my point. SHMMAX is a limit the governs the behavior of individual processes (like your /proc file), while SHMALL is a limit that governs the behavior of the system as a whole. There is a (sort of) logic to using bytes for one and pages for the other. I think that I've said all I need to say on the topic. I'm inclined to think yours /proc file should use bytes, since it seems consistent with other simialr APIs. Others may confirm, or someone else mught have a different insight. Cheers, Michael PS I hope you are going to set the lower limit for the /proc file to 4096B (a page) (?). -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 6:46 ` Michael Kerrisk @ 2010-06-03 7:01 ` Jens Axboe 2010-06-03 7:05 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-06-03 7:01 UTC (permalink / raw) To: mtk.manpages@gmail.com Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org On Thu, Jun 03 2010, Michael Kerrisk wrote: > Hi Jens, > > On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: > > On Wed, Jun 02 2010, Michael Kerrisk wrote: > >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: > >> > On Thu, May 27 2010, Michael Kerrisk wrote: > >> >> Jens, > >> >> > >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: > >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: > >> >> >> >> > Right, that looks like a thinko. > >> >> >> >> > > >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this > >> >> >> >> > -Eerror. Thanks for your comments and suggestions! > >> >> >> >> > >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on > >> >> >> >> this patche (and all patches that change the API/ABI.) > >> >> >> > > >> >> >> > The first change is this: > >> >> >> > > >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b > >> >> >> > > >> >> >> > and the one dealing with the pages vs bytes API is this: > >> >> >> > > >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 > >> >> >> > > >> >> >> > Not tested yet, will do so before sending in of course. > >> >> >> > >> >> >> Eyeballing it quickly, these changes look right. > >> >> > > >> >> > Good, thanks. > >> >> > > >> >> >> Do you have some test programs you can make available? > >> >> > > >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get > >> >> > the pipe size and test the resulting transfers. > >> >> > >> >> An afterthought. Do there not also need to be fixes to the /proc > >> >> interfaces. I don't think they were included in your revised patches. > >> > > >> > I think the proc part can be sanely left in pages, since it's just a > >> > memory limiter. > >> > >> I can't see any advantage to using two different units for these > >> closely related APIs, and it does seem like it could be a source of > >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and > >> shmget() SHMMAX that impose per-process memory-related limits use > >> bytes. Best to be consistent, don't you think? > > > > But they are different interfaces. I think the 'pass in required size, > > return actual size' where actual size is >= required size makes sense > > for the syscall part, but for an "admin" interface it is more logical to > > deal in pages. Perhaps that's just me and the average admin does not > > agree. So while it's just detail, it's also an interface so has some > > importance. And if there's consensus that bytes is a cleaner interface > > on the proc side as well, then lets change it. > > I'll add one more datapoint to those that I already mentioned. > RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. > > There was only one vaguely related limit that I could find that > measured things in pages. Consider these two System V shared memory > limits: > > SHMMAX > This is the maximum size (in bytes) of a shared memory segment. > > SHMALL > This is a system-wide limit on the total number of pages of shared memory. > > But in a way this almost confirms my point. SHMMAX is a limit the > governs the behavior of individual processes (like your /proc file), > while SHMALL is a limit that governs the behavior of the system as a > whole. There is a (sort of) logic to using bytes for one and pages for > the other. > > I think that I've said all I need to say on the topic. I'm inclined to > think yours /proc file should use bytes, since it seems consistent > with other simialr APIs. Others may confirm, or someone else mught > have a different insight. I'll commit a patch to change it to bytes. > PS I hope you are going to set the lower limit for the /proc file to > 4096B (a page) (?). Yes, I think I'll do that as a separate patch up front. -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 7:01 ` Jens Axboe @ 2010-06-03 7:05 ` Michael Kerrisk 2010-06-03 7:48 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 7:05 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org On Thu, Jun 3, 2010 at 9:01 AM, Jens Axboe <jaxboe@fusionio.com> wrote: > On Thu, Jun 03 2010, Michael Kerrisk wrote: >> Hi Jens, >> >> On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >> > On Wed, Jun 02 2010, Michael Kerrisk wrote: >> >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >> >> > On Thu, May 27 2010, Michael Kerrisk wrote: >> >> >> Jens, >> >> >> >> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >> >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >> >> >> >> >> > Right, that looks like a thinko. >> >> >> >> >> > >> >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >> >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >> >> >> >> >> >> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >> >> >> >> >> this patche (and all patches that change the API/ABI.) >> >> >> >> > >> >> >> >> > The first change is this: >> >> >> >> > >> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >> >> >> >> > >> >> >> >> > and the one dealing with the pages vs bytes API is this: >> >> >> >> > >> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >> >> >> >> > >> >> >> >> > Not tested yet, will do so before sending in of course. >> >> >> >> >> >> >> >> Eyeballing it quickly, these changes look right. >> >> >> > >> >> >> > Good, thanks. >> >> >> > >> >> >> >> Do you have some test programs you can make available? >> >> >> > >> >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >> >> >> > the pipe size and test the resulting transfers. >> >> >> >> >> >> An afterthought. Do there not also need to be fixes to the /proc >> >> >> interfaces. I don't think they were included in your revised patches. >> >> > >> >> > I think the proc part can be sanely left in pages, since it's just a >> >> > memory limiter. >> >> >> >> I can't see any advantage to using two different units for these >> >> closely related APIs, and it does seem like it could be a source of >> >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >> >> shmget() SHMMAX that impose per-process memory-related limits use >> >> bytes. Best to be consistent, don't you think? >> > >> > But they are different interfaces. I think the 'pass in required size, >> > return actual size' where actual size is >= required size makes sense >> > for the syscall part, but for an "admin" interface it is more logical to >> > deal in pages. Perhaps that's just me and the average admin does not >> > agree. So while it's just detail, it's also an interface so has some >> > importance. And if there's consensus that bytes is a cleaner interface >> > on the proc side as well, then lets change it. >> >> I'll add one more datapoint to those that I already mentioned. >> RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. >> >> There was only one vaguely related limit that I could find that >> measured things in pages. Consider these two System V shared memory >> limits: >> >> SHMMAX >> This is the maximum size (in bytes) of a shared memory segment. >> >> SHMALL >> This is a system-wide limit on the total number of pages of shared memory. >> >> But in a way this almost confirms my point. SHMMAX is a limit the >> governs the behavior of individual processes (like your /proc file), >> while SHMALL is a limit that governs the behavior of the system as a >> whole. There is a (sort of) logic to using bytes for one and pages for >> the other. >> >> I think that I've said all I need to say on the topic. I'm inclined to >> think yours /proc file should use bytes, since it seems consistent >> with other simialr APIs. Others may confirm, or someone else mught >> have a different insight. > > I'll commit a patch to change it to bytes. Thanks Jens. >> PS I hope you are going to set the lower limit for the /proc file to >> 4096B (a page) (?). > > Yes, I think I'll do that as a separate patch up front. Okay. Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 7:05 ` Michael Kerrisk @ 2010-06-03 7:48 ` Michael Kerrisk 2010-06-03 7:58 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 7:48 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org On Thu, Jun 3, 2010 at 9:05 AM, Michael Kerrisk <mtk.manpages@googlemail.com> wrote: > On Thu, Jun 3, 2010 at 9:01 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >> On Thu, Jun 03 2010, Michael Kerrisk wrote: >>> Hi Jens, >>> >>> On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>> > On Wed, Jun 02 2010, Michael Kerrisk wrote: >>> >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >>> >> > On Thu, May 27 2010, Michael Kerrisk wrote: >>> >> >> Jens, >>> >> >> >>> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>> >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>> >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>> >> >> >> >> > Right, that looks like a thinko. >>> >> >> >> >> > >>> >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >>> >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >>> >> >> >> >> >>> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >>> >> >> >> >> this patche (and all patches that change the API/ABI.) >>> >> >> >> > >>> >> >> >> > The first change is this: >>> >> >> >> > >>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >>> >> >> >> > >>> >> >> >> > and the one dealing with the pages vs bytes API is this: >>> >> >> >> > >>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >>> >> >> >> > >>> >> >> >> > Not tested yet, will do so before sending in of course. >>> >> >> >> >>> >> >> >> Eyeballing it quickly, these changes look right. >>> >> >> > >>> >> >> > Good, thanks. >>> >> >> > >>> >> >> >> Do you have some test programs you can make available? >>> >> >> > >>> >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >>> >> >> > the pipe size and test the resulting transfers. >>> >> >> >>> >> >> An afterthought. Do there not also need to be fixes to the /proc >>> >> >> interfaces. I don't think they were included in your revised patches. >>> >> > >>> >> > I think the proc part can be sanely left in pages, since it's just a >>> >> > memory limiter. >>> >> >>> >> I can't see any advantage to using two different units for these >>> >> closely related APIs, and it does seem like it could be a source of >>> >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >>> >> shmget() SHMMAX that impose per-process memory-related limits use >>> >> bytes. Best to be consistent, don't you think? >>> > >>> > But they are different interfaces. I think the 'pass in required size, >>> > return actual size' where actual size is >= required size makes sense >>> > for the syscall part, but for an "admin" interface it is more logical to >>> > deal in pages. Perhaps that's just me and the average admin does not >>> > agree. So while it's just detail, it's also an interface so has some >>> > importance. And if there's consensus that bytes is a cleaner interface >>> > on the proc side as well, then lets change it. >>> >>> I'll add one more datapoint to those that I already mentioned. >>> RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. >>> >>> There was only one vaguely related limit that I could find that >>> measured things in pages. Consider these two System V shared memory >>> limits: >>> >>> SHMMAX >>> This is the maximum size (in bytes) of a shared memory segment. >>> >>> SHMALL >>> This is a system-wide limit on the total number of pages of shared memory. >>> >>> But in a way this almost confirms my point. SHMMAX is a limit the >>> governs the behavior of individual processes (like your /proc file), >>> while SHMALL is a limit that governs the behavior of the system as a >>> whole. There is a (sort of) logic to using bytes for one and pages for >>> the other. >>> >>> I think that I've said all I need to say on the topic. I'm inclined to >>> think yours /proc file should use bytes, since it seems consistent >>> with other simialr APIs. Others may confirm, or someone else mught >>> have a different insight. >> >> I'll commit a patch to change it to bytes. > > Thanks Jens. Since I'm going to document the /proc file, it occurred to me... What are you going to call this file now? "pipe_max_pages" no longer makes sense. "pipe_size_ceiling" may be more expressive than simply "pipe_max". Cheers, Michael -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 7:48 ` Michael Kerrisk @ 2010-06-03 7:58 ` Michael Kerrisk 2010-06-03 8:29 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 7:58 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Hi Jens, On Thu, Jun 3, 2010 at 9:48 AM, Michael Kerrisk <mtk.manpages@googlemail.com> wrote: > On Thu, Jun 3, 2010 at 9:05 AM, Michael Kerrisk > <mtk.manpages@googlemail.com> wrote: >> On Thu, Jun 3, 2010 at 9:01 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>> On Thu, Jun 03 2010, Michael Kerrisk wrote: >>>> Hi Jens, >>>> >>>> On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>>> > On Wed, Jun 02 2010, Michael Kerrisk wrote: >>>> >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >>>> >> > On Thu, May 27 2010, Michael Kerrisk wrote: >>>> >> >> Jens, >>>> >> >> >>>> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>> >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>> >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>> >> >> >> >> > Right, that looks like a thinko. >>>> >> >> >> >> > >>>> >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >>>> >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >>>> >> >> >> >> >>>> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >>>> >> >> >> >> this patche (and all patches that change the API/ABI.) >>>> >> >> >> > >>>> >> >> >> > The first change is this: >>>> >> >> >> > >>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >>>> >> >> >> > >>>> >> >> >> > and the one dealing with the pages vs bytes API is this: >>>> >> >> >> > >>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >>>> >> >> >> > >>>> >> >> >> > Not tested yet, will do so before sending in of course. >>>> >> >> >> >>>> >> >> >> Eyeballing it quickly, these changes look right. >>>> >> >> > >>>> >> >> > Good, thanks. >>>> >> >> > >>>> >> >> >> Do you have some test programs you can make available? >>>> >> >> > >>>> >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >>>> >> >> > the pipe size and test the resulting transfers. >>>> >> >> >>>> >> >> An afterthought. Do there not also need to be fixes to the /proc >>>> >> >> interfaces. I don't think they were included in your revised patches. >>>> >> > >>>> >> > I think the proc part can be sanely left in pages, since it's just a >>>> >> > memory limiter. >>>> >> >>>> >> I can't see any advantage to using two different units for these >>>> >> closely related APIs, and it does seem like it could be a source of >>>> >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >>>> >> shmget() SHMMAX that impose per-process memory-related limits use >>>> >> bytes. Best to be consistent, don't you think? >>>> > >>>> > But they are different interfaces. I think the 'pass in required size, >>>> > return actual size' where actual size is >= required size makes sense >>>> > for the syscall part, but for an "admin" interface it is more logical to >>>> > deal in pages. Perhaps that's just me and the average admin does not >>>> > agree. So while it's just detail, it's also an interface so has some >>>> > importance. And if there's consensus that bytes is a cleaner interface >>>> > on the proc side as well, then lets change it. >>>> >>>> I'll add one more datapoint to those that I already mentioned. >>>> RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. >>>> >>>> There was only one vaguely related limit that I could find that >>>> measured things in pages. Consider these two System V shared memory >>>> limits: >>>> >>>> SHMMAX >>>> This is the maximum size (in bytes) of a shared memory segment. >>>> >>>> SHMALL >>>> This is a system-wide limit on the total number of pages of shared memory. >>>> >>>> But in a way this almost confirms my point. SHMMAX is a limit the >>>> governs the behavior of individual processes (like your /proc file), >>>> while SHMALL is a limit that governs the behavior of the system as a >>>> whole. There is a (sort of) logic to using bytes for one and pages for >>>> the other. >>>> >>>> I think that I've said all I need to say on the topic. I'm inclined to >>>> think yours /proc file should use bytes, since it seems consistent >>>> with other simialr APIs. Others may confirm, or someone else mught >>>> have a different insight. >>> >>> I'll commit a patch to change it to bytes. >> >> Thanks Jens. > > Since I'm going to document the /proc file, it occurred to me... What > are you going to call this file now? "pipe_max_pages" no longer makes > sense. "pipe_size_ceiling" may be more expressive than simply > "pipe_max". So, I'm looking at this interface still more closely now. How about using CAP_SYS_RESOURCE, rather than the hugely overloaded CAP_SYS_ADMIN as the governor for the capability check? Again, it's about consistency. Here's what CAP_SYS_RESOURCE currently governs: CAP_SYS_RESOURCE * Use reserved space on ext2 file systems; * make ioctl(2) calls controlling ext3 journaling; * override disk quota limits; * increase resource limits (see setrlimit(2)); * override RLIMIT_NPROC resource limit; * raise msg_qbytes limit for a System V message queue above the limit in /proc/sys/kernel/msgmnb (see msgop(2) and msgctl(2)). Including the pipe size limit in this list makes sense. Cheers, Michael -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 7:58 ` Michael Kerrisk @ 2010-06-03 8:29 ` Michael Kerrisk 2010-06-03 8:53 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 8:29 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org Hi Jens, On Thu, Jun 3, 2010 at 9:58 AM, Michael Kerrisk <mtk.manpages@googlemail.com> wrote: > Hi Jens, > > On Thu, Jun 3, 2010 at 9:48 AM, Michael Kerrisk > <mtk.manpages@googlemail.com> wrote: >> On Thu, Jun 3, 2010 at 9:05 AM, Michael Kerrisk >> <mtk.manpages@googlemail.com> wrote: >>> On Thu, Jun 3, 2010 at 9:01 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>>> On Thu, Jun 03 2010, Michael Kerrisk wrote: >>>>> Hi Jens, >>>>> >>>>> On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>>>> > On Wed, Jun 02 2010, Michael Kerrisk wrote: >>>>> >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >>>>> >> > On Thu, May 27 2010, Michael Kerrisk wrote: >>>>> >> >> Jens, >>>>> >> >> >>>>> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>>> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>>> >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>>> >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>>> >> >> >> >> > Right, that looks like a thinko. >>>>> >> >> >> >> > >>>>> >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >>>>> >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >>>>> >> >> >> >> >>>>> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >>>>> >> >> >> >> this patche (and all patches that change the API/ABI.) >>>>> >> >> >> > >>>>> >> >> >> > The first change is this: >>>>> >> >> >> > >>>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >>>>> >> >> >> > >>>>> >> >> >> > and the one dealing with the pages vs bytes API is this: >>>>> >> >> >> > >>>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >>>>> >> >> >> > >>>>> >> >> >> > Not tested yet, will do so before sending in of course. >>>>> >> >> >> >>>>> >> >> >> Eyeballing it quickly, these changes look right. >>>>> >> >> > >>>>> >> >> > Good, thanks. >>>>> >> >> > >>>>> >> >> >> Do you have some test programs you can make available? >>>>> >> >> > >>>>> >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >>>>> >> >> > the pipe size and test the resulting transfers. >>>>> >> >> >>>>> >> >> An afterthought. Do there not also need to be fixes to the /proc >>>>> >> >> interfaces. I don't think they were included in your revised patches. >>>>> >> > >>>>> >> > I think the proc part can be sanely left in pages, since it's just a >>>>> >> > memory limiter. >>>>> >> >>>>> >> I can't see any advantage to using two different units for these >>>>> >> closely related APIs, and it does seem like it could be a source of >>>>> >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >>>>> >> shmget() SHMMAX that impose per-process memory-related limits use >>>>> >> bytes. Best to be consistent, don't you think? >>>>> > >>>>> > But they are different interfaces. I think the 'pass in required size, >>>>> > return actual size' where actual size is >= required size makes sense >>>>> > for the syscall part, but for an "admin" interface it is more logical to >>>>> > deal in pages. Perhaps that's just me and the average admin does not >>>>> > agree. So while it's just detail, it's also an interface so has some >>>>> > importance. And if there's consensus that bytes is a cleaner interface >>>>> > on the proc side as well, then lets change it. >>>>> >>>>> I'll add one more datapoint to those that I already mentioned. >>>>> RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. >>>>> >>>>> There was only one vaguely related limit that I could find that >>>>> measured things in pages. Consider these two System V shared memory >>>>> limits: >>>>> >>>>> SHMMAX >>>>> This is the maximum size (in bytes) of a shared memory segment. >>>>> >>>>> SHMALL >>>>> This is a system-wide limit on the total number of pages of shared memory. >>>>> >>>>> But in a way this almost confirms my point. SHMMAX is a limit the >>>>> governs the behavior of individual processes (like your /proc file), >>>>> while SHMALL is a limit that governs the behavior of the system as a >>>>> whole. There is a (sort of) logic to using bytes for one and pages for >>>>> the other. >>>>> >>>>> I think that I've said all I need to say on the topic. I'm inclined to >>>>> think yours /proc file should use bytes, since it seems consistent >>>>> with other simialr APIs. Others may confirm, or someone else mught >>>>> have a different insight. >>>> >>>> I'll commit a patch to change it to bytes. >>> >>> Thanks Jens. >> >> Since I'm going to document the /proc file, it occurred to me... What >> are you going to call this file now? "pipe_max_pages" no longer makes >> sense. "pipe_size_ceiling" may be more expressive than simply >> "pipe_max". > > So, I'm looking at this interface still more closely now. How about > using CAP_SYS_RESOURCE, rather than the hugely overloaded > CAP_SYS_ADMIN as the governor for the capability check? Again, it's > about consistency. Here's what CAP_SYS_RESOURCE currently governs: > > CAP_SYS_RESOURCE > * Use reserved space on ext2 file systems; > * make ioctl(2) calls controlling ext3 journaling; > * override disk quota limits; > * increase resource limits (see setrlimit(2)); > * override RLIMIT_NPROC resource limit; > * raise msg_qbytes limit for a System V message queue > above the limit > in /proc/sys/kernel/msgmnb (see msgop(2) and msgctl(2)). > > Including the pipe size limit in this list makes sense. Another question: What happens if we adjust the capacity of a pipe to a value that is smaller than the number of bytes currently in the pipe? Cheers, Michael ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-06-03 8:29 ` Michael Kerrisk @ 2010-06-03 8:53 ` Michael Kerrisk 0 siblings, 0 replies; 47+ messages in thread From: Michael Kerrisk @ 2010-06-03 8:53 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org On Thu, Jun 3, 2010 at 10:29 AM, Michael Kerrisk <mtk.manpages@googlemail.com> wrote: > Hi Jens, > > On Thu, Jun 3, 2010 at 9:58 AM, Michael Kerrisk > <mtk.manpages@googlemail.com> wrote: >> Hi Jens, >> >> On Thu, Jun 3, 2010 at 9:48 AM, Michael Kerrisk >> <mtk.manpages@googlemail.com> wrote: >>> On Thu, Jun 3, 2010 at 9:05 AM, Michael Kerrisk >>> <mtk.manpages@googlemail.com> wrote: >>>> On Thu, Jun 3, 2010 at 9:01 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>>>> On Thu, Jun 03 2010, Michael Kerrisk wrote: >>>>>> Hi Jens, >>>>>> >>>>>> On Thu, Jun 3, 2010 at 8:10 AM, Jens Axboe <jaxboe@fusionio.com> wrote: >>>>>> > On Wed, Jun 02 2010, Michael Kerrisk wrote: >>>>>> >> On Tue, Jun 1, 2010 at 9:45 AM, Jens Axboe <axboe@kernel.dk> wrote: >>>>>> >> > On Thu, May 27 2010, Michael Kerrisk wrote: >>>>>> >> >> Jens, >>>>>> >> >> >>>>>> >> >> On Mon, May 24, 2010 at 7:56 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>>>> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>>>> >> >> >> On Mon, May 24, 2010 at 7:35 PM, Jens Axboe <jens.axboe@oracle.com> wrote: >>>>>> >> >> >> > On Mon, May 24 2010, Michael Kerrisk wrote: >>>>>> >> >> >> >> > Right, that looks like a thinko. >>>>>> >> >> >> >> > >>>>>> >> >> >> >> > I'll submit a patch changing it to bytes and the agreed API and fix this >>>>>> >> >> >> >> > -Eerror. Thanks for your comments and suggestions! >>>>>> >> >> >> >> >>>>>> >> >> >> >> Thanks. And of course you are welcome. (Please CC linux-api@vger on >>>>>> >> >> >> >> this patche (and all patches that change the API/ABI.) >>>>>> >> >> >> > >>>>>> >> >> >> > The first change is this: >>>>>> >> >> >> > >>>>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b >>>>>> >> >> >> > >>>>>> >> >> >> > and the one dealing with the pages vs bytes API is this: >>>>>> >> >> >> > >>>>>> >> >> >> > http://git.kernel.dk/?p=linux-2.6-block.git;a=commit;h=b9598db3401282bb27b4aef77e3eee12015f7f29 >>>>>> >> >> >> > >>>>>> >> >> >> > Not tested yet, will do so before sending in of course. >>>>>> >> >> >> >>>>>> >> >> >> Eyeballing it quickly, these changes look right. >>>>>> >> >> > >>>>>> >> >> > Good, thanks. >>>>>> >> >> > >>>>>> >> >> >> Do you have some test programs you can make available? >>>>>> >> >> > >>>>>> >> >> > Actually I don't, I test it by modifying fio's splice engine to set/get >>>>>> >> >> > the pipe size and test the resulting transfers. >>>>>> >> >> >>>>>> >> >> An afterthought. Do there not also need to be fixes to the /proc >>>>>> >> >> interfaces. I don't think they were included in your revised patches. >>>>>> >> > >>>>>> >> > I think the proc part can be sanely left in pages, since it's just a >>>>>> >> > memory limiter. >>>>>> >> >>>>>> >> I can't see any advantage to using two different units for these >>>>>> >> closely related APIs, and it does seem like it could be a source of >>>>>> >> confusion. Similar APIs that I can think of like RLIMIT_MEMLOCK and >>>>>> >> shmget() SHMMAX that impose per-process memory-related limits use >>>>>> >> bytes. Best to be consistent, don't you think? >>>>>> > >>>>>> > But they are different interfaces. I think the 'pass in required size, >>>>>> > return actual size' where actual size is >= required size makes sense >>>>>> > for the syscall part, but for an "admin" interface it is more logical to >>>>>> > deal in pages. Perhaps that's just me and the average admin does not >>>>>> > agree. So while it's just detail, it's also an interface so has some >>>>>> > importance. And if there's consensus that bytes is a cleaner interface >>>>>> > on the proc side as well, then lets change it. >>>>>> >>>>>> I'll add one more datapoint to those that I already mentioned. >>>>>> RLIMIT_STACK and RLIMIT_DATA (getrlimit()) is also expressed in bytes. >>>>>> >>>>>> There was only one vaguely related limit that I could find that >>>>>> measured things in pages. Consider these two System V shared memory >>>>>> limits: >>>>>> >>>>>> SHMMAX >>>>>> This is the maximum size (in bytes) of a shared memory segment. >>>>>> >>>>>> SHMALL >>>>>> This is a system-wide limit on the total number of pages of shared memory. >>>>>> >>>>>> But in a way this almost confirms my point. SHMMAX is a limit the >>>>>> governs the behavior of individual processes (like your /proc file), >>>>>> while SHMALL is a limit that governs the behavior of the system as a >>>>>> whole. There is a (sort of) logic to using bytes for one and pages for >>>>>> the other. >>>>>> >>>>>> I think that I've said all I need to say on the topic. I'm inclined to >>>>>> think yours /proc file should use bytes, since it seems consistent >>>>>> with other simialr APIs. Others may confirm, or someone else mught >>>>>> have a different insight. >>>>> >>>>> I'll commit a patch to change it to bytes. >>>> >>>> Thanks Jens. >>> >>> Since I'm going to document the /proc file, it occurred to me... What >>> are you going to call this file now? "pipe_max_pages" no longer makes >>> sense. "pipe_size_ceiling" may be more expressive than simply >>> "pipe_max". >> >> So, I'm looking at this interface still more closely now. How about >> using CAP_SYS_RESOURCE, rather than the hugely overloaded >> CAP_SYS_ADMIN as the governor for the capability check? Again, it's >> about consistency. Here's what CAP_SYS_RESOURCE currently governs: >> >> CAP_SYS_RESOURCE >> * Use reserved space on ext2 file systems; >> * make ioctl(2) calls controlling ext3 journaling; >> * override disk quota limits; >> * increase resource limits (see setrlimit(2)); >> * override RLIMIT_NPROC resource limit; >> * raise msg_qbytes limit for a System V message queue >> above the limit >> in /proc/sys/kernel/msgmnb (see msgop(2) and msgctl(2)). >> >> Including the pipe size limit in this list makes sense. > > Another question: What happens if we adjust the capacity of a pipe to > a value that is smaller than the number of bytes currently in the > pipe? Hi Jens, This is my first cut at trying to describe the new interface: [[ Starting with kernel 2.6.35, Linux allows the capacity of a pipe to be modified. The Linux-specific fcntl(fd, F_SETPIPE_SZ, size) call changes the capacity of the pipe referred to by fd to be at least size bytes. An unprivileged process can change the pipe capacity to any value in the range from the system page size up to the value in /proc/sys/fs/pipe-max-pages???. The default value for pipe-max-pages???? is ???? bytes. A privileged (CAP_SYS_RESOURCE???) process can override this limit. When allocating space for the pipe, the kernel may round size up to some value convenient for the implementation. (In the initial implementation, size is rounded up to the next power-of-two multiple of the system page size.) The fcntl(fd, F_GETPIPE_SZ) call returns the actual size allocated for the pipe referred to by fd. ]] Obviously there's a few things that are still not final, such as * the resource name, * the name of the /proc file, and the behavior if 'size' is less than the number of bytes currently in the pipe. But other than that, how does this look? Cheers, Michael ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 1:43 ` OGAWA Hirofumi 2010-05-24 4:43 ` Michael Kerrisk @ 2010-05-24 7:04 ` Jens Axboe 2010-05-24 7:28 ` Michael Kerrisk 2010-05-24 7:46 ` OGAWA Hirofumi 1 sibling, 2 replies; 47+ messages in thread From: Jens Axboe @ 2010-05-24 7:04 UTC (permalink / raw) To: OGAWA Hirofumi Cc: mtk.manpages, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24 2010, OGAWA Hirofumi wrote: > Jens Axboe <jens.axboe@oracle.com> writes: > > >> > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing > >> > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a > >> > power of 2 in pages. So the question is if that makes the API cleaner, > >> > passing in number of pages but returning bytes? Or pass in bytes all > >> > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in > >> > pages if need be. Then it would return a size at least what was passed > >> > in, or error. > > I really think "power of 2 in pages" is simply current implementation > detail, not detail of pipe API. Completely agree, one more reason more to make that dependency exposed in the API. > >> I'd recommend this: Pass it in and out in bytes. Don't round to a > >> power of 2. Require the user to know what they are doing. Give an > >> error if the user doesn't supply a power-of-2 * page-size for > >> F_SETPIPE_SZ. (Again, consider the case of architectures with > >> switchable page sizes.) > > > > But is there much point in erroring on an incorrect size? If the > > application says "I need at least 120kb of space in there", kernel > > returns "OK, you got 128kb". Would returning -1/EINVAL for that case > > really make a better API? Doesn't seem like it to me. > > FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix > socket. Well, API itself wouldn't say "at least this size" or "exactly > this size", so, in here, important thing is consistency of interfaces, I > think. (And the both is sane API at least for me if those had > consistency in the system.) > > Well, so how about set/get in bytes, and kernel will set "at least > specified size" actually like setsockopt(SO_RCV/SNDBUF)? Isn't that pretty much what I described? -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:04 ` Jens Axboe @ 2010-05-24 7:28 ` Michael Kerrisk 2010-05-24 7:49 ` OGAWA Hirofumi 2010-05-24 14:51 ` Brian Bloniarz 2010-05-24 7:46 ` OGAWA Hirofumi 1 sibling, 2 replies; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 7:28 UTC (permalink / raw) To: Jens Axboe Cc: OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 9:04 AM, Jens Axboe <jens.axboe@oracle.com> wrote: > On Mon, May 24 2010, OGAWA Hirofumi wrote: >> Jens Axboe <jens.axboe@oracle.com> writes: >> >> >> > We can easily make F_GETPIPE_SZ return bytes, but I don't think passing >> >> > in bytes to F_SETPIPE_SZ makes a lot of sense. The pipe array must be a >> >> > power of 2 in pages. So the question is if that makes the API cleaner, >> >> > passing in number of pages but returning bytes? Or pass in bytes all >> >> > around, but have F_SETPIPE_SZ round to the nearest multiple of pow2 in >> >> > pages if need be. Then it would return a size at least what was passed >> >> > in, or error. >> >> I really think "power of 2 in pages" is simply current implementation >> detail, not detail of pipe API. > > Completely agree, one more reason more to make that dependency exposed > in the API. > >> >> I'd recommend this: Pass it in and out in bytes. Don't round to a >> >> power of 2. Require the user to know what they are doing. Give an >> >> error if the user doesn't supply a power-of-2 * page-size for >> >> F_SETPIPE_SZ. (Again, consider the case of architectures with >> >> switchable page sizes.) >> > >> > But is there much point in erroring on an incorrect size? If the >> > application says "I need at least 120kb of space in there", kernel >> > returns "OK, you got 128kb". Would returning -1/EINVAL for that case >> > really make a better API? Doesn't seem like it to me. >> >> FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix >> socket. Well, API itself wouldn't say "at least this size" or "exactly >> this size", so, in here, important thing is consistency of interfaces, I >> think. (And the both is sane API at least for me if those had >> consistency in the system.) >> >> Well, so how about set/get in bytes, and kernel will set "at least >> specified size" actually like setsockopt(SO_RCV/SNDBUF)? > > Isn't that pretty much what I described? Actually, SO_*BUF is pretty weird. It returns double what was supplied. It's not simply a matter of rounding up: it always doubles what was supplied. -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:28 ` Michael Kerrisk @ 2010-05-24 7:49 ` OGAWA Hirofumi 2010-05-24 14:51 ` Brian Bloniarz 1 sibling, 0 replies; 47+ messages in thread From: OGAWA Hirofumi @ 2010-05-24 7:49 UTC (permalink / raw) To: mtk.manpages Cc: Jens Axboe, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Michael Kerrisk <mtk.manpages@googlemail.com> writes: > Actually, SO_*BUF is pretty weird. It returns double what was > supplied. It's not simply a matter of rounding up: it always doubles > what was supplied. Yes. However, well, I'm feeling it also is implementation detail of "at least". :) -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:28 ` Michael Kerrisk 2010-05-24 7:49 ` OGAWA Hirofumi @ 2010-05-24 14:51 ` Brian Bloniarz 2010-05-24 15:43 ` Michael Kerrisk 1 sibling, 1 reply; 47+ messages in thread From: Brian Bloniarz @ 2010-05-24 14:51 UTC (permalink / raw) To: mtk.manpages Cc: Michael Kerrisk, Jens Axboe, OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On 05/24/2010 03:28 AM, Michael Kerrisk wrote: > Actually, SO_*BUF is pretty weird. It returns double what was > supplied. It's not simply a matter of rounding up: it always doubles > what was supplied. Rationale in net/core/sock.c: set_rcvbuf: sk->sk_userlocks |= SOCK_RCVBUF_LOCK; /* * We double it on the way in to account for * "struct sk_buff" etc. overhead. Applications * assume that the SO_RCVBUF setting they make will * allow that much actual data to be received on that * socket. * * Applications are unaware that "struct sk_buff" and * other overheads allocate from the receive buffer * during socket buffer allocation. * * And after considering the possible alternatives, * returning the value we actually used in getsockopt * is the most desirable behavior. */ if ((val * 2) < SOCK_MIN_RCVBUF) sk->sk_rcvbuf = SOCK_MIN_RCVBUF; else sk->sk_rcvbuf = val * 2; break; I'm guessing pipes don't have this kind of wrinkle. ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 14:51 ` Brian Bloniarz @ 2010-05-24 15:43 ` Michael Kerrisk 0 siblings, 0 replies; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 15:43 UTC (permalink / raw) To: Brian Bloniarz Cc: Jens Axboe, OGAWA Hirofumi, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 4:51 PM, Brian Bloniarz <bmb@athenacr.com> wrote: > On 05/24/2010 03:28 AM, Michael Kerrisk wrote: >> Actually, SO_*BUF is pretty weird. It returns double what was >> supplied. It's not simply a matter of rounding up: it always doubles >> what was supplied. > > Rationale in net/core/sock.c: > > set_rcvbuf: > sk->sk_userlocks |= SOCK_RCVBUF_LOCK; > /* > * We double it on the way in to account for > * "struct sk_buff" etc. overhead. Applications > * assume that the SO_RCVBUF setting they make will > * allow that much actual data to be received on that > * socket. > * > * Applications are unaware that "struct sk_buff" and > * other overheads allocate from the receive buffer > * during socket buffer allocation. > * > * And after considering the possible alternatives, > * returning the value we actually used in getsockopt > * is the most desirable behavior. > */ > if ((val * 2) < SOCK_MIN_RCVBUF) > sk->sk_rcvbuf = SOCK_MIN_RCVBUF; > else > sk->sk_rcvbuf = val * 2; > break; > > I'm guessing pipes don't have this kind of wrinkle. Yes, all of the above is understood. It's exposing these details to userspace that's weird... -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:04 ` Jens Axboe 2010-05-24 7:28 ` Michael Kerrisk @ 2010-05-24 7:46 ` OGAWA Hirofumi 2010-05-24 17:15 ` Jens Axboe 1 sibling, 1 reply; 47+ messages in thread From: OGAWA Hirofumi @ 2010-05-24 7:46 UTC (permalink / raw) To: Jens Axboe Cc: mtk.manpages, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Jens Axboe <jens.axboe@oracle.com> writes: >> >> I'd recommend this: Pass it in and out in bytes. Don't round to a >> >> power of 2. Require the user to know what they are doing. Give an >> >> error if the user doesn't supply a power-of-2 * page-size for >> >> F_SETPIPE_SZ. (Again, consider the case of architectures with >> >> switchable page sizes.) >> > >> > But is there much point in erroring on an incorrect size? If the >> > application says "I need at least 120kb of space in there", kernel >> > returns "OK, you got 128kb". Would returning -1/EINVAL for that case >> > really make a better API? Doesn't seem like it to me. >> >> FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix >> socket. Well, API itself wouldn't say "at least this size" or "exactly >> this size", so, in here, important thing is consistency of interfaces, I >> think. (And the both is sane API at least for me if those had >> consistency in the system.) >> >> Well, so how about set/get in bytes, and kernel will set "at least >> specified size" actually like setsockopt(SO_RCV/SNDBUF)? > > Isn't that pretty much what I described? Yes, probably. Well, 120kb was still multiple of page size. :) Thanks. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 7:46 ` OGAWA Hirofumi @ 2010-05-24 17:15 ` Jens Axboe 2010-05-24 18:12 ` OGAWA Hirofumi 0 siblings, 1 reply; 47+ messages in thread From: Jens Axboe @ 2010-05-24 17:15 UTC (permalink / raw) To: OGAWA Hirofumi Cc: mtk.manpages, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24 2010, OGAWA Hirofumi wrote: > Jens Axboe <jens.axboe@oracle.com> writes: > > >> >> I'd recommend this: Pass it in and out in bytes. Don't round to a > >> >> power of 2. Require the user to know what they are doing. Give an > >> >> error if the user doesn't supply a power-of-2 * page-size for > >> >> F_SETPIPE_SZ. (Again, consider the case of architectures with > >> >> switchable page sizes.) > >> > > >> > But is there much point in erroring on an incorrect size? If the > >> > application says "I need at least 120kb of space in there", kernel > >> > returns "OK, you got 128kb". Would returning -1/EINVAL for that case > >> > really make a better API? Doesn't seem like it to me. > >> > >> FWIW, my first impression of this was setsockopt(SO_RCV/SNDBUF) of unix > >> socket. Well, API itself wouldn't say "at least this size" or "exactly > >> this size", so, in here, important thing is consistency of interfaces, I > >> think. (And the both is sane API at least for me if those had > >> consistency in the system.) > >> > >> Well, so how about set/get in bytes, and kernel will set "at least > >> specified size" actually like setsockopt(SO_RCV/SNDBUF)? > > > > Isn't that pretty much what I described? > > Yes, probably. Well, 120kb was still multiple of page size. :) It is, but 120KB/page_size is not (which is the power-of-2 of interest here). -- Jens Axboe ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 17:15 ` Jens Axboe @ 2010-05-24 18:12 ` OGAWA Hirofumi 2010-05-24 18:16 ` Michael Kerrisk 0 siblings, 1 reply; 47+ messages in thread From: OGAWA Hirofumi @ 2010-05-24 18:12 UTC (permalink / raw) To: Jens Axboe Cc: mtk.manpages, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel Jens Axboe <jens.axboe@oracle.com> writes: >> > Isn't that pretty much what I described? >> >> Yes, probably. Well, 120kb was still multiple of page size. :) > > It is, but 120KB/page_size is not (which is the power-of-2 of interest > here). Um... I have no interest whether rounded-up-size is power-of-2 or not. Because it is implementation detail. Anyway, we agreed with "in bytes" and "at least"? If so, I'm happy with it, and I'm sorry if I was just noise. -- OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-24 18:12 ` OGAWA Hirofumi @ 2010-05-24 18:16 ` Michael Kerrisk 0 siblings, 0 replies; 47+ messages in thread From: Michael Kerrisk @ 2010-05-24 18:16 UTC (permalink / raw) To: OGAWA Hirofumi Cc: Jens Axboe, Andrew Morton, Linus Torvalds, Miklos Szeredi, linux-fsdevel, linux-kernel On Mon, May 24, 2010 at 8:12 PM, OGAWA Hirofumi <hirofumi@mail.parknet.co.jp> wrote: > Jens Axboe <jens.axboe@oracle.com> writes: > >>> > Isn't that pretty much what I described? >>> >>> Yes, probably. Well, 120kb was still multiple of page size. :) >> >> It is, but 120KB/page_size is not (which is the power-of-2 of interest >> here). > > Um... I have no interest whether rounded-up-size is power-of-2 or > not. Because it is implementation detail. > > Anyway, we agreed with "in bytes" and "at least"? If so, I'm happy with > it, and I'm sorry if I was just noise. I thought it was good input! -- Michael Kerrisk Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/ Author of "The Linux Programming Interface" http://blog.man7.org/ ^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [patch] pipe: add support for shrinking and growing pipes 2010-05-19 16:45 [patch] pipe: add support for shrinking and growing pipes Miklos Szeredi 2010-05-19 16:49 ` Linus Torvalds @ 2010-05-20 12:52 ` Andi Kleen 1 sibling, 0 replies; 47+ messages in thread From: Andi Kleen @ 2010-05-20 12:52 UTC (permalink / raw) To: Miklos Szeredi; +Cc: linux-fsdevel, linux-kernel, jens.axboe, torvalds, akpm Miklos Szeredi <miklos@szeredi.hu> writes: > + > + /* > + * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't > + * expect a lot of shrink+grow operations, just free and allocate > + * again like we would do for growing. If the pipe currently > + * contains more buffers than arg, then return busy. > + */ > + if (arg < pipe->nrbufs) > + return -EBUSY; > + > + bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); While this is conceptually like socket buffers, socket buffers have sophisticated mechanisms to throttle their memory use under low memory conditions. That's not there here? It means every user could pin a lot of memory. -Andi -- ak@linux.intel.com -- Speaking for myself only. ^ permalink raw reply [flat|nested] 47+ messages in thread
end of thread, other threads:[~2010-06-03 8:53 UTC | newest] Thread overview: 47+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-05-19 16:45 [patch] pipe: add support for shrinking and growing pipes Miklos Szeredi 2010-05-19 16:49 ` Linus Torvalds 2010-05-19 18:05 ` Jens Axboe 2010-05-19 19:05 ` Jens Axboe 2010-05-20 8:33 ` Miklos Szeredi 2010-05-20 8:37 ` Jens Axboe 2010-05-20 17:42 ` Linus Torvalds 2010-05-20 17:48 ` Jens Axboe 2010-05-21 17:13 ` Rick Sherm 2010-05-23 5:30 ` Michael Kerrisk 2010-05-23 2:38 ` Andrew Morton 2010-05-23 5:52 ` Michael Kerrisk 2010-05-23 7:09 ` Jens Axboe 2010-05-23 9:24 ` Michael Kerrisk 2010-05-23 17:47 ` Jens Axboe 2010-05-24 1:43 ` OGAWA Hirofumi 2010-05-24 4:43 ` Michael Kerrisk 2010-05-24 7:05 ` Jens Axboe 2010-05-24 7:27 ` Michael Kerrisk 2010-05-24 17:35 ` Jens Axboe 2010-05-24 17:52 ` Michael Kerrisk 2010-05-24 17:56 ` Jens Axboe 2010-05-25 4:01 ` Michael Kerrisk 2010-06-01 7:48 ` Jens Axboe 2010-06-01 15:22 ` Linus Torvalds 2010-06-01 16:36 ` Loke, Chetan 2010-05-27 6:49 ` Michael Kerrisk 2010-06-01 7:45 ` Jens Axboe 2010-06-02 19:25 ` Michael Kerrisk 2010-06-03 6:10 ` Jens Axboe 2010-06-03 6:46 ` Michael Kerrisk 2010-06-03 7:01 ` Jens Axboe 2010-06-03 7:05 ` Michael Kerrisk 2010-06-03 7:48 ` Michael Kerrisk 2010-06-03 7:58 ` Michael Kerrisk 2010-06-03 8:29 ` Michael Kerrisk 2010-06-03 8:53 ` Michael Kerrisk 2010-05-24 7:04 ` Jens Axboe 2010-05-24 7:28 ` Michael Kerrisk 2010-05-24 7:49 ` OGAWA Hirofumi 2010-05-24 14:51 ` Brian Bloniarz 2010-05-24 15:43 ` Michael Kerrisk 2010-05-24 7:46 ` OGAWA Hirofumi 2010-05-24 17:15 ` Jens Axboe 2010-05-24 18:12 ` OGAWA Hirofumi 2010-05-24 18:16 ` Michael Kerrisk 2010-05-20 12:52 ` Andi Kleen
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).