From: majianpeng <majianpeng@gmail.com>
To: "Yan, Zheng" <zheng.z.yan@intel.com>
Cc: sage <sage@inktank.com>, ceph-devel <ceph-devel@vger.kernel.org>,
linux-fsdevel <linux-fsdevel@vger.kernel.org>
Subject: Re: Re: [PATCH 1/2] ceph: Implement readv/preadv for sync operation.
Date: Thu, 5 Sep 2013 08:28:24 +0800 [thread overview]
Message-ID: <201309050828216701631@gmail.com> (raw)
In-Reply-To: 52272E1D.9090908@intel.com
>On 09/03/2013 04:52 PM, majianpeng wrote:
>> For readv/preadv sync-operatoin, ceph only do the first iov.
>> It don't think other iovs.Now implement this.
>>
>> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
>> ---
>> fs/ceph/file.c | 175 ++++++++++++++++++++++++++++++++++++++++-----------------
>> 1 file changed, 123 insertions(+), 52 deletions(-)
>>
>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
>> index 3de8982..7d6a3ee 100644
>> --- a/fs/ceph/file.c
>> +++ b/fs/ceph/file.c
>> @@ -408,51 +408,95 @@ more:
>> *
>> * If the read spans object boundary, just do multiple reads.
>> */
>> -static ssize_t ceph_sync_read(struct file *file, char __user *data,
>> - unsigned len, loff_t *poff, int *checkeof)
>> +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iovec *iov,
>> + unsigned long nr_segs, int *checkeof)
>> {
>> + struct file *file = iocb->ki_filp;
>> struct inode *inode = file_inode(file);
>> struct page **pages;
>> - u64 off = *poff;
>> - int num_pages, ret;
>> + u64 off = iocb->ki_pos;
>> + int num_pages, ret, i;
>>
>> - dout("sync_read on file %p %llu~%u %s\n", file, off, len,
>> + dout("sync_read on file %p %llu~%u %s\n", file, off,
>> + (unsigned)iocb->ki_left,
>> (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
>> -
>> - if (file->f_flags & O_DIRECT) {
>> - num_pages = calc_pages_for((unsigned long)data, len);
>> - pages = ceph_get_direct_page_vector(data, num_pages, true);
>> - } else {
>> - num_pages = calc_pages_for(off, len);
>> - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
>> - }
>> - if (IS_ERR(pages))
>> - return PTR_ERR(pages);
>> -
>> /*
>> * flush any page cache pages in this range. this
>> * will make concurrent normal and sync io slow,
>> * but it will at least behave sensibly when they are
>> * in sequence.
>> */
>> - ret = filemap_write_and_wait(inode->i_mapping);
>> + ret = filemap_write_and_wait_range(inode->i_mapping, off,
>> + off + iocb->ki_left);
>> if (ret < 0)
>> - goto done;
>> -
>> - ret = striped_read(inode, off, len, pages, num_pages, checkeof,
>> - file->f_flags & O_DIRECT,
>> - (unsigned long)data & ~PAGE_MASK);
>> + return ret;
>>
>> - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
>> - ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
>> - if (ret >= 0)
>> - *poff = off + ret;
>> + if (file->f_flags & O_DIRECT) {
>> + for (i = 0; i < nr_segs; i++) {
>> + void __user *data = iov[i].iov_base;
>> + size_t len = iov[i].iov_len;
>> +
>> + num_pages = calc_pages_for((unsigned long)data, len);
>> + pages = ceph_get_direct_page_vector(data,
>> + num_pages, true);
>> + if (IS_ERR(pages))
>> + return PTR_ERR(pages);
>> +
>> + ret = striped_read(inode, off, len,
>> + pages, num_pages, checkeof,
>> + 1, (unsigned long)data & ~PAGE_MASK);
>> + ceph_put_page_vector(pages, num_pages, true);
>> +
>> + if (ret <= 0)
>> + break;
>> + off += ret;
>> + if (ret < len)
>> + break;
>> + }
>> + if (off > iocb->ki_pos) {
>> + ret = off - iocb->ki_pos;
>> + iocb->ki_pos = off;
>> + iocb->ki_left -= ret;
>> + }
>> + } else {
>> + size_t len = iocb->ki_left;
>>
>> -done:
>> - if (file->f_flags & O_DIRECT)
>> - ceph_put_page_vector(pages, num_pages, true);
>> - else
>> + num_pages = calc_pages_for(off, len);
>> + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
>> + if (IS_ERR(pages))
>> + return PTR_ERR(pages);
>> + ret = striped_read(inode, off, len, pages,
>> + num_pages, checkeof, 0, 0);
>> + len = ret;
>> + if (len) {
>> + int i, l, k = 0;
>> + size_t left = len;
>> +
>> + for (i = 0; i < nr_segs && left; i++) {
>> + void __user *data = iov[i].iov_base;
>> + l = min(left, iov[i].iov_len);
>> + ret = ceph_copy_page_vector_to_user(&pages[k],
>> + data, off,
>> + l);
>> + if (ret > 0) {
>> + left -= ret;
>> + off += ret;
>> + k = calc_pages_for(iocb->ki_pos,
>> + len - left + 1) - 1;
>> + BUG_ON(k >= num_pages && left);
>> + } else
>> + break;
>> + }
>> +
>> + if (left == 0) {
>> + iocb->ki_pos += len;
>> + iocb->ki_left -= len;
>> + ret = len;
>> + }
>
>If user program provides invalid buffer to readv syscall. 'left' can be larger than 0.
>For this case, we should return size of data that were successfully copied
Ok, we should add this code, before "if (left == 0)"
left = len - left;
>
>> + }
>> ceph_release_page_vector(pages, num_pages);
>> + }
>> +
>> dout("sync_read result %d\n", ret);
>> return ret;
>> }
>> @@ -647,55 +691,82 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
>> {
>> struct file *filp = iocb->ki_filp;
>> struct ceph_file_info *fi = filp->private_data;
>> - loff_t *ppos = &iocb->ki_pos;
>> - size_t len = iov->iov_len;
>> + size_t len = iocb->ki_left;
>> struct inode *inode = file_inode(filp);
>> struct ceph_inode_info *ci = ceph_inode(inode);
>> - void __user *base = iov->iov_base;
>> ssize_t ret;
>> int want, got = 0;
>> int checkeof = 0, read = 0;
>>
>> +
>> dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
>> inode, ceph_vinop(inode), pos, (unsigned)len, inode);
>> -again:
>> +
>> if (fi->fmode & CEPH_FILE_MODE_LAZY)
>> want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
>> else
>> want = CEPH_CAP_FILE_CACHE;
>> ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
>> if (ret < 0)
>> - goto out;
>> + return ret;
>> +
>> dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
>> inode, ceph_vinop(inode), pos, (unsigned)len,
>> ceph_cap_string(got));
>>
>> if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
>> (iocb->ki_filp->f_flags & O_DIRECT) ||
>> - (fi->flags & CEPH_F_SYNC))
>> + (fi->flags & CEPH_F_SYNC)) {
>> + unsigned long curr_seg = 0;
>> + struct iovec *iov_clone;
>> +
>> + iov_clone = kmalloc(nr_segs * sizeof(struct iovec), GFP_KERNEL);
>> + if (iov_clone == NULL) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> + memcpy(iov_clone, iov, nr_segs * sizeof(struct iovec));
>
>why do we need to clone the iov ?
>
I noticed in patch2,you asked the same question.
The reason is i need modify iov,but the iov is const.So i only clone it.
Or should you have better method?
>Regards
>Yan, Zheng
>
>> +again:
>> /* hmm, this isn't really async... */
>> - ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
>> - else
>> - ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
>> + ret = ceph_sync_read(iocb, &iov_clone[curr_seg],
>> + nr_segs - curr_seg, &checkeof);
>> +
>> + if (checkeof && ret >= 0) {
>> + int statret = ceph_do_getattr(inode,
>> + CEPH_STAT_CAP_SIZE);
>> +
>> + /* hit EOF or hole? */
>> + if (statret == 0 && iocb->ki_pos < inode->i_size &&
>> + iocb->ki_left) {
>> + size_t tmp = 0;
>> + dout("%s sync_read hit hole, ppos %lld < size %lld"
>> + ", reading more\n", __func__, iocb->ki_pos,
>> + inode->i_size);
>> +
>> + read += ret;
>> + for (; curr_seg < nr_segs; curr_seg++) {
>> + if ((tmp + iov_clone[curr_seg].iov_len)
>> + > ret)
>> + break;
>> + tmp += iov_clone[curr_seg].iov_len;
>> + }
>> +
>> + BUG_ON(curr_seg == nr_segs);
>> + iov_clone[curr_seg].iov_base += ret - tmp;
>> + iov_clone[curr_seg].iov_len -= ret - tmp;
>> + checkeof = 0;
>> + goto again;
>> + }
>> + }
>> + kfree(iov_clone);
>>
>> + } else
>> + ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
>> out:
>> dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
>> inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
>> ceph_put_cap_refs(ci, got);
>>
>> - if (checkeof && ret >= 0) {
>> - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
>> -
>> - /* hit EOF or hole? */
>> - if (statret == 0 && *ppos < inode->i_size) {
>> - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
>> - read += ret;
>> - base += ret;
>> - len -= ret;
>> - checkeof = 0;
>> - goto again;
>> - }
>> - }
>> if (ret >= 0)
>> ret += read;
>>
>>
>
next prev parent reply other threads:[~2013-09-05 0:28 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-09-03 8:52 [PATCH 1/2] ceph: Implement readv/preadv for sync operation majianpeng
2013-09-04 12:57 ` Yan, Zheng
2013-09-05 0:28 ` majianpeng [this message]
2013-09-05 2:51 ` Yan, Zheng
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=201309050828216701631@gmail.com \
--to=majianpeng@gmail.com \
--cc=ceph-devel@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=sage@inktank.com \
--cc=zheng.z.yan@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.