Re: [PATCH v5 10/10] fs: add support for copy file range in zonefs

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Nitesh Shetty <nj.shetty@samsung.com>
To: Amir Goldstein <amir73il@gmail.com>
Cc: axboe@kernel.dk, agk@redhat.com, snitzer@kernel.org,
	dm-devel@redhat.com, kbusch@kernel.org, hch@lst.de,
	sagi@grimberg.me, james.smart@broadcom.com, kch@nvidia.com,
	damien.lemoal@opensource.wdc.com, naohiro.aota@wdc.com,
	jth@kernel.org, viro@zeniv.linux.org.uk,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-nvme@lists.infradead.org, linux-fsdevel@vger.kernel.org,
	anuj20.g@samsung.com, joshi.k@samsung.com, p.raghav@samsung.com,
	nitheshshetty@gmail.com, gost.dev@samsung.com
Subject: Re: [PATCH v5 10/10] fs: add support for copy file range in zonefs
Date: Wed, 23 Nov 2022 15:43:13 +0530	[thread overview]
Message-ID: <20221123101313.GB26377@test-zns> (raw)
In-Reply-To: <CAOQ4uxhMX9MF0+6DD7NO5QzqDRwESkhiY5f9CB7DXFVa22Za+w@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 9677 bytes --]

On Wed, Nov 23, 2022 at 08:53:14AM +0200, Amir Goldstein wrote:
> On Wed, Nov 23, 2022 at 8:26 AM Nitesh Shetty <nj.shetty@samsung.com> wrote:
> >
> > copy_file_range is implemented using copy offload,
> > copy offloading to device is always enabled.
> > To disable copy offloading mount with "no_copy_offload" mount option.
> > At present copy offload is only used, if the source and destination files
> > are on same block device, otherwise copy file range is completed by
> > generic copy file range.
> >
> > copy file range implemented as following:
> >         - write pending writes on the src and dest files
> >         - drop page cache for dest file if its conv zone
> >         - copy the range using offload
> >         - update dest file info
> >
> > For all failure cases we fallback to generic file copy range
> > At present this implementation does not support conv aggregation
> >
> > Signed-off-by: Nitesh Shetty <nj.shetty@samsung.com>
> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> > ---
> >  fs/zonefs/super.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 179 insertions(+)
> >
> > diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> > index abc9a85106f2..15613433d4ae 100644
> > --- a/fs/zonefs/super.c
> > +++ b/fs/zonefs/super.c
> > @@ -1223,6 +1223,183 @@ static int zonefs_file_release(struct inode *inode, struct file *file)
> >         return 0;
> >  }
> >
> > +static int zonefs_is_file_copy_offset_ok(struct inode *src_inode,
> > +               struct inode *dst_inode, loff_t src_off, loff_t dst_off,
> > +               size_t *len)
> > +{
> > +       loff_t size, endoff;
> > +       struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +
> > +       inode_lock(src_inode);
> > +       size = i_size_read(src_inode);
> > +       inode_unlock(src_inode);
> > +       /* Don't copy beyond source file EOF. */
> > +       if (src_off < size) {
> > +               if (src_off + *len > size)
> > +                       *len = (size - (src_off + *len));
> > +       } else
> > +               *len = 0;
> > +
> > +       mutex_lock(&dst_zi->i_truncate_mutex);
> > +       if (dst_zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
> > +               if (*len > dst_zi->i_max_size - dst_zi->i_wpoffset)
> > +                       *len -= dst_zi->i_max_size - dst_zi->i_wpoffset;
> > +
> > +               if (dst_off != dst_zi->i_wpoffset)
> > +                       goto err;
> > +       }
> > +       mutex_unlock(&dst_zi->i_truncate_mutex);
> > +
> > +       endoff = dst_off + *len;
> > +       inode_lock(dst_inode);
> > +       if (endoff > dst_zi->i_max_size ||
> > +                       inode_newsize_ok(dst_inode, endoff)) {
> > +               inode_unlock(dst_inode);
> > +               goto err;
> > +       }
> > +       inode_unlock(dst_inode);
> > +
> > +       return 0;
> > +err:
> > +       mutex_unlock(&dst_zi->i_truncate_mutex);
> > +       return -EINVAL;
> > +}
> > +
> > +static ssize_t zonefs_issue_copy(struct zonefs_inode_info *src_zi,
> > +               loff_t src_off, struct zonefs_inode_info *dst_zi,
> > +               loff_t dst_off, size_t len)
> > +{
> > +       struct block_device *src_bdev = src_zi->i_vnode.i_sb->s_bdev;
> > +       struct block_device *dst_bdev = dst_zi->i_vnode.i_sb->s_bdev;
> > +       struct range_entry *rlist = NULL;
> > +       int ret = len;
> > +
> > +       rlist = kmalloc(sizeof(*rlist), GFP_KERNEL);
> > +       if (!rlist)
> > +               return -ENOMEM;
> > +
> > +       rlist[0].dst = (dst_zi->i_zsector << SECTOR_SHIFT) + dst_off;
> > +       rlist[0].src = (src_zi->i_zsector << SECTOR_SHIFT) + src_off;
> > +       rlist[0].len = len;
> > +       rlist[0].comp_len = 0;
> > +       ret = blkdev_issue_copy(src_bdev, dst_bdev, rlist, 1, NULL, NULL,
> > +                       GFP_KERNEL);
> > +       if (rlist[0].comp_len > 0)
> > +               ret = rlist[0].comp_len;
> > +       kfree(rlist);
> > +
> > +       return ret;
> > +}
> > +
> > +/* Returns length of possible copy, else returns error */
> > +static ssize_t zonefs_copy_file_checks(struct file *src_file, loff_t src_off,
> > +                                       struct file *dst_file, loff_t dst_off,
> > +                                       size_t *len, unsigned int flags)
> > +{
> > +       struct inode *src_inode = file_inode(src_file);
> > +       struct inode *dst_inode = file_inode(dst_file);
> > +       struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> > +       struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +       ssize_t ret;
> > +
> > +       if (src_inode->i_sb != dst_inode->i_sb)
> > +               return -EXDEV;
> > +
> > +       /* Start by sync'ing the source and destination files for conv zones */
> > +       if (src_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > +               ret = file_write_and_wait_range(src_file, src_off,
> > +                               (src_off + *len));
> > +               if (ret < 0)
> > +                       goto io_error;
> > +       }
> > +       inode_dio_wait(src_inode);
> > +
> > +       /* Start by sync'ing the source and destination files ifor conv zones */
> > +       if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > +               ret = file_write_and_wait_range(dst_file, dst_off,
> > +                               (dst_off + *len));
> > +               if (ret < 0)
> > +                       goto io_error;
> > +       }
> > +       inode_dio_wait(dst_inode);
> > +
> > +       /* Drop dst file cached pages for a conv zone*/
> > +       if (dst_zi->i_ztype == ZONEFS_ZTYPE_CNV) {
> > +               ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
> > +                               dst_off >> PAGE_SHIFT,
> > +                               (dst_off + *len) >> PAGE_SHIFT);
> > +               if (ret < 0)
> > +                       goto io_error;
> > +       }
> > +
> > +       ret = zonefs_is_file_copy_offset_ok(src_inode, dst_inode, src_off,
> > +                       dst_off, len);
> > +       if (ret < 0)
> > +               return ret;
> > +
> > +       return *len;
> > +
> > +io_error:
> > +       zonefs_io_error(dst_inode, true);
> > +       return ret;
> > +}
> > +
> > +static ssize_t zonefs_copy_file(struct file *src_file, loff_t src_off,
> > +               struct file *dst_file, loff_t dst_off,
> > +               size_t len, unsigned int flags)
> > +{
> > +       struct inode *src_inode = file_inode(src_file);
> > +       struct inode *dst_inode = file_inode(dst_file);
> > +       struct zonefs_inode_info *src_zi = ZONEFS_I(src_inode);
> > +       struct zonefs_inode_info *dst_zi = ZONEFS_I(dst_inode);
> > +       ssize_t ret = 0, bytes;
> > +
> > +       inode_lock(src_inode);
> > +       inode_lock(dst_inode);
> > +       bytes = zonefs_issue_copy(src_zi, src_off, dst_zi, dst_off, len);
> > +       if (bytes < 0)
> > +               goto unlock_exit;
> > +
> > +       ret += bytes;
> > +
> > +       file_update_time(dst_file);
> > +       mutex_lock(&dst_zi->i_truncate_mutex);
> > +       zonefs_update_stats(dst_inode, dst_off + bytes);
> > +       zonefs_i_size_write(dst_inode, dst_off + bytes);
> > +       dst_zi->i_wpoffset += bytes;
> > +       mutex_unlock(&dst_zi->i_truncate_mutex);
> > +       /* if we still have some bytes left, do splice copy */
> > +       if (bytes && (bytes < len)) {
> > +               bytes = do_splice_direct(src_file, &src_off, dst_file,
> > +                                        &dst_off, len, flags);
> > +               if (bytes > 0)
> > +                       ret += bytes;
> > +       }
> > +unlock_exit:
> > +       if (ret < 0)
> > +               zonefs_io_error(dst_inode, true);
> > +       inode_unlock(src_inode);
> > +       inode_unlock(dst_inode);
> > +       return ret;
> > +}
> > +
> > +static ssize_t zonefs_copy_file_range(struct file *src_file, loff_t src_off,
> > +                                     struct file *dst_file, loff_t dst_off,
> > +                                     size_t len, unsigned int flags)
> > +{
> > +       ssize_t ret = -EIO;
> > +
> > +       ret = zonefs_copy_file_checks(src_file, src_off, dst_file, dst_off,
> > +                                    &len, flags);
> > +       if (ret > 0)
> > +               ret = zonefs_copy_file(src_file, src_off, dst_file, dst_off,
> > +                                    len, flags);
> > +       else if (ret < 0 && ret == -EXDEV)
> 
> First of all, ret < 0 is redundant.
> 

acked

> > +               ret = generic_copy_file_range(src_file, src_off, dst_file,
> > +                                             dst_off, len, flags);
> 
> But more importantly, why do you want to fall back to
> do_splice_direct() in zonefs copy_file_range?
> How does it serve your patch set or the prospect consumers
> of zonefs copy_file_range?
> 
> The reason I am asking is because commit 5dae222a5ff0
> ("vfs: allow copy_file_range to copy across devices")
> turned out to be an API mistake that was later reverted by
> 868f9f2f8e00 ("vfs: fix copy_file_range() regression in cross-fs copies")
> 
> It is always better to return EXDEV to userspace which can
> always fallback to splice itself, but maybe it has something
> smarter to do.
> 
> The places where it made sense for kernel to fallback to
> direct splice was for network servers server-side-copy, but that
> is independent of any specific filesystem copy_file_range()
> implementation.
> 
> Thanks,
> Amir.
> 

At present we don't handle few case's such as IO getting split incase of
copy offload, so we wanted to fallback to existing mechanism. So went with
default operation, do_splice_direct.

Regards,
Nitesh Shetty


[-- Attachment #2: Type: text/plain, Size: 0 bytes --]

next prev parent reply	other threads:[~2022-11-23 12:14 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20221123061010epcas5p21cef9d23e4362b01f2b19d1117e1cdf5@epcas5p2.samsung.com>
2022-11-23  5:58 ` [PATCH v5 00/10] Implement copy offload support Nitesh Shetty
     [not found]   ` <CGME20221123061014epcas5p150fd8add12fe6d09b63c56972818e6a2@epcas5p1.samsung.com>
2022-11-23  5:58     ` [PATCH v5 01/10] block: Introduce queue limits for copy-offload support Nitesh Shetty
     [not found]   ` <CGME20221123061017epcas5p246a589e20eac655ac340cfda6028ff35@epcas5p2.samsung.com>
2022-11-23  5:58     ` [PATCH v5 02/10] block: Add copy offload support infrastructure Nitesh Shetty
2022-11-23  8:04       ` Ming Lei
2022-11-23 10:07         ` Nitesh Shetty
2022-11-24  0:03           ` Ming Lei
2022-11-29 11:44             ` Nitesh Shetty
2022-12-07  5:54               ` Nitesh Shetty
2022-12-07 11:19                 ` Ming Lei
2022-12-09  8:16                   ` Nitesh Shetty
     [not found]   ` <CGME20221123061021epcas5p276b6d48db889932282d017b27c9a3291@epcas5p2.samsung.com>
2022-11-23  5:58     ` [PATCH v5 03/10] block: add emulation for copy Nitesh Shetty
     [not found]   ` <CGME20221123061024epcas5p28fd0296018950d722b5a97e2875cf391@epcas5p2.samsung.com>
2022-11-23  5:58     ` [PATCH v5 04/10] block: Introduce a new ioctl " Nitesh Shetty
     [not found]   ` <CGME20221123061028epcas5p1aecd27b2f4f694b5a18b51d3df5d7432@epcas5p1.samsung.com>
2022-11-23  5:58     ` [PATCH v5 05/10] nvme: add copy offload support Nitesh Shetty
     [not found]   ` <CGME20221123061031epcas5p3745558c2caffd2fd21d15feff00495e9@epcas5p3.samsung.com>
2022-11-23  5:58     ` [PATCH v5 06/10] nvmet: add copy command support for bdev and file ns Nitesh Shetty
     [not found]       ` <482586a3-f45d-a17b-7630-341fb0e1ee96@linux.alibaba.com>
2022-11-23  9:39         ` Nitesh Shetty
2022-12-06  9:22       ` kernel test robot
     [not found]   ` <CGME20221123061034epcas5p3fe90293ad08df4901f98bae2d7cfc1ba@epcas5p3.samsung.com>
2022-11-23  5:58     ` [PATCH v5 07/10] dm: Add support for copy offload Nitesh Shetty
     [not found]   ` <CGME20221123061037epcas5p4d57436204fbe0065819b156eeeddbfac@epcas5p4.samsung.com>
2022-11-23  5:58     ` [PATCH v5 08/10] dm: Enable copy offload for dm-linear target Nitesh Shetty
     [not found]   ` <CGME20221123061041epcas5p4413569a46ee730cd3033a9025c8f134a@epcas5p4.samsung.com>
2022-11-23  5:58     ` [PATCH v5 09/10] dm kcopyd: use copy offload support Nitesh Shetty
     [not found]   ` <CGME20221123061044epcas5p2ac082a91fc8197821f29e84278b6203c@epcas5p2.samsung.com>
2022-11-23  5:58     ` [PATCH v5 10/10] fs: add support for copy file range in zonefs Nitesh Shetty
2022-11-23  6:53       ` Amir Goldstein
2022-11-23 10:13         ` Nitesh Shetty [this message]
2022-11-24  1:32       ` Damien Le Moal
2022-11-24  1:47         ` Damien Le Moal
2022-11-25  4:18           ` Al Viro
2022-11-29 12:22           ` Nitesh Shetty
2022-11-29 23:45             ` Damien Le Moal
2022-11-30  4:17               ` Nitesh Shetty
2022-11-30  9:55                 ` Damien Le Moal
2022-11-23 22:56   ` [PATCH v5 00/10] Implement copy offload support Chaitanya Kulkarni
2022-11-29 12:16     ` Nitesh Shetty
2022-11-30  0:05       ` Chaitanya Kulkarni
2022-11-30  4:14         ` Nitesh Shetty

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221123101313.GB26377@test-zns \
    --to=nj.shetty@samsung.com \
    --cc=agk@redhat.com \
    --cc=amir73il@gmail.com \
    --cc=anuj20.g@samsung.com \
    --cc=axboe@kernel.dk \
    --cc=damien.lemoal@opensource.wdc.com \
    --cc=dm-devel@redhat.com \
    --cc=gost.dev@samsung.com \
    --cc=hch@lst.de \
    --cc=james.smart@broadcom.com \
    --cc=joshi.k@samsung.com \
    --cc=jth@kernel.org \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=naohiro.aota@wdc.com \
    --cc=nitheshshetty@gmail.com \
    --cc=p.raghav@samsung.com \
    --cc=sagi@grimberg.me \
    --cc=snitzer@kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).