From: Shan Hai <shan.hai@oracle.com>
To: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: linux-xfs@vger.kernel.org
Subject: Re: [PATCH RFC 4/8] xfs: implement inline data read write code
Date: Fri, 6 Jul 2018 12:05:34 +0800 [thread overview]
Message-ID: <97dee0ab-3ecf-54f4-22a2-821c9663fb07@oracle.com> (raw)
In-Reply-To: <20180706033328.GM32415@magnolia>
On 2018年07月06日 11:33, Darrick J. Wong wrote:
> On Fri, Jul 06, 2018 at 11:12:25AM +0800, Shan Hai wrote:
>> Implement read/write functions for inline data access and use them
>> for buffered/DIO/DAX operations.
>>
>> Signed-off-by: Shan Hai <shan.hai@oracle.com>
>> ---
>> fs/xfs/xfs_file.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 242 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index a3e7767a5715..55047928b720 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -27,6 +27,7 @@
>> #include "xfs_pnfs.h"
>> #include "xfs_iomap.h"
>> #include "xfs_reflink.h"
>> +#include "xfs_defer.h"
>>
>> #include <linux/dcache.h>
>> #include <linux/falloc.h>
>> @@ -175,6 +176,185 @@ xfs_file_fsync(
>> return error;
>> }
>>
>> +/*
>> + * Read the inline data of a local format file.
>> + */
>> +STATIC ssize_t
>> +xfs_file_inline_data_read(
>> + struct xfs_inode *ip,
>> + int whichfork,
>> + struct kiocb *iocb,
>> + struct iov_iter *to)
>> +{
>> + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork);
>> + struct inode *inode = VFS_I(ip);
>> + loff_t isize = i_size_read(inode);
>> + loff_t *ppos = &iocb->ki_pos;
>> + size_t count = iov_iter_count(to);
>> + ssize_t ret = 0;
>> +
>> + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
>> + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
>> +
>> + if (!count)
>> + return 0;
>> +
>> + if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
>> + return 0;
>> +
>> + if (*ppos > isize)
>> + return 0;
>> +
>> + iov_iter_truncate(to, inode->i_sb->s_maxbytes);
>> + count = (*ppos + count > isize) ? isize - *ppos : count;
>> + ret = copy_to_iter(ifp->if_u1.if_data + *ppos, count, to);
>> + *ppos += ret;
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Try to inline the data into the xfs inode data fork when it has enough
>> + * space to hold it. The data inlining happens at below points:
>> + *
>> + * - writeback: for buffered writes
>> + * - write_iter: for dax/dio writes
>> + *
>> + * The extents to local format conversion is done here for DAX/DIO write,
>> + * while the same conversion is done in the writeback path for buffered write.
>> + * This function also converts inode from local to extents when the data
>> + * fork could not hold the data inline anymore.
>> + */
>> +STATIC ssize_t
>> +xfs_file_inline_data_write(
>> + struct xfs_inode *ip,
>> + int whichfork,
>> + struct kiocb *iocb,
>> + struct iov_iter *from)
>> +{
>> + struct xfs_mount *mp = ip->i_mount;
>> + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork);
>> + struct inode *inode = VFS_I(ip);
>> + struct address_space *mapping = inode->i_mapping;
>> + loff_t pos = iocb->ki_pos;
>> + size_t count = iov_iter_count(from);
>> + struct xfs_trans *tp;
>> + struct xfs_defer_ops dfops;
>> + xfs_fsblock_t first_block;
>> + int ret;
>> + int error = 0;
>> + int logflags = 0;
>> + loff_t written = 0;
>> + xfs_fileoff_t bno = 0;
>> + struct xfs_bmbt_irec map;
>> + int nmap;
>> + struct page *page;
>> + char *kaddr;
>> +
>> + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
>> + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
>> +
>> + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 1, 0, 0, &tp);
>> + if (error) {
>> + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
>> + return error;
>> + }
>> +
>> + xfs_ilock(ip, XFS_ILOCK_EXCL);
>> + xfs_trans_ijoin(tp, ip, 0);
>> + xfs_defer_init(&dfops, &first_block);
>> +
>> + if (pos + count > XFS_IFORK_DSIZE(ip)) {
>> + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS) {
>> + error = 0;
>> + goto trans_cancel;
>> + }
>> +
>> + page = find_or_create_page(mapping, pos >> PAGE_SHIFT, GFP_NOFS);
>> + if (!page) {
>> + error = -ENOMEM;
>> + goto trans_cancel;
>> + }
>> + kaddr = kmap_atomic(page);
>> + memcpy(kaddr, ifp->if_u1.if_data, ifp->if_bytes);
>> + kunmap_atomic(kaddr);
>> + SetPageUptodate(page);
>> + unlock_page(page);
>> + put_page(page);
>> +
>> + xfs_bmap_forkoff_reset(ip, whichfork);
>> + ifp->if_flags &= ~XFS_IFINLINE;
>> + ifp->if_flags |= XFS_IFEXTENTS;
>> + ifp->if_u1.if_root = NULL;
>> + ifp->if_height = 0;
>> + ifp->if_bytes = 0;
>> + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
>> +
>> + written = ifp->if_bytes;
>> + error = xfs_bmap_first_unused(tp, ip, 1, &bno, whichfork);
>> + if (error)
>> + goto trans_cancel;
>> + nmap = 1;
>> + error = xfs_bmapi_write(tp, ip, bno, 1, XFS_BMAPI_ENTIRE,
>> + &first_block, 1, &map, &nmap, &dfops);
>> + if (error)
>> + goto trans_cancel;
>> +
>> + goto out_local_to_extents;
>> + }
>> +
>> + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
>> + xfs_idata_realloc(ip, (pos + count) - ifp->if_bytes, whichfork);
>> + ret = copy_from_iter(ifp->if_u1.if_data + pos, count, from);
>> + written = ret;
>> + goto out;
>> + }
>> +
>> + if (IS_DAX(inode) || iocb->ki_flags & IOCB_DIRECT) {
>> + struct page *page = alloc_page(GFP_KERNEL | GFP_NOFS);
>> + if (!page) {
>> + error = -ENOMEM;
>> + goto trans_cancel;
>> + }
>> +
>> + kaddr = kmap_atomic(page);
>> + ret = copy_from_iter(kaddr + pos, count, from);
>> + kunmap_atomic(kaddr);
>> + written = ret;
>> + error = xfs_bmap_extents_to_local(tp, ip, &dfops, &logflags,
>> + XFS_DATA_FORK, page);
>> + if (error) {
>> + __free_page(page);
>> + goto trans_cancel;
>> + }
>> +
>> + __free_page(page);
>> + goto out;
>> + }
>> +trans_cancel:
>> + xfs_defer_cancel(&dfops);
>> + xfs_trans_cancel(tp);
>> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> + return error;
>> +out:
>> + if (pos + written > i_size_read(inode))
>> + i_size_write(inode, pos + written);
>> + ip->i_d.di_size = pos + written;
>> +
>> +out_local_to_extents:
>> + logflags |= (XFS_ILOG_DDATA | XFS_ILOG_CORE);
>> + xfs_trans_log_inode(tp, ip, logflags);
>> +
>> + error = xfs_defer_finish(&tp, &dfops);
>> + if (error)
>> + goto trans_cancel;
>> + error = xfs_trans_commit(tp);
>> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +
>> + return written > 0 ? written : error;
>> +}
>> +
>> +
>> STATIC ssize_t
>> xfs_file_dio_aio_read(
>> struct kiocb *iocb,
>> @@ -192,7 +372,12 @@ xfs_file_dio_aio_read(
>> file_accessed(iocb->ki_filp);
>>
>> xfs_ilock(ip, XFS_IOLOCK_SHARED);
>> - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
>> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) &&
>> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) {
>> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to);
> I thought iomap was growing the ability to handle inline data? At least
> for gfs2?
>
> Hmm, you might ask Christoph about how to support that... also these
> read/write paths are about to get a lot of rip/smash this cycle.
>
>> + } else {
>> + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
>> + }
>> xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>>
>> return ret;
>> @@ -219,6 +404,13 @@ xfs_file_dax_read(
>> xfs_ilock(ip, XFS_IOLOCK_SHARED);
>> }
>>
>> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) &&
>> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) {
>> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to);
>> + } else {
>> + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
>> + }
>> +
>> ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
>> xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>>
>> @@ -242,7 +434,12 @@ xfs_file_buffered_aio_read(
>> } else {
>> xfs_ilock(ip, XFS_IOLOCK_SHARED);
>> }
>> - ret = generic_file_read_iter(iocb, to);
>> +
>> + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) {
>> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to);
>> + } else {
>> + ret = generic_file_read_iter(iocb, to);
>> + }
>> xfs_iunlock(ip, XFS_IOLOCK_SHARED);
>>
>> return ret;
>> @@ -487,6 +684,7 @@ xfs_file_dio_aio_write(
>> size_t count = iov_iter_count(from);
>> struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
>> mp->m_rtdev_targp : mp->m_ddev_targp;
>> + int error;
>>
>> /* DIO must be aligned to device logical sector size */
>> if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
>> @@ -547,6 +745,22 @@ xfs_file_dio_aio_write(
>> }
>>
>> trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
>> +
>> + if (xfs_sb_version_hasinlinedata(&mp->m_sb) &&
>> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE &&
> != BTREE? Why not == INLINE?
The xfs inode is born with EXTENTS format and the very first write to a
file will
encounter the EXTENTS since it's not converted to INLINE yet, the format
conversion happens in the following xfs_file_inline_data_write.
>> + S_ISREG(inode->i_mode)) {
>> + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb,
>> + from);
> int != ssize_t... also, if we decided we needed extents format but the
> file turned out to be extents format already then did we lose the write
> here?
If the file is already in the extents format then the above
xfs_file_inline_data_write
just returns without format conversion and the control flow falls
through to the normal
extents handling because of the following if statement found that the
format is not LOCAL.
>> + if (error) {
>> + ret = error;
>> + goto out;
>> + }
>> + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
>> + ret = count;
>> + goto out;
>> + }
>> + }
>> +
>> ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
>> out:
>> xfs_iunlock(ip, iolock);
>> @@ -586,6 +800,20 @@ xfs_file_dax_write(
>> count = iov_iter_count(from);
>>
>> trace_xfs_file_dax_write(ip, count, pos);
>> +
>> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) &&
>> + ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
>> + S_ISREG(inode->i_mode)) {
>> + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK,
>> + iocb, from);
>> + if (error)
>> + goto out;
>> + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
>> + ret = count;
>> + goto out;
>> + }
>> + }
>> +
>> ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
>> if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
>> i_size_write(inode, iocb->ki_pos);
>> @@ -614,6 +842,7 @@ xfs_file_buffered_aio_write(
>> struct address_space *mapping = file->f_mapping;
>> struct inode *inode = mapping->host;
>> struct xfs_inode *ip = XFS_I(inode);
>> + struct xfs_mount *mp = ip->i_mount;
>> ssize_t ret;
>> int enospc = 0;
>> int iolock;
>> @@ -629,6 +858,17 @@ xfs_file_buffered_aio_write(
>> if (ret)
>> goto out;
>>
>> + if (xfs_sb_version_hasinlinedata(&mp->m_sb) &&
>> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE) {
>> + ret = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, from);
>> + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) ==
>> + XFS_DINODE_FMT_LOCAL){
> What's going on with the indenting here?
Oops, sorry, I will fix it.
Thanks
Shan Hai
>
> --D
>
>> + if (likely(ret >= 0))
>> + iocb->ki_pos += ret;
>> + goto out;
>> + }
>> + }
>> +
>> /* We can write back this queue in page reclaim */
>> current->backing_dev_info = inode_to_bdi(inode);
>>
>> --
>> 2.11.0
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2018-07-06 4:05 UTC|newest]
Thread overview: 50+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-07-06 3:12 [PATCH RFC 0/8] xfs: introduce inode data inline feature Shan Hai
2018-07-06 3:12 ` [PATCH RFC 1/8] xfs: introduce inline data superblock feature bit Shan Hai
2018-07-06 3:34 ` Darrick J. Wong
2018-07-06 4:06 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 2/8] xfs: introduce extents to local conversion helper Shan Hai
2018-07-06 3:45 ` Darrick J. Wong
2018-07-06 4:15 ` Shan Hai
2018-07-08 15:42 ` Christoph Hellwig
2018-07-09 1:58 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 3/8] xfs: convert inode from extents to local format Shan Hai
2018-07-06 3:47 ` Darrick J. Wong
2018-07-06 4:24 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 4/8] xfs: implement inline data read write code Shan Hai
2018-07-06 3:33 ` Darrick J. Wong
2018-07-06 4:05 ` Shan Hai [this message]
2018-07-08 15:45 ` Christoph Hellwig
2018-07-09 2:08 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 5/8] xfs: consider the local format inode in misc operations Shan Hai
2018-07-06 3:40 ` Darrick J. Wong
2018-07-06 4:40 ` Shan Hai
2018-07-08 15:51 ` Christoph Hellwig
2018-07-09 3:06 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 6/8] xfs: fix imbalanced locking Shan Hai
2018-07-08 15:53 ` Christoph Hellwig
2018-07-09 3:07 ` Shan Hai
2018-07-06 3:12 ` [PATCH RFC 7/8] xfs: return non-zero blocks for inline data Shan Hai
2018-07-08 15:54 ` Christoph Hellwig
2018-07-11 13:08 ` Carlos Maiolino
2018-07-12 1:03 ` Shan Hai
2018-07-12 1:13 ` Shan Hai
2018-07-12 1:31 ` Darrick J. Wong
2018-07-12 1:46 ` Shan Hai
2018-07-12 9:08 ` Carlos Maiolino
2018-07-12 10:48 ` Shan Hai
2018-07-13 12:39 ` Carlos Maiolino
2018-07-17 13:57 ` Christoph Hellwig
2018-07-18 15:03 ` Carlos Maiolino
2018-07-06 3:12 ` [PATCH RFC 8/8] xfs: skip local format inode for reflinking Shan Hai
2018-07-06 3:26 ` Darrick J. Wong
2018-07-06 3:54 ` Shan Hai
2018-07-08 16:00 ` Christoph Hellwig
2018-07-06 3:12 ` [PATCH RFC 1/1] xfsprogs: add inode inline data support Shan Hai
2018-07-06 3:35 ` Darrick J. Wong
2018-07-06 19:14 ` Eric Sandeen
2018-07-06 3:51 ` [PATCH RFC 0/8] xfs: introduce inode data inline feature Darrick J. Wong
2018-07-06 4:09 ` Shan Hai
2018-07-06 5:42 ` Dave Chinner
2018-07-06 6:39 ` Shan Hai
2018-07-06 7:11 ` Shan Hai
2018-07-08 15:58 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=97dee0ab-3ecf-54f4-22a2-821c9663fb07@oracle.com \
--to=shan.hai@oracle.com \
--cc=darrick.wong@oracle.com \
--cc=linux-xfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).