* [PATCH] fs/xfs: Add support for passing write life-time hint with log
[not found] <CGME20181203131558epcas2p14b6b38cb67d4915b1ba782e11ce7ffe6@epcas2p1.samsung.com>
@ 2018-12-03 13:12 ` Kanchan Joshi
2018-12-03 15:48 ` Holger Hoffstätte
0 siblings, 1 reply; 7+ messages in thread
From: Kanchan Joshi @ 2018-12-03 13:12 UTC (permalink / raw)
To: darrick.wong; +Cc: linux-xfs, Kanchan Joshi
Log gets updated in a circular fashion, and that makes life-time
of log-data different from other types of meta/user-data.
By passing a write life-time hint with log, GC efficiency of multi-stream SSD
gets improved, leading to endurance/performance benefits.
It is described in greater detail (along with results) in this "FAST 2018"
paper -
https://www.usenix.org/conference/fast18/presentation/rho
This patch introduces new mount option "logwritehint" to pass write hint
with XFS log.
Among other Linux file-systems, F2FS supports passing down such write
hints. While for Ext4 journal, I am preparing similar proposal.
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
fs/xfs/xfs_buf.c | 2 ++
fs/xfs/xfs_buf.h | 1 +
fs/xfs/xfs_log.c | 3 +++
fs/xfs/xfs_log_recover.c | 1 +
fs/xfs/xfs_mount.h | 2 ++
fs/xfs/xfs_super.c | 15 +++++++++++++--
6 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b21ea2b..00d17f6 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1370,6 +1370,8 @@ xfs_buf_ioapply_map(
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
bio_set_op_attrs(bio, op, op_flags);
+ /* set write hint in bio */
+ bio->bi_write_hint = bp->b_write_hint;
for (; size && nr_pages; nr_pages--, page_index++) {
int rbytes, nbytes = PAGE_SIZE - offset;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511..ba9c78c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -196,6 +196,7 @@ typedef struct xfs_buf {
int b_retries;
unsigned long b_first_retry_time; /* in jiffies */
int b_last_error;
+ enum rw_hint b_write_hint; /* write hint for I/O */
const struct xfs_buf_ops *b_ops;
} xfs_buf_t;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c3b610b..45e220d 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1881,6 +1881,8 @@ xlog_sync(
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
+ /* set write hint in buffer */
+ bp->b_write_hint = log->l_mp->m_logwritehint;
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -1971,6 +1973,7 @@ xlog_sync(
bp->b_log_item = iclog;
bp->b_flags &= ~XBF_FLUSH;
bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
+ bp->b_write_hint = log->l_mp->m_logwritehint;
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1fc9e90..8bf89fa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -282,6 +282,7 @@ xlog_bwrite(
xfs_buf_lock(bp);
bp->b_io_length = nbblks;
bp->b_error = 0;
+ bp->b_write_hint = log->l_mp->m_logwritehint;
error = xfs_bwrite(bp);
if (error)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7964513..7f6b2b8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -171,6 +171,8 @@ typedef struct xfs_mount {
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
struct workqueue_struct *m_sync_workqueue;
+ /* To store write hint (for log writes) passed during mount */
+ int m_logwritehint;
/*
* Generation of the filesysyem layout. This is incremented by each
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d3e6cd0..6449d213 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -71,7 +71,7 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_logwritehint, Opt_err,
};
static const match_table_t tokens = {
@@ -119,6 +119,7 @@ static const match_table_t tokens = {
{Opt_discard, "discard"}, /* Discard unused blocks */
{Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */
{Opt_dax, "dax"}, /* Enable direct access to bdev pages */
+ {Opt_logwritehint, "logwritehint=%u"},/* Write-hint for log */
{Opt_err, NULL},
};
@@ -225,6 +226,10 @@ xfs_parseargs(
if (match_int(args, &mp->m_logbufs))
return -EINVAL;
break;
+ case Opt_logwritehint:
+ if (match_int(args, &mp->m_logwritehint))
+ return -EINVAL;
+ break;
case Opt_logbsize:
if (suffix_kstrtoint(args, 10, &mp->m_logbsize))
return -EINVAL;
@@ -405,7 +410,6 @@ xfs_parseargs(
mp->m_dalign = dsunit;
mp->m_swidth = dswidth;
}
-
if (mp->m_logbufs != -1 &&
mp->m_logbufs != 0 &&
(mp->m_logbufs < XLOG_MIN_ICLOGS ||
@@ -438,6 +442,13 @@ xfs_parseargs(
mp->m_readio_log = iosizelog;
mp->m_writeio_log = iosizelog;
}
+ if (mp->m_logwritehint < WRITE_LIFE_NOT_SET ||
+ mp->m_logwritehint > WRITE_LIFE_EXTREME) {
+ xfs_warn(mp, "invalid logwritehint value: %d [not %d-%d]",
+ mp->m_logwritehint, WRITE_LIFE_NOT_SET, WRITE_LIFE_EXTREME);
+ return -EINVAL;
+
+ }
return 0;
}
--
2.7.4
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-03 13:12 ` [PATCH] fs/xfs: Add support for passing write life-time hint with log Kanchan Joshi
@ 2018-12-03 15:48 ` Holger Hoffstätte
2018-12-03 16:34 ` Darrick J. Wong
0 siblings, 1 reply; 7+ messages in thread
From: Holger Hoffstätte @ 2018-12-03 15:48 UTC (permalink / raw)
To: Kanchan Joshi, darrick.wong; +Cc: linux-xfs
On 12/3/18 2:12 PM, Kanchan Joshi wrote:
> Log gets updated in a circular fashion, and that makes life-time
> of log-data different from other types of meta/user-data.
> By passing a write life-time hint with log, GC efficiency of multi-stream SSD
> gets improved, leading to endurance/performance benefits.
> It is described in greater detail (along with results) in this "FAST 2018"
> paper -
> https://www.usenix.org/conference/fast18/presentation/rho
>
> This patch introduces new mount option "logwritehint" to pass write hint
> with XFS log.
Is there any downside to passing the hints unconditionally?
Introducing a new mount option which depends on the internals of
an SSD seems .. unlikely to gain many friends.
Otherwise a great idea. :)
-h
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-03 15:48 ` Holger Hoffstätte
@ 2018-12-03 16:34 ` Darrick J. Wong
2018-12-03 20:09 ` Dave Chinner
0 siblings, 1 reply; 7+ messages in thread
From: Darrick J. Wong @ 2018-12-03 16:34 UTC (permalink / raw)
To: Holger Hoffstätte; +Cc: Kanchan Joshi, linux-xfs
On Mon, Dec 03, 2018 at 04:48:12PM +0100, Holger Hoffstätte wrote:
> On 12/3/18 2:12 PM, Kanchan Joshi wrote:
> > Log gets updated in a circular fashion, and that makes life-time
> > of log-data different from other types of meta/user-data.
> > By passing a write life-time hint with log, GC efficiency of multi-stream SSD
> > gets improved, leading to endurance/performance benefits.
> > It is described in greater detail (along with results) in this "FAST 2018"
> > paper -
> > https://www.usenix.org/conference/fast18/presentation/rho
> > This patch introduces new mount option "logwritehint" to pass write hint
> > with XFS log.
>
> Is there any downside to passing the hints unconditionally?
Why wouldn't we always pass LIFE_EXTREME? Do people have setups where,
say, hint <= LIFE_MEDIUM gets a disk but anything longer than that gets
a big slow stone tablet, which is not where we'd want the metadata log?
For that matter, should we be passing write hints for other fs metadata?
Fixed AG headers never move, should they be LIFE_whateverthelogis ? How
about space and file metadata, which aren't fixed to certain locations?
> Introducing a new mount option which depends on the internals of
> an SSD seems .. unlikely to gain many friends.
> Otherwise a great idea. :)
Likewise, I'm not wild about adding mount options or passing raw
integers via mount(8) command line:
mount /dev/fd0 /mnt -o logwritehint=3 # ???
--D
> -h
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-03 16:34 ` Darrick J. Wong
@ 2018-12-03 20:09 ` Dave Chinner
2018-12-04 12:11 ` Kanchan Joshi
0 siblings, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2018-12-03 20:09 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: Holger Hoffstätte, Kanchan Joshi, linux-xfs
On Mon, Dec 03, 2018 at 08:34:57AM -0800, Darrick J. Wong wrote:
> On Mon, Dec 03, 2018 at 04:48:12PM +0100, Holger Hoffstätte wrote:
> > On 12/3/18 2:12 PM, Kanchan Joshi wrote:
> > > Log gets updated in a circular fashion, and that makes life-time
> > > of log-data different from other types of meta/user-data.
> > > By passing a write life-time hint with log, GC efficiency of multi-stream SSD
> > > gets improved, leading to endurance/performance benefits.
> > > It is described in greater detail (along with results) in this "FAST 2018"
> > > paper -
> > > https://www.usenix.org/conference/fast18/presentation/rho
> > > This patch introduces new mount option "logwritehint" to pass write hint
> > > with XFS log.
> >
> > Is there any downside to passing the hints unconditionally?
>
> Why wouldn't we always pass LIFE_EXTREME? Do people have setups where,
> say, hint <= LIFE_MEDIUM gets a disk but anything longer than that gets
> a big slow stone tablet, which is not where we'd want the metadata log?
>
> For that matter, should we be passing write hints for other fs metadata?
> Fixed AG headers never move, should they be LIFE_whateverthelogis ? How
> about space and file metadata, which aren't fixed to certain locations?
I started looking at this recently because of the problems that were
being had with the XFS allocator interleaving short term and long
term data for certain applications. Part of this was getting the
userspace hints plumbed through to the inode, which then canbe used
by the allocator to make high level placement decisions (e.g. AG
level) and then the hint gets plumbed through to the user data bios
as well.
Metadata is largely static, even the dynamic metadata, because we
overwrite in place and it doesn't move about all that much in common
workloads. So it was just looking at treating all the metadata as
the same, given that there are only 4 or 5 hint levels available.
> > Introducing a new mount option which depends on the internals of
> > an SSD seems .. unlikely to gain many friends.
> > Otherwise a great idea. :)
>
> Likewise, I'm not wild about adding mount options or passing raw
> integers via mount(8) command line:
>
> mount /dev/fd0 /mnt -o logwritehint=3 # ???
No mount option, please. Fix the log and metadata as "always
overwritten in place" write type hints, let user data be specified
by the dynamic per-inode hinting interface we already have.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-03 20:09 ` Dave Chinner
@ 2018-12-04 12:11 ` Kanchan Joshi
2018-12-04 22:09 ` Dave Chinner
0 siblings, 1 reply; 7+ messages in thread
From: Kanchan Joshi @ 2018-12-04 12:11 UTC (permalink / raw)
To: Dave Chinner, Darrick J. Wong
Cc: Holger Hoffstätte, linux-xfs, jooyoung.hwang, chur.lee,
prakash.v
I expect log to have lifetime as "SHORT" in general. Log is bound to be
overwritten, as XFS continues performing transaction. So it is not good
idea to place it (inside SSD) with some other meta/data that is more
stable (or less stable, for that matter).
By assigning a distinct write-hint (SHORT, or anything else than NONE)
to log, this problem of mixing is solved.
Keeping a mount option seemed to offer more flexibility to
admin/system-designers. Assuming a single large SSD, hosting two XFS
volumes - one catering to fsync-heavy workloads, while another one with
reduced frequency of log writes. In that situation, one would not want
to mix the writes of two logs and instead prefer to configure one log as
"SHORT" and another one as "MEDIUM or EXTREME".
Also, this way (through mount option) seemed more in sync with how rest
of the kernel currently deals with streams/write-hints. In order to be
useful, write-hints need to be converted to specific stream numbers. For
NVMe SSDs, this is done by nvme-core module, but only if it is loaded
with "streams=1" option. F2FS has mount option for passing write-hints.
Default behavior is passing no write-hint.
To summarize, I have listed three schemes below. Please let me know
which one sounds more acceptable for patch -
1. [Current proposal] Keep write-hint (NONE) as default, and make it
overridable through mount option.
2. Keep immutable write-hint (say SHORT). Provide no mount option.
3. Keep write-hint (SHORT) as default, and make it overridable through
mount option.
Thanks,
On Tuesday 04 December 2018 01:39 AM, Dave Chinner wrote:
> On Mon, Dec 03, 2018 at 08:34:57AM -0800, Darrick J. Wong wrote:
>> On Mon, Dec 03, 2018 at 04:48:12PM +0100, Holger Hoffstätte wrote:
>>> On 12/3/18 2:12 PM, Kanchan Joshi wrote:
>>>> Log gets updated in a circular fashion, and that makes life-time
>>>> of log-data different from other types of meta/user-data.
>>>> By passing a write life-time hint with log, GC efficiency of multi-stream SSD
>>>> gets improved, leading to endurance/performance benefits.
>>>> It is described in greater detail (along with results) in this "FAST 2018"
>>>> paper -
>>>> https://www.usenix.org/conference/fast18/presentation/rho
>>>> This patch introduces new mount option "logwritehint" to pass write hint
>>>> with XFS log.
>>>
>>> Is there any downside to passing the hints unconditionally?
>>
>> Why wouldn't we always pass LIFE_EXTREME? Do people have setups where,
>> say, hint <= LIFE_MEDIUM gets a disk but anything longer than that gets
>> a big slow stone tablet, which is not where we'd want the metadata log?
>>
>> For that matter, should we be passing write hints for other fs metadata?
>> Fixed AG headers never move, should they be LIFE_whateverthelogis ? How
>> about space and file metadata, which aren't fixed to certain locations?
>
> I started looking at this recently because of the problems that were
> being had with the XFS allocator interleaving short term and long
> term data for certain applications. Part of this was getting the
> userspace hints plumbed through to the inode, which then canbe used
> by the allocator to make high level placement decisions (e.g. AG
> level) and then the hint gets plumbed through to the user data bios
> as well.
>
> Metadata is largely static, even the dynamic metadata, because we
> overwrite in place and it doesn't move about all that much in common
> workloads. So it was just looking at treating all the metadata as
> the same, given that there are only 4 or 5 hint levels available.
>
>>> Introducing a new mount option which depends on the internals of
>>> an SSD seems .. unlikely to gain many friends.
>>> Otherwise a great idea. :)
>>
>> Likewise, I'm not wild about adding mount options or passing raw
>> integers via mount(8) command line:
>>
>> mount /dev/fd0 /mnt -o logwritehint=3 # ???
>
> No mount option, please. Fix the log and metadata as "always
> overwritten in place" write type hints, let user data be specified
> by the dynamic per-inode hinting interface we already have.
>
> Cheers,
>
> Dave.
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-04 12:11 ` Kanchan Joshi
@ 2018-12-04 22:09 ` Dave Chinner
2018-12-10 15:15 ` Kanchan Joshi
0 siblings, 1 reply; 7+ messages in thread
From: Dave Chinner @ 2018-12-04 22:09 UTC (permalink / raw)
To: Kanchan Joshi
Cc: Darrick J. Wong, Holger Hoffstätte, linux-xfs,
jooyoung.hwang, chur.lee, prakash.v
On Tue, Dec 04, 2018 at 05:41:26PM +0530, Kanchan Joshi wrote:
> I expect log to have lifetime as "SHORT" in general. Log is bound to
> be overwritten, as XFS continues performing transaction. So it is
> not good idea to place it (inside SSD) with some other meta/data
> that is more stable (or less stable, for that matter).
> By assigning a distinct write-hint (SHORT, or anything else than
> NONE) to log, this problem of mixing is solved.
So, we have different definitions of what is "short lived"
and what is "long lived". The log is a -static allocation- it never
moves and so it always gets overwritten in place. It exists for the
life of the filesystem, so it's a long-lived structure. Some
metadata moves around - it's allocated and freed on demand, but is
still overwritten in place while it's in use.
The in-use life time of metadata can be very short, but it can also
be very long. It may never get overwritten, or it could be
overwritten multiple times a second. We have no real idea what is
going to happen with each individual piece of metadata because it is
completely dependent on user workloads.
So from a metadata perspective, life-time refers to how long the
metadata is in use in the filesystem, not how often it is accessed
or written. There's no "one-size-fits-all" bucket here.
> Keeping a mount option seemed to offer more flexibility to
> admin/system-designers.
OTOH, it gives everyone who is not an expert in storage and
filesystem implemetnations an oportunity to screw up in new and
exciting ways that are difficult to detect and impossible for XFS
developers to reproduce or debug.
>
> Assuming a single large SSD, hosting two XFS
> volumes - one catering to fsync-heavy workloads, while another one
> with reduced frequency of log writes. In that situation, one would
> not want to mix the writes of two logs and instead prefer to
> configure one log as "SHORT" and another one as "MEDIUM or EXTREME".
Here's the problem: you're making an assumption that "frequency of
log writes" equates to "the log is overwritten more often", and
that's not true. Frequent fsyncs typically mean lots of small log
writes that block each other, while applicaitons that don't use
fsync will be doing lots large async log writes and potentially
writing a lot more metadata to the log because nothing is blocking
waiting on journal IO completion......
Filesystems rarely behave in the ways non-filesystem developers
expect them to.
> Also, this way (through mount option) seemed more in sync with how
> rest of the kernel currently deals with streams/write-hints. In
> order to be useful, write-hints need to be converted to specific
> stream numbers. For NVMe SSDs, this is done by nvme-core module, but
> only if it is loaded with "streams=1" option. F2FS has mount option
> for passing write-hints. Default behavior is passing no write-hint.
There is no need for mount options, because we already have a
fcntl() interface that applications can use for setting write hints
on files. It was introduced in 4.13, and XFS already plumbs it
through for buffered write IO.
FYI:
$ man fcntl
....
File read/write hints
Write lifetime hints can be used to inform the kernel about
the relative expected lifetime of writes on a given inode or
via a particular open file description. (See open(2)
for an explanation of open file descriptions.) In this
context, the term "write lifetime" means the expected time
the data will live on media, before being over¿ written or
erased.
.....
And the interfaces are:
F_GET_RW_HINT (uint64_t *; since Linux 4.13)
F_SET_RW_HINT (uint64_t *; since Linux 4.13)
F_GET_FILE_RW_HINT (uint64_t *; since Linux 4.13)
F_SET_FILE_RW_HINT (uint64_t *; since Linux 4.13)
And the types are:
RWH_WRITE_LIFE_NOT_SET
RWH_WRITE_LIFE_NONE
RWH_WRITE_LIFE_SHORT
RWH_WRITE_LIFE_MEDIUM
RWH_WRITE_LIFE_LONG
RWH_WRITE_LIFE_EXTREME
We probably also should make sure direct IO uses this hint, too, and
ideally we want set the write hint for the metadata in that file to
the same value as the user data being written, as the file metadata
is likely to have a similar lifetime to the user data it refers to.
IOWs, we want different metadata to have appropriately different
write hints, some of it will be controllable by the user per-file
write hints, others will be controlled by the filesystem itself as
userspace has no visibility or control over how that internal
metadata is managed.
> To summarize, I have listed three schemes below. Please let me know
> which one sounds more acceptable for patch -
> 1. [Current proposal] Keep write-hint (NONE) as default, and make it
> overridable through mount option.
> 2. Keep immutable write-hint (say SHORT). Provide no mount option.
> 3. Keep write-hint (SHORT) as default, and make it overridable
> through mount option.
Option 4: let the filesystem decide what is best dynamically,
because the lifetime of metadata and how often it is written is
a dynamic property of the specific metadata type.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] fs/xfs: Add support for passing write life-time hint with log
2018-12-04 22:09 ` Dave Chinner
@ 2018-12-10 15:15 ` Kanchan Joshi
0 siblings, 0 replies; 7+ messages in thread
From: Kanchan Joshi @ 2018-12-10 15:15 UTC (permalink / raw)
To: Dave Chinner
Cc: Darrick J. Wong, Holger Hoffstätte, linux-xfs,
jooyoung.hwang, chur.lee, prakash.v
Write life-time hint is not a feature in itself, it's an abstraction
built over a SSD feature "stream". And this abstraction is more rigid
than the feature, in terms of defining life-time buckets. Feature-wise,
it is sufficient to assign two stream numbers X and Y to isolate one
data from other (and to reap the benefits). While abstraction compels us
to debate on relative hotness-level between these two types of data.
Deciding relative hotness gets trickier as data-types increase, and
worse, it may not bring any goodness. If aim of the change is to get
goodness from SSD, we should consider lifetime from SSD's point-of-view.
And that is based on "overwrites".
Please refer figure 1 in this paper -
https://www.usenix.org/system/files/conference/fast18/fast18-rho.pdf
If a block is not overwritten by Host, it stays valid inside SSD; If it
gets overwritten, it becomes invalid and creates a hole. No holes are
good. All holes are also good. Intermixing of _few_ holes with _few_
valid blocks is bad.
Due to the way log is written, it stays valid (i.e. no overwrites) until
roll-over. After roll-over, it starts getting overwritten.If volume is
meta-light, log will stay valid for long. If volume is meta-heavy,
log-writes will start creating holes (invalid data). But either of the
situation is not problematic in itself. Problematic situation is when,
along with log updates, we start getting other data/meta updates. This
meta/data may or may not be as stable or transient. But point is, why to
bother about whether log is as hot/cold as something else. Problem can
be solved by isolating log-data in its own chamber, in its own stream.
It will either remain all-valid or turn all-invalid, unaffected by
everything else that goes around.
> Option 4: let the filesystem decide what is best dynamically,
> because the lifetime of metadata and how often it is written is
> a dynamic property of the specific metadata type.
I think log should be treated independently than any other meta/data.
Matching dynamic nature of meta-data with life-time hints seems harder
(than log) to get right. Abstraction-wise, FS can try to be very
accurate about changing life-time hints (change something from warm to
cold to hot etc.). But one should note that streams come with allocation
granularity. One can refer "SGS" in NVMe spec, page 275 -
https://nvmexpress.org/wp-content/uploads/NVM_Express_Revision_1.3.pdf.
Or, as seen in above figure 1, internally each write-hint/stream is
assigned on a fixed-size large region. Therefore possibility of internal
fragmentation needs to be considered while hoping from one hint to
another.
On Wednesday 05 December 2018 03:39 AM, Dave Chinner wrote:
> On Tue, Dec 04, 2018 at 05:41:26PM +0530, Kanchan Joshi wrote:
>> I expect log to have lifetime as "SHORT" in general. Log is bound to
>> be overwritten, as XFS continues performing transaction. So it is
>> not good idea to place it (inside SSD) with some other meta/data
>> that is more stable (or less stable, for that matter).
>> By assigning a distinct write-hint (SHORT, or anything else than
>> NONE) to log, this problem of mixing is solved.
>
> So, we have different definitions of what is "short lived"
> and what is "long lived". The log is a -static allocation- it never
> moves and so it always gets overwritten in place. It exists for the
> life of the filesystem, so it's a long-lived structure. Some
> metadata moves around - it's allocated and freed on demand, but is
> still overwritten in place while it's in use.
>
> The in-use life time of metadata can be very short, but it can also
> be very long. It may never get overwritten, or it could be
> overwritten multiple times a second. We have no real idea what is
> going to happen with each individual piece of metadata because it is
> completely dependent on user workloads.
>
> So from a metadata perspective, life-time refers to how long the
> metadata is in use in the filesystem, not how often it is accessed
> or written. There's no "one-size-fits-all" bucket here.
>
>> Keeping a mount option seemed to offer more flexibility to
>> admin/system-designers.
>
> OTOH, it gives everyone who is not an expert in storage and
> filesystem implemetnations an oportunity to screw up in new and
> exciting ways that are difficult to detect and impossible for XFS
> developers to reproduce or debug.
>
>>
>> Assuming a single large SSD, hosting two XFS
>> volumes - one catering to fsync-heavy workloads, while another one
>> with reduced frequency of log writes. In that situation, one would
>> not want to mix the writes of two logs and instead prefer to
>> configure one log as "SHORT" and another one as "MEDIUM or EXTREME".
>
> Here's the problem: you're making an assumption that "frequency of
> log writes" equates to "the log is overwritten more often", and
> that's not true. Frequent fsyncs typically mean lots of small log
> writes that block each other, while applicaitons that don't use
> fsync will be doing lots large async log writes and potentially
> writing a lot more metadata to the log because nothing is blocking
> waiting on journal IO completion......
>
> Filesystems rarely behave in the ways non-filesystem developers
> expect them to.
>
>> Also, this way (through mount option) seemed more in sync with how
>> rest of the kernel currently deals with streams/write-hints. In
>> order to be useful, write-hints need to be converted to specific
>> stream numbers. For NVMe SSDs, this is done by nvme-core module, but
>> only if it is loaded with "streams=1" option. F2FS has mount option
>> for passing write-hints. Default behavior is passing no write-hint.
>
> There is no need for mount options, because we already have a
> fcntl() interface that applications can use for setting write hints
> on files. It was introduced in 4.13, and XFS already plumbs it
> through for buffered write IO.
>
> FYI:
>
> $ man fcntl
> ....
> File read/write hints
>
> Write lifetime hints can be used to inform the kernel about
> the relative expected lifetime of writes on a given inode or
> via a particular open file description. (See open(2)
> for an explanation of open file descriptions.) In this
> context, the term "write lifetime" means the expected time
> the data will live on media, before being over¿ written or
> erased.
> .....
>
> And the interfaces are:
>
> F_GET_RW_HINT (uint64_t *; since Linux 4.13)
> F_SET_RW_HINT (uint64_t *; since Linux 4.13)
> F_GET_FILE_RW_HINT (uint64_t *; since Linux 4.13)
> F_SET_FILE_RW_HINT (uint64_t *; since Linux 4.13)
>
> And the types are:
>
> RWH_WRITE_LIFE_NOT_SET
> RWH_WRITE_LIFE_NONE
> RWH_WRITE_LIFE_SHORT
> RWH_WRITE_LIFE_MEDIUM
> RWH_WRITE_LIFE_LONG
> RWH_WRITE_LIFE_EXTREME
>
> We probably also should make sure direct IO uses this hint, too, and
> ideally we want set the write hint for the metadata in that file to
> the same value as the user data being written, as the file metadata
> is likely to have a similar lifetime to the user data it refers to.
>
> IOWs, we want different metadata to have appropriately different
> write hints, some of it will be controllable by the user per-file
> write hints, others will be controlled by the filesystem itself as
> userspace has no visibility or control over how that internal
> metadata is managed.
>
>> To summarize, I have listed three schemes below. Please let me know
>> which one sounds more acceptable for patch -
>> 1. [Current proposal] Keep write-hint (NONE) as default, and make it
>> overridable through mount option.
>> 2. Keep immutable write-hint (say SHORT). Provide no mount option.
>> 3. Keep write-hint (SHORT) as default, and make it overridable
>> through mount option.
>
> Option 4: let the filesystem decide what is best dynamically,
> because the lifetime of metadata and how often it is written is
> a dynamic property of the specific metadata type.
>
> Cheers,
>
> Dave.
>
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2018-12-10 15:18 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <CGME20181203131558epcas2p14b6b38cb67d4915b1ba782e11ce7ffe6@epcas2p1.samsung.com>
2018-12-03 13:12 ` [PATCH] fs/xfs: Add support for passing write life-time hint with log Kanchan Joshi
2018-12-03 15:48 ` Holger Hoffstätte
2018-12-03 16:34 ` Darrick J. Wong
2018-12-03 20:09 ` Dave Chinner
2018-12-04 12:11 ` Kanchan Joshi
2018-12-04 22:09 ` Dave Chinner
2018-12-10 15:15 ` Kanchan Joshi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox